Python Dependencytree примеры использования

Язык программирования: Python

Пространство имен/Пакет: DependencyTree

Класс/Тип: Dependencytree

Примеров на hotexamples.com: 6

Python Dependencytree - 6 примеров найдено. Это лучшие примеры Python кода для DependencyTree.Dependencytree, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

updateTree(3)

wordTags(3)

getPhrase(1)

isConstituent(1)

isContiguous(1)

sentLen(1)

Пример #1

Показать файл

Файл: CSHandler.py Проект: phanigadde/CSRelated

 def __init__(self):
   #self.__L1Tree = Parsetree()
   #self.__L2Tree = Parsetree()
   self.__L1 = ""
   self.__L2 = ""
   self.__curL1 = ""
   self.__curL2 = ""
   self.__L1Tree = Dependencytree()
   self.__L2Tree = Dependencytree()
   self.__align = {}
   self.__revAlign = {}
   self.__utils = Utils()
   self.__phraseMap = {}
   self.__l1Index = 0
   self.__clausalChunks = ["CCP","VGF", "NULL__CCP","NULL__VGF"]

Пример #2

Показать файл

Файл: DataGenerator.py Проект: phanigadde/CSRelated

 def __init__(self, outDir):
   self.config = None
   self.__csHash = set()
   self.__outputDir = outDir
   self.__csInstance = CSHandler()
   self.__dataHandler = DataHandler()
   self.__utils = Utils()
   self.__Tree = Dependencytree()
   self.__fileSuffix = ""
   self.prepareConfig()

Пример #3

Показать файл

Файл: DataGenerator.py Проект: phanigadde/CSRelated

 def __init__(self, outDir):
   sys.stderr.write("DataGenerator: Constructor\n")
   ## Languages and Order
   self.__LID = ["HI","EN"]
   self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/en-ptb.map"
   self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/hi-hyd.map"
   ## Data containers
   self.__parL1 = []
   self.__parL2 = []
   self.__align = []
   self.__pureL1 = []
   self.__pureL2 = []
   self.__outputDir = outDir
   self.__posMap = {}
   self.__phraseMap = dd(list)
   self.__csInstance = CSHandler()
   self.__utils = Utils()
   self.__Tree = Dependencytree()
   
   ## Generation Variants
   self.__csVariants = [0,1,2,3,4]
   self.__tagsetVariants = ["",".uni"]
   self.__dataRange = range(50,900,50)
   ##self.__dataRange = [200]
   self.__splits = [(50,50),(60,40),(70,30),(80,20),(90,10)]
   self.__csHash = set()
   ##LID stuff
   self.__L1Tags = set()
   self.__L2Tags = set()
   self.__commonTags = set()
   ## Pre processing
   self.__genPosMap()
   self.__genPhraseMap()
   self.__csInstance.updatePhraseMap(self.__phraseMap)
   self.__csInstance.updateLIDTags(self.__LID[0], self.__LID[1])
   
   ## Real test overwrites
   #self.__csVariants = [1,2,3,4]
   self.__tagsetVariants = [""]
   self.__dataRange = [400]
   self.__dataRanges = {0:range(40,601,40), 1:range(40,601,40), 2:range(35,540,35), 3:range(30,451,30), 4:range(15,231,15)}
   #self.__dataRanges = {0:[880], 1:[880], 2:[800], 3:[630], 4:[330]}
   #self.__dataRanges = {0:[60], 1:[60], 2:[60], 3:[60], 4:[60]}
   #self.__splits = [(50,50)]
   #for i in range(0,51,5):
   #  split = (100-i, i)
   #  self.__splits.append(split)
   self.__fileSuffix = ""

Пример #4

Показать файл

Файл: DataGenerator.py Проект: phanigadde/CSRelated

class Generator:
  def __init__(self, outDir):
    self.config = None
    self.__csHash = set()
    self.__outputDir = outDir
    self.__csInstance = CSHandler()
    self.__dataHandler = DataHandler()
    self.__utils = Utils()
    self.__Tree = Dependencytree()
    self.__fileSuffix = ""
    self.prepareConfig()
    
  def prepareConfig(self):
    self.config = GeneratingConfig()
    self.config.setCSVariants([0, 1, 2, 3, 4])
    self.config.setDataRanges({0:range(50, 1001, 50), 1:range(50, 1001, 50), 2:range(50, 1001, 50), 3:range(50, 1001, 50), 4:range(50, 1001, 50)})
    self.config.setSplits([(50, 50), (60, 40), (70, 30), (80, 20), (90, 10)])
    self.config.setTagsetVariants([".uniq", ".uni"])
  
  def prepareGenerator(self):
    self.__csInstance.updateLIDTags(self.__dataHandler.LID[0], self.__dataHandler.LID[1])
  
  def prepareRealTest(self, dataFile, outFile):
    dataFile = open(dataFile)
    outFile = open(outFile, 'w')
    for line in dataFile:
      line = map(lambda x:x.split('_#'), line.strip().split())
      uniLine = self.__dataHandler.mapLD2Uni(line)
      outFile.write(' '.join(map(lambda x:'_#'.join(x), uniLine)) + '\n')
    outFile.close()

  def generateTestData(self):
    self.config.setDataRanges({0:range(30, 151, 50), 1:range(30, 151, 50), 2:range(30, 151, 50), 3:range(30, 151, 50), 4:range(30, 151, 50)})
    for csType in self.config.csVariants:
      print "type" + str(csType)
      for data in self.config.dataRanges[csType]:
        print
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Testing Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()
  
  
  def generateDataForTest(self):
    for i in range(10):
      self.__fileSuffix = "."+str(i)
      self.generateTrainDataForTest()
  
  def generateTrainDataForTest(self):
    self.config.setDataRanges({0:[450], 1:[450], 2:[450], 3:[450], 4:[450]})
    statusCount = 0
    for csType in self.config.csVariants:
      print "type" + str(csType),
      for data in self.config.dataRanges[csType]:
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          if splitIndex == len(self.config.splits) - 1:
            print
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.. ",
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount

  
  def generateTrainData(self):
    statusCount = 0
    for csType in self.config.csVariants:
      print "type" + str(csType)
      for data in self.config.dataRanges[csType]:
        print
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def generateUCTrainData(self): # Unknown words constrained training data
    statusCount = 0
    for csType in self.config.csVariants:
      for data in self.config.dataRanges[csType]:
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print pr
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount

  def makeString(self, wordsTagsLangs):
    return ' '.join(map(lambda x:"_#".join(x), wordsTagsLangs)) + '\n'
    
  def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data):
    self.__dataHandler.loadData(l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data)

Пример #5

Показать файл

Файл: CSHandler.py Проект: phanigadde/CSRelated

class CSHandler:
  def __init__(self):
    #self.__L1Tree = Parsetree()
    #self.__L2Tree = Parsetree()
    self.__L1 = ""
    self.__L2 = ""
    self.__curL1 = ""
    self.__curL2 = ""
    self.__L1Tree = Dependencytree()
    self.__L2Tree = Dependencytree()
    self.__align = {}
    self.__revAlign = {}
    self.__utils = Utils()
    self.__phraseMap = {}
    self.__l1Index = 0
    self.__clausalChunks = ["CCP","VGF", "NULL__CCP","NULL__VGF"]
  
  def updatePhraseMap(self, phraseMap):
    self.__phraseMap = phraseMap
    
  def updateLIDTags(self, L1, L2):
    self.__L1 = L1
    self.__L2 = L2
  
  def updateHandler(self, l1Sent, l2Sent, alignLine, l1Index):
    #sys.stderr.write(alignLine+'\n')
    l2Index = 1-l1Index
    self.__l1Index = l1Index
    if l1Index:
      self.__curL1 = self.__L2
      self.__curL2 = self.__L1
      self.__L1Tree.updateTree(l2Sent)
      self.__L2Tree.updateTree(l1Sent)
    else:
      self.__curL1 = self.__L1
      self.__curL2 = self.__L2
      self.__L1Tree.updateTree(l1Sent)
      self.__L2Tree.updateTree(l2Sent)
      
    self.__align = self.__parseAlign(alignLine, l1Index, l2Index)
    self.__revAlign = self.__parseAlign(alignLine, l2Index, l1Index)
    self.__csHash = dd(lambda:set())
    
  def updateBadSwitch(self, index, l1Switch, l2Switch):
    self.__csHash[index].add(l1Switch)
    self.__csHash[index].add(l2Switch)
    
  def __parseAlign(self, alignLine, l1Index, l2Index):
    align = {}
    for i in alignLine.split():
      key = int(i.split("-")[l1Index])
      value = int(i.split("-")[l2Index])
      if key in align.keys():
        align[key].append(value)
      else:
        align[key] = [value]
    return align
    
  def csSentence(self, csType):
    validSequences = self.__utils.validSequences(self.__L1Tree.sentLen())
    # Debugging !!
    #sys.stderr.write("L1SeqL2Cont Valid Sequences:\n")
    #for sequence in validSequences:
    #    sys.stderr.write(str(sequence)+"\n")
    #dummy=raw_input()
    ###############
    # Assumption that a sentence will have a single code switch.
    sequence = self.__selectSequence(validSequences, csType) 
    # Debugging !!
    ##sys.stderr.write("Selected Sequence: "+str(sequence)+"\n")
    ##sys.stderr.write("Align:"+str(self.__align)+'\n')
    #sys.stderr.write(l1Parse)
    #sys.stderr.write(l2Parse)
    #dummy=raw_input()
    if sequence == -1:
      return [-1,[]]
    ##print "Sequence:", sequence
    csSentence = self.__utils.makeCSSentence(self.__L1Tree.wordTags(), sequence, self.__L2Tree.wordTags(), self.__align, self.__curL1, self.__curL2)
    return [csSentence,sequence]
    
  ## Assumptions:
  ## There is always a single code switch
  ## The selection among the valid candidate sequences is random
  def __selectSequence(self, validSequences, csType):
    if csType == 0:
      return self.__random(validSequences)
    elif csType == 1:
      return self.__seqL1SeqL2Cont(validSequences)
    elif csType == 2:
      return self.__seqL1Const()
    elif csType == 4:
      return self.__seqHindiClausal()
    elif csType == 3:
      return self.__seqL1ConstL2Const()
    
  def __checkHindiClause(self, sequence, OBJ):
    if frozenset(sequence) not in OBJ.heads:
      return False
    try:
      head = OBJ.heads[frozenset(sequence)]
      #print "Head:", head
      if not OBJ.isChunkSubtree(head[1], sequence) and head[0] in self.__clausalChunks and not OBJ.hasNPChild(head[1]):
        return True
    except:
      pass
    return False
    
  def __seqHindiClausal(self):
    l1Sequence = -1
    if self.__l1Index: ## Hindi is L2
      subtrees = self.__L2Tree.subtrees
      if len(subtrees) == 0:
        return -1
      count = 0
      while 1:
        l2Sequence = list(random.sample(subtrees,1)[0])
        l1Sequence = self.__utils.l2Sequence(l2Sequence, self.__revAlign)
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        if len(l2Sequence) == self.__L2Tree.sentLen():
          count += 1
          continue
        #if (frozenset(l2Sequence) in self.__L2Tree.heads and self.__L2Tree.heads[frozenset(l2Sequence)] in self.__clausalChunks and len(l1Sequence) > 0 )or count > 100:
        if len(l2Sequence)>0 and len(l1Sequence)>0 and self.__checkHindiClause(l2Sequence, self.__L2Tree):
          #if len(l1Sequence) == 0:
          #  print "l1Seq:",l1Sequence
          #  print "subtrees:", subtrees
          #  print "Count:", count
          #  dummy = raw_input(
          return l1Sequence
        count += 1
        if count > 100:
            break
    else: ## Hin is L1
      subtrees = self.__L1Tree.subtrees
      if len(subtrees) == 0:
        return -1
      count = 0
      while 1:
        l1Sequence = list(random.sample(subtrees,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if len(l2Sequence) == self.__L2Tree.sentLen():
          count += 1
          continue
        #if (frozenset(l1Sequence) in self.__L1Tree.heads and self.__L1Tree.heads[frozenset(l1Sequence)] in self.__clausalChunks and len(l2Sequence) > 0 )or count > 100:
        if len(l1Sequence)>0 and len(l2Sequence)>0 and self.__checkHindiClause(l1Sequence, self.__L1Tree):
          return l1Sequence 
        count += 1
        if count > 100:
            break
    return -1
    
  def __random(self, sequences):
    random.seed()
    l1Sequence = -1
    count = 0
    while 1:
      l1Sequence = random.sample(sequences, 1)[0]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if len(l2Sequence) > 0 or count > 100:
        break
      count += 1
    return l1Sequence
  
  def __randomStrict(self, sequences):
    random.seed()
    l1Sequence = -1
    count = 0
    while 1:
      l1Sequence = random.sample(sequences, 1)[0]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if len(l2Sequence) > 0:
        return l1Sequence
      count += 1
      if count > 100:
        break
    return -1
  
  def __seqL1SeqL2Cont(self, sequences):
    random.seed()
    sequenceIndex = -1
    l1Sequence = -1
    count = 0
    while 1:
      count += 1
      if count%100 == 0:
        return -1
        sys.stderr.write("L1SeqL2Cont"+"InfLoop:"+str(count)+"\n")
      sequenceIndex = random.randrange(len(sequences))
      l1Sequence = sequences[sequenceIndex]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if self.__L2Tree.isContiguous(l2Sequence):
        # Debugging !!
        #sys.stderr.write("Alignment: "+str(align)+"\n")
        #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n")
        ##############
        break
    return l1Sequence
 
  def __seqL1Const(self):
    random.seed()
    subtrees = self.__L1Tree.subtrees
    l1Sequence = -1
    if len(subtrees)>0:
      count = 0
      while 1:
        l1Sequence = list(random.sample(subtrees,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if len(l2Sequence) > 0:
          l1Sequence.sort()
          return l1Sequence
        count += 1
        if count > 100:
            break
    return -1
  
  def __seqL1LWG(self):
    random.seed()
    LWGs = self.__L1Tree.LWGs()
    l1Sequence = -1
    if len(LWGs)>0:
      count = 0
      while 1:
        l1Sequence = list(random.sample(LWGs,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if len(l2Sequence) > 0:
          l1Sequence.sort()
          return l1Sequence
        count += 1
        if count > 100:
            break
    return -1
 
  def __seqL1ConstL2Const(self):
    random.seed()
    subtrees = self.__L1Tree.subtrees
    l1Sequence = -1
    if len(subtrees)>0:
      count = 0
      while 1:
        l1Sequence = list(random.sample(subtrees,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if frozenset(l2Sequence) in self.__L2Tree.subtrees:
          l1Sequence.sort()
          return l1Sequence
        else:
          ##print "L1 Sentence:",self.__L1Tree.word_tags()
          ##print "L2 Sentence:",self.__L2Tree.word_tags()
          ##print "L1 Sequence:",l1Sequence
          ##print "L2 Sequence:", l2Sequence
          ##dummy = raw_input()
          dummy = 1
          
        count += 1
        if count > 100:
            break
    return -1
  
  def __seqL1LWGL2LWG(self):
    random.seed()
    LWGs = self.__L1Tree.LWGs()
    l1Sequence = -1
    if len(LWGs)>0:
      count = 0
      while 1:
        l1Sequence = list(random.sample(LWGs,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if frozenset(l2Sequence) in self.__L2Tree.LWGs():
          l1Sequence.sort()
          return l1Sequence
        else:
          ##print "L1 Sentence:",self.__L1Tree.word_tags()
          ##print "L2 Sentence:",self.__L2Tree.word_tags()
          ##print "L1 Sequence:",l1Sequence
          ##print "L2 Sequence:", l2Sequence
          ##dummy = raw_input()
          dummy = 1
        count += 1
        if count > 100:
            break
    return -1
  
  def __seqL1ConstL2Cont(self, sequences):
    random.seed()
    sequenceIndex = -1
    l1Sequence = -1
    count = 0
    while 1:
      count += 1
      if count%100 == 0:
        return -1
        sys.stderr.write("L1Const2Cont"+"InfLoop:"+str(count)+"\n")
      sequenceIndex = random.randrange(len(sequences))
      l1Sequence = sequences[sequenceIndex]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if self.__L1Tree.isConstituent(l1Sequence):
        if self.__L2Tree.isContiguous(l2Sequence):
          # Debugging !!
          #sys.stderr.write("Alignment: "+str(align)+"\n")
          #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n")
          ##############
          break
    return l1Sequence
  
  def __seqL1ConstL2SameConst(self, sequences):
    random.seed()
    sequenceIndex = -1
    l1Sequence =-1
    count = 0
    while 1:
      count += 1
      if count%500 == 0:
        return -1
        sys.stderr.write("L1Const2SameConst"+"InfLoop:"+str(count)+"\n")
      sequenceIndex = random.randrange(len(sequences))
      l1Sequence = sequences[sequenceIndex]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if self.__L1Tree.isConstituent(l1Sequence):
        if self.__L2Tree.isConstituent(l2Sequence):
          l1PhraseTag = self.__L1Tree.getPhrase(l1Sequence[0])
          l2PhraseTag = self.__L2Tree.getPhrase(l2Sequence[0])
          ## Both the phrases are same, for dual structure principle
          if l1PhraseTag == l2PhraseTag or l2PhraseTag in self.__phraseMap[l1PhraseTag]:
            # Debugging !!
            #sys.stderr.write("Alignment: "+str(align)+"\n")
            #sys.stderr.write("L2Sequence: "+str(l2Sequence)+" Same Const: "+l1PhraseTag+"\n")
            ##############
            break
    return l1Sequence

Пример #6

Показать файл

Файл: DataGenerator.py Проект: phanigadde/CSRelated

class DataGenerator:
  def __init__(self, outDir):
    sys.stderr.write("DataGenerator: Constructor\n")
    ## Languages and Order
    self.__LID = ["HI","EN"]
    self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/en-ptb.map"
    self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/hi-hyd.map"
    ## Data containers
    self.__parL1 = []
    self.__parL2 = []
    self.__align = []
    self.__pureL1 = []
    self.__pureL2 = []
    self.__outputDir = outDir
    self.__posMap = {}
    self.__phraseMap = dd(list)
    self.__csInstance = CSHandler()
    self.__utils = Utils()
    self.__Tree = Dependencytree()
    
    ## Generation Variants
    self.__csVariants = [0,1,2,3,4]
    self.__tagsetVariants = ["",".uni"]
    self.__dataRange = range(50,900,50)
    ##self.__dataRange = [200]
    self.__splits = [(50,50),(60,40),(70,30),(80,20),(90,10)]
    self.__csHash = set()
    ##LID stuff
    self.__L1Tags = set()
    self.__L2Tags = set()
    self.__commonTags = set()
    ## Pre processing
    self.__genPosMap()
    self.__genPhraseMap()
    self.__csInstance.updatePhraseMap(self.__phraseMap)
    self.__csInstance.updateLIDTags(self.__LID[0], self.__LID[1])
    
    ## Real test overwrites
    #self.__csVariants = [1,2,3,4]
    self.__tagsetVariants = [""]
    self.__dataRange = [400]
    self.__dataRanges = {0:range(40,601,40), 1:range(40,601,40), 2:range(35,540,35), 3:range(30,451,30), 4:range(15,231,15)}
    #self.__dataRanges = {0:[880], 1:[880], 2:[800], 3:[630], 4:[330]}
    #self.__dataRanges = {0:[60], 1:[60], 2:[60], 3:[60], 4:[60]}
    #self.__splits = [(50,50)]
    #for i in range(0,51,5):
    #  split = (100-i, i)
    #  self.__splits.append(split)
    self.__fileSuffix = ""
 
  def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data):
    self.__parL1 = self.__utils.readSentences(l1Data)
    self.__parL2 = self.__utils.readSentences(l2Data)
    self.__align = self.__utils.readAligns(l1Aligns, l2Aligns)
    self.__pureL1 = self.__utils.readSentencesPlain(pureL1Data)
    self.__pureL2 = self.__utils.readSentencesPlain(pureL2Data)
    sys.stderr.write("parL1:"+str(len(self.__parL1))+"\n")
    sys.stderr.write("parL2:"+str(len(self.__parL2))+"\n")
    sys.stderr.write("align:"+str(len(self.__align))+"\n")
    sys.stderr.write("pureL1:"+str(len(self.__pureL1))+"\n")
    sys.stderr.write("pureL2:"+str(len(self.__pureL2))+"\n")
  
  def __genTrainData(self):
    statusCount = 0
    for data in self.__dataRange:
      #control = 0
      #while 1:
      for Split in self.__splits:
      #for control in range(3):
        #if control == 3:
        #  break
        #pr= int(control*1.0/2 * data)
        pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
        tr = data - pr
        pr = pr/2
        
        print pr
        random.seed()
        pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
        pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
        
        for csType in self.__csVariants:
          self.__csHash = set()
          ##sys.stderr.write("csType:"+str(csType)+'\n')
          # Debugging !!
          #switch = ""
          #############
          #for tag in self.__tagsetVariants:
            # Debugging !!
            #if switch == "yes":
            #    break
            ###################
            #sys.stderr.write(outputDir+"Train"+cs+str(len(trainVariants[tr]))+"Pure"+str(len(pureVariants[pr]))+tag+"\n")
          
          sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w')
          ##dataFileUni = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni",'w')
          ##dataFileUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uniq",'w')
          ##dataFileUniUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni.uniq",'w')
          
          for index in pIndicesL1:
            line = self.__pureL1[index]
            #sys.stderr.write("L1 Line:"+str(line)+'\n')
            line = self.__addLangTags(line, self.__LID[0])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n')
            lineUni = self.__map2Uni(line)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')'''
            
          for index in pIndicesL2:
            line = self.__pureL2[index]
            #sys.stderr.write("L2 Line:"+str(line)+'\n')
            line = self.__addLangTags(line, self.__LID[1])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n')
            lineUni = self.__map2Uni(line)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')'''
            
          stopLength = tr+1
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              index = 0
            csLine = ""
            order = stopLength%2
            #sys.stderr.write("order:"+str(order)+'\n')
            self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
            csReturn = self.__csInstance.csSentence(csType)
            # Debugging !!                         
            #sys.stderr.write("Switch to another CS variant?? ")
            #switch = raw_input()
            #if switch == "yes":
            #    break
            ###############
            csLine = csReturn[0]
            #csSequence = csReturn[1]
            #print csReturn[1]
            hashKey = (index, tuple(csReturn[1]))
            #print hashKey
            if csLine != -1 and hashKey not in self.__csHash:
              self.__csHash.add(hashKey)
              stopLength -= 1
            else:
              continue
            #sys.stderr.write("csLine:"+str(csLine)+'\n')
            #csLine = self.__addLangTags(csLine)
            
            if stopLength <= 0:
              break
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLine)))+'\n')
            csLineUni = self.__map2Uni(csLine)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLineUni)))+'\n')'''
            
          dataFile.close()
          ##dataFileUni.close()
          ##dataFileUniq.close()
          ##dataFileUniUniq.close()
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            #pr -= 1
            dummy = raw_input()
          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
        #pr += 1
    print statusCount
    
  def __genTrainDataDiverse(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        for Split in self.__splits:
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          pr = pr/2
          
          print pr
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          
          ##for csType in self.__csVariants:
          self.__csHash = set()
          
          sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w')
          
          #### Dangerous ####
          
          ##pIndicesL1 = []
          ##pIndicesL2 = []
          
          #### End of Dangerous ####
          
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
          stopLength = tr+1
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              break
              index = 0
            csLine = ""
            order = stopLength%2
            self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
            csReturn = self.__csInstance.csSentence(csType)
            csLine = csReturn[0]
            hashKey = (index, tuple(csReturn[1]))
            if csLine != -1 and hashKey not in self.__csHash:
              #self.__csHash.add(hashKey)
              self.__csHash.add(index)
              stopLength -= 1
            else:
              continue
            
            if stopLength <= 0:
              break
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
            
          dataFile.close()
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            dummy = raw_input()
          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def __genTrainDataDup(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        
        for splitIndex in range(len(self.__splits)):
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)

          ##for csType in self.__csVariants:
          self.__csHash = set()
          stopLength = tr
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              break
              index = 0
            csLines = []
            for order in range(2):
            #order = stopLength%2
              self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
              csReturn = self.__csInstance.csSentence(csType)
              csLine = csReturn[0]
              if csLine != -1:
                csLines.append(csLine)
            if len(csLines) == 2:
              csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n')
              csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n')
              ##if splitIndex == 0:
              self.__Tree.updateTree(self.__parL1[index])
              pureLine = self.__Tree.wordTags()
              pureLine = self.__addLangTags(pureLine, self.__LID[0])
              pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n')
              self.__Tree.updateTree(self.__parL2[index])
              pureLine = self.__Tree.wordTags()
              #print pureLine
              pureLine = self.__addLangTags(pureLine, self.__LID[1])
              #print pureLine
              #sys.exit(0)
              pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n')
              self.__csHash.add(index)
              stopLength -= 1
            else:
              continue
            
            if stopLength <= 0:
              break
            
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            dummy = raw_input()
          
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def collectData(self):
    statusCount = 0
    for csType in [4]:
      for data in self.__dataRanges[csType]:
        initialSlitCSData = []
        for splitIndex in range(len(self.__splits)):
          if splitIndex > 0:
            return initialSlitCSData
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
            initialSlitCSData.append((pureLine))
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
            initialSlitCSData.append((pureLine))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSlitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:

            ##for csType in self.__csVariants:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              if index == len(self.__parL1):
                ##break
                index = 0
                print "Still:",stopLength," Looping.."
              csLines = []
              csSeqs = []
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
                self.__Tree.updateTree(self.__parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
                pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
                if pureWords == csWords:
                  p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
                  p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
                  pureData.append(p1)
                  pureData.append(p2)
                  cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
                  csData.append(cs1)
                  cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
                  csData.append(cs2)
                  if splitIndex == 0:
                    initialSlitCSData.append((self.__parL1[index],self.__parL2[index], self.__align[index]))
                  self.__csHash.add(index)
                  stopLength -= 1
                ##else:
                ##  l1Switch = (0, tuple(csSeqs[0]))
                ##  l2Switch = (1, tuple(csSeqs[1]))
                ##  self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
  
  def __genFromSingleData(self):
    dataset = self.collectData()
    CSData = [d for d in dataset if len(d)==3]
    PUREData = [d[1] for d in dataset if len(d)==1]
    pureFile = open(self.__outputDir+"Baseline"+self.__fileSuffix,'w')
    pureFlag = 1
    for csType in self.__csVariants:
      for splitIndex in range(len(self.__splits)):
        Split = self.__splits[splitIndex]
        dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(len(dataset))+self.__fileSuffix,'w')
        stopLength = len(CSData)/2
        csData = []
        pureData = []
        index = -1
        while 1:
          index += 1
          if index == len(self.__parL1):
            ##break
            index = 0
            print "Still:",stopLength," Looping.."
          csLines = []
          csSeqs = []
          for order in range(2):
          #order = stopLength%2
            self.__csInstance.updateHandler(CSData[index][0], CSData[index][1], CSData[index][2], order)
            csReturn = self.__csInstance.csSentence(csType)
            csLine = csReturn[0]
            if csLine != -1:
              csLines.append(csLine)
              csSeqs.append(csReturn[1])
          if len(csLines) == 2:
            csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
            self.__Tree.updateTree(self.__parL1[index])
            pureLine1 = self.__Tree.wordTags()
            pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
            self.__Tree.updateTree(self.__parL2[index])
            pureLine2 = self.__Tree.wordTags()
            pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
            pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
            if pureWords == csWords:
              p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
              p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
              pureData.append(p1)
              pureData.append(p2)
              cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
              csData.append(cs1)
              cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
              csData.append(cs2)
              self.__csHash.add(index)
              stopLength -= 1
          else:
            continue
          
          if stopLength <= 0:
            break
          
        if stopLength > 0:
          print tr, stopLength, "Training Break!!"
          dummy = raw_input()
        
        for csLine in csData:
          dataFile.write(csLine)
        if pureFlag:
          pureFlag = 0
          for pureLine in pureData:
            pureFile.write(pureLine)
          for pureLine in PUREData:
            pureFile.write(pureLine)
          pureFile.close()
        dataFile.close()
  
  def __genTrainDataDupStrict(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        initialSlitCSData = []
        for splitIndex in range(len(self.__splits)):
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSlitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:

            ##for csType in self.__csVariants:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              if index == len(self.__parL1):
                ##break
                index = 0
                print "Still:",stopLength," Looping.."
              csLines = []
              csSeqs = []
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
                self.__Tree.updateTree(self.__parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
                pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
                if pureWords == csWords:
                  p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
                  p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
                  pureData.append(p1)
                  pureData.append(p2)
                  cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
                  csData.append(cs1)
                  cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
                  csData.append(cs2)
                  if splitIndex == 0:
                    initialSlitCSData.append((cs1,cs2, p1, p2))
                  self.__csHash.add(index)
                  stopLength -= 1
                ##else:
                ##  l1Switch = (0, tuple(csSeqs[0]))
                ##  l2Switch = (1, tuple(csSeqs[1]))
                ##  self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
  
    
  def __addLangTags(self, WordTags, lTag):
    wordTags = []
    for wt in WordTags:
      newWT = [i for i in wt]
      wordTags.append(newWT)
    for index in range(len(wordTags)):
      wordTags[index].append(lTag)
    return wordTags
  
  def __genPosMap(self):
    for i in open(self.__l1MapFile):
      i = i.strip()
      srcTag = i.split()[0]
      uniTag = i.split()[1]
      self.__posMap[srcTag] = uniTag

    for i in open(self.__l2MapFile):
      i = i.strip()
      srcTag = i.split()[0]
      uniTag = i.split()[1]
      self.__posMap[srcTag] = uniTag  
    
    self.__L1Tags = set()
    for line in open(self.__l1MapFile):
      tag = line.split()[0]
      self.__L1Tags.add(tag)
    for line in open(self.__l2MapFile):
      tag = line.split()[0]
      self.__L2Tags.add(tag)
    self.__commonTags = set([c for c in self.__L1Tags if c in self.__L2Tags])
  
  def __map2Uni(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      try:
        newLine[index][1] = self.__posMap[tag]
      except:
        newLine[index][1] = 'X'
    return newLine
  
  def __map2UniControl(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      lang = wordTagsLangs[index][2]
      try:
        newLine[index][1] = self.__posMap[tag]+'_'+lang
      except:
        newLine[index][1] = 'X'+'_'+lang
    return newLine
  
  def __genPhraseMap(self):
    phraseMapFile = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/mapping")
    for i in phraseMapFile:
      i = i.strip()
      self.__phraseMap[i.split()[0]].extend(i.split()[1].split(","))
    
  def generateData(self):
    ##for i in range(10):
      ##self.__fileSuffix = "."+str(i)
      #self.__genTrainDataDiverse()
      self.__genTrainDataDupStrict()