Exemplo n.º 1
0
class CSHandler:
  def __init__(self):
    #self.__L1Tree = Parsetree()
    #self.__L2Tree = Parsetree()
    self.__L1 = ""
    self.__L2 = ""
    self.__curL1 = ""
    self.__curL2 = ""
    self.__L1Tree = Dependencytree()
    self.__L2Tree = Dependencytree()
    self.__align = {}
    self.__revAlign = {}
    self.__utils = Utils()
    self.__phraseMap = {}
    self.__l1Index = 0
    self.__clausalChunks = ["CCP","VGF", "NULL__CCP","NULL__VGF"]
  
  def updatePhraseMap(self, phraseMap):
    self.__phraseMap = phraseMap
    
  def updateLIDTags(self, L1, L2):
    self.__L1 = L1
    self.__L2 = L2
  
  def updateHandler(self, l1Sent, l2Sent, alignLine, l1Index):
    #sys.stderr.write(alignLine+'\n')
    l2Index = 1-l1Index
    self.__l1Index = l1Index
    if l1Index:
      self.__curL1 = self.__L2
      self.__curL2 = self.__L1
      self.__L1Tree.updateTree(l2Sent)
      self.__L2Tree.updateTree(l1Sent)
    else:
      self.__curL1 = self.__L1
      self.__curL2 = self.__L2
      self.__L1Tree.updateTree(l1Sent)
      self.__L2Tree.updateTree(l2Sent)
      
    self.__align = self.__parseAlign(alignLine, l1Index, l2Index)
    self.__revAlign = self.__parseAlign(alignLine, l2Index, l1Index)
    self.__csHash = dd(lambda:set())
    
  def updateBadSwitch(self, index, l1Switch, l2Switch):
    self.__csHash[index].add(l1Switch)
    self.__csHash[index].add(l2Switch)
    
  def __parseAlign(self, alignLine, l1Index, l2Index):
    align = {}
    for i in alignLine.split():
      key = int(i.split("-")[l1Index])
      value = int(i.split("-")[l2Index])
      if key in align.keys():
        align[key].append(value)
      else:
        align[key] = [value]
    return align
    
  def csSentence(self, csType):
    validSequences = self.__utils.validSequences(self.__L1Tree.sentLen())
    # Debugging !!
    #sys.stderr.write("L1SeqL2Cont Valid Sequences:\n")
    #for sequence in validSequences:
    #    sys.stderr.write(str(sequence)+"\n")
    #dummy=raw_input()
    ###############
    # Assumption that a sentence will have a single code switch.
    sequence = self.__selectSequence(validSequences, csType) 
    # Debugging !!
    ##sys.stderr.write("Selected Sequence: "+str(sequence)+"\n")
    ##sys.stderr.write("Align:"+str(self.__align)+'\n')
    #sys.stderr.write(l1Parse)
    #sys.stderr.write(l2Parse)
    #dummy=raw_input()
    if sequence == -1:
      return [-1,[]]
    ##print "Sequence:", sequence
    csSentence = self.__utils.makeCSSentence(self.__L1Tree.wordTags(), sequence, self.__L2Tree.wordTags(), self.__align, self.__curL1, self.__curL2)
    return [csSentence,sequence]
    
  ## Assumptions:
  ## There is always a single code switch
  ## The selection among the valid candidate sequences is random
  def __selectSequence(self, validSequences, csType):
    if csType == 0:
      return self.__random(validSequences)
    elif csType == 1:
      return self.__seqL1SeqL2Cont(validSequences)
    elif csType == 2:
      return self.__seqL1Const()
    elif csType == 4:
      return self.__seqHindiClausal()
    elif csType == 3:
      return self.__seqL1ConstL2Const()
    
  def __checkHindiClause(self, sequence, OBJ):
    if frozenset(sequence) not in OBJ.heads:
      return False
    try:
      head = OBJ.heads[frozenset(sequence)]
      #print "Head:", head
      if not OBJ.isChunkSubtree(head[1], sequence) and head[0] in self.__clausalChunks and not OBJ.hasNPChild(head[1]):
        return True
    except:
      pass
    return False
    
  def __seqHindiClausal(self):
    l1Sequence = -1
    if self.__l1Index: ## Hindi is L2
      subtrees = self.__L2Tree.subtrees
      if len(subtrees) == 0:
        return -1
      count = 0
      while 1:
        l2Sequence = list(random.sample(subtrees,1)[0])
        l1Sequence = self.__utils.l2Sequence(l2Sequence, self.__revAlign)
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        if len(l2Sequence) == self.__L2Tree.sentLen():
          count += 1
          continue
        #if (frozenset(l2Sequence) in self.__L2Tree.heads and self.__L2Tree.heads[frozenset(l2Sequence)] in self.__clausalChunks and len(l1Sequence) > 0 )or count > 100:
        if len(l2Sequence)>0 and len(l1Sequence)>0 and self.__checkHindiClause(l2Sequence, self.__L2Tree):
          #if len(l1Sequence) == 0:
          #  print "l1Seq:",l1Sequence
          #  print "subtrees:", subtrees
          #  print "Count:", count
          #  dummy = raw_input(
          return l1Sequence
        count += 1
        if count > 100:
            break
    else: ## Hin is L1
      subtrees = self.__L1Tree.subtrees
      if len(subtrees) == 0:
        return -1
      count = 0
      while 1:
        l1Sequence = list(random.sample(subtrees,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if len(l2Sequence) == self.__L2Tree.sentLen():
          count += 1
          continue
        #if (frozenset(l1Sequence) in self.__L1Tree.heads and self.__L1Tree.heads[frozenset(l1Sequence)] in self.__clausalChunks and len(l2Sequence) > 0 )or count > 100:
        if len(l1Sequence)>0 and len(l2Sequence)>0 and self.__checkHindiClause(l1Sequence, self.__L1Tree):
          return l1Sequence 
        count += 1
        if count > 100:
            break
    return -1
    
  def __random(self, sequences):
    random.seed()
    l1Sequence = -1
    count = 0
    while 1:
      l1Sequence = random.sample(sequences, 1)[0]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if len(l2Sequence) > 0 or count > 100:
        break
      count += 1
    return l1Sequence
  
  def __randomStrict(self, sequences):
    random.seed()
    l1Sequence = -1
    count = 0
    while 1:
      l1Sequence = random.sample(sequences, 1)[0]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if len(l2Sequence) > 0:
        return l1Sequence
      count += 1
      if count > 100:
        break
    return -1
  
  def __seqL1SeqL2Cont(self, sequences):
    random.seed()
    sequenceIndex = -1
    l1Sequence = -1
    count = 0
    while 1:
      count += 1
      if count%100 == 0:
        return -1
        sys.stderr.write("L1SeqL2Cont"+"InfLoop:"+str(count)+"\n")
      sequenceIndex = random.randrange(len(sequences))
      l1Sequence = sequences[sequenceIndex]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if self.__L2Tree.isContiguous(l2Sequence):
        # Debugging !!
        #sys.stderr.write("Alignment: "+str(align)+"\n")
        #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n")
        ##############
        break
    return l1Sequence
 
  def __seqL1Const(self):
    random.seed()
    subtrees = self.__L1Tree.subtrees
    l1Sequence = -1
    if len(subtrees)>0:
      count = 0
      while 1:
        l1Sequence = list(random.sample(subtrees,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if len(l2Sequence) > 0:
          l1Sequence.sort()
          return l1Sequence
        count += 1
        if count > 100:
            break
    return -1
  
  def __seqL1LWG(self):
    random.seed()
    LWGs = self.__L1Tree.LWGs()
    l1Sequence = -1
    if len(LWGs)>0:
      count = 0
      while 1:
        l1Sequence = list(random.sample(LWGs,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if len(l2Sequence) > 0:
          l1Sequence.sort()
          return l1Sequence
        count += 1
        if count > 100:
            break
    return -1
 
  def __seqL1ConstL2Const(self):
    random.seed()
    subtrees = self.__L1Tree.subtrees
    l1Sequence = -1
    if len(subtrees)>0:
      count = 0
      while 1:
        l1Sequence = list(random.sample(subtrees,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if frozenset(l2Sequence) in self.__L2Tree.subtrees:
          l1Sequence.sort()
          return l1Sequence
        else:
          ##print "L1 Sentence:",self.__L1Tree.word_tags()
          ##print "L2 Sentence:",self.__L2Tree.word_tags()
          ##print "L1 Sequence:",l1Sequence
          ##print "L2 Sequence:", l2Sequence
          ##dummy = raw_input()
          dummy = 1
          
        count += 1
        if count > 100:
            break
    return -1
  
  def __seqL1LWGL2LWG(self):
    random.seed()
    LWGs = self.__L1Tree.LWGs()
    l1Sequence = -1
    if len(LWGs)>0:
      count = 0
      while 1:
        l1Sequence = list(random.sample(LWGs,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if frozenset(l2Sequence) in self.__L2Tree.LWGs():
          l1Sequence.sort()
          return l1Sequence
        else:
          ##print "L1 Sentence:",self.__L1Tree.word_tags()
          ##print "L2 Sentence:",self.__L2Tree.word_tags()
          ##print "L1 Sequence:",l1Sequence
          ##print "L2 Sequence:", l2Sequence
          ##dummy = raw_input()
          dummy = 1
        count += 1
        if count > 100:
            break
    return -1
  
  def __seqL1ConstL2Cont(self, sequences):
    random.seed()
    sequenceIndex = -1
    l1Sequence = -1
    count = 0
    while 1:
      count += 1
      if count%100 == 0:
        return -1
        sys.stderr.write("L1Const2Cont"+"InfLoop:"+str(count)+"\n")
      sequenceIndex = random.randrange(len(sequences))
      l1Sequence = sequences[sequenceIndex]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if self.__L1Tree.isConstituent(l1Sequence):
        if self.__L2Tree.isContiguous(l2Sequence):
          # Debugging !!
          #sys.stderr.write("Alignment: "+str(align)+"\n")
          #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n")
          ##############
          break
    return l1Sequence
  
  def __seqL1ConstL2SameConst(self, sequences):
    random.seed()
    sequenceIndex = -1
    l1Sequence =-1
    count = 0
    while 1:
      count += 1
      if count%500 == 0:
        return -1
        sys.stderr.write("L1Const2SameConst"+"InfLoop:"+str(count)+"\n")
      sequenceIndex = random.randrange(len(sequences))
      l1Sequence = sequences[sequenceIndex]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if self.__L1Tree.isConstituent(l1Sequence):
        if self.__L2Tree.isConstituent(l2Sequence):
          l1PhraseTag = self.__L1Tree.getPhrase(l1Sequence[0])
          l2PhraseTag = self.__L2Tree.getPhrase(l2Sequence[0])
          ## Both the phrases are same, for dual structure principle
          if l1PhraseTag == l2PhraseTag or l2PhraseTag in self.__phraseMap[l1PhraseTag]:
            # Debugging !!
            #sys.stderr.write("Alignment: "+str(align)+"\n")
            #sys.stderr.write("L2Sequence: "+str(l2Sequence)+" Same Const: "+l1PhraseTag+"\n")
            ##############
            break
    return l1Sequence
Exemplo n.º 2
0
class Generator:
  def __init__(self, outDir):
    self.config = None
    self.__csHash = set()
    self.__outputDir = outDir
    self.__csInstance = CSHandler()
    self.__dataHandler = DataHandler()
    self.__utils = Utils()
    self.__Tree = Dependencytree()
    self.__fileSuffix = ""
    self.prepareConfig()
    
  def prepareConfig(self):
    self.config = GeneratingConfig()
    self.config.setCSVariants([0, 1, 2, 3, 4])
    self.config.setDataRanges({0:range(50, 1001, 50), 1:range(50, 1001, 50), 2:range(50, 1001, 50), 3:range(50, 1001, 50), 4:range(50, 1001, 50)})
    self.config.setSplits([(50, 50), (60, 40), (70, 30), (80, 20), (90, 10)])
    self.config.setTagsetVariants([".uniq", ".uni"])
  
  def prepareGenerator(self):
    self.__csInstance.updateLIDTags(self.__dataHandler.LID[0], self.__dataHandler.LID[1])
  
  def prepareRealTest(self, dataFile, outFile):
    dataFile = open(dataFile)
    outFile = open(outFile, 'w')
    for line in dataFile:
      line = map(lambda x:x.split('_#'), line.strip().split())
      uniLine = self.__dataHandler.mapLD2Uni(line)
      outFile.write(' '.join(map(lambda x:'_#'.join(x), uniLine)) + '\n')
    outFile.close()

  def generateTestData(self):
    self.config.setDataRanges({0:range(30, 151, 50), 1:range(30, 151, 50), 2:range(30, 151, 50), 3:range(30, 151, 50), 4:range(30, 151, 50)})
    for csType in self.config.csVariants:
      print "type" + str(csType)
      for data in self.config.dataRanges[csType]:
        print
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Testing Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()
  
  
  def generateDataForTest(self):
    for i in range(10):
      self.__fileSuffix = "."+str(i)
      self.generateTrainDataForTest()
  
  def generateTrainDataForTest(self):
    self.config.setDataRanges({0:[450], 1:[450], 2:[450], 3:[450], 4:[450]})
    statusCount = 0
    for csType in self.config.csVariants:
      print "type" + str(csType),
      for data in self.config.dataRanges[csType]:
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          if splitIndex == len(self.config.splits) - 1:
            print
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.. ",
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount

  
  def generateTrainData(self):
    statusCount = 0
    for csType in self.config.csVariants:
      print "type" + str(csType)
      for data in self.config.dataRanges[csType]:
        print
        print " numSents:" + str(data * 2),
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print " Pure:" + str(2 * pr),
          print " CS:" + str(2 * tr),
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if True or pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def generateUCTrainData(self): # Unknown words constrained training data
    statusCount = 0
    for csType in self.config.csVariants:
      for data in self.config.dataRanges[csType]:
        initialSplitCSData = []
        for splitIndex in range(len(self.config.splits)):
          csData = []
          Split = self.config.splits[splitIndex]
          pureData = []
          
          pureFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + self.__fileSuffix, 'w')
          dataFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + self.__fileSuffix, 'w')
          pureUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + "_Control" + ".uni" + self.__fileSuffix, 'w')
          dataUniFile = open(self.__outputDir + "TrainCSType" + str(csType) + "CS" + str(Split[1]) + "Pure" + str(Split[0]) + "Total" + str(2 * data) + ".uni" + self.__fileSuffix, 'w')
          
          pr = int((Split[0] * 1.0 / (Split[0] + Split[1])) * data)
          tr = data - pr
          print pr
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__dataHandler.pureL1)), pr)
          pIndicesL2 = random.sample(range(len(self.__dataHandler.pureL2)), pr)
          
          for index in pIndicesL1:
            line = self.__dataHandler.pureL1[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[0])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))
          
          for index in pIndicesL2:
            line = self.__dataHandler.pureL2[index]
            line = self.__dataHandler.addLangTags(line, self.__dataHandler.LID[1])
            line = self.__dataHandler.makeLD(line)
            pureData.append(tuple(line))
            csData.append(tuple(line))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSplitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              
              if index == len(self.__dataHandler.parL1):
                ##break
                index = 0
                print "Still:", stopLength, " Looping.."
              
              csLines = []
              csSeqs = []
              
              hashKeys = ["", ""]
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__dataHandler.parL1[index], self.__dataHandler.parL2[index], self.__dataHandler.align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  hashKeys[order] = (index, order, tuple(csReturn[1]))
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]]) | set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__dataHandler.parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__dataHandler.addLangTags(pureLine1, self.__dataHandler.LID[0])
                pureLine1 = self.__dataHandler.makeLD(pureLine1)
                self.__Tree.updateTree(self.__dataHandler.parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__dataHandler.addLangTags(pureLine2, self.__dataHandler.LID[1])
                pureLine2 = self.__dataHandler.makeLD(pureLine2)
                pureWords = set([x[0] for x in pureLine1]) | set([x[0] for x in pureLine2])
                if pureWords == csWords and hashKeys[0] not in self.__csHash and hashKeys[1] not in self.__csHash:
                  pureData.append(tuple(pureLine1))
                  pureData.append(tuple(pureLine2))
                  csData.append(tuple(csLines[0]))
                  csData.append(tuple(csLines[1]))
                  if splitIndex == 0:
                    initialSplitCSData.append((tuple(csLines[0]), tuple(csLines[1]), tuple(pureLine1), tuple(pureLine2)))
                  stopLength -= 1
                  for hashKey in hashKeys:
                    self.__csHash.add(hashKey)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(csLine)))
            dataFile.write(self.makeString(csLine))
          for pureLine in pureData:
            pureFile.write(self.makeString(pureLine))
            pureUniFile.write(self.makeString(self.__dataHandler.mapLD2Uni(pureLine)))
          pureFile.close()
          dataFile.close()
          pureUniFile.close()
          dataUniFile.close()

          statusCount += 1
          if statusCount % 50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount

  def makeString(self, wordsTagsLangs):
    return ' '.join(map(lambda x:"_#".join(x), wordsTagsLangs)) + '\n'
    
  def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data):
    self.__dataHandler.loadData(l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data)
Exemplo n.º 3
0
class DataGenerator:
  def __init__(self, outDir):
    sys.stderr.write("DataGenerator: Constructor\n")
    ## Languages and Order
    self.__LID = ["HI","EN"]
    self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/en-ptb.map"
    self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/hi-hyd.map"
    ## Data containers
    self.__parL1 = []
    self.__parL2 = []
    self.__align = []
    self.__pureL1 = []
    self.__pureL2 = []
    self.__outputDir = outDir
    self.__posMap = {}
    self.__phraseMap = dd(list)
    self.__csInstance = CSHandler()
    self.__utils = Utils()
    self.__Tree = Dependencytree()
    
    ## Generation Variants
    self.__csVariants = [0,1,2,3,4]
    self.__tagsetVariants = ["",".uni"]
    self.__dataRange = range(50,900,50)
    ##self.__dataRange = [200]
    self.__splits = [(50,50),(60,40),(70,30),(80,20),(90,10)]
    self.__csHash = set()
    ##LID stuff
    self.__L1Tags = set()
    self.__L2Tags = set()
    self.__commonTags = set()
    ## Pre processing
    self.__genPosMap()
    self.__genPhraseMap()
    self.__csInstance.updatePhraseMap(self.__phraseMap)
    self.__csInstance.updateLIDTags(self.__LID[0], self.__LID[1])
    
    ## Real test overwrites
    #self.__csVariants = [1,2,3,4]
    self.__tagsetVariants = [""]
    self.__dataRange = [400]
    self.__dataRanges = {0:range(40,601,40), 1:range(40,601,40), 2:range(35,540,35), 3:range(30,451,30), 4:range(15,231,15)}
    #self.__dataRanges = {0:[880], 1:[880], 2:[800], 3:[630], 4:[330]}
    #self.__dataRanges = {0:[60], 1:[60], 2:[60], 3:[60], 4:[60]}
    #self.__splits = [(50,50)]
    #for i in range(0,51,5):
    #  split = (100-i, i)
    #  self.__splits.append(split)
    self.__fileSuffix = ""
 
  def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data):
    self.__parL1 = self.__utils.readSentences(l1Data)
    self.__parL2 = self.__utils.readSentences(l2Data)
    self.__align = self.__utils.readAligns(l1Aligns, l2Aligns)
    self.__pureL1 = self.__utils.readSentencesPlain(pureL1Data)
    self.__pureL2 = self.__utils.readSentencesPlain(pureL2Data)
    sys.stderr.write("parL1:"+str(len(self.__parL1))+"\n")
    sys.stderr.write("parL2:"+str(len(self.__parL2))+"\n")
    sys.stderr.write("align:"+str(len(self.__align))+"\n")
    sys.stderr.write("pureL1:"+str(len(self.__pureL1))+"\n")
    sys.stderr.write("pureL2:"+str(len(self.__pureL2))+"\n")
  
  def __genTrainData(self):
    statusCount = 0
    for data in self.__dataRange:
      #control = 0
      #while 1:
      for Split in self.__splits:
      #for control in range(3):
        #if control == 3:
        #  break
        #pr= int(control*1.0/2 * data)
        pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
        tr = data - pr
        pr = pr/2
        
        print pr
        random.seed()
        pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
        pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
        
        for csType in self.__csVariants:
          self.__csHash = set()
          ##sys.stderr.write("csType:"+str(csType)+'\n')
          # Debugging !!
          #switch = ""
          #############
          #for tag in self.__tagsetVariants:
            # Debugging !!
            #if switch == "yes":
            #    break
            ###################
            #sys.stderr.write(outputDir+"Train"+cs+str(len(trainVariants[tr]))+"Pure"+str(len(pureVariants[pr]))+tag+"\n")
          
          sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w')
          ##dataFileUni = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni",'w')
          ##dataFileUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uniq",'w')
          ##dataFileUniUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni.uniq",'w')
          
          for index in pIndicesL1:
            line = self.__pureL1[index]
            #sys.stderr.write("L1 Line:"+str(line)+'\n')
            line = self.__addLangTags(line, self.__LID[0])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n')
            lineUni = self.__map2Uni(line)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')'''
            
          for index in pIndicesL2:
            line = self.__pureL2[index]
            #sys.stderr.write("L2 Line:"+str(line)+'\n')
            line = self.__addLangTags(line, self.__LID[1])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), line)))+'\n')
            lineUni = self.__map2Uni(line)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), lineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), lineUni)))+'\n')'''
            
          stopLength = tr+1
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              index = 0
            csLine = ""
            order = stopLength%2
            #sys.stderr.write("order:"+str(order)+'\n')
            self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
            csReturn = self.__csInstance.csSentence(csType)
            # Debugging !!                         
            #sys.stderr.write("Switch to another CS variant?? ")
            #switch = raw_input()
            #if switch == "yes":
            #    break
            ###############
            csLine = csReturn[0]
            #csSequence = csReturn[1]
            #print csReturn[1]
            hashKey = (index, tuple(csReturn[1]))
            #print hashKey
            if csLine != -1 and hashKey not in self.__csHash:
              self.__csHash.add(hashKey)
              stopLength -= 1
            else:
              continue
            #sys.stderr.write("csLine:"+str(csLine)+'\n')
            #csLine = self.__addLangTags(csLine)
            
            if stopLength <= 0:
              break
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
            ##Commented for real test
            '''
            dataFileUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLine)))+'\n')
            csLineUni = self.__map2Uni(csLine)
            dataFileUni.write(' '.join(map(lambda x:'_#'.join(x), csLineUni))+'\n')
            dataFileUniUniq.write(' '.join(map(lambda x:'_#'.join(x), map(lambda x:(x[0],x[1]+'_'+x[2],x[2]), csLineUni)))+'\n')'''
            
          dataFile.close()
          ##dataFileUni.close()
          ##dataFileUniq.close()
          ##dataFileUniUniq.close()
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            #pr -= 1
            dummy = raw_input()
          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
        #pr += 1
    print statusCount
    
  def __genTrainDataDiverse(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        for Split in self.__splits:
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          pr = pr/2
          
          print pr
          random.seed()
          
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          
          ##for csType in self.__csVariants:
          self.__csHash = set()
          
          sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w')
          
          #### Dangerous ####
          
          ##pIndicesL1 = []
          ##pIndicesL2 = []
          
          #### End of Dangerous ####
          
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), line))+'\n')
            
          stopLength = tr+1
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              break
              index = 0
            csLine = ""
            order = stopLength%2
            self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
            csReturn = self.__csInstance.csSentence(csType)
            csLine = csReturn[0]
            hashKey = (index, tuple(csReturn[1]))
            if csLine != -1 and hashKey not in self.__csHash:
              #self.__csHash.add(hashKey)
              self.__csHash.add(index)
              stopLength -= 1
            else:
              continue
            
            if stopLength <= 0:
              break
            dataFile.write(' '.join(map(lambda x:'_#'.join(x), csLine))+'\n')
            
          dataFile.close()
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            dummy = raw_input()
          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def __genTrainDataDup(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        
        for splitIndex in range(len(self.__splits)):
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)

          ##for csType in self.__csVariants:
          self.__csHash = set()
          stopLength = tr
          index = -1
          while 1:
            index += 1
            if index == len(self.__parL1):
              break
              index = 0
            csLines = []
            for order in range(2):
            #order = stopLength%2
              self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
              csReturn = self.__csInstance.csSentence(csType)
              csLine = csReturn[0]
              if csLine != -1:
                csLines.append(csLine)
            if len(csLines) == 2:
              csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n')
              csData.append(' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n')
              ##if splitIndex == 0:
              self.__Tree.updateTree(self.__parL1[index])
              pureLine = self.__Tree.wordTags()
              pureLine = self.__addLangTags(pureLine, self.__LID[0])
              pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n')
              self.__Tree.updateTree(self.__parL2[index])
              pureLine = self.__Tree.wordTags()
              #print pureLine
              pureLine = self.__addLangTags(pureLine, self.__LID[1])
              #print pureLine
              #sys.exit(0)
              pureData.append(' '.join(map(lambda x:'_#'.join(x), pureLine))+'\n')
              self.__csHash.add(index)
              stopLength -= 1
            else:
              continue
            
            if stopLength <= 0:
              break
            
          if stopLength > 0:
            print tr, stopLength, "Training Break!!"
            dummy = raw_input()
          
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
    
  def collectData(self):
    statusCount = 0
    for csType in [4]:
      for data in self.__dataRanges[csType]:
        initialSlitCSData = []
        for splitIndex in range(len(self.__splits)):
          if splitIndex > 0:
            return initialSlitCSData
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
            initialSlitCSData.append((pureLine))
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
            initialSlitCSData.append((pureLine))

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSlitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:

            ##for csType in self.__csVariants:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              if index == len(self.__parL1):
                ##break
                index = 0
                print "Still:",stopLength," Looping.."
              csLines = []
              csSeqs = []
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
                self.__Tree.updateTree(self.__parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
                pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
                if pureWords == csWords:
                  p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
                  p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
                  pureData.append(p1)
                  pureData.append(p2)
                  cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
                  csData.append(cs1)
                  cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
                  csData.append(cs2)
                  if splitIndex == 0:
                    initialSlitCSData.append((self.__parL1[index],self.__parL2[index], self.__align[index]))
                  self.__csHash.add(index)
                  stopLength -= 1
                ##else:
                ##  l1Switch = (0, tuple(csSeqs[0]))
                ##  l2Switch = (1, tuple(csSeqs[1]))
                ##  self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
  
  def __genFromSingleData(self):
    dataset = self.collectData()
    CSData = [d for d in dataset if len(d)==3]
    PUREData = [d[1] for d in dataset if len(d)==1]
    pureFile = open(self.__outputDir+"Baseline"+self.__fileSuffix,'w')
    pureFlag = 1
    for csType in self.__csVariants:
      for splitIndex in range(len(self.__splits)):
        Split = self.__splits[splitIndex]
        dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(len(dataset))+self.__fileSuffix,'w')
        stopLength = len(CSData)/2
        csData = []
        pureData = []
        index = -1
        while 1:
          index += 1
          if index == len(self.__parL1):
            ##break
            index = 0
            print "Still:",stopLength," Looping.."
          csLines = []
          csSeqs = []
          for order in range(2):
          #order = stopLength%2
            self.__csInstance.updateHandler(CSData[index][0], CSData[index][1], CSData[index][2], order)
            csReturn = self.__csInstance.csSentence(csType)
            csLine = csReturn[0]
            if csLine != -1:
              csLines.append(csLine)
              csSeqs.append(csReturn[1])
          if len(csLines) == 2:
            csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
            self.__Tree.updateTree(self.__parL1[index])
            pureLine1 = self.__Tree.wordTags()
            pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
            self.__Tree.updateTree(self.__parL2[index])
            pureLine2 = self.__Tree.wordTags()
            pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
            pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
            if pureWords == csWords:
              p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
              p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
              pureData.append(p1)
              pureData.append(p2)
              cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
              csData.append(cs1)
              cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
              csData.append(cs2)
              self.__csHash.add(index)
              stopLength -= 1
          else:
            continue
          
          if stopLength <= 0:
            break
          
        if stopLength > 0:
          print tr, stopLength, "Training Break!!"
          dummy = raw_input()
        
        for csLine in csData:
          dataFile.write(csLine)
        if pureFlag:
          pureFlag = 0
          for pureLine in pureData:
            pureFile.write(pureLine)
          for pureLine in PUREData:
            pureFile.write(pureLine)
          pureFile.close()
        dataFile.close()
  
  def __genTrainDataDupStrict(self):
    statusCount = 0
    for csType in self.__csVariants:
      for data in self.__dataRanges[csType]:
        initialSlitCSData = []
        for splitIndex in range(len(self.__splits)):
          csData = []
          Split = self.__splits[splitIndex]
          pureData = []
          pureFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+"_Control"+self.__fileSuffix,'w')
          dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(2*data)+self.__fileSuffix,'w')
          pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
          tr = data - pr
          #pr = pr/2
          
          print pr
          random.seed()
          pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
          pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
          for index in pIndicesL1:
            line = self.__pureL1[index]
            line = self.__addLangTags(line, self.__LID[0])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)
          for index in pIndicesL2:
            line = self.__pureL2[index]
            line = self.__addLangTags(line, self.__LID[1])
            pureLine = ' '.join(map(lambda x:'_#'.join(x), line))+'\n'
            ##pureFile.write(pureLine)
            ##if splitIndex == 0:
            pureData.append(pureLine)
            csData.append(pureLine)

          if splitIndex != 0:
            random.seed()
            csSample = random.sample(initialSlitCSData, tr)
            for sample in csSample:
              csData.append(sample[0])
              csData.append(sample[1])
              pureData.append(sample[2])
              pureData.append(sample[3])
          else:

            ##for csType in self.__csVariants:
            self.__csHash = set()
            stopLength = tr
            index = -1
            while 1:
              index += 1
              if index == len(self.__parL1):
                ##break
                index = 0
                print "Still:",stopLength," Looping.."
              csLines = []
              csSeqs = []
              for order in range(2):
              #order = stopLength%2
                self.__csInstance.updateHandler(self.__parL1[index], self.__parL2[index], self.__align[index], order)
                csReturn = self.__csInstance.csSentence(csType)
                csLine = csReturn[0]
                if csLine != -1:
                  csLines.append(csLine)
                  csSeqs.append(csReturn[1])
              if len(csLines) == 2:
                csWords = set([x[0] for x in csLines[0]])|set([x[0] for x in csLines[1]])
                self.__Tree.updateTree(self.__parL1[index])
                pureLine1 = self.__Tree.wordTags()
                pureLine1 = self.__addLangTags(pureLine1, self.__LID[0])
                self.__Tree.updateTree(self.__parL2[index])
                pureLine2 = self.__Tree.wordTags()
                pureLine2 = self.__addLangTags(pureLine2, self.__LID[1])
                pureWords = set([x[0] for x in pureLine1])|set([x[0] for x in pureLine2])
                if pureWords == csWords:
                  p1 = ' '.join(map(lambda x:'_#'.join(x), pureLine1))+'\n'
                  p2 = ' '.join(map(lambda x:'_#'.join(x), pureLine2))+'\n'
                  pureData.append(p1)
                  pureData.append(p2)
                  cs1 = ' '.join(map(lambda x:'_#'.join(x), csLines[0]))+'\n'
                  csData.append(cs1)
                  cs2 = ' '.join(map(lambda x:'_#'.join(x), csLines[1]))+'\n'
                  csData.append(cs2)
                  if splitIndex == 0:
                    initialSlitCSData.append((cs1,cs2, p1, p2))
                  self.__csHash.add(index)
                  stopLength -= 1
                ##else:
                ##  l1Switch = (0, tuple(csSeqs[0]))
                ##  l2Switch = (1, tuple(csSeqs[1]))
                ##  self.__csInstance.updateBadSwitch(index, l1Switch, l2Switch)
              else:
                continue
              
              if stopLength <= 0:
                break
              
            if stopLength > 0:
              print tr, stopLength, "Training Break!!"
              dummy = raw_input()
            
          for csLine in csData:
            dataFile.write(csLine)
          ##if splitIndex == 0:
          for pureLine in pureData:
            pureFile.write(pureLine)
          pureFile.close()
          dataFile.close()

          statusCount += 1
          if statusCount%50 == 0:
            print statusCount,
            sys.stdout.flush()
    print statusCount
  
    
  def __addLangTags(self, WordTags, lTag):
    wordTags = []
    for wt in WordTags:
      newWT = [i for i in wt]
      wordTags.append(newWT)
    for index in range(len(wordTags)):
      wordTags[index].append(lTag)
    return wordTags
  
  def __genPosMap(self):
    for i in open(self.__l1MapFile):
      i = i.strip()
      srcTag = i.split()[0]
      uniTag = i.split()[1]
      self.__posMap[srcTag] = uniTag

    for i in open(self.__l2MapFile):
      i = i.strip()
      srcTag = i.split()[0]
      uniTag = i.split()[1]
      self.__posMap[srcTag] = uniTag  
    
    self.__L1Tags = set()
    for line in open(self.__l1MapFile):
      tag = line.split()[0]
      self.__L1Tags.add(tag)
    for line in open(self.__l2MapFile):
      tag = line.split()[0]
      self.__L2Tags.add(tag)
    self.__commonTags = set([c for c in self.__L1Tags if c in self.__L2Tags])
  
  def __map2Uni(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      try:
        newLine[index][1] = self.__posMap[tag]
      except:
        newLine[index][1] = 'X'
    return newLine
  
  def __map2UniControl(self, wordTagsLangs):
    newLine = []
    for index in range(len(wordTagsLangs)):
      newLine.append(wordTagsLangs[index])
      tag = wordTagsLangs[index][1]
      lang = wordTagsLangs[index][2]
      try:
        newLine[index][1] = self.__posMap[tag]+'_'+lang
      except:
        newLine[index][1] = 'X'+'_'+lang
    return newLine
  
  def __genPhraseMap(self):
    phraseMapFile = open("/usr0/home/pgadde/Work/CodeSwitching/FrenchEnglish/NewsCommentary/E17/mapping")
    for i in phraseMapFile:
      i = i.strip()
      self.__phraseMap[i.split()[0]].extend(i.split()[1].split(","))
    
  def generateData(self):
    ##for i in range(10):
      ##self.__fileSuffix = "."+str(i)
      #self.__genTrainDataDiverse()
      self.__genTrainDataDupStrict()