Exemplo n.º 1
0
class CSHandler:
  def __init__(self):
    #self.__L1Tree = Parsetree()
    #self.__L2Tree = Parsetree()
    self.__L1 = ""
    self.__L2 = ""
    self.__curL1 = ""
    self.__curL2 = ""
    self.__L1Tree = Dependencytree()
    self.__L2Tree = Dependencytree()
    self.__align = {}
    self.__revAlign = {}
    self.__utils = Utils()
    self.__phraseMap = {}
    self.__l1Index = 0
    self.__clausalChunks = ["CCP","VGF", "NULL__CCP","NULL__VGF"]
  
  def updatePhraseMap(self, phraseMap):
    self.__phraseMap = phraseMap
    
  def updateLIDTags(self, L1, L2):
    self.__L1 = L1
    self.__L2 = L2
  
  def updateHandler(self, l1Sent, l2Sent, alignLine, l1Index):
    #sys.stderr.write(alignLine+'\n')
    l2Index = 1-l1Index
    self.__l1Index = l1Index
    if l1Index:
      self.__curL1 = self.__L2
      self.__curL2 = self.__L1
      self.__L1Tree.updateTree(l2Sent)
      self.__L2Tree.updateTree(l1Sent)
    else:
      self.__curL1 = self.__L1
      self.__curL2 = self.__L2
      self.__L1Tree.updateTree(l1Sent)
      self.__L2Tree.updateTree(l2Sent)
      
    self.__align = self.__parseAlign(alignLine, l1Index, l2Index)
    self.__revAlign = self.__parseAlign(alignLine, l2Index, l1Index)
    self.__csHash = dd(lambda:set())
    
  def updateBadSwitch(self, index, l1Switch, l2Switch):
    self.__csHash[index].add(l1Switch)
    self.__csHash[index].add(l2Switch)
    
  def __parseAlign(self, alignLine, l1Index, l2Index):
    align = {}
    for i in alignLine.split():
      key = int(i.split("-")[l1Index])
      value = int(i.split("-")[l2Index])
      if key in align.keys():
        align[key].append(value)
      else:
        align[key] = [value]
    return align
    
  def csSentence(self, csType):
    validSequences = self.__utils.validSequences(self.__L1Tree.sentLen())
    # Debugging !!
    #sys.stderr.write("L1SeqL2Cont Valid Sequences:\n")
    #for sequence in validSequences:
    #    sys.stderr.write(str(sequence)+"\n")
    #dummy=raw_input()
    ###############
    # Assumption that a sentence will have a single code switch.
    sequence = self.__selectSequence(validSequences, csType) 
    # Debugging !!
    ##sys.stderr.write("Selected Sequence: "+str(sequence)+"\n")
    ##sys.stderr.write("Align:"+str(self.__align)+'\n')
    #sys.stderr.write(l1Parse)
    #sys.stderr.write(l2Parse)
    #dummy=raw_input()
    if sequence == -1:
      return [-1,[]]
    ##print "Sequence:", sequence
    csSentence = self.__utils.makeCSSentence(self.__L1Tree.wordTags(), sequence, self.__L2Tree.wordTags(), self.__align, self.__curL1, self.__curL2)
    return [csSentence,sequence]
    
  ## Assumptions:
  ## There is always a single code switch
  ## The selection among the valid candidate sequences is random
  def __selectSequence(self, validSequences, csType):
    if csType == 0:
      return self.__random(validSequences)
    elif csType == 1:
      return self.__seqL1SeqL2Cont(validSequences)
    elif csType == 2:
      return self.__seqL1Const()
    elif csType == 4:
      return self.__seqHindiClausal()
    elif csType == 3:
      return self.__seqL1ConstL2Const()
    
  def __checkHindiClause(self, sequence, OBJ):
    if frozenset(sequence) not in OBJ.heads:
      return False
    try:
      head = OBJ.heads[frozenset(sequence)]
      #print "Head:", head
      if not OBJ.isChunkSubtree(head[1], sequence) and head[0] in self.__clausalChunks and not OBJ.hasNPChild(head[1]):
        return True
    except:
      pass
    return False
    
  def __seqHindiClausal(self):
    l1Sequence = -1
    if self.__l1Index: ## Hindi is L2
      subtrees = self.__L2Tree.subtrees
      if len(subtrees) == 0:
        return -1
      count = 0
      while 1:
        l2Sequence = list(random.sample(subtrees,1)[0])
        l1Sequence = self.__utils.l2Sequence(l2Sequence, self.__revAlign)
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        if len(l2Sequence) == self.__L2Tree.sentLen():
          count += 1
          continue
        #if (frozenset(l2Sequence) in self.__L2Tree.heads and self.__L2Tree.heads[frozenset(l2Sequence)] in self.__clausalChunks and len(l1Sequence) > 0 )or count > 100:
        if len(l2Sequence)>0 and len(l1Sequence)>0 and self.__checkHindiClause(l2Sequence, self.__L2Tree):
          #if len(l1Sequence) == 0:
          #  print "l1Seq:",l1Sequence
          #  print "subtrees:", subtrees
          #  print "Count:", count
          #  dummy = raw_input(
          return l1Sequence
        count += 1
        if count > 100:
            break
    else: ## Hin is L1
      subtrees = self.__L1Tree.subtrees
      if len(subtrees) == 0:
        return -1
      count = 0
      while 1:
        l1Sequence = list(random.sample(subtrees,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if len(l2Sequence) == self.__L2Tree.sentLen():
          count += 1
          continue
        #if (frozenset(l1Sequence) in self.__L1Tree.heads and self.__L1Tree.heads[frozenset(l1Sequence)] in self.__clausalChunks and len(l2Sequence) > 0 )or count > 100:
        if len(l1Sequence)>0 and len(l2Sequence)>0 and self.__checkHindiClause(l1Sequence, self.__L1Tree):
          return l1Sequence 
        count += 1
        if count > 100:
            break
    return -1
    
  def __random(self, sequences):
    random.seed()
    l1Sequence = -1
    count = 0
    while 1:
      l1Sequence = random.sample(sequences, 1)[0]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if len(l2Sequence) > 0 or count > 100:
        break
      count += 1
    return l1Sequence
  
  def __randomStrict(self, sequences):
    random.seed()
    l1Sequence = -1
    count = 0
    while 1:
      l1Sequence = random.sample(sequences, 1)[0]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if len(l2Sequence) > 0:
        return l1Sequence
      count += 1
      if count > 100:
        break
    return -1
  
  def __seqL1SeqL2Cont(self, sequences):
    random.seed()
    sequenceIndex = -1
    l1Sequence = -1
    count = 0
    while 1:
      count += 1
      if count%100 == 0:
        return -1
        sys.stderr.write("L1SeqL2Cont"+"InfLoop:"+str(count)+"\n")
      sequenceIndex = random.randrange(len(sequences))
      l1Sequence = sequences[sequenceIndex]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if self.__L2Tree.isContiguous(l2Sequence):
        # Debugging !!
        #sys.stderr.write("Alignment: "+str(align)+"\n")
        #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n")
        ##############
        break
    return l1Sequence
 
  def __seqL1Const(self):
    random.seed()
    subtrees = self.__L1Tree.subtrees
    l1Sequence = -1
    if len(subtrees)>0:
      count = 0
      while 1:
        l1Sequence = list(random.sample(subtrees,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if len(l2Sequence) > 0:
          l1Sequence.sort()
          return l1Sequence
        count += 1
        if count > 100:
            break
    return -1
  
  def __seqL1LWG(self):
    random.seed()
    LWGs = self.__L1Tree.LWGs()
    l1Sequence = -1
    if len(LWGs)>0:
      count = 0
      while 1:
        l1Sequence = list(random.sample(LWGs,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if len(l2Sequence) > 0:
          l1Sequence.sort()
          return l1Sequence
        count += 1
        if count > 100:
            break
    return -1
 
  def __seqL1ConstL2Const(self):
    random.seed()
    subtrees = self.__L1Tree.subtrees
    l1Sequence = -1
    if len(subtrees)>0:
      count = 0
      while 1:
        l1Sequence = list(random.sample(subtrees,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if frozenset(l2Sequence) in self.__L2Tree.subtrees:
          l1Sequence.sort()
          return l1Sequence
        else:
          ##print "L1 Sentence:",self.__L1Tree.word_tags()
          ##print "L2 Sentence:",self.__L2Tree.word_tags()
          ##print "L1 Sequence:",l1Sequence
          ##print "L2 Sequence:", l2Sequence
          ##dummy = raw_input()
          dummy = 1
          
        count += 1
        if count > 100:
            break
    return -1
  
  def __seqL1LWGL2LWG(self):
    random.seed()
    LWGs = self.__L1Tree.LWGs()
    l1Sequence = -1
    if len(LWGs)>0:
      count = 0
      while 1:
        l1Sequence = list(random.sample(LWGs,1)[0])
        if len(l1Sequence) == self.__L1Tree.sentLen():
          count += 1
          continue
        l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
        if frozenset(l2Sequence) in self.__L2Tree.LWGs():
          l1Sequence.sort()
          return l1Sequence
        else:
          ##print "L1 Sentence:",self.__L1Tree.word_tags()
          ##print "L2 Sentence:",self.__L2Tree.word_tags()
          ##print "L1 Sequence:",l1Sequence
          ##print "L2 Sequence:", l2Sequence
          ##dummy = raw_input()
          dummy = 1
        count += 1
        if count > 100:
            break
    return -1
  
  def __seqL1ConstL2Cont(self, sequences):
    random.seed()
    sequenceIndex = -1
    l1Sequence = -1
    count = 0
    while 1:
      count += 1
      if count%100 == 0:
        return -1
        sys.stderr.write("L1Const2Cont"+"InfLoop:"+str(count)+"\n")
      sequenceIndex = random.randrange(len(sequences))
      l1Sequence = sequences[sequenceIndex]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if self.__L1Tree.isConstituent(l1Sequence):
        if self.__L2Tree.isContiguous(l2Sequence):
          # Debugging !!
          #sys.stderr.write("Alignment: "+str(align)+"\n")
          #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n")
          ##############
          break
    return l1Sequence
  
  def __seqL1ConstL2SameConst(self, sequences):
    random.seed()
    sequenceIndex = -1
    l1Sequence =-1
    count = 0
    while 1:
      count += 1
      if count%500 == 0:
        return -1
        sys.stderr.write("L1Const2SameConst"+"InfLoop:"+str(count)+"\n")
      sequenceIndex = random.randrange(len(sequences))
      l1Sequence = sequences[sequenceIndex]
      l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align)
      if self.__L1Tree.isConstituent(l1Sequence):
        if self.__L2Tree.isConstituent(l2Sequence):
          l1PhraseTag = self.__L1Tree.getPhrase(l1Sequence[0])
          l2PhraseTag = self.__L2Tree.getPhrase(l2Sequence[0])
          ## Both the phrases are same, for dual structure principle
          if l1PhraseTag == l2PhraseTag or l2PhraseTag in self.__phraseMap[l1PhraseTag]:
            # Debugging !!
            #sys.stderr.write("Alignment: "+str(align)+"\n")
            #sys.stderr.write("L2Sequence: "+str(l2Sequence)+" Same Const: "+l1PhraseTag+"\n")
            ##############
            break
    return l1Sequence