class CSHandler: def __init__(self): self.__L1Tree = Parsetree() self.__L2Tree = Parsetree() self.__align = {} self.__utils = Utils() self.__phraseMap = {} def updatePhraseMap(self, phraseMap): self.__phraseMap = phraseMap def updateHandler(self, l1Sent, l2Sent, alignLine, l1Index): l2Index = 1-l1Index if l1Index: self.__L1Tree.updateTree(l2Sent) self.__L2Tree.updateTree(l1Sent) else: self.__L1Tree.updateTree(l1Sent) self.__L2Tree.updateTree(l2Sent) self.__align = self.__parseAlign(alignLine, l1Index, l2Index) def __parseAlign(self, alignLine, l1Index, l2Index): align = {} for i in alignLine.split(): key = int(i.split("-")[l1Index]) value = int(i.split("-")[l2Index]) if key in align.keys(): align[key].append(value) else: align[key] = [value] return align def csSentence(self, csType): validSequences = self.__utils.validSequences(self.__L1Tree.sentLen()) # Debugging !! #sys.stderr.write("L1SeqL2Cont Valid Sequences:\n") #for sequence in validSequences: # sys.stderr.write(str(sequence)+"\n") #dummy=raw_input() ############### # Assumption that a sentence will have a single code switch. sequence = self.__selectSequence(validSequences, csType) # Debugging !! #sys.stderr.write("L1SeqL2Cont Selected Sequence: "+str(sequence)+"\n") #sys.stderr.write(l1Parse) #sys.stderr.write(l2Parse) #dummy=raw_input() if sequence == -1: return [-1,[]] csSentence = self.__utils.makeCSSentence(self.__L1Tree.wordTags(), sequence, self.__L2Tree.wordTags(), self.__align) return [csSentence,sequence] ## Assumptions: ## There is always a single code switch ## The selection among the valid candidate sequences is random def __selectSequence(self, validSequences, csType): if csType == 0: return self.__seqL1SeqL2Cont(validSequences) elif csType == 1: return self.__seqL1ConstL2Cont(validSequences) elif csType == 2: return self.__seqL1ConstL2Const(validSequences) elif csType == 3: return self.__seqL1ConstL2SameConst(validSequences) def __seqL1SeqL2Cont(self, sequences): random.seed() sequenceIndex = -1 l1Sequence = [] count = 0 while 1: count += 1 if count%100 == 0: return -1 sys.stderr.write("L1SeqL2Cont"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L2Tree.isContiguous(l2Sequence): # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n") ############## break return l1Sequence def __seqL1ConstL2Cont(self, sequences): random.seed() sequenceIndex = -1 l1Sequence = [] count = 0 while 1: count += 1 if count%100 == 0: return -1 sys.stderr.write("L1Const2Cont"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L1Tree.isConstituent(l1Sequence): if self.__L2Tree.isContiguous(l2Sequence): # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n") ############## break return l1Sequence def __seqL1ConstL2Const(self, sequences): random.seed() sequenceIndex = -1 l1Sequence = [] count = 0 while 1: count += 1 if count%100 == 0: return -1 sys.stderr.write("L1Const2Const"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L1Tree.isConstituent(l1Sequence): if self.__L2Tree.isConstituent(l2Sequence): # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("L2Sequence: "+str(l2Sequence)+"\n") ############## break return l1Sequence def __seqL1ConstL2SameConst(self, sequences): random.seed() sequenceIndex = -1 l1Sequence = [] count = 0 while 1: count += 1 if count%500 == 0: return -1 sys.stderr.write("L1Const2SameConst"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L1Tree.isConstituent(l1Sequence): if self.__L2Tree.isConstituent(l2Sequence): l1PhraseTag = self.__L1Tree.getPhrase(l1Sequence[0]) l2PhraseTag = self.__L2Tree.getPhrase(l2Sequence[0]) ## Both the phrases are same, for dual structure principle if l1PhraseTag == l2PhraseTag or l2PhraseTag in self.__phraseMap[l1PhraseTag]: # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("L2Sequence: "+str(l2Sequence)+" Same Const: "+l1PhraseTag+"\n") ############## break return l1Sequence
class CSHandler: def __init__(self): #self.__L1Tree = Parsetree() #self.__L2Tree = Parsetree() self.__L1 = "" self.__L2 = "" self.__curL1 = "" self.__curL2 = "" self.__L1Tree = Dependencytree() self.__L2Tree = Dependencytree() self.__align = {} self.__revAlign = {} self.__utils = Utils() self.__phraseMap = {} self.__l1Index = 0 self.__clausalChunks = ["CCP","VGF", "NULL__CCP","NULL__VGF"] def updatePhraseMap(self, phraseMap): self.__phraseMap = phraseMap def updateLIDTags(self, L1, L2): self.__L1 = L1 self.__L2 = L2 def updateHandler(self, l1Sent, l2Sent, alignLine, l1Index): #sys.stderr.write(alignLine+'\n') l2Index = 1-l1Index self.__l1Index = l1Index if l1Index: self.__curL1 = self.__L2 self.__curL2 = self.__L1 self.__L1Tree.updateTree(l2Sent) self.__L2Tree.updateTree(l1Sent) else: self.__curL1 = self.__L1 self.__curL2 = self.__L2 self.__L1Tree.updateTree(l1Sent) self.__L2Tree.updateTree(l2Sent) self.__align = self.__parseAlign(alignLine, l1Index, l2Index) self.__revAlign = self.__parseAlign(alignLine, l2Index, l1Index) self.__csHash = dd(lambda:set()) def updateBadSwitch(self, index, l1Switch, l2Switch): self.__csHash[index].add(l1Switch) self.__csHash[index].add(l2Switch) def __parseAlign(self, alignLine, l1Index, l2Index): align = {} for i in alignLine.split(): key = int(i.split("-")[l1Index]) value = int(i.split("-")[l2Index]) if key in align.keys(): align[key].append(value) else: align[key] = [value] return align def csSentence(self, csType): validSequences = self.__utils.validSequences(self.__L1Tree.sentLen()) # Debugging !! #sys.stderr.write("L1SeqL2Cont Valid Sequences:\n") #for sequence in validSequences: # sys.stderr.write(str(sequence)+"\n") #dummy=raw_input() ############### # Assumption that a sentence will have a single code switch. sequence = self.__selectSequence(validSequences, csType) # Debugging !! ##sys.stderr.write("Selected Sequence: "+str(sequence)+"\n") ##sys.stderr.write("Align:"+str(self.__align)+'\n') #sys.stderr.write(l1Parse) #sys.stderr.write(l2Parse) #dummy=raw_input() if sequence == -1: return [-1,[]] ##print "Sequence:", sequence csSentence = self.__utils.makeCSSentence(self.__L1Tree.wordTags(), sequence, self.__L2Tree.wordTags(), self.__align, self.__curL1, self.__curL2) return [csSentence,sequence] ## Assumptions: ## There is always a single code switch ## The selection among the valid candidate sequences is random def __selectSequence(self, validSequences, csType): if csType == 0: return self.__random(validSequences) elif csType == 1: return self.__seqL1SeqL2Cont(validSequences) elif csType == 2: return self.__seqL1Const() elif csType == 4: return self.__seqHindiClausal() elif csType == 3: return self.__seqL1ConstL2Const() def __checkHindiClause(self, sequence, OBJ): if frozenset(sequence) not in OBJ.heads: return False try: head = OBJ.heads[frozenset(sequence)] #print "Head:", head if not OBJ.isChunkSubtree(head[1], sequence) and head[0] in self.__clausalChunks and not OBJ.hasNPChild(head[1]): return True except: pass return False def __seqHindiClausal(self): l1Sequence = -1 if self.__l1Index: ## Hindi is L2 subtrees = self.__L2Tree.subtrees if len(subtrees) == 0: return -1 count = 0 while 1: l2Sequence = list(random.sample(subtrees,1)[0]) l1Sequence = self.__utils.l2Sequence(l2Sequence, self.__revAlign) if len(l1Sequence) == self.__L1Tree.sentLen(): count += 1 continue if len(l2Sequence) == self.__L2Tree.sentLen(): count += 1 continue #if (frozenset(l2Sequence) in self.__L2Tree.heads and self.__L2Tree.heads[frozenset(l2Sequence)] in self.__clausalChunks and len(l1Sequence) > 0 )or count > 100: if len(l2Sequence)>0 and len(l1Sequence)>0 and self.__checkHindiClause(l2Sequence, self.__L2Tree): #if len(l1Sequence) == 0: # print "l1Seq:",l1Sequence # print "subtrees:", subtrees # print "Count:", count # dummy = raw_input( return l1Sequence count += 1 if count > 100: break else: ## Hin is L1 subtrees = self.__L1Tree.subtrees if len(subtrees) == 0: return -1 count = 0 while 1: l1Sequence = list(random.sample(subtrees,1)[0]) if len(l1Sequence) == self.__L1Tree.sentLen(): count += 1 continue l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if len(l2Sequence) == self.__L2Tree.sentLen(): count += 1 continue #if (frozenset(l1Sequence) in self.__L1Tree.heads and self.__L1Tree.heads[frozenset(l1Sequence)] in self.__clausalChunks and len(l2Sequence) > 0 )or count > 100: if len(l1Sequence)>0 and len(l2Sequence)>0 and self.__checkHindiClause(l1Sequence, self.__L1Tree): return l1Sequence count += 1 if count > 100: break return -1 def __random(self, sequences): random.seed() l1Sequence = -1 count = 0 while 1: l1Sequence = random.sample(sequences, 1)[0] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if len(l2Sequence) > 0 or count > 100: break count += 1 return l1Sequence def __randomStrict(self, sequences): random.seed() l1Sequence = -1 count = 0 while 1: l1Sequence = random.sample(sequences, 1)[0] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if len(l2Sequence) > 0: return l1Sequence count += 1 if count > 100: break return -1 def __seqL1SeqL2Cont(self, sequences): random.seed() sequenceIndex = -1 l1Sequence = -1 count = 0 while 1: count += 1 if count%100 == 0: return -1 sys.stderr.write("L1SeqL2Cont"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L2Tree.isContiguous(l2Sequence): # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n") ############## break return l1Sequence def __seqL1Const(self): random.seed() subtrees = self.__L1Tree.subtrees l1Sequence = -1 if len(subtrees)>0: count = 0 while 1: l1Sequence = list(random.sample(subtrees,1)[0]) if len(l1Sequence) == self.__L1Tree.sentLen(): count += 1 continue l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if len(l2Sequence) > 0: l1Sequence.sort() return l1Sequence count += 1 if count > 100: break return -1 def __seqL1LWG(self): random.seed() LWGs = self.__L1Tree.LWGs() l1Sequence = -1 if len(LWGs)>0: count = 0 while 1: l1Sequence = list(random.sample(LWGs,1)[0]) if len(l1Sequence) == self.__L1Tree.sentLen(): count += 1 continue l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if len(l2Sequence) > 0: l1Sequence.sort() return l1Sequence count += 1 if count > 100: break return -1 def __seqL1ConstL2Const(self): random.seed() subtrees = self.__L1Tree.subtrees l1Sequence = -1 if len(subtrees)>0: count = 0 while 1: l1Sequence = list(random.sample(subtrees,1)[0]) if len(l1Sequence) == self.__L1Tree.sentLen(): count += 1 continue l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if frozenset(l2Sequence) in self.__L2Tree.subtrees: l1Sequence.sort() return l1Sequence else: ##print "L1 Sentence:",self.__L1Tree.word_tags() ##print "L2 Sentence:",self.__L2Tree.word_tags() ##print "L1 Sequence:",l1Sequence ##print "L2 Sequence:", l2Sequence ##dummy = raw_input() dummy = 1 count += 1 if count > 100: break return -1 def __seqL1LWGL2LWG(self): random.seed() LWGs = self.__L1Tree.LWGs() l1Sequence = -1 if len(LWGs)>0: count = 0 while 1: l1Sequence = list(random.sample(LWGs,1)[0]) if len(l1Sequence) == self.__L1Tree.sentLen(): count += 1 continue l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if frozenset(l2Sequence) in self.__L2Tree.LWGs(): l1Sequence.sort() return l1Sequence else: ##print "L1 Sentence:",self.__L1Tree.word_tags() ##print "L2 Sentence:",self.__L2Tree.word_tags() ##print "L1 Sequence:",l1Sequence ##print "L2 Sequence:", l2Sequence ##dummy = raw_input() dummy = 1 count += 1 if count > 100: break return -1 def __seqL1ConstL2Cont(self, sequences): random.seed() sequenceIndex = -1 l1Sequence = -1 count = 0 while 1: count += 1 if count%100 == 0: return -1 sys.stderr.write("L1Const2Cont"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L1Tree.isConstituent(l1Sequence): if self.__L2Tree.isContiguous(l2Sequence): # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("Contiguous l2Sequence: "+str(l2Sequence)+"\n") ############## break return l1Sequence def __seqL1ConstL2SameConst(self, sequences): random.seed() sequenceIndex = -1 l1Sequence =-1 count = 0 while 1: count += 1 if count%500 == 0: return -1 sys.stderr.write("L1Const2SameConst"+"InfLoop:"+str(count)+"\n") sequenceIndex = random.randrange(len(sequences)) l1Sequence = sequences[sequenceIndex] l2Sequence = self.__utils.l2Sequence(l1Sequence, self.__align) if self.__L1Tree.isConstituent(l1Sequence): if self.__L2Tree.isConstituent(l2Sequence): l1PhraseTag = self.__L1Tree.getPhrase(l1Sequence[0]) l2PhraseTag = self.__L2Tree.getPhrase(l2Sequence[0]) ## Both the phrases are same, for dual structure principle if l1PhraseTag == l2PhraseTag or l2PhraseTag in self.__phraseMap[l1PhraseTag]: # Debugging !! #sys.stderr.write("Alignment: "+str(align)+"\n") #sys.stderr.write("L2Sequence: "+str(l2Sequence)+" Same Const: "+l1PhraseTag+"\n") ############## break return l1Sequence