def refineML_TopLevelMatch(ml): if PRINT_DEBUG: for m in ml: print "Phrase match\t%s\t-> %s\t\t(%s)\t-> (%s)" % ( m[0].tag(), m[1].tag(), nodeText(m[0]), nodeText( m[1])), topLevelPhrasesMatch(m) return [m for m in ml if topLevelPhrasesMatch(m)]
def linkIdenticalWords(n1, n2, matchList): """ Matches nodes in the two trees where the words match, even if the node labels do not. Using wordsMatch which includes stemming. @return Pair of matched nodes """ assert not n1.isLeaf(), "Original sentence node is a single leaf" assert not n2.isLeaf(), "Highlight sentence node is a single leaf" ch1List = [(ch, nodeText(ch)) for ch in n1] ch2List = [(ch, nodeText(ch)) for ch in n2] if False and PRINT_DEBUG: print "ch1" for ch in ch1List: print ch[0].tag(), ch[0], ch[1] print "ch2" for ch in ch2List: print ch[0], ch[1] # do loop explicitly to catch recursion errors and carry on matching = [] for (ch1, txt1) in ch1List: for (ch2, txt2) in ch2List: try: if (ch1, ch2) not in matchList and wordsMatch( txt1, txt2, ch1.tag(), ch2.tag()): matching.append((ch1, ch2)) except RuntimeError: # recursion problem in matching trees pass # if len(matching)>0: # print "\n\n\nMatching: ", matching # raise SystemExit, "Need to add POS tags" return matching
def linkParentNodes(matchList, matchListNoStopList, maxDepth=1): """ Considers pairs of matched nodes in the match list. If two nodes have the same parent, then this parent is matched to the ancestor of the corresponding highlight nodes. A maxDepth=1 allows siblings to be matched """ matchingParentNodes = [] if PRINT_DEBUG: print "\nInvestigating node distances" assert len(matchList) > 0, "No matches to work with" srcroot = matchList[0][0].root() hiroot = matchList[0][1].root() print "Roots: ", id(srcroot), "---", id(hiroot) for ms, mh in matchList: print id(ms.root()), "---", id(mh.root()) if not id(ms.root()) == id(srcroot): print "src!!!" if not id(mh.root()) == id(hiroot): print "hi!!!" print "Original tree:" print nodeText(hiroot) print hiroot print "New tree involving:", mh print nodeText(mh.root()) print mh.root() for (s1, h1) in matchList: # was matchListNoStopList # for (s2, h2) in matchListNoStopList[i+1:]: for (s2, h2) in matchListNoStopList: d = nodeDistance(s1, s2) if PRINT_DEBUG: print "nodeDistance: ", d, "\t\t", s1.treeposition( ), "---", s2.treeposition( ), "\t; highlights\t", h1.treeposition( ), "---", h2.treeposition(), if d <= maxDepth: ca1 = s1.root()[commonAncestor(s1, s2)] ca2 = h1.root()[commonAncestor(h1, h2)] if (ca1, ca2) in matchList: continue if (ca1, ca2) in matchingParentNodes: continue if PRINT_DEBUG: print print "Original: ", nodeText(s1), " --- ", nodeText( s2), "\tdistance ", d print "Highlight: ", nodeText(h1), " --- ", nodeText( h2), nodeDistance(h1, h2) print "Ancestors in match list? ", (ca1, ca2) in matchList print "Original parent phrase: ", nodeText(ca1) print "Highlight parent phrase:", nodeText(ca2) matchingParentNodes.append((ca1, ca2)) return matchingParentNodes
def _refineML_RemoveMissedProperNounsTest(m): """ Return True if the match conveys any NP present in the source tree """ if m[0].isLeaf() or m[1].isLeaf(): return True # looking for S where NP isn't included # if m[0].tag()=="S" and m[1].tag(): srcTags = [ch.tag() for ch in m[0]] tgtTags = [ch.tag() for ch in m[1]] if "NP" not in srcTags and "NP" in tgtTags: # TODO: check that NP actually contains proper nouns as children if PRINT_DEBUG: # print "Missing NP" print nodeText(m[0]) print nodeText(m[1]) #raise SystemExit return False return True
def printAllMatchListInfo(matchList): print "\nMatch list:" for (ph1, ph2) in matchList: print ph1.treeposition(), "---", ph2.treeposition(), "\t\t", ph1.tag( ), nodeText(ph1), " --- ", ph2.tag(), nodeText(ph2) print