def findmatch(node,start, maxDist, minMatch): # for i in xrange(0, len(node.childNodes)): refid = start ref = node.childNodes[refid] matchId = False matchCount = 0 j = refid while True: j += matchId - refid if matchId else 1 if j >= len(node.childNodes): break c = node.childNodes[j] if __debug: print refid, ref.tags print j, c.tags #, c.text.replace('\n', '|').replace(' ', '') d = float(simpleTreeMatching(ref,c))/max(len(ref.tags), len(c.tags),1) if 1-d <= maxDist: matchCount += 1 matchId = j if not matchId else matchId if matchCount >= minMatch: return matchId, matchCount return matchId, matchCount
def match2(node, maxDist=0, height=3, tags=False, printtag=False): if __debug: print "Debug mode: eri.utils.match.match2()" s1 = None s2 = None result = [False] * len(node.childNodes) match = False _comp = 0 for x in xrange(0,len(node.childNodes)): c = node.childNodes[x] #primary test to not match low height if x == 0 or c.height < height: s1 = c match = False _comp += 1 continue s2 = c if printtag or __debug: print 'str:', s1.tag, s2.tag d = float(simpleTreeMatching(s1,s2))/max(len(s1.tags), len(s2.tags),1) if __debug: print 'distance:', 1-d #match test if 1-d <= maxDist : s1 = c result[x] = _comp result[x-1] = _comp if not match else result[x-1] match = True else: s1 = c match = False _comp += 1 #for if __debug: print "return: eri.utils.match.match2()" return result
def check(node, maxDist, r, first, last): if __debug: print "Check", maxDist, r, first, last maxmatch = False i = first+r while i+r <= last: # print i, i+r, last for k in xrange(0, r): # print k , first+k, i+k a = node.childNodes[first+k] b = node.childNodes[i+k] d = float(simpleTreeMatching(a,b))/max(len(a.tags), len(b.tags),1) # print a.tags,'\n', b.tags if 1-d <= maxDist: maxmatch = i+k # print maxmatch else: # print 'not match' return maxmatch i += r return maxmatch