def normalized_simple_distance(A, B, get_children=Node.get_children, alpha=1, get_tree_size=None, return_operations=False, **kwargs): if get_tree_size is None: get_tree_size = functools.partial(default_tree_size, get_children=get_children) if return_operations: edit_distance, operations = simple_distance(A, B, get_children=get_children, **kwargs) else: edit_distance = simple_distance(A, B, get_children=get_children, **kwargs) d = _compute_normalized_distance(edit_distance, alpha, get_tree_size(A), get_tree_size(B)) if return_operations: return d, operations else: return d
def test_symmetry(self): trees = itertools.product((randtree(5, repeat=3, width=2) for x in range(N)), repeat=2) for a, b in trees: ab = simple_distance(a, b) ba = simple_distance(b, a) # print '-----------------------------' # print ab, ba self.assertEqual(ab, ba)
def test_triangle_inequality(self): trees = itertools.product((randtree(5, repeat=3, width=2) for x in xrange(N)), (randtree(5, repeat=3, width=2) for x in xrange(N)), (randtree(5, repeat=3, width=2) for x in xrange(N))) for a,b,c in trees: #print '--------------------------------' ab = simple_distance(a,b) bc = simple_distance(b,c) ac = simple_distance(a,c) #print ab, bc, ac self.assertTrue(ac <= ab + bc)
def build_sim_mx(trees, nodelists=None, method='zss', similarity=True, speedup=-1): '''Returns a distance matrix of `trees` using metric `method` trees : list of trees (Node) method : `zss` is normalized TED, `jaccard` is on node lists, `both` gives both and can take the speedup parameter similarity : if False, returns the dissimilarity (1-sim) speedup : [0,1]. Will set TED=0 when jaccard < speedup. (Zero = no speedup) Returns sim_mx (or if `both`, returns TED then jaccard) ''' numt = len(trees) lists = [[c.label for c in t.iter()] for t in trees] if nodelists == None else nodelists sizes = [len(li) for li in lists] sim_mx = np.zeros((numt, numt)) sim_mx2 = np.zeros((numt, numt)) if method == 'both' else None for i in xrange(numt): for j in xrange(i, numt): if i == j: sim_mx[i, j] = 1 if method == 'both': sim_mx2[i, j] = 1 continue if method == 'both': jacc = jaccard(lists[i], lists[j]) nted = 0 if jacc >= speedup: ted = simple_distance(trees[i], trees[j]) nted = 1 - (2 * ted / float(sizes[i] + sizes[j] + ted)) sim_mx[i, j], sim_mx[j, i] = nted, nted sim_mx2[i, j], sim_mx2[j, i] = jacc, jacc else: val = 0 if method == 'zss': ted = simple_distance(trees[i], trees[j]) val = 1 - (2 * ted / float(sizes[i] + sizes[j] + ted)) elif method == 'jaccard': val = jaccard(lists[i], lists[j]) else: print 'Method unrecognized.' return sim_mx[i, j], sim_mx[j, i] = val, val if method == 'both': if not similarity: sim_mx = 1 - sim_mx sim_mx2 = 1 - sim_mx2 return sim_mx, sim_mx2 else: return sim_mx if similarity else (1 - sim_mx)
def test_triangle_inequality(self): trees = itertools.product((randtree(5, repeat=3, width=2) for x in range(N)), (randtree(5, repeat=3, width=2) for x in range(N)), (randtree(5, repeat=3, width=2) for x in range(N))) for a, b, c in trees: # print '--------------------------------' ab = simple_distance(a, b) bc = simple_distance(b, c) ac = simple_distance(a, c) # print ab, bc, ac self.assertTrue(ac <= ab + bc)
def kernel_tree_edit_distance(self, other): """ Calculate tree edit distance between this and other state Because tree edit distance is symmetric, we return the average of distances for both ways. We use the algorithm by Kaizhong Zhang and Dennis Shasha. Simple fast algorithms for the editing distance between trees and related problems. SIAM Journal of Computing, 18:1245–1262, 1989 We use the implementation from https://github.com/timtadh/zhang-shasha """ t1 = self._kernel_tree_edit_distance_create_tree(self.tree.root) t2 = other._kernel_tree_edit_distance_create_tree(other.tree.root) return (zss.simple_distance(t1, t2) + zss.simple_distance(t2, t1)) / 2.0
def kernel_tree_edit_distance(self, other): """ Calculate tree edit distance between this and other state Because tree edit distance is symmetric, we return the average of distances for both ways. We use the algorithm by Kaizhong Zhang and Dennis Shasha. Simple fast algorithms for the editing distance between trees and related problems. SIAM Journal of Computing, 18:1245–1262, 1989 We use the implementation from https://github.com/timtadh/zhang-shasha """ t1 = self._kernel_tree_edit_distance_create_tree(self.tree.root) t2 = other._kernel_tree_edit_distance_create_tree(other.tree.root) return (zss.simple_distance(t1, t2) + zss.simple_distance(t2, t1)) / 2.0
def generate_ai_sequence(df, ast_list, best_ast_no): trajct = dict() for i in df.trajectoryId: with open(trajectories + "%s.txt" % i, 'r') as fin: rawpath = fin.read().splitlines() trajct[i] = map(int, rawpath) print "Length of original trajectories: %d" % len(trajct) del_key = [] print "1. Detecting the key-value paris contain missing ASTs:" for key in progressbar.progressbar(trajct): if set(trajct[key]).issubset(set(ast_list)) == False: del_key.append(key) print "2. Droping the trajectories contain missing ASTs:" for key in progressbar.progressbar(del_key): del trajct[key] print "Lenght of reduced trajectories: %d" % len(trajct) print "3. Computing appoaching index for each trajectory:" best_ast = load_json(best_ast_no) for key, value in progressbar.progressbar(trajct.items()): for i in range(len(value)): current_ast = load_json(trajct[key][i]) trajct[key][i] = zss.simple_distance(tree_builder(current_ast), tree_builder(best_ast)) # save trajct dict intp pickle, if you want # f = open("ai_trajctory.pkl","wb") # pickle.dump(trajct,f) # f.close() return trajct
def test_paper_tree(): A = ( Node("f") .addkid(Node("d") .addkid(Node("a")) .addkid(Node("c") .addkid(Node("b")) ) ) .addkid(Node("e")) ) B = ( Node("f") .addkid(Node("c") .addkid(Node("d") .addkid(Node("a")) .addkid(Node("b")) ) ) .addkid(Node("e")) ) #print A #print #print B dist = simple_distance(A,B) assert dist == 2
def test_paper_tree(): Node = WeirdNode A = ( Node("f") .addkid(Node("d") .addkid(Node("a")) .addkid(Node("c") .addkid(Node("b")) ) ) .addkid(Node("e")) ) B = ( Node("f") .addkid(Node("c") .addkid(Node("d") .addkid(Node("a")) .addkid(Node("b")) ) ) .addkid(Node("e")) ) #print A #print #print B dist = simple_distance(A, B, WeirdNode.get_children, WeirdNode.get_label, weird_dist) assert dist == 20
def compute_display_TED(root1, root2, normalize): cost, ops = simple_distance( root1, root2, label_dist=lambda x, y: display_label_sitance(x, y, normalize), return_operations=True) return cost, ops
def edit_distance(self, other_tree): """ Returns the Zhang-Sasha edit distance between another tree, as implemented in zss """ return simple_distance(self, other_tree, ExpressionNode.get_children, ExpressionNode.get_label)
def syntax_similarity_conversation(self, documents1, average=False): #syntax similarity of each document with its before and after document global numnodes documents1parsed = [] for d1 in range(len(documents1)): sys.stderr.write(str(d1)+"\n") # print documents1[d1] tempsents = (self.sent_detector.tokenize(documents1[d1].strip())) for s in tempsents: if len(s.split())>100: documents1parsed.append("NA") break else: temp = list(self.parser.raw_parse_sents((tempsents))) for i in range(len(temp)): temp[i] = list(temp[i])[0] temp[i] = ParentedTree.convert(temp[i]) documents1parsed.append(list(temp)) results = OrderedDict() for d1 in range(len(documents1parsed)): d2 = d1+1 if d2 == len(documents1parsed): break if documents1parsed[d1] == "NA" or documents1parsed[d2]=="NA": continue costMatrix = [] for i in range(len(documents1parsed[d1])): numnodes = 0 tempnode = Node(documents1parsed[d1][i].root().label()) new_sentencedoc1 = self.convert_mytree(documents1parsed[d1][i],tempnode) temp_costMatrix = [] sen1nodes = numnodes for j in range(len(documents1parsed[d2])): numnodes=0.0 tempnode = Node(documents1parsed[d2][j].root().label()) new_sentencedoc2 = self.convert_mytree(documents1parsed[d2][j],tempnode) ED = simple_distance(new_sentencedoc1, new_sentencedoc2) ED = ED / (numnodes + sen1nodes) temp_costMatrix.append(ED) costMatrix.append(temp_costMatrix) costMatrix = np.array(costMatrix) if average==True: return 1-np.mean(costMatrix) else: indexes = su.linear_assignment(costMatrix) total = 0 rowMarked = [0] * len(documents1parsed[d1]) colMarked = [0] * len(documents1parsed[d2]) for row, column in indexes: total += costMatrix[row][column] rowMarked[row] = 1 colMarked [column] = 1 for k in range(len(rowMarked)): if rowMarked[k]==0: total+= np.min(costMatrix[k]) for c in range(len(colMarked)): if colMarked[c]==0: total+= np.min(costMatrix[:,c]) maxlengraph = max(len(documents1parsed[d1]),len(documents1parsed[d2])) results[(d1,d2)] = 1-total/maxlengraph#, minWeight/minlengraph, randtotal/lengraph return results
def zss_similarity(node1, node2): a = Node(node1['name'], node1['children']) b = Node(node2['name'], node2['children']) dist = simple_distance(a, b) return dist
def tree_distance(t1, t2, _input1, _input2): """ compare 2 trees, return t1 == t2 :param t1: a tree (zss root node) :param t2: ... :param _input1: a set of all labels (str) which represent input symbols :param _input2: ... :return: a bool, t1 == t2 """ t1_input = get_tree_used_input(t1, _input1) t2_input = get_tree_used_input(t2, _input2) if len(t1_input) != len(t2_input): # tentatively we treat expressions with different number of inputs as different, we do not prove return False min_distance = 1000000 for per_t1_input_idx in permutations(range(len(t1_input))): replace_map = { t2_input[i]: t1_input[per_t1_input_idx[i]] for i in range(len(per_t1_input_idx)) } # tmp_t2 = deepcopy(t2) # _replace_tree_node(tmp_t2, replace_map) tmp_t2 = copy_zss_tree_with_replacement(t2, replace_map) # tmp_t2 = copy_zss_tree_with_replacement_no_recursion(t2, replace_map) # print('t1') # print(str(t1)) # print('t2') # print(str(tmp_t2)) # print() tmp_d = zss.simple_distance(t1, tmp_t2) min_distance = min(min_distance, tmp_d) return min_distance
def evaluate_tree_edit_distance(inputs_1, inputs_2, **kwargs): distance_list = [] for i1, i2 in zip(inputs_1, inputs_2): distance_list.append( zss.simple_distance(i1, i2, WeirdNode.get_children, WeirdNode.get_label, weird_dist)) return distance_list
def tree_edit_distance(pipeline1, pipeline2): if not isinstance(pipeline1, zss.Node): pipeline1 = to_tree(pipeline1) if not isinstance(pipeline2, zss.Node): pipeline2 = to_tree(pipeline2) # just binary on labels return zss.simple_distance(pipeline1, pipeline2, label_dist=binary_dist)
def zss_with_descriptor(tree1, tree2, tree_descriptor): if tree_descriptor == 'v1': return zss.simple_distance(tree1.root, tree2.root) else: penalty = 10 weights = (1, 1, 1) desc = tree_descriptor.split(',')[1:] if len(desc) > 3: penalty = int(desc[3]) if len(desc) > 2: weights = tuple(map(int, desc[:3])) def dist_method(label_a, label_b): return enhanced_label_dist(label_a, label_b, weights) return zss.distance(tree1.root, tree2.root, insert_cost=lambda node: label_dist([ ], node.enhanced_label, penalty, dist_method), remove_cost=lambda node: label_dist( node.enhanced_label, [], penalty, dist_method), update_cost=lambda node1, node2: label_dist( node1.enhanced_label, node2.enhanced_label, penalty, dist_method), get_children=lambda node: node.children)
def test_nondegenercy(self): trees = itertools.product((randtree(5, repeat=3, width=2) for x in range(N)), repeat=2) for a, b in trees: d = simple_distance(a, b) # print '-----------------------------' # print d, a is b self.assertTrue((d == 0 and a is b) or a is not b)
def simple_tree_distance(self, pair, name=False): """ Compute a simple tree edit distance (TED) value and operations. Nodes are considered to match if they have the same dependency label and lemma. """ if name: return 'Simple TED', 'TED / Length T', 'TED / Length H' def get_children(node): return node.dependents def get_label(node): return node.lemma, node.dependency_relation def label_dist(label1, label2): return int(label1 != label2) tree_t = pair.annotated_t tree_h = pair.annotated_h root_t = tree_t.root root_h = tree_h.root size_t = len(tree_t.tokens) size_h = len(tree_h.tokens) distance = zss.simple_distance(root_t, root_h, get_children, get_label, label_dist) return distance, distance / size_t, distance / size_h
def compare_graphs(g1, g2): ''' Given two graphs (zss trees) return a normalized distance between them ''' g1_node_count = float( len(g1.get_children(g1)) ) # get the length of the tree starting at the root g2_node_count = float( len(g2.get_children(g2)) ) # '' dist = float(simple_distance(g1, g2)) return 1 - (dist / (g1_node_count + g2_node_count))
def compare(self, comparator=None): if comparator is None: comparator = self.__defaultComparator self.__treeEditDistance, self.__treeEditOperations = zss.simple_distance( self.__tree1, self.__tree2, self.__getChildren, self.__getLabel, comparator, return_operations=True )
def calculate_smart_score(self, page_text, url): self.current_event.clear_data() #print page_text self.current_event.set_url(url) self.build_event(page_text) #self.current_event.to_string() A = self.build_tree_no_date(self.current_event) B = self.build_tree_no_date(self.base_event) simp_dist = simple_distance(A, B) #print "simp_dist before inversing: "+str(simp_dist) if simp_dist != 0: simp_dist = 1.0/(1.25*simp_dist) else: simp_dist = 1 # set null date values to zero tempday = self.current_event.day if(tempday == ''): tempday = 0 tempmonth = self.current_event.month if(tempmonth == ''): tempmonth = 0 tempyear = self.current_event.year if(tempyear == ''): tempyear = 0 # do same for basedate tempbaseday = self.base_event.day if(tempbaseday == ''): tempbaseday = 0 tempbasemonth = self.base_event.month if(tempbasemonth == ''): tempbasemonth = 0 tempbaseyear = self.base_event.year if(tempbaseyear == ''): tempbaseyear = 0 daysdelta = abs(int(tempbaseday)-int(tempday))+(30*(abs(int(tempmonth)-int(tempbasemonth))))+(365*(abs(int(tempyear)-int(tempbaseyear)))) date_contrib = pow(daysdelta,.2) if date_contrib != 0: date_contrib = 1.0 / date_contrib else: date_contrib = 1 # catch case where no date was extracted from article if tempday == 0 and tempmonth == 0 and tempyear == 0: date_contrib = 0 #print "Found Date: " + str(tempday) +"/"+str(tempmonth)+"/"+str(tempyear) #print "Base Date: " + str(tempbaseday) +"/"+str(tempbasemonth)+"/"+str(tempbaseyear) #print "date_contrib: " + str(date_contrib) #print "simp_dist: " + str(simp_dist) #print (.2*date_contrib) + (.8*simp_dist) score = (.2*date_contrib) + (.8*simp_dist) if score > self.threshold: f = open(last_run_file, 'a') f.write(self.current_event.formated_string()) f.close() return score
def syntax_similarity_two_documents(self, doc1, doc2, average=False): #syntax similarity of two single documents global numnodes doc1sents = self.sent_detector.tokenize(doc1.strip()) doc2sents = self.sent_detector.tokenize(doc2.strip()) for s in doc1sents: # to handle unusual long sentences. if len(s.split())>100: return "NA" for s in doc2sents: if len(s.split())>100: return "NA" try: #to handle parse errors. Parser errors might happen in cases where there is an unsuall long word in the sentence. doc1parsed = self.parser.raw_parse_sents((doc1sents)) doc2parsed = self.parser.raw_parse_sents((doc2sents)) except Exception as e: sys.stderr.write(str(e)) return "NA" costMatrix = [] doc1parsed = list(doc1parsed) for i in range(len(doc1parsed)): doc1parsed[i] = list(doc1parsed[i])[0] doc2parsed = list(doc2parsed) for i in range(len(doc2parsed)): doc2parsed[i] = list(doc2parsed[i])[0] for i in range(len(doc1parsed)): numnodes = 0 sentencedoc1 = ParentedTree.convert(doc1parsed[i]) tempnode = Node(sentencedoc1.root().label()) new_sentencedoc1 = self.convert_mytree(sentencedoc1,tempnode) temp_costMatrix = [] sen1nodes = numnodes for j in range(len(doc2parsed)): numnodes=0.0 sentencedoc2 = ParentedTree.convert(doc2parsed[j]) tempnode = Node(sentencedoc2.root().label()) new_sentencedoc2 = self.convert_mytree(sentencedoc2,tempnode) ED = simple_distance(new_sentencedoc1, new_sentencedoc2) ED = ED / (numnodes + sen1nodes) temp_costMatrix.append(ED) costMatrix.append(temp_costMatrix) costMatrix = np.array(costMatrix) if average==True: return 1-np.mean(costMatrix) else: indexes = su.linear_assignment(costMatrix) total = 0 rowMarked = [0] * len(doc1parsed) colMarked = [0] * len(doc2parsed) for row, column in indexes: total += costMatrix[row][column] rowMarked[row] = 1 colMarked [column] = 1 for k in range(len(rowMarked)): if rowMarked[k]==0: total+= np.min(costMatrix[k]) for c in range(len(colMarked)): if colMarked[c]==0: total+= np.min(costMatrix[:,c]) maxlengraph = max(len(doc1parsed),len(doc2parsed)) return 1-(total/maxlengraph)
def xmltree_dist(xml_1, xml_2): if xml_1 == xml_2: return 0.0 else: d = zss.simple_distance( ET.parse(xml_1).getroot(), ET.parse(xml_2).getroot(), get_children, get_label) return d
def _calculate_distance(self, prototype, tree): return zss.simple_distance( prototype, tree, lambda node: list(node.children()), lambda node: self._signature.get_signature(node, node.parent()), lambda prototype_label, tree_label: prototype_label != tree_label )
def test_labelchange(self): for A in (randtree(5, repeat=3, width=2) for x in range(N * 4)): B = copy.deepcopy(A) node = random.choice([n for n in B.iter()]) old_label = str(node.label) node.label = 'xty' assert simple_distance(A, B) == strdist(old_label, node.label)
def compare_graphs(g1, g2): ''' Given two graphs (zss trees) return a normalized distance between them ''' g1_node_count = float(len(g1.get_children( g1))) # get the length of the tree starting at the root g2_node_count = float(len(g2.get_children(g2))) # '' dist = float(simple_distance(g1, g2)) return 1 - (dist / (g1_node_count + g2_node_count))
def test_de(): expected_ops = [ Operation(Operation.remove, Node("b"), None), Operation(Operation.remove, Node("c"), None), Operation(Operation.match, Node("a"), Node("a")) ] cost, ops = simple_distance(D, E, return_operations=True) assert ops == expected_ops
def add(self, fn): fntree = Node.to_tree(fn) prefix = fntree.prefix() self.index[prefix].add(Entry(fn, prefix, 0, True)) for delete in fntree.deletes(): self.index[delete.prefix()].add( Entry(fn, delete.prefix(), simple_distance(fntree, delete), False))
def computeTreeEditDistance(self, otherTree): ownTreeEditDistFormat = self.convertTreeToEditDistanceFormat() otherTreeEditDistFormat = otherTree.convertTreeToEditDistanceFormat() #now compute the edit distance between these two if 'Healthy' in ownTreeEditDistFormat.keys(): distance = simple_distance( ownTreeEditDistFormat['Healthy'], otherTreeEditDistFormat['Healthy'] ) #from the root we can reach all the nodes. else: distance = simple_distance( ownTreeEditDistFormat['Precursor'], otherTreeEditDistFormat['Precursor'] ) #from the root we can reach all the nodes. return distance
def calculate_distance(tree1, tree2): """ Calculates distance between two trees :param tree1: first tree :param tree2: second tree :return: distance of given trees """ return zss.simple_distance(tree1, tree2, MyNode.get_children, MyNode.get_label, my_distance)
def get_dist(args, s1, s2): try: t1, height_t1 = convert(args, json.loads(s1), 'ROOT', 1) t2, height_t2 = convert(args, json.loads(s2), 'ROOT', 1) return simple_distance( t1, t2) * (height_t1 + height_t2) / (height_t1 * height_t2) except: return None
def closest_dist(ast, corpus): dists = [ zss.simple_distance(ast, program['ast'], get_children=ZSS.get_children, get_label=ZSS.get_label, label_dist=ZSS.label_dist_string) for program in corpus['programs'] ] return min(dists)
def edit_distance(a, b, node_dist_func=lambda a, b: 0 if a == b else 1): try: import zss assert isinstance(a, Tree), a assert isinstance(b, Tree), b return zss.simple_distance(a, b, \ Tree.get_children, Tree.get_root_node, node_dist_func) except ModuleNotFoundError: return None
def test_simplelabelchange(): A = (Node("f").addkid( Node("a").addkid(Node("h")).addkid(Node("c").addkid( Node("l")))).addkid(Node("e"))) B = (Node("f").addkid( Node("a").addkid(Node("d")).addkid(Node("r").addkid( Node("b")))).addkid(Node("e"))) dist = simple_distance(A, B) print dist assert dist == 3
def _tree_edit_distance(node1, node2): def get_dtc_tree(node): distance_node = Node(type(node).__name__) tree_size = _dfs(node, distance_node) return distance_node, tree_size distance_node1, tree_size1 = get_dtc_tree(node1) distance_node2, tree_size2 = get_dtc_tree(node2) distance = simple_distance(distance_node1, distance_node2) return 1 - 1.0 * distance / max(tree_size1, tree_size2)
def getRepresentTree(Nodes): dis_dic = {} for i in Nodes: distanceSum = 0 for j in Nodes: distance = simple_distance(Nodes[i], Nodes[j]) distanceSum += distance dis_dic[i] = distanceSum id = sorted(dis_dic.items(), key=lambda x: x[1])[0][0] return id, Nodes[id]
def test_distance(self): trees = itertools.product([tree1(), tree2(), tree3(), tree4()], repeat=2) for a, b in trees: ab = simple_distance(a, b) ba = simple_distance(b, a) # print '-----------------------------' # print a # print '------' # print b # print '------' # print ab, ba self.assertEqual(ab, ba) self.assertTrue((ab == 0 and a is b) or a is not b) # break trees = itertools.product([tree1(), tree2(), tree3(), tree4()], repeat=3) for a, b, c in trees: ab = simple_distance(a, b) bc = simple_distance(b, c) ac = simple_distance(a, c) self.assertTrue(ac <= ab + bc)
def calculate_score(self, page_text): self.current_event.clear_data() #print page_text self.build_event(page_text) A = self.build_tree(self.current_event) B = self.build_tree(self.base_event) simp_dist = simple_distance(A, B) if simp_dist == 0: return 1 self.current_event.to_string() #print 1.0/(1.1423*simp_dist) return 1.0/(1.1423*simp_dist)
def tree_edit_distance(s1,s2): l1 = s1.split(',') l2 = s2.split(',') n1 = Node("") for item in l1: #print item n1.addkid(Node(item)) n2 = Node("") for item in l2: #print item n2.addkid(Node(item)) return simple_distance(n1, n2)
def calculate_score(self, page_text): self.reload_data() #print page_text the_event = build_event(page_text) A = build_tree(the_event) B = self.build_base_tree() simp_dist = simple_distance(A, B) if simp_dist == 0: return 1 if 1.0/(1.1423*simp_dist) > 0.5: the_event.to_string() print 1.0/(1.1423*simp_dist) return 1.0/(1.1423*simp_dist)
def calculate_smart_score(self, page_text): self.current_event.clear_data() #print page_text self.build_event(page_text) #self.current_event.to_string() A = self.build_tree_no_date(self.current_event) B = self.build_tree_no_date(self.base_event) simp_dist = simple_distance(A, B) #print "simp_dist before inversing: "+str(simp_dist) if simp_dist != 0: simp_dist = 1.0/(1.25*simp_dist) else: simp_dist = 1 # set null date values to zero tempday = self.current_event.day if(tempday == ''): tempday = 0 tempmonth = self.current_event.month if(tempmonth == ''): tempmonth = 0 tempyear = self.current_event.year if(tempyear == ''): tempyear = 0 # do same for basedate tempbaseday = self.base_event.day if(tempbaseday == ''): tempbaseday = 0 tempbasemonth = self.base_event.month if(tempbasemonth == ''): tempbasemonth = 0 tempbaseyear = self.base_event.year if(tempbaseyear == ''): tempbaseyear = 0 daysdelta = abs(int(tempbaseday)-int(tempday))+(30*(abs(int(tempmonth)-int(tempbasemonth))))+(365*(abs(int(tempyear)-int(tempbaseyear)))) date_contrib = pow(daysdelta,.2) if date_contrib != 0: date_contrib = 1.0 / date_contrib else: date_contrib = 1 #print "Found Date: " + str(tempday) +"/"+str(tempmonth)+"/"+str(tempyear) #print "Found Date Obj: " + self.current_event.day +"/"+the_event.month+"/"+the_event.year #print "Base Date: " + str(tempbaseday) +"/"+str(tempbasemonth)+"/"+str(tempbaseyear) #print "date_contrib: " + str(date_contrib) #print "simp_dist: " + str(simp_dist) #print (.2*date_contrib) + (.8*simp_dist) return (.2*date_contrib) + (.8*simp_dist)
def test_incorrect_behavior_regression(): A = ( Node("a") .addkid(Node("b") .addkid(Node("x")) .addkid(Node("y")) ) ) B = ( Node("a") .addkid(Node("x")) .addkid(Node("b") .addkid(Node("y")) ) ) dist = simple_distance(A, B) print dist assert dist == 2
def calculate_score(self, page_text, url): self.current_event.clear_data() #print page_text self.current_event.set_url(url) self.build_event(page_text) A = self.build_tree(self.current_event) B = self.build_tree(self.base_event) simp_dist = simple_distance(A, B) if simp_dist == 0: return 1 self.current_event.to_string() #print 1.0/(1.1423*simp_dist) score = 1.0/(1.1423*simp_dist) if score > self.threshold: f = open(last_run_file, 'a') f.write(self.current_event.formated_string()) f.close() return score
def edit_distance(self, other_tree): """ Returns the Zhang-Sasha edit distance between another tree, as implemented in zss """ global cache_hits global cache_misses cached_dist = self.dist_cache.get(str(other_tree)) if cached_dist: cache_hits += 1 return cached_dist else: cache_misses += 1 d = simple_distance(self, other_tree, ExpressionNode.get_children, ExpressionNode.get_label) self.dist_cache[other_tree] = d other_tree.dist_cache[self] = d return d
def test_simplelabelchange(): A = ( Node("f") .addkid(Node("a") .addkid(Node("h")) .addkid(Node("c") .addkid(Node("l")))) .addkid(Node("e")) ) B = ( Node("f") .addkid(Node("a") .addkid(Node("d")) .addkid(Node("r") .addkid(Node("b")))) .addkid(Node("e")) ) dist = simple_distance(A,B) print dist assert dist == 3
def process_list(cur, bots, robot_cache, pair_cache): exp, run, births = cur if exp == 'embodied': r = int(run) if 0 <= r <= 5: subdir = 'd0-5' elif 6 <= r <= 21: subdir = 'd6-21' elif r == 22: return else: subdir = 'd23-31' dir = '/media/expdata/online-output/'+subdir else: dir = '/media/expdata/output/'+exp proto_bots = [] for robot_id in bots: if robot_id in robot_cache: proto_bots.append(robot_cache[robot_id]) else: bot = Robot() with open(dir+'/robot_'+str(robot_id)+'.pb', 'rb') as f: bot.ParseFromString(f.read()) bot_pair = robot_id, Node(bot.body.root) proto_bots.append(bot_pair) robot_cache[robot_id] = bot_pair for (aid, a), (bid, b) in combinations(proto_bots, 2): k = (aid, bid) if k in pair_cache: diff = pair_cache[k] else: diff = simple_distance(a, b, get_children, get_label, label_distance) pair_cache[k] = diff print("%s,%s,%s,%s,%s,%f" % (exp, run, births, aid, bid, diff))
def syntax_similarity_two_lists(self, documents1, documents2, average = False): # synax similarity of two lists of documents global numnodes documents1parsed = [] documents2parsed = [] for d1 in range(len(documents1)): # print d1 tempsents = (self.sent_detector.tokenize(documents1[d1].strip())) for s in tempsents: if len(s.split())>100: documents1parsed.append("NA") break else: temp = list(self.parser.raw_parse_sents((tempsents))) for i in range(len(temp)): temp[i] = list(temp[i])[0] temp[i] = ParentedTree.convert(temp[i]) documents1parsed.append(list(temp)) for d2 in range(len(documents2)): # print d2 tempsents = (self.sent_detector.tokenize(documents2[d2].strip())) for s in tempsents: if len(s.split())>100: documents2parsed.append("NA") break else: temp = list(self.parser.raw_parse_sents((tempsents))) for i in range(len(temp)): temp[i] = list(temp[i])[0] temp[i] = ParentedTree.convert(temp[i]) documents2parsed.append(list(temp)) results ={} for d1 in range(len(documents1parsed)): # print d1 for d2 in range(len(documents2parsed)): # print d1,d2 if documents1parsed[d1]=="NA" or documents2parsed[d2] =="NA": # print "skipped" continue costMatrix = [] for i in range(len(documents1parsed[d1])): numnodes = 0 tempnode = Node(documents1parsed[d1][i].root().label()) new_sentencedoc1 = self.convert_mytree(documents1parsed[d1][i],tempnode) temp_costMatrix = [] sen1nodes = numnodes for j in range(len(documents2parsed[d2])): numnodes=0.0 tempnode = Node(documents2parsed[d2][j].root().label()) new_sentencedoc2 = self.convert_mytree(documents2parsed[d2][j],tempnode) ED = simple_distance(new_sentencedoc1, new_sentencedoc2) ED = ED / (numnodes + sen1nodes) temp_costMatrix.append(ED) costMatrix.append(temp_costMatrix) costMatrix = np.array(costMatrix) if average==True: return 1-np.mean(costMatrix) else: indexes = su.linear_assignment(costMatrix) total = 0 rowMarked = [0] * len(documents1parsed[d1]) colMarked = [0] * len(documents2parsed[d2]) for row, column in indexes: total += costMatrix[row][column] rowMarked[row] = 1 colMarked [column] = 1 for k in range(len(rowMarked)): if rowMarked[k]==0: total+= np.min(costMatrix[k]) for c in range(len(colMarked)): if colMarked[c]==0: total+= np.min(costMatrix[:,c]) maxlengraph = max(len(documents1parsed[d1]),len(documents2parsed[d2])) results[(d1,d2)] = 1-total/maxlengraph return results
def distance_syntactic(in_sentence_a, in_sentence_b): edit_distance = zss.simple_distance(in_sentence_a.get_root(), in_sentence_b.get_root(), label_dist=lambda x, y: x != y) return edit_distance / float(len(in_sentence_a) + len(in_sentence_b))
def test_empty_tree_distance(): assert simple_distance(Node(''), Node('')) == 0 assert simple_distance(Node('a'), Node('')) == 1 assert simple_distance(Node(''), Node('b')) == 1
def minweight_edit_distance(self, doc1, doc2): global numnodes doc1sents = self.sent_detector.tokenize(doc1.strip()) doc2sents = self.sent_detector.tokenize(doc2.strip()) doc1parsed = self.parser.raw_parse_sents((doc1sents)) doc2parsed = self.parser.raw_parse_sents((doc2sents)) costMatrix = [] doc1parsed = list(doc1parsed) for i in range(len(doc1parsed)): doc1parsed[i] = list(doc1parsed[i])[0] doc2parsed = list(doc2parsed) for i in range(len(doc2parsed)): doc2parsed[i] = list(doc2parsed[i])[0] for i in range(len(doc1parsed)): numnodes = 0 sentencedoc1 = ParentedTree.convert(doc1parsed[i]) tempnode = Node(sentencedoc1.root().label()) new_sentencedoc1 = self.convert_mytree(sentencedoc1,tempnode) temp_costMatrix = [] sen1nodes = numnodes for j in range(len(doc2parsed)): numnodes=0.0 sentencedoc2 = ParentedTree.convert(doc2parsed[j]) tempnode = Node(sentencedoc2.root().label()) new_sentencedoc2 = self.convert_mytree(sentencedoc2,tempnode) ED = simple_distance(new_sentencedoc1, new_sentencedoc2) ED = ED / (numnodes + sen1nodes) temp_costMatrix.append(ED) costMatrix.append(temp_costMatrix) costMatrix = np.array(costMatrix) rownum= costMatrix.shape[0] colnum = costMatrix.shape[1] if rownum > colnum: costMatrixRandom = costMatrix[np.random.randint(rownum, size=colnum),:] else: costMatrixRandom = costMatrix[:,np.random.randint(colnum, size=rownum)] indexes = su.linear_assignment(costMatrix) total = 0 minWeight = 0 rowMarked = [0] * len(doc1parsed) colMarked = [0] * len(doc2parsed) for row, column in indexes: total += costMatrix[row][column] rowMarked[row] = 1 colMarked [column] = 1 minWeight = total for k in range(len(rowMarked)): if rowMarked[k]==0: total+= np.min(costMatrix[k]) for c in range(len(colMarked)): if colMarked[c]==0: total+= np.min(costMatrix[:,c]) maxlengraph = max(len(doc1parsed),len(doc2parsed)) minlengraph = min(len(doc1parsed),len(doc2parsed)) indexes = su.linear_assignment(costMatrixRandom) randtotal = 0 for row, column in indexes: randtotal +=costMatrixRandom[row][column] lengraph = costMatrixRandom.shape[0] return total/maxlengraph#, minWeight/minlengraph, randtotal/lengraph
ref = " ".join([x.encode('utf-8') for x in wordpunct_tokenize(candidates[2].strip().decode("utf-8"))]) hyp1 = " ".join([x.encode('utf-8') for x in wordpunct_tokenize(candidates[0].strip().decode("utf-8"))]) hyp2 = " ".join([x.encode('utf-8') for x in wordpunct_tokenize(candidates[1].strip().decode("utf-8"))]) ref = treeDict[ref] #candidates[2].strip()[2:-2] hyp1 = treeDict[hyp1] #candidates[0].strip()[2:-2] hyp2 = treeDict[hyp2] #candidates[1].strip()[2:-2] #print ref rootRef = readNode(ref, None) rootHyp1 = readNode(hyp1, None) rootHyp2 = readNode(hyp2, None) #print rootRef.label score1 = simple_distance(rootRef, rootHyp1) score2 = simple_distance(rootRef, rootHyp2) #print score1 fx.write("%d\t%d\n"%(score1,score2)) if score1 >= score2: f.write("1\n") else: f.write("-1\n") f.close() fx.close()
def distance(l1, l2): try: return simple_distance(l1, l2) except Exception: return -1
def aligned_edit_distance(p1, p2): """ uses the aligned trees for labeled tree edit distance""" tree1, tree2 = create_ordered_trees(p1, p2) return zss.simple_distance(convert_structure_to_zss(tree1), convert_structure_to_zss(tree2))