def test_rich_api(): insert_cost = lambda node: 1 remove_cost = lambda node: 1 small_update_cost = lambda a, b: 1 large_update_cost = lambda a, b: 3 no_insert_cost = lambda node: 0 A = Node('a') B = Node('b') # prefer update assert distance( A, B, Node.get_children, insert_cost, remove_cost, small_update_cost) == 1 # prefer insert/remove assert distance( A, B, Node.get_children, insert_cost, remove_cost, large_update_cost) == 2 C = Node('a', [Node('x')]) assert ( distance( A, C, Node.get_children, insert_cost, remove_cost, small_update_cost) > distance( A, C, Node.get_children, no_insert_cost, remove_cost, small_update_cost) )
def zss_dist(A, B): return zss.distance(A, B, get_children=zss.Node.get_children, insert_cost=lambda x: 1, remove_cost=lambda x: 1, update_cost=lambda x, y: 0)
def zss_with_descriptor(tree1, tree2, tree_descriptor): if tree_descriptor == 'v1': return zss.simple_distance(tree1.root, tree2.root) else: penalty = 10 weights = (1, 1, 1) desc = tree_descriptor.split(',')[1:] if len(desc) > 3: penalty = int(desc[3]) if len(desc) > 2: weights = tuple(map(int, desc[:3])) def dist_method(label_a, label_b): return enhanced_label_dist(label_a, label_b, weights) return zss.distance(tree1.root, tree2.root, insert_cost=lambda node: label_dist([ ], node.enhanced_label, penalty, dist_method), remove_cost=lambda node: label_dist( node.enhanced_label, [], penalty, dist_method), update_cost=lambda node1, node2: label_dist( node1.enhanced_label, node2.enhanced_label, penalty, dist_method), get_children=lambda node: node.children)
def diff(a, b): assert a is not None assert b is not None def _str_dist(i, j): return 0 if i == j else 1 def _get_label(n): #print(type(n).__name__) if type(n).__name__ == "Attribute": return n.attr if type(n).__name__ == "Str": return n.s return type(n).__name__ def _get_children(n): if not hasattr(n, 'children'): n.children = list(ast.iter_child_nodes(n)) return n.children import zss #from editdistance import distance as strdist res = zss.distance( a.func_node, b.func_node, _get_children, lambda node: _str_dist('', _get_label(node)), # insert cost lambda node: _str_dist(_get_label(node), ''), # remove cost lambda _a, _b: _str_dist(_get_label(_a), _get_label(_b)) ) # update cost return res
def compare_methods(methods_1, methods_2, data_1, data_2, i, queue): info_1 = data_1[i] temp_i = [] new_i = [] for n in methods_2: info_2 = data_2[n] N1 = len(info_1[0]) N2 = len(info_2[0]) matrix = np.full((N1, N2), np.nan) for k in range(N1): API_1 = info_1[1][k] max_edit = 10000 for j in range(N2): API_2 = info_2[1][j] if API_1 == API_2: dist = zss.distance(info_1[2][k], info_2[2][j], Node.get_children, insert_cost, remove_cost, update_cost) max_len = max(info_1[0][k], info_2[0][j]) sim = (max_len - dist)/max_len matrix[k, j] = sim #print(i,n,info_1[1][k], info_2[1][j], sim, dist, info_1[0][k], info_2[0][j]) new_i.append(matrix) df = df = pd.DataFrame(matrix, columns = info_2[1], index = info_1[1]) result_df = process_df(df, info_1[1], info_2[1]) temp_i.append(result_df.values) queue.put((i, temp_i, new_i))
def diff(a, b): assert a is not None assert b is not None def _str_dist(i, j): return 0 if i == j else 1 def _get_label(n): return type(n).__name__ def _get_children(n): if not hasattr(n, 'children'): n.children = list(ast.iter_child_nodes(n)) return n.children import zss res = zss.distance( a.func_node, b.func_node, _get_children, lambda node: 0, # insert cost lambda node: _str_dist(_get_label(node), ''), # remove cost lambda _a, _b: _str_dist(_get_label(_a), _get_label(_b)), ) # update cost return res
def compare_methods(methods_1, methods_2, data_1, data_2, i, queue): info_1 = data_1[i] temp_i = [] new_i = {} for n in methods_2: info_2 = data_2[n] N1 = len(info_1[0]) N2 = len(info_2[0]) matrix = np.full((N1, N2), np.nan) new_i[n] = [] for k in range(N1): API_1 = info_1[1][k] max_sim = -1 max_edit = 10000 for j in range(N2): API_2 = info_2[1][j] if API_1 == API_2: dist = zss.distance(info_1[2][k], info_2[2][j], Node.get_children, insert_cost, remove_cost, update_cost) max_len = max(info_1[0][k], info_2[0][j]) sim = (max_len - dist)/max_len matrix[k, j] = sim if sim > max_sim: max_edit = dist max_sim = sim #print(i,n,info_1[1][k], info_2[1][j], sim, dist, info_1[0][k], info_2[0][j]) if max_sim == -1: detail = (info_1[0][k], 0, info_1[1][k], info_2[1][j]) matrix[k, N2-1] = 0 else: detail = (max_len, max_sim, info_1[1][k], info_2[1][j]) new_i[n].append(detail) temp_i.append(matrix) queue.put((i, temp_i, new_i))
def tree_distance(tree1, tree2): return zss.distance( tree1, tree2, Node.get_children, insert_cost=lambda node: strdist('', Node.get_label(node)), remove_cost=lambda node: strdist(Node.get_label(node), ''), update_cost=lambda a, b: wordvec_dist(Node.get_label(a), Node.get_label(b)))
def main(args): logging.info('Starting main...') _start_time = time.clock() if args.metric not in supported_metrics: raise ValueError('Metric {0} is not supported.\nSupported metrics:\n\t{1}' ''.format(args.metric, '\n\t'.join(supported_metrics.keys()))) metric_class = supported_metrics[args.metric] if args.true == args.prediction: print('0') return true = etree.parse(args.true).getroot() prediction = etree.parse(args.prediction).getroot() if args.metric == 'zss_Levenshtein': coder = NoteContentCoder() true = encode_notes(true, coder) prediction = encode_notes(prediction, coder) # Preprocess trees: only retain relevant. # - part-list # - part # From those, filter: # - midi-instrument # - midi-device # - print _parse_time = time.clock() logging.info('Parsing done in: {0:.3f} s'.format(_parse_time - _start_time)) # Argument order: "How much does it cost to turn prediction into the true tree?" dist = distance(prediction, true, get_children=metric_class.get_children, update_cost=metric_class.update, insert_cost=metric_class.insert, remove_cost=metric_class.remove ) print('{0}'.format(dist)) _end_time = time.clock() _eval_time = _end_time - _parse_time # Logging timing: n_true_notes = len(list(true.iter('note'))) n_pred_notes = len(list(prediction.iter('note'))) logging.info('Timing:') logging.info('True notes: {0}, eval. took {1:.4f} s per true note.' ''.format(n_true_notes, _eval_time / n_true_notes)) logging.info('Pred notes: {0}, eval. took {1:.4f} s per pred note.' ''.format(n_pred_notes, _eval_time / n_pred_notes)) logging.info('musicxml_eval done in {0:.3f} s'.format(_end_time - _start_time))
def zss_edit_dist(first, second, insert_cost, remove_cost, update_cost): def get_children(node): return list(node.get_children()) return zss.distance(first, second, get_children=get_children, insert_cost=insert_cost, remove_cost=remove_cost, update_cost=update_cost)
def get_distance(node1: ASTWrapper, node2: ASTWrapper, return_operations: bool = False) -> int: return zss.distance(node1, node2, get_children, insert_cost, delete_cost, update_cost, return_operations=return_operations)
def normalized_distance(A, B, get_children, alpha=1, get_tree_size=None, return_operations=False, **kwargs): if get_tree_size is None: get_tree_size = functools.partial(default_tree_size, get_children=get_children) if return_operations: edit_distance, operations = distance(A, B, get_children, **kwargs) else: edit_distance = distance(A, B, get_children, **kwargs) d = _compute_normalized_distance(edit_distance, alpha, get_tree_size(A), get_tree_size(B)) if return_operations: return d, operations else: return d
def zss_edit_dist(first, second, insert_cost, remove_cost, update_cost): def get_children(node): return list(node.get_children()) return zss.distance( first, second, get_children=get_children, insert_cost=insert_cost, remove_cost=remove_cost, update_cost=update_cost )
def cal_edit_distance(Ta, Tb): ''' Ta, Tb: trees ''' return distance( Ta, Tb, get_children=WeirdNode.get_children, insert_cost=WeirdNode.insert_cost, remove_cost=WeirdNode.remove_cost, update_cost=WeirdNode.update_cost )
def test_rich_api(): insert_cost = lambda node: 1 remove_cost = lambda node: 1 small_update_cost = lambda a, b: 1 large_update_cost = lambda a, b: 3 no_insert_cost = lambda node: 0 A = Node('a') B = Node('b') # prefer update assert distance(A, B, Node.get_children, insert_cost, remove_cost, small_update_cost) == 1 # prefer insert/remove assert distance(A, B, Node.get_children, insert_cost, remove_cost, large_update_cost) == 2 C = Node('a', [Node('x')]) assert (distance(A, C, Node.get_children, insert_cost, remove_cost, small_update_cost) > distance( A, C, Node.get_children, no_insert_cost, remove_cost, small_update_cost))
def find_score(q_dependency_list, a_dependency_list): """Used to find the edit distance of the dependency trees generated for the answer and the question :type a_dependency_list: list :type q_dependency_list: list """ costs = [] for q_dep in q_dependency_list: print q_dep print '_____________________________________________________________________________________________________________' q_entities = set(find_entities(q_dep)) common_entities = set() for a_dep in a_dependency_list: a_entities = list(find_entities(a_dep)) print a_dep # Finding the edit distance # edit_distance = simple_distance(a_tree, q_tree) enhanced_distance = distance( tree(a_dep).children[0], tree(q_dep).children[0], get_children, insert_cost, remove_cost, update_cost) print '\033[94m', 'Enhanced distance :', enhanced_distance, '\033[0m' # Finding common entities common = q_entities.intersection(set(a_entities)) # Find new distance new_distance = enhanced_distance / float(len(a_entities)) print 'New distance :', new_distance # Finding entities which intersect with the question entities, but not in the common_entities new_entities = [x for x in common if x not in common_entities] print 'new entities :', len(new_entities) common_entities.update(new_entities) # new_distance is re-assigned new_distance /= float(len(new_entities) + 1) costs.append(new_distance) costs = [x / (len(common_entities) + 1) for x in costs] print 'costs :', costs min_cost = min(costs) k = 0.8 # Computing final_cost with k% carrying cost contributions of previous sentences carrying_cost = 0 for c in costs[::-1]: carrying_cost = k * carrying_cost + (c - min_cost) * (1 - k) final_cost = min_cost + carrying_cost print '\033[92m', 'final cost :', final_cost, '\033[0m' return final_cost
def get_distance(file_a, file_b): tree_a = ET.parse(file_a) root_a = tree_a.getroot() A = WeirdNode("a") setNode(root_a, A) tree_b = ET.parse(file_b) root_b = tree_b.getroot() B = WeirdNode("a") setNode(root_b, B) b = AnnotatedTree(B, WeirdNode.get_children) if len(b.nodes) > 200: return 100000 return distance(A, B, WeirdNode.get_children, weird_insert_dist, weird_remove_dist, weird_update_dist)
def find_score(q_dependency_list, a_dependency_list): """Used to find the edit distance of the dependency trees generated for the answer and the question :type a_dependency_list: list :type q_dependency_list: list """ costs = [] for q_dep in q_dependency_list: print q_dep print'_____________________________________________________________________________________________________________' q_entities = set(find_entities(q_dep)) common_entities = set() for a_dep in a_dependency_list: a_entities = list(find_entities(a_dep)) print a_dep # Finding the edit distance # edit_distance = simple_distance(a_tree, q_tree) enhanced_distance = distance(tree(a_dep).children[0], tree(q_dep).children[0], get_children, insert_cost, remove_cost, update_cost) print '\033[94m', 'Enhanced distance :', enhanced_distance, '\033[0m' # Finding common entities common = q_entities.intersection(set(a_entities)) # Find new distance new_distance = enhanced_distance / float(len(a_entities)) print 'New distance :', new_distance # Finding entities which intersect with the question entities, but not in the common_entities new_entities = [x for x in common if x not in common_entities] print 'new entities :', len(new_entities) common_entities.update(new_entities) # new_distance is re-assigned new_distance /= float(len(new_entities) + 1) costs.append(new_distance) costs = [x / (len(common_entities) + 1) for x in costs] print 'costs :', costs min_cost = min(costs) k = 0.8 # Computing final_cost with k% carrying cost contributions of previous sentences carrying_cost = 0 for c in costs[::-1]: carrying_cost = k * carrying_cost + (c - min_cost) * (1 - k) final_cost = min_cost + carrying_cost print '\033[92m', 'final cost :', final_cost, '\033[0m' return final_cost
def _zhangshasha( tree1: Tree, tree2: Tree, children: Callable[[Tree], Iterable[Tree]], insert_cost: Callable[[Tree], float] = constantfunc(1), delete_cost: Callable[[Tree], float] = constantfunc(1), rename_cost: Callable[[Tree, Tree], float] = lambda x, y: int(x != y), ) -> float: return zss.distance( tree1, tree2, get_children=lambda x: [*children(x)], insert_cost=insert_cost, remove_cost=delete_cost, update_cost=rename_cost, ) # type: ignore
def pageSimilarity(page1, page2): ''' Similar to treeEditDistance Calculate page similarity between two pages' block trees https://github.com/timtadh/zhang-shasha/ @param page1: {String} the first page's file path @param page2: {String} the second page's file path @return: {Integer} page similarity ''' def builupTree(root): tree = zss.Node(root.info) for child in root.children: tree.addkid(builupTree(child)) return tree pass # def builupTree(root) insert_cost = lambda node: 1 remove_cost = lambda node: 1 update_cost = lambda a, b: 1 return zss.distance(builupTree(BlockTree.parseBlockTreeFromFile(page1).root), \ builupTree(BlockTree.parseBlockTreeFromFile(page2).root), \ zss.Node.get_children, insert_cost, remove_cost, update_cost)
def cosine_tree_distance(self, pair, name=False): """ Compute the tree edit distance of the sentences considering the replacement cost of two words as their embeddings's cosine distance. :return: an integer """ if name: return ('Cosine TED', 'Cosine TED / Length T', 'Cosine TED / Length H') def get_children(node): return node.dependents def insert_cost(node): return 1 def remove_cost(node): return 1 #TODO: different update costs for stopwords? def update_cost(node1, node2): # -1 because conll id's start from 1 i = node1.id - 1 j = node2.id - 1 return self.cosine_distances[i, j] tree_t = pair.annotated_t tree_h = pair.annotated_h root_t = tree_t.root root_h = tree_h.root size_t = len(tree_t.tokens) size_h = len(tree_h.tokens) distance = zss.distance(root_t, root_h, get_children, insert_cost, remove_cost, update_cost) return distance, distance / size_t, distance / size_h
parser.add_argument('source', type=argparse.FileType('r'), help='source corpus') parser.add_argument('target', type=argparse.FileType('r'), help='target corpus') parser.add_argument('-writetrees', type=argparse.FileType('w'), help='write trees in bracket notation') parser.add_argument('-binary', action='store_true') args = parser.parse_args(sys.argv[1:]) source_tree = build_tree(args.source) target_tree = build_tree(args.target) if args.writetrees: args.writetrees.write( fix_bracket_format(MyDOM.bracket_notation(source_tree))) args.writetrees.write("\n") args.writetrees.write( fix_bracket_format(MyDOM.bracket_notation(target_tree))) sys.exit() if args.binary: print distance(source_tree, target_tree, MyDOM.get_children, BinaryCosts.insert_delete_cost, BinaryCosts.insert_delete_cost, BinaryCosts.update_cost) else: print distance(source_tree, target_tree, MyDOM.get_children, EditCost.remove_cost, EditCost.remove_cost, EditCost.update_cost)
if save_plots: # Save the Plot: plt.figure(figsize=(300, 300)) nx.draw(gen_graph, with_labels=True) gen_file_name = mst_file.split("/")[-1].split(".")[0] plt.savefig(plot_dir + gen_file_name + ".png") gen_tree = nx.bfs_tree(gen_graph, source="root") gen_nodes_dict = {} for edge in gen_tree.edges(): if edge[0] not in gen_nodes_dict: gen_nodes_dict[edge[0]] = zss.Node(edge[0]) if edge[1] not in gen_nodes_dict: gen_nodes_dict[edge[1]] = zss.Node(edge[1]) gen_nodes_dict[edge[0]].addkid(gen_nodes_dict[edge[1]]) # Computing the Tree edit distance: tree_edit_distance = zss.distance(gt_nodes_dict['root'], gen_nodes_dict['root'], zss.Node.get_children, insert_cost=insertCost, remove_cost=removeCost, update_cost=updateCost) print("Tree Edit Distance = ", tree_edit_distance) # print(zss.simple_distance(gt_nodes_dict['root'], gen_nodes_dict['root'])) cur_time = time.time() seconds_elapsed = cur_time - start_time print("Seconds Elapsed = ", seconds_elapsed)
type=argparse.FileType('r'), help='source corpus') parser.add_argument('target', type=argparse.FileType('r'), help='target corpus') parser.add_argument('-writetrees', type=argparse.FileType('w'), help='write trees in bracket notation') parser.add_argument('-binary', action='store_true') args = parser.parse_args(sys.argv[1:]) source_tree = build_tree(args.source) target_tree = build_tree(args.target) if args.writetrees: args.writetrees.write( fix_bracket_format(MyDOM.bracket_notation(source_tree))) args.writetrees.write("\n") args.writetrees.write( fix_bracket_format(MyDOM.bracket_notation(target_tree))) sys.exit() if args.binary: print distance(source_tree, target_tree, MyDOM.get_children, BinaryCosts.insert_delete_cost, BinaryCosts.insert_delete_cost, BinaryCosts.update_cost) else: print distance(source_tree, target_tree, MyDOM.get_children, EditCost.remove_cost, EditCost.remove_cost, EditCost.update_cost)
if tree == None: continue if l == "en": lines_en.append((linenr, tree, line)) else: lines_fr.append((linenr, tree, line)) for linenr_en, tree_en, line_en in lines_en: dists = [] for linenr_fr, tree_fr, line_fr in lines_fr: d = -1 if args.binary: d = distance(tree_en, tree_fr, MyDOM.get_children, BinaryCosts.insert_delete_cost, BinaryCosts.insert_delete_cost, BinaryCosts.update_cost) else: d = distance(tree_en, tree_fr, MyDOM.get_children, EditCost.remove_cost, EditCost.remove_cost, EditCost.update_cost) dists.append((d, linenr_fr, line_fr)) dists.sort() args.outfile.write("%d" % (linenr_en + 1)) for d, linenr_fr, line_fr in dists[:10]: debug(line_en, line_fr, d) args.outfile.write("\t%d:%f" % (linenr_fr + 1, d)) args.outfile.write("\n")
def compare_trees(tree_size, number_of_trees): print('Create instances') create_random_binary_trees(tree_size, number_of_trees) file_name = 'examples/example_trees_size_' + tree_size.__str__() + '.json' print('Instances created successfully!') print('Instances can be found in ' + file_name) if os.path.exists(file_name): with open(file_name) as tree_file: tree_list = json.load(tree_file) #Only compare with ated keys = {"ATED": 0.5, "CTED": 0, "STED": 1} size_start = time.time() for i in range(0, min(len(tree_list),number_of_trees)): #Loop output loop_time = time.time() j = i + 1 needed_time = loop_time - size_start estimation = needed_time / j * number_of_trees print("(" + str(timedelta(seconds=round(needed_time))) + " / " + str(timedelta(seconds=round(estimation))) + ") (" + str(j) + "/" + str(number_of_trees) + ") tree size: " + str(tree_size)) tree_one = create_binary_tree_from_list(tree_list[i]['one']) tree_two = create_binary_tree_from_list(tree_list[i]['two']) if ('one_adapted' not in tree_list[i]): tree_one_adapted = adapt_tree_one(tree_one, tree_two) tree_list[i]['one_adapted'] = tree_one_adapted.get_tree_list(tree_one_adapted) tree_one_adapted = create_binary_tree_from_list(tree_list[i]['one_adapted']) if ('#GRFRestr' not in tree_list[i]): I = compute_invalid_edges(tree_one.get_clusters(1), tree_two.get_clusters(1)) tree_list[i]['#GRFRestr'] = len(I) #Compute gRF distance with varying 'k' for k in [1,4,16,64]: key = 'GRF' + str(k) if (key not in tree_list[i] and tree_size <= 32): start = time.time() print( "k is " + str(k)) lpProblem = createLPproblem(tree_one, tree_two, k) lp = lpProblem.get("lp") time_creation = time.time() - start lp.solve() c1 = lpProblem.get("c1") c2 = lpProblem.get("c2") if LpStatus[lp.status] == "Optimal": end = time.time() varsdict = {} for v in lp.variables(): varsdict[v.name] = v.varValue gRF = 0 for m in range(0,len(c1)): gRF = gRF + 1 for l in range(0,len(c2)): kex = "x_" + str(m) + "_" + str(l) if (varsdict[kex] == 1.0): cup = [i for i in c1[m] if i in c2[l]] gRF = gRF - len(cup)/(len(c1[m]) + len(c2[l]) - len(cup)) for m in range(0,len(c2)): used = 0 for l in range(0,len(c1)): kex = "x_" + str(l) + "_" + str(m) if (varsdict[kex] == 1.0): used = 1 if used == 0: gRF = gRF + 1 solution = {'clusterOne': c1, 'clusterTwo': c2, 'vardsDict': json.dumps(varsdict)} tree_list[i]['GRF' + str(k)] = {"cost": gRF, "time": end - start, "time_creation": time_creation} #Compute all TEDs defined in variable 'keys' for key,k in keys.items(): if (key not in tree_list[i]): start = time.time() print(key) cost = zss.distance( tree_one, tree_two, tree_one.get_children,insert_cost_delta(k), remove_cost_delta(k), update_cost=lambda a, b: strdist(ExtendedNode.get_label(a), ExtendedNode.get_label(b))) end = time.time() tree_list[i][key] = {"cost": cost, "time": end - start} key2 = key + "_a" if (key2 not in tree_list[i]): start = time.time() print(key2) cost = zss.distance( tree_one_adapted, tree_two, tree_one.get_children,insert_cost_delta(k), remove_cost_delta(k), update_cost=lambda a, b: strdist(ExtendedNode.get_label(a), ExtendedNode.get_label(b))) end = time.time() tree_list[i][key2] = {"cost": cost, "time": end - start} with open(file_name, 'w') as outfile: json.dump(tree_list, outfile)