def threshold_iterator(threshold): """ The method used to extract data from the XML-file and calculate distances. Iterates over different thresholds to find the best threshold or just calculates the entailment values for one threshold. """ global idf_dict dom_doc = xml_util.get_dom_from_xml("data/RTE2_dev.preprocessed.xml") pairs = xml_util.get_pairs(dom_doc) pair_attributes = xml_util.get_attributes_from_preprocessed_pair_nodes(pairs) idf_dict = calculate_idf_dictionary(pair_attributes) print len(idf_dict.keys()) tree_value_pairs = [] # Extracting the actual lemma values from the pair nodes for i in range(len(pair_attributes)): t,h,id_num,e,ta = pair_attributes[i] id_num = int(id_num) t_values = xml_util.get_minipar_values_from_text_node(t) h_values = xml_util.get_minipar_values_from_text_node(h) tree_value_pairs.append((t_values,h_values)) # Calculating distances between text and hypothesis distances = [] for i in range(len(tree_value_pairs)): t_tree,h_tree = build_tree(tree_value_pairs[i]) dist = tree_edit_dist.distance(t_tree, h_tree, idf_cost) normalizer = tree_edit_dist.distance(tree_edit_dist.Node("root"), h_tree, idf_cost) normalized_dist = float(dist) / float(normalizer) distances.append(normalized_dist) #for d in distances: # print d if threshold == -1: for i in range(200): threshold = 1.0 - (0.005 * i) syntax_matching(pair_attributes, distances, threshold) else: syntax_matching(pair_attributes, distances, threshold)
for node in soup.findAll("hypothesis"): hypo.append(node.findAll("node")) sentence = soup.findAll("node") text = make_nodes(text[0]) hypo = make_nodes(hypo[0]) text = process_node(text) hypo = process_node(hypo) text_tree = make_tree(text) hypo_tree = make_tree(hypo) print hypo_tree res = tree_edit_dist.distance(text_tree, hypo_tree) print "distance =", res ##t1 = {"c":["b"],"d":["a","c"],"f":["d","e"], "b":[], "e":[], "a":[]} ##t2 = {"c":["d"],"d":["a","b"],"f":["c","e"], "b":[], "e":[], "a":[]} ##test_tree1 = recur_tree(t1["f"], t1, "f", []) ##test_tree2 = recur_tree(t2["f"], t2, "f", []) #gg = recur_tree(d["Root"], d, "Root", []) #print gg #print test_tree1 #print test_tree2 #print gg #for i in gg: # print i[0], # for j in i[1]: