示例#1
0
def threshold_iterator(threshold):
    """ 
    The method used to extract data from the XML-file and calculate distances.
    Iterates over different thresholds to find the best threshold or just
    calculates the entailment values for one threshold.
    """
    global idf_dict
    dom_doc = xml_util.get_dom_from_xml("data/RTE2_dev.preprocessed.xml")
    pairs = xml_util.get_pairs(dom_doc)
    pair_attributes = xml_util.get_attributes_from_preprocessed_pair_nodes(pairs)
    idf_dict = calculate_idf_dictionary(pair_attributes)
    print len(idf_dict.keys())
    tree_value_pairs = []
    
    # Extracting the actual lemma values from the pair nodes
    for i in range(len(pair_attributes)):
        t,h,id_num,e,ta = pair_attributes[i]
        id_num = int(id_num)
        t_values = xml_util.get_minipar_values_from_text_node(t)
        h_values = xml_util.get_minipar_values_from_text_node(h)
        tree_value_pairs.append((t_values,h_values))
        
    # Calculating distances between text and hypothesis
    distances = []
    for i in range(len(tree_value_pairs)):
        t_tree,h_tree = build_tree(tree_value_pairs[i])
        dist = tree_edit_dist.distance(t_tree, h_tree, idf_cost)
        normalizer = tree_edit_dist.distance(tree_edit_dist.Node("root"), h_tree, idf_cost)
        normalized_dist = float(dist) / float(normalizer)
        distances.append(normalized_dist)
        
    #for d in distances:
    #    print d
    
    if threshold == -1:
        for i in range(200):
            threshold = 1.0 - (0.005 * i)
            syntax_matching(pair_attributes, distances, threshold)
    else:
        syntax_matching(pair_attributes, distances, threshold)
示例#2
0
for node in soup.findAll("hypothesis"):
	hypo.append(node.findAll("node"))
	
sentence = soup.findAll("node")
text = make_nodes(text[0])
hypo = make_nodes(hypo[0])

text = process_node(text)
hypo = process_node(hypo)

text_tree = make_tree(text)
hypo_tree = make_tree(hypo)

print hypo_tree

res = tree_edit_dist.distance(text_tree, hypo_tree)
print "distance =", res

##t1 = {"c":["b"],"d":["a","c"],"f":["d","e"], "b":[], "e":[], "a":[]} 
##t2 = {"c":["d"],"d":["a","b"],"f":["c","e"], "b":[], "e":[], "a":[]}
##test_tree1 = recur_tree(t1["f"], t1, "f", [])
##test_tree2 = recur_tree(t2["f"], t2, "f", [])
#gg = recur_tree(d["Root"], d, "Root", [])
#print gg
#print test_tree1
#print test_tree2

#print gg
#for i in gg:
#	print i[0],
#	for j in i[1]: