예제 #1
0
def inner_node_match(node1, node2, match_final, f, t):
    if node1.label == node2.label:
        common = 0  # 记录以node1为根的子树和以node2为根的子树中,匹配的叶子节点数
        node_list1, inner_node_list1, leaf_node_list1 = node_get(node1)
        node_list2, inner_node_list2, leaf_node_list2 = node_get(node2)
        for node in leaf_node_list1:
            if node.matched == 0:  # 若该叶子节点不存在与之匹配的节点,跳过
                continue
            else:
                id1 = node.id
                for item in match_final:  # 在match_final中找node的匹配信息,计算common
                    if item[0] == id1:
                        id2 = item[1]
                        for leaf_node in leaf_node_list2:
                            if leaf_node.id == id2:
                                common += 1
        max_num = len(leaf_node_list1) if len(leaf_node_list1) > len(leaf_node_list2) else len(leaf_node_list2)
        # 阈值t的大小根据子树规模动态改变
        if len(leaf_node_list1) <= 4 or len(leaf_node_list2) <= 4:
            t = 0.4
        sim_inner = common / max_num  # 好像没有int/double的问题
        sim_value = string_similarity_ngram(node1.value, node2.value, 2)
        # 为中间节点相似度设置权重,common leaves function有更高的权重
        # 即使中间节点value的相似度小于阈值f,但是若子树相似度远大于t,也认为两个中间节点相似
        if sim_inner >= 0.8:
            match_final.append((node1.id, node2.id, sim_value))
            node1.matched = 1
            node2.matched = 1
        elif sim_inner > t and sim_value > f:
            match_final.append((node1.id, node2.id, sim_value))
            node1.matched = 1
            node2.matched = 1
예제 #2
0
def inner_node_match(node1, node2, match_final, f, t):
    if node1.label == node2.label:
        common = 0  # 记录以node1为根的子树和以node2为根的子树中,匹配的叶子节点数
        node_list1, inner_node_list1, leaf_node_list1 = node_get(node1)
        node_list2, inner_node_list2, leaf_node_list2 = node_get(node2)
        for node in leaf_node_list1:
            if node.matched == 0:  # 若该叶子节点不存在与之匹配的节点,跳过
                continue
            else:
                id1 = node.id
                for item in match_final:  # 在match_final中找node的匹配信息,计算common
                    if item[0] == id1:
                        id2 = item[1]
                        for leaf_node in leaf_node_list2:
                            if leaf_node.id == id2:
                                common += 1
        max_num = len(leaf_node_list1) if len(leaf_node_list1) > len(
            leaf_node_list2) else len(leaf_node_list2)
        # 阈值t的大小根据子树规模动态改变
        if len(leaf_node_list1) <= 4 or len(leaf_node_list2) <= 4:
            t = 0.4
        sim_inner = common / max_num  # 好像没有int/double的问题
        sim_value = string_similarity_ngram(node1.value, node2.value, 2)
        # 为中间节点相似度设置权重,common leaves function有更高的权重
        # 即使中间节点value的相似度小于阈值f,但是若子树相似度远大于t,也认为两个中间节点相似
        if sim_inner >= 0.8:
            match_final.append((node1.id, node2.id, sim_value))
            node1.matched = 1
            node2.matched = 1
        elif sim_inner > t and sim_value > f:
            match_final.append((node1.id, node2.id, sim_value))
            node1.matched = 1
            node2.matched = 1
예제 #3
0
def leaf_match(left_leaf_node_list, right_leaf_node_list, f):
    match_temp = []
    for i in range(len(left_leaf_node_list)):
        for item in right_leaf_node_list:
            if left_leaf_node_list[i].label == item.label:
                sim = string_similarity_ngram(left_leaf_node_list[i].value, item.value, 2)
                if sim > f:
                    match_temp.append((left_leaf_node_list[i].id, item.id, sim))
    return match_temp
예제 #4
0
def leaf_match(left_leaf_node_list, right_leaf_node_list, f):
    match_temp = []
    for i in range(len(left_leaf_node_list)):
        for item in right_leaf_node_list:
            if left_leaf_node_list[i].label == item.label:
                sim = string_similarity_ngram(left_leaf_node_list[i].value,
                                              item.value, 2)
                if sim > f:
                    match_temp.append(
                        (left_leaf_node_list[i].id, item.id, sim))
    return match_temp