示例#1
0
def inner_node_match(node1, node2, match_final, f, t):
    if node1.label == node2.label:
        common = 0  # 记录以node1为根的子树和以node2为根的子树中,匹配的叶子节点数
        node_list1, inner_node_list1, leaf_node_list1 = node_get(node1)
        node_list2, inner_node_list2, leaf_node_list2 = node_get(node2)
        for node in leaf_node_list1:
            if node.matched == 0:  # 若该叶子节点不存在与之匹配的节点,跳过
                continue
            else:
                id1 = node.id
                for item in match_final:  # 在match_final中找node的匹配信息,计算common
                    if item[0] == id1:
                        id2 = item[1]
                        for leaf_node in leaf_node_list2:
                            if leaf_node.id == id2:
                                common += 1
        max_num = len(leaf_node_list1) if len(leaf_node_list1) > len(leaf_node_list2) else len(leaf_node_list2)
        # 阈值t的大小根据子树规模动态改变
        if len(leaf_node_list1) <= 4 or len(leaf_node_list2) <= 4:
            t = 0.4
        sim_inner = common / max_num  # 好像没有int/double的问题
        sim_value = string_similarity_ngram(node1.value, node2.value, 2)
        # 为中间节点相似度设置权重,common leaves function有更高的权重
        # 即使中间节点value的相似度小于阈值f,但是若子树相似度远大于t,也认为两个中间节点相似
        if sim_inner >= 0.8:
            match_final.append((node1.id, node2.id, sim_value))
            node1.matched = 1
            node2.matched = 1
        elif sim_inner > t and sim_value > f:
            match_final.append((node1.id, node2.id, sim_value))
            node1.matched = 1
            node2.matched = 1
示例#2
0
def inner_node_match(node1, node2, match_final, f, t):
    if node1.label == node2.label:
        common = 0  # 记录以node1为根的子树和以node2为根的子树中,匹配的叶子节点数
        node_list1, inner_node_list1, leaf_node_list1 = node_get(node1)
        node_list2, inner_node_list2, leaf_node_list2 = node_get(node2)
        for node in leaf_node_list1:
            if node.matched == 0:  # 若该叶子节点不存在与之匹配的节点,跳过
                continue
            else:
                id1 = node.id
                for item in match_final:  # 在match_final中找node的匹配信息,计算common
                    if item[0] == id1:
                        id2 = item[1]
                        for leaf_node in leaf_node_list2:
                            if leaf_node.id == id2:
                                common += 1
        max_num = len(leaf_node_list1) if len(leaf_node_list1) > len(
            leaf_node_list2) else len(leaf_node_list2)
        # 阈值t的大小根据子树规模动态改变
        if len(leaf_node_list1) <= 4 or len(leaf_node_list2) <= 4:
            t = 0.4
        sim_inner = common / max_num  # 好像没有int/double的问题
        sim_value = string_similarity_ngram(node1.value, node2.value, 2)
        # 为中间节点相似度设置权重,common leaves function有更高的权重
        # 即使中间节点value的相似度小于阈值f,但是若子树相似度远大于t,也认为两个中间节点相似
        if sim_inner >= 0.8:
            match_final.append((node1.id, node2.id, sim_value))
            node1.matched = 1
            node2.matched = 1
        elif sim_inner > t and sim_value > f:
            match_final.append((node1.id, node2.id, sim_value))
            node1.matched = 1
            node2.matched = 1
示例#3
0
def editscript_calculate(left_node_list, match_final, left_id_to_node,
                         right_id_to_node, right_node_list):
    left = []
    right = []
    for item in match_final:
        left.append(item[0])
        right.append(item[1])
    # 根据match_final得到两个匹配映射map_left_to_right和map_right_to_left
    # map_left_to_right为左树到右树的匹配情况,key为T1中的节点id,value为该节点在T2中的最佳匹配节点id
    map_left_to_right = dict(zip(left, right))
    map_right_to_left = dict(zip(right, left))

    num_of_node = len(left_node_list)  # 计算左子树节点数量,用于为新插入节点设置id
    edit_script = []
    change_information = []  # 记录插入、删除和更新等操作的信息
    change_information2 = []  # 记录移动操作的信息,MOV操作需要记录变化前后的父节点类型
    # 对于T2中的每个节点x,其父节点为y;x在T1中的匹配节点为w,y为z
    for right_node in right_node_list:
        x = right_node.id  # x、y、w和z都是id
        y = right_node.parent
        w = map_right_to_left.get(x)
        z = map_right_to_left.get(y)
        if x == 0:  # 跳过头指针
            continue
        else:
            x_node = right_id_to_node.get(x)
            z_node = left_id_to_node.get(z)
            # 如果x的匹配节点不存在且x未经过插入操作处理,则定义x为新增节点,创建一个插入操作INS(x,z),表示在z节点上添加了节点x
            if w is None and right_node.inserted == 0:
                new_node = Node(x_node.label, x_node.value)  # 将插入操作作用于T1
                z_node.insertchild(new_node)
                new_node.id = num_of_node  # 保持左子树中原有节点id不变,为新插入的节点设置id
                map_right_to_left[x] = new_node.id  # 为新插入的节点设置匹配关系(x, z)
                map_left_to_right[new_node.id] = x
                left_id_to_node[
                    num_of_node] = new_node  # 为新插入的节点设置id_to_node的匹配关系
                # 获取以x_node为根的子树中的所有节点集合node_list,将node_list集合中的节点标记为已插入node.inserted=1
                new_node.inserted = 1
                node_list, inner_node_list, leaf_node_list = node_get(x_node)
                for item in node_list:
                    item.inserted = 1
                operation = '(' + str(
                    (x_node.label, x_node.value, new_node.id)) + ', ' + str(
                        (z_node.label, z)) + ')'
                edit_script.append('INS ' + operation)  # 没考虑是在z节点的哪个位置上添加了节点x
                change_information.append(('INS', new_node, z_node))
                num_of_node += 1
            # 如果x的匹配节点存在,且未经过插入操作处理
            elif w is not None and right_node.inserted == 0:
                w_node = left_id_to_node.get(w)
                v_id = w_node.parent
                v_node = left_id_to_node.get(v_id)
                v_match = map_left_to_right.get(v_id)

                if w_node.value != x_node.value:  # 如果w节点存在,但其值不等于x节点的值,则定义一个更新操作UPD(w,value(x))
                    operation = '(' + str((w_node.label, w_node.value,
                                           w)) + ', ' + x_node.value + ')'
                    edit_script.append('UPD ' + operation)
                    change_information.append(('UPD', w_node, v_node))

                # v为w节点的父节点,如果v与y不匹配,判断x的父节点发生了变化,则定义一个移动操作MOV(w,z),表示w节点被移动到z节点下
                if v_match != y:
                    operation = '(' + str(
                        (w_node.label, w_node.value, w)) + ', ' + str(
                            (z_node.label, z_node.id)) + ')'
                    edit_script.append('MOV ' + operation)
                    # w_node为变更节点,v_node为变更节点父节点,z_node为移动后的父节点
                    change_information2.append(('MOV', w_node, v_node, z_node))

    # 遍历T1中的节点,如果某T1中的节点在T2中找不到对应的匹配节点,则定义一个删除操作DEL(w)
    # left_node_list中的节点按广度优先遍历的顺序存放
    for left_node in left_node_list:
        if map_left_to_right.get(
                left_node.id) is None and left_node.deleted == 0:
            left_node.deleted = 1
            node_list, inner_node_list, leaf_node_list = node_get(left_node)
            # 删除某个中间节点后,以该节点为根的子树都删除
            # 对节点left_node执行DEL操作,并获取以该节点为根的子树中的所有节点集合node_list,将node_list集合中的节点标记为已删除node.deleted=1
            for item in node_list:
                item.deleted = 1
            operation = '(' + str(
                (left_node.label, left_node.value, left_node.id)) + ')'
            edit_script.append('DEL' + operation)
            # 找到被删除节点的父节点
            left_node_parent = left_id_to_node.get(left_node.parent)
            change_information.append(('DEL', left_node, left_node_parent))
    return edit_script, change_information, change_information2
示例#4
0
def editscript_calculate(left_node_list, match_final, left_id_to_node, right_id_to_node, right_node_list):
    left = []
    right = []
    for item in match_final:
        left.append(item[0])
        right.append(item[1])
    # 根据match_final得到两个匹配映射map_left_to_right和map_right_to_left
    # map_left_to_right为左树到右树的匹配情况,key为T1中的节点id,value为该节点在T2中的最佳匹配节点id
    map_left_to_right = dict(zip(left, right))
    map_right_to_left = dict(zip(right, left))

    num_of_node = len(left_node_list)  # 计算左子树节点数量,用于为新插入节点设置id
    edit_script = []
    change_information = []  # 记录插入、删除和更新等操作的信息
    change_information2 = []  # 记录移动操作的信息,MOV操作需要记录变化前后的父节点类型
    # 对于T2中的每个节点x,其父节点为y;x在T1中的匹配节点为w,y为z
    for right_node in right_node_list:
        x = right_node.id  # x、y、w和z都是id
        y = right_node.parent
        w = map_right_to_left.get(x)
        z = map_right_to_left.get(y)
        if x == 0:  # 跳过头指针
            continue
        else:
            x_node = right_id_to_node.get(x)
            z_node = left_id_to_node.get(z)
            # 如果x的匹配节点不存在且x未经过插入操作处理,则定义x为新增节点,创建一个插入操作INS(x,z),表示在z节点上添加了节点x
            if w is None and right_node.inserted == 0:
                new_node = Node(x_node.label, x_node.value)  # 将插入操作作用于T1
                z_node.insertchild(new_node)
                new_node.id = num_of_node  # 保持左子树中原有节点id不变,为新插入的节点设置id
                map_right_to_left[x] = new_node.id  # 为新插入的节点设置匹配关系(x, z)
                map_left_to_right[new_node.id] = x
                left_id_to_node[num_of_node] = new_node  # 为新插入的节点设置id_to_node的匹配关系
                # 获取以x_node为根的子树中的所有节点集合node_list,将node_list集合中的节点标记为已插入node.inserted=1
                new_node.inserted = 1
                node_list, inner_node_list, leaf_node_list = node_get(x_node)
                for item in node_list:
                    item.inserted = 1
                operation = '(' + str((x_node.label, x_node.value, new_node.id)) + ', ' + str((z_node.label, z)) + ')'
                edit_script.append('INS ' + operation)  # 没考虑是在z节点的哪个位置上添加了节点x
                change_information.append(('INS', new_node, z_node))
                num_of_node += 1
            # 如果x的匹配节点存在,且未经过插入操作处理
            elif w is not None and right_node.inserted == 0:
                w_node = left_id_to_node.get(w)
                v_id = w_node.parent
                v_node = left_id_to_node.get(v_id)
                v_match = map_left_to_right.get(v_id)

                if w_node.value != x_node.value:  # 如果w节点存在,但其值不等于x节点的值,则定义一个更新操作UPD(w,value(x))
                    operation = '(' + str((w_node.label, w_node.value, w)) + ', ' + x_node.value + ')'
                    edit_script.append('UPD ' + operation)
                    change_information.append(('UPD', w_node, v_node))

                # v为w节点的父节点,如果v与y不匹配,判断x的父节点发生了变化,则定义一个移动操作MOV(w,z),表示w节点被移动到z节点下
                if v_match != y:
                    operation = '(' + str((w_node.label, w_node.value, w)) + ', ' + str((z_node.label, z_node.id)) + ')'
                    edit_script.append('MOV ' + operation)
                    # w_node为变更节点,v_node为变更节点父节点,z_node为移动后的父节点
                    change_information2.append(('MOV', w_node, v_node, z_node))

    # 遍历T1中的节点,如果某T1中的节点在T2中找不到对应的匹配节点,则定义一个删除操作DEL(w)
    # left_node_list中的节点按广度优先遍历的顺序存放
    for left_node in left_node_list:
        if map_left_to_right.get(left_node.id) is None and left_node.deleted == 0:
            left_node.deleted = 1
            node_list, inner_node_list, leaf_node_list = node_get(left_node)
            # 删除某个中间节点后,以该节点为根的子树都删除
            # 对节点left_node执行DEL操作,并获取以该节点为根的子树中的所有节点集合node_list,将node_list集合中的节点标记为已删除node.deleted=1
            for item in node_list:
                item.deleted = 1
            operation = '(' + str((left_node.label, left_node.value, left_node.id)) + ')'
            edit_script.append('DEL' + operation)
            # 找到被删除节点的父节点
            left_node_parent = left_id_to_node.get(left_node.parent)
            change_information.append(('DEL', left_node, left_node_parent))
    return edit_script, change_information, change_information2
def code_change_extraction(left_file, right_file):
    # 对两个源程序进行处理,生成各自的中间抽象语法树,并获取节点列表等相应信息
    left_AstContent = customast.parse_file(left_file)
    left_tree = Tree('头指针', 'left_head')
    left_root = Node('AstRoot', 'root')
    left_tree.linktohead(left_root)
    ast_process(left_AstContent, left_root)
    id_set(left_tree.head)
    left_child_to_parent = child_parent_information(left_tree.head)
    left_node_list, left_inner_node_list, left_leaf_node_list = node_get(left_tree.head)
    left_id_to_node = id_to_node_get(left_node_list)
    # 设置每个节点的parent_id属性
    for pair in left_child_to_parent:
        for left_id in left_id_to_node:
            if left_id == pair[0]:
                left_node = left_id_to_node.get(left_id)
                left_node.parent = pair[1]

    right_AstContent = customast.parse_file(right_file)
    right_tree = Tree('头指针', 'right_head')
    right_root = Node('AstRoot', 'root')
    right_tree.linktohead(right_root)
    ast_process(right_AstContent, right_root)
    id_set(right_tree.head)
    right_child_to_parent = child_parent_information(right_tree.head)
    right_node_list, right_inner_node_list, right_leaf_node_list = node_get(right_tree.head)
    right_id_to_node = id_to_node_get(right_node_list)
    for pair in right_child_to_parent:
        for right_id in right_id_to_node:
            if right_id == pair[0]:
                right_node = right_id_to_node.get(right_id)
                right_node.parent = pair[1]

    # 匹配叶子节点
    match_temp = leaf_match(left_leaf_node_list, right_leaf_node_list, 0.6)
    match_final = best_match(match_temp)
    leaf_matched_set(left_leaf_node_list, right_leaf_node_list, match_final)

    # 匹配中间节点
    # 对T1中所有标记为unmatched的中间节点,如果T2中存在一个节点y与之匹配,则将(x, y)加入match_final集合
    # 在匹配中间节点时采用first match,对于中间节点而言,first is best的概率较大
    for node1 in left_inner_node_list:
        for node2 in right_inner_node_list:
            if node1.matched == 0 and node2.matched == 0:
                inner_node_match(node1, node2, match_final, 0.4, 0.6)

    # 将头指针和根节点加入match_final集合,并将matched标志置为1,确保头指针和根节点一定匹配
    match_final.append((0, 0, 1.0))
    match_final.append(('head_parent', 'head_parent', 1.0))
    match_final.append((1, 1, 1.0))

    # 根据T1与T2的匹配节点集合match_final计算从T1转换为T2的编辑操作
    edit_script, change_information, change_information2 = \
        editscript_calculate(left_node_list, match_final, left_id_to_node, right_id_to_node, right_node_list)

    # 根据编辑操作得到相应的ChangeType
    change_type_list, scc_list, parent_entity_list, changed_entity_list = \
        changetype_generation(change_information, change_information2)
    '''
    # 输出change信息
    for i in range(len(change_type_list)):
        if change_type_list[i] != '':
            print('\nChangeType:', change_type_list[i])
            print('scc:', scc_list[i])
            print('ChangedEntity:', changed_entity_list[i])
            print('ParentEntity:', parent_entity_list[i])
    '''
    # 根据变更抽取结果统计ChangeType的分布
    change_type = change_type_enum()
    change_type_percentage = [0] * len(change_type)  # 用于保存每种change出现的百分比
    # 若change_type_list长度为0,表示没有不同,change_type_percentage中的元素全为0,否则计算每种change出现的频率
    change_type_dict = changetype_statistic(change_type_list)
    if len(change_type_list) != 0:
        for key, value in change_type_dict.items():
            for i in range(len(change_type)):
                if key == change_type[i]:
                    change_type_percentage[i] = value / len(change_type_list)

    # change_type_percentage在对应位置保存了每种change_type的分布百分比
    # 例如,change_type_percentage[i]保存的是change_type[i]出现的频率
    return change_type_percentage