Пример #1
0
def read_node2vec_result(path):
    print('读入,node2vec的结果')
    node_list, edge_list = return_node_list_and_edge_list()
    serialNumber_listIndex_dict = nodeSerialNumber_listIndex_dict_generation(
        node_list)

    node2vec_result_file = open(path, mode='r')
    lines = node2vec_result_file.readlines()
    lines.pop(0)  # 第一行包含:节点个数 节点嵌入后的维度
    for line in lines:
        arr = line.strip().split(' ')
        serial_number = int(arr[0])
        arr.pop(0)
        node_list[
            serialNumber_listIndex_dict[serial_number]].embedded_vector = arr

    for node in node_list:
        if len(node.embedded_vector) != 64:
            print('node2vec读入有问题')
    node2vec_result_file.close()
Пример #2
0
def read_node2vec_result(path):
    print('read node2vec result')
    node_list, edge_list = return_node_list_and_edge_list()
    serialNumber_listIndex_dict = nodeSerialNumber_listIndex_dict_generation(node_list)

    node2vec_result_file = open(path, mode='r')
    lines = node2vec_result_file.readlines()
    lines.pop(0)    # 第一行包含:节点个数 节点嵌入后的维度
    for line in lines:
        arr = line.strip().split(' ')
        serial_number = int(arr[0])
        arr.pop(0)
        node_list[serialNumber_listIndex_dict[serial_number]].embedded_vector = arr
    
    count_node_without_node2vecResult = 0
    for node in node_list:
        if len(node.embedded_vector) != 64:
            count_node_without_node2vecResult += 1
            node.embedded_vector = [0] * 64
    print(f'没有node2vec结果的节点数:{count_node_without_node2vecResult}')
    node2vec_result_file.close()
Пример #3
0
def reduce_dataset_mentainConnected(G, ratio, list_interaction,
                                    list_negativeInteraction, list_lncRNA,
                                    list_protein):
    print(f'reduce the dataset to its {ratio}')
    # 确定要删减的数据的数量
    len_list_interaction = len(list_interaction)
    len_list_negativeInteraction = len(list_negativeInteraction)
    num_delete_interaction = int(
        len(list_interaction) - (len(list_interaction) * ratio))
    num_delete_negativeInteraction = int(
        len(list_negativeInteraction) -
        (len(list_negativeInteraction) * ratio))

    # 创建从serial_number到索引的字典,方便找到lncRNA, protein
    dict_serialNumber_listIndex_lncRNA = nodeSerialNumber_listIndex_dict_generation(
        list_lncRNA)
    dict_serialNumber_listIndex_protein = nodeSerialNumber_listIndex_dict_generation(
        list_protein)

    # 打乱
    random.shuffle(list_interaction)
    random.shuffle(list_negativeInteraction)

    # 从list_interaction中删除
    num_deleted_positive = 0
    num_deleted_negative = 0
    delete_positive = True
    count = 0
    while num_deleted_positive + num_deleted_negative < num_delete_interaction + num_delete_negativeInteraction:
        # 挑选正例来删除
        delete_positive = not delete_positive
        if delete_positive == True and num_deleted_positive < num_delete_interaction:
            index_interaction = random.sample(
                range(0, len_list_interaction - num_deleted_positive), 1)[0]
            interaction = list_interaction[index_interaction]
            # 定位到interaction对应的lncRNA和protein
            index_lncRNA = dict_serialNumber_listIndex_lncRNA[
                interaction.lncRNA.serial_number]
            index_protein = dict_serialNumber_listIndex_protein[
                interaction.protein.serial_number]
            lncRNA = list_lncRNA[index_lncRNA]
            protein = list_protein[index_protein]
            # 判断删除这个相互作用,会不会使数据集的二部图成为非连通图
            G_temp = G.copy()
            e = (lncRNA.serial_number, protein.serial_number)
            G_temp.remove_edge(*e)
            num_connectedComponent_G_temp = len(
                list(nx.connected_components(G_temp)))
            if num_connectedComponent_G_temp == 1:
                # 删除这个条边,二部图还是一个连通分量
                delete_interaction_from_lncRNA_protein(lncRNA, protein)
                G = G_temp
                del list_interaction[index_interaction]
                num_deleted_positive += 1
            elif num_connectedComponent_G_temp == 2 and len(
                    lncRNA.interaction_list) == 1:
                # 删除这条边,二部图变成两个连通分量,其中小的那个只包含一个孤立的lncRNA
                delete_interaction_from_lncRNA_protein(lncRNA, protein)
                G_temp.remove_node(lncRNA.serial_number)
                G = G_temp
                del list_interaction[index_interaction]
                num_deleted_positive += 1
            elif num_connectedComponent_G_temp == 2 and len(
                    protein.interaction_list) == 1:
                # 删除这条边,二部图变成两个连通分量,其中小的那个只包含一个孤立的protein
                delete_interaction_from_lncRNA_protein(lncRNA, protein)
                G_temp.remove_node(protein.serial_number)
                G = G_temp
                del list_interaction[index_interaction]
                num_deleted_positive += 1
            # 回收垃圾
            gc.collect()
        elif delete_positive == False and num_deleted_negative < num_delete_negativeInteraction:
            # 挑选负例来删除
            index_interaction = random.sample(
                range(0, len_list_negativeInteraction - num_deleted_negative),
                1)[0]
            interaction = list_negativeInteraction[index_interaction]
            # 定位到interaction对应的lncRNA和protein
            index_lncRNA = dict_serialNumber_listIndex_lncRNA[
                interaction.lncRNA.serial_number]
            index_protein = dict_serialNumber_listIndex_protein[
                interaction.protein.serial_number]
            lncRNA = list_lncRNA[index_lncRNA]
            protein = list_protein[index_protein]
            # 判断删除这个相互作用,会不会使数据集的二部图成为非连通图
            G_temp = G.copy()
            e = (lncRNA.serial_number, protein.serial_number)
            G_temp.remove_edge(*e)
            num_connectedComponent_G_temp = len(
                list(nx.connected_components(G_temp)))
            if num_connectedComponent_G_temp == 1:
                # 删除这个条边,二部图还是一个连通分量
                delete_interaction_from_lncRNA_protein(lncRNA, protein)
                G = G_temp
                del list_negativeInteraction[index_interaction]
                num_deleted_negative += 1
            elif num_connectedComponent_G_temp == 2 and len(
                    lncRNA.interaction_list) == 1:
                # 删除这条边,二部图变成两个连通分量,其中小的那个只包含一个孤立的lncRNA
                # print('删除这条边,二部图变成两个连通分量,其中小的那个只包含一个孤立的lncRNA')
                delete_interaction_from_lncRNA_protein(lncRNA, protein)
                G_temp.remove_node(lncRNA.serial_number)
                G = G_temp
                del list_negativeInteraction[index_interaction]
                num_deleted_negative += 1
            elif num_connectedComponent_G_temp == 2 and len(
                    protein.interaction_list) == 1:
                # 删除这条边,二部图变成两个连通分量,其中小的那个只包含一个孤立的protein
                # print('删除这条边,二部图变成两个连通分量,其中小的那个只包含一个孤立的protein')
                delete_interaction_from_lncRNA_protein(lncRNA, protein)
                G_temp.remove_node(protein.serial_number)
                G = G_temp
                del list_negativeInteraction[index_interaction]
                num_deleted_negative += 1
            # 回收垃圾
            gc.collect()

        if count % 100 == 0:
            print(
                f'number of positive samples need to be deleted: {num_delete_interaction-num_deleted_positive},number of negative samples need to be deleted: {num_delete_negativeInteraction-num_deleted_negative}'
            )
        count += 1
    print('reduce process done')