Пример #1
0
    def __init__(self, base_path, origin_folder, embedding_folder, node_list, model, loss, max_time_num, model_folder="model"):
        # file paths
        self.base_path = base_path
        self.origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder))
        self.embedding_base_path = os.path.abspath(os.path.join(base_path, embedding_folder))
        self.model_base_path = os.path.abspath(os.path.join(base_path, model_folder))

        self.full_node_list = node_list
        self.node_num = len(self.full_node_list)  # node num
        self.timestamp_list = sorted(os.listdir(self.origin_base_path))

        # cpu gpu
        if torch.cuda.is_available():
            print("GPU")
            device = torch.device("cuda: 0")
        else:
            print("CPU")
            device = torch.device("cpu")
            self.set_thread()
        self.device = device

        self.model = model
        self.loss = loss
        self.max_time_num = max_time_num

        check_and_make_path(self.embedding_base_path)
        check_and_make_path(self.model_base_path)
Пример #2
0
def process_result(dataset, rep_num, method_list):

    for method in method_list:
        base_path = os.path.join('../../data/' + dataset,
                                 'node_classification_res_0')
        res_path = os.path.join(base_path, method + '_acc_record.csv')
        df_method = pd.read_csv(res_path,
                                sep=',',
                                header=0,
                                names=['date', 'acc0'])
        for i in range(1, rep_num):
            base_path = os.path.join('../../data/' + dataset,
                                     'node_classification_res_' + str(i))
            res_path = os.path.join(base_path, method + '_acc_record.csv')
            df_rep = pd.read_csv(res_path,
                                 sep=',',
                                 header=0,
                                 names=['date', 'acc' + str(i)])
            df_method = pd.concat([df_method, df_rep.iloc[:, [1]]], axis=1)
        output_base_path = os.path.join('../../data/' + dataset,
                                        'node_classification_res')
        check_and_make_path(output_base_path)
        acc_list = ['acc' + str(i) for i in range(rep_num)]
        df_method['avg'] = df_method.loc[:, acc_list].mean(axis=1)
        df_method['max'] = df_method.loc[:, acc_list].max(axis=1)
        df_method['min'] = df_method.loc[:, acc_list].min(axis=1)
        output_path = os.path.join(output_base_path,
                                   method + '_acc_record.csv')
        df_method.to_csv(output_path, sep=',', index=False)
Пример #3
0
    def __init__(self, base_path, input_folder, output_folder, node_file):
        self.base_path = base_path
        self.input_base_path = os.path.join(base_path, input_folder)
        self.output_base_path = os.path.join(base_path, output_folder)

        nodes_set = pd.read_csv(os.path.join(base_path, node_file), names=['node'])
        self.full_node_list = nodes_set['node'].tolist()
        self.node_num = len(self.full_node_list)

        check_and_make_path(self.input_base_path)
        check_and_make_path(self.output_base_path)
        return
Пример #4
0
    def __init__(self, base_path, origin_folder, core_folder, node_file):
        self.base_path = base_path
        self.origin_base_path = os.path.abspath(
            os.path.join(base_path, origin_folder))
        self.core_base_path = os.path.abspath(
            os.path.join(base_path, core_folder))

        node_path = os.path.abspath(os.path.join(base_path, node_file))
        nodes_set = pd.read_csv(node_path, names=['node'])
        self.full_node_list = nodes_set['node'].tolist()
        self.node_num = len(self.full_node_list)
        # if os.path.exists(self.core_base_path):
        #     shutil.rmtree(self.core_base_path)
        check_and_make_path(self.core_base_path)
Пример #5
0
def get_graph_from_edges(file_path, node_file, output_node_dir, output_edge_dir, sep='\t'):
    import random
    df_edges = pd.read_csv(file_path, sep=sep, header=0)
    all_edge_num = df_edges.shape[0]
    nodes_set = pd.read_csv(node_file, names=['node'])
    full_node_list = nodes_set['node'].tolist()
    check_and_make_path(output_node_dir)
    check_and_make_path(output_edge_dir)
    edge_num_list = [50, 100, 500, 1000, 5000, 10000, 70000]
    edge_idxs = np.arange(all_edge_num).tolist()
    for i, edge_num in enumerate(edge_num_list):
        sample_edge_idxs = random.sample(edge_idxs, edge_num)
        df_subgraph = df_edges.loc[sample_edge_idxs, :]
        node_list = pd.unique(pd.concat([df_subgraph['from_id'], df_subgraph['to_id']], axis=0)).tolist()
        df_nodes = pd.DataFrame(node_list, columns=['node'])
        df_nodes.to_csv(os.path.join(output_node_dir, str(i) + '.csv'), sep='\t', index=False)
        df_subgraph.to_csv(os.path.join(output_edge_dir, str(i) + '.csv'), sep='\t', index=False)
Пример #6
0
    def __init__(self,
                 base_path,
                 input_folder,
                 output_folder,
                 node_file,
                 label_file,
                 trans_label_file,
                 sep=' ',
                 test_ratio=0.1,
                 val_ratio=0.2):
        self.base_path = base_path
        self.input_base_path = os.path.join(base_path, input_folder)
        self.output_base_path = os.path.join(base_path, output_folder)

        nodes_set = pd.read_csv(os.path.join(base_path, node_file),
                                names=['node'])
        self.full_node_list = nodes_set['node'].tolist()
        self.node_num = len(self.full_node_list)

        node2idx_dict = dict(
            zip(self.full_node_list,
                np.arange(self.node_num).tolist()))
        df_label = pd.read_csv(os.path.join(base_path, label_file),
                               sep=sep,
                               header=0,
                               names=['node', 'label'],
                               dtype=str)
        df_label['node'] = df_label['node'].apply(lambda x: 'U' + x)
        df_label['label'] = df_label['label'].apply(np.int)
        df_label['node'] = df_label['node'].apply(lambda x: node2idx_dict[x])
        # print(node_idx_list)
        df_label.index = df_label['node'].tolist()
        df_label = df_label.loc[np.arange(self.node_num).tolist(), :]
        # print(df_label)
        self.label_list = df_label['label'].tolist()
        df_label.to_csv(os.path.join(base_path, trans_label_file),
                        sep='\t',
                        index=False)

        assert test_ratio + val_ratio < 1.0
        self.test_ratio = test_ratio
        self.val_ratio = val_ratio

        check_and_make_path(self.input_base_path)
        check_and_make_path(self.output_base_path)
        return
Пример #7
0
    def __init__(self, base_path, origin_folder, walk_pair_folder, node_freq_folder,  node_file, walk_time=100, walk_length=5):
        self.base_path = base_path
        self.origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder))
        self.walk_pair_base_path = os.path.abspath(os.path.join(base_path, walk_pair_folder))
        self.node_freq_base_path = os.path.abspath(os.path.join(base_path, node_freq_folder))

        node_path = os.path.abspath(os.path.join(base_path, node_file))
        nodes_set = pd.read_csv(node_path, names=['node'])
        self.full_node_list = nodes_set['node'].tolist()

        self.walk_time = walk_time
        self.walk_length = walk_length

        # if os.path.exists(self.walk_pair_base_path):
        #     shutil.rmtree(self.walk_pair_base_path)
        # if os.path.exists(self.node_freq_base_path):
        #     shutil.rmtree(self.node_freq_base_path)
        check_and_make_path(self.walk_pair_base_path)
        check_and_make_path(self.node_freq_base_path)
Пример #8
0
def get_graph_from_nodes(file_path, node_file, output_node_dir, output_edge_dir, sep='\t'):
    import random
    df_edges = pd.read_csv(file_path, sep=sep, header=0)
    # node_list = pd.unique(pd.concat([df_edges['from_id'], df_edges['to_id']], axis=0)).tolist()
    nodes_set = pd.read_csv(node_file, names=['node'])
    full_node_list = nodes_set['node'].tolist()
    check_and_make_path(output_node_dir)
    check_and_make_path(output_edge_dir)
    nx_graph = get_nx_graph(file_path, full_node_list, sep=sep)
    node_num_list = [50, 100, 500, 1000, 5000, 10000]
    max_cc = max(nx.connected_components(nx_graph), key=len)
    node_list = list(max_cc)
    print(len(node_list))
    for i, node_num in enumerate(node_num_list):
        start_node = random.sample(node_list, 1)[0]
        adj = nx_graph.adj
        node_dict = dict()
        node_dict[start_node] = 1
        sample_list = [start_node]
        front, cnt = -1, 1
        while front < cnt and cnt < node_num:
            front += 1
            cur = sample_list[front]
            for neighbor, edge_attr in adj[cur].items():
                if neighbor not in node_dict:
                    node_dict[neighbor] = 1
                    cnt += 1
                    sample_list.append(neighbor)
        # print(sample_nodes)
        nx_subgraph = nx_graph.subgraph(sample_list)
        edge_list = []
        df_nodes = pd.DataFrame([full_node_list[id] for id in sample_list], columns=['node'])
        df_nodes.to_csv(os.path.join(output_node_dir, str(i) + '.csv'), sep='\t', index=False)
        for node, neighbors in nx_subgraph.adj.items():
            for neighbor, edge_attr in neighbors.items():
                edge_list.append([full_node_list[node], full_node_list[neighbor], edge_attr['weight']])
        edges_arr = np.array(edge_list)
        print('edges arr shape: ', edges_arr.shape[0])
        df_output = pd.DataFrame(edges_arr, columns=['from_id', 'to_id', 'weight'])
        df_output.to_csv(os.path.join(output_edge_dir, str(i) + '.csv'), sep='\t', index=False)
    df_nodes = pd.DataFrame(np.array(full_node_list), columns=['node'])
    df_nodes.to_csv(os.path.join(output_node_dir, str(len(node_num_list)) + '.csv'), sep='\t', index=False)
    df_edges.to_csv(os.path.join(output_edge_dir, str(len(node_num_list)) + '.csv'), sep='\t', index=False)
Пример #9
0
    def __init__(self,
                 base_path,
                 origin_folder,
                 embedding_folder,
                 lp_edge_folder,
                 output_folder,
                 node_file,
                 train_ratio=1.0,
                 test_ratio=1.0):
        self.base_path = base_path
        self.origin_base_path = os.path.join(base_path, origin_folder)
        self.embedding_base_path = os.path.join(base_path, embedding_folder)
        self.lp_edge_base_path = os.path.join(base_path, lp_edge_folder)
        self.output_base_path = os.path.join(base_path, output_folder)
        self.train_ratio = train_ratio
        self.test_ratio = test_ratio

        nodes_set = pd.read_csv(os.path.join(base_path, node_file),
                                names=['node'])
        self.full_node_list = nodes_set['node'].tolist()

        check_and_make_path(self.embedding_base_path)
        check_and_make_path(self.origin_base_path)
        check_and_make_path(self.output_base_path)
        return
Пример #10
0
    def __init__(self,
                 base_path,
                 input_folder,
                 output_folder,
                 node_file,
                 test_ratio=0.3,
                 val_ratio=0.2):
        self.base_path = base_path
        self.input_base_path = os.path.join(base_path, input_folder)
        self.output_base_path = os.path.join(base_path, output_folder)

        nodes_set = pd.read_csv(os.path.join(base_path, node_file),
                                names=['node'])
        self.full_node_list = nodes_set['node'].tolist()
        self.node_num = len(self.full_node_list)
        assert test_ratio + val_ratio < 1.0
        self.test_ratio = test_ratio
        self.val_ratio = val_ratio

        check_and_make_path(self.input_base_path)
        check_and_make_path(self.output_base_path)
        return
Пример #11
0
    def get_kcore_graph(self,
                        input_file,
                        output_dir,
                        core_list=None,
                        degree_list=None):
        graph = get_nx_graph(input_file, self.full_node_list, sep='\t')
        core_num_dict = nx.core_number(graph)
        max_core_num = max(list(core_num_dict.values()))
        print('max core num: ', max_core_num)
        # x= list(graph.degree())
        # max_degree = max(list(zip(*x))[1])
        # # print('max degree: ', max_degree)
        # core_list.append(max_core_num)
        # degree_list.append(max_degree)
        check_and_make_path(output_dir)

        format_str = get_format_str(max_core_num)
        for i in range(1, max_core_num + 1):
            k_core_graph = nx.k_core(graph, k=i, core_number=core_num_dict)
            k_core_graph.add_nodes_from(np.arange(self.node_num))
            A = nx.to_scipy_sparse_matrix(k_core_graph)
            signature = format_str.format(i)
            sp.save_npz(os.path.join(output_dir, signature + ".npz"), A)
        return
Пример #12
0
    def __init__(self, base_path, origin_folder, embedding_folder, equ_folder, output_folder, node_file):
        self.base_path = base_path
        self.origin_base_path = os.path.join(base_path, origin_folder)
        self.embedding_base_path = os.path.join(base_path, embedding_folder)
        self.equ_base_path = os.path.join(base_path, equ_folder)
        self.output_base_path = os.path.join(base_path, output_folder)

        nodes_set = pd.read_csv(os.path.join(base_path, node_file), names=['node'])
        self.full_node_list = nodes_set['node'].tolist()

        check_and_make_path(self.embedding_base_path)
        check_and_make_path(self.origin_base_path)
        check_and_make_path(self.output_base_path)
        return
Пример #13
0
    def __init__(self, base_path, origin_folder, embedding_folder,
                 nodeclas_folder, output_folder, node_file, trans_label_file):
        self.base_path = base_path
        self.origin_base_path = os.path.join(base_path, origin_folder)
        self.embedding_base_path = os.path.join(base_path, embedding_folder)
        self.nodeclas_base_path = os.path.join(base_path, nodeclas_folder)
        self.output_base_path = os.path.join(base_path, output_folder)

        nodes_set = pd.read_csv(os.path.join(base_path, node_file),
                                names=['node'])
        self.full_node_list = nodes_set['node'].tolist()
        df_label = pd.read_csv(os.path.join(base_path, trans_label_file),
                               sep='\t')
        self.label_list = df_label['label'].tolist()

        check_and_make_path(self.embedding_base_path)
        check_and_make_path(self.origin_base_path)
        check_and_make_path(self.output_base_path)
        return
Пример #14
0
def process_result(dataset, rep_num, method_list):
    for method in method_list:
        base_path = os.path.join('../../data/' + dataset,
                                 'link_prediction_res_0')
        res_path = os.path.join(base_path, method + '_auc_record.csv')
        df_method = pd.read_csv(res_path,
                                sep=',',
                                header=0,
                                names=['date', 'avg0', 'had0', 'l1_0', 'l2_0'])
        df_avg = df_method.loc[:, ['date', 'avg0']].copy()
        df_had = df_method.loc[:, ['date', 'had0']].copy()
        df_l1 = df_method.loc[:, ['date', 'l1_0']].copy()
        df_l2 = df_method.loc[:, ['date', 'l2_0']].copy()
        for i in range(1, rep_num):
            base_path = os.path.join('../../data/' + dataset,
                                     'link_prediction_res_' + str(i))
            res_path = os.path.join(base_path, method + '_auc_record.csv')
            df_rep = pd.read_csv(res_path,
                                 sep=',',
                                 header=0,
                                 names=[
                                     'date', 'avg' + str(i), 'had' + str(i),
                                     'l1_' + str(i), 'l2_' + str(i)
                                 ])
            df_avg = pd.concat([df_avg, df_rep.loc[:, ['avg' + str(i)]]],
                               axis=1)
            df_had = pd.concat([df_had, df_rep.loc[:, ['had' + str(i)]]],
                               axis=1)
            df_l1 = pd.concat([df_l1, df_rep.loc[:, ['l1_' + str(i)]]], axis=1)
            df_l2 = pd.concat([df_l2, df_rep.loc[:, ['l2_' + str(i)]]], axis=1)
        output_base_path = os.path.join('../../data/' + dataset,
                                        'link_prediction_res')
        check_and_make_path(output_base_path)

        avg_list = ['avg' + str(i) for i in range(rep_num)]
        df_avg['avg'] = df_avg.loc[:, avg_list].mean(axis=1)
        df_avg['max'] = df_avg.loc[:, avg_list].max(axis=1)
        df_avg['min'] = df_avg.loc[:, avg_list].min(axis=1)
        output_path = os.path.join(output_base_path,
                                   method + '_avg_record.csv')
        df_avg.to_csv(output_path, sep=',', index=False)

        had_list = ['had' + str(i) for i in range(rep_num)]
        df_had['avg'] = df_had.loc[:, had_list].mean(axis=1)
        df_had['max'] = df_had.loc[:, had_list].max(axis=1)
        df_had['min'] = df_had.loc[:, had_list].min(axis=1)
        output_path = os.path.join(output_base_path,
                                   method + '_had_record.csv')
        df_had.to_csv(output_path, sep=',', index=False)

        l1_list = ['l1_' + str(i) for i in range(rep_num)]
        df_l1['avg'] = df_l1.loc[:, l1_list].mean(axis=1)
        df_l1['max'] = df_l1.loc[:, l1_list].max(axis=1)
        df_l1['min'] = df_l1.loc[:, l1_list].min(axis=1)
        output_path = os.path.join(output_base_path, method + '_l1_record.csv')
        df_l1.to_csv(output_path, sep=',', index=False)

        l2_list = ['l2_' + str(i) for i in range(rep_num)]
        df_l2['avg'] = df_l2.loc[:, l2_list].mean(axis=1)
        df_l2['max'] = df_l2.loc[:, l2_list].max(axis=1)
        df_l2['min'] = df_l2.loc[:, l2_list].min(axis=1)
        output_path = os.path.join(output_base_path, method + '_l2_record.csv')
        df_l2.to_csv(output_path, sep=',', index=False)