def get_graph_from_edges(file_path, node_file, output_node_dir, output_edge_dir, sep='\t'): import random df_edges = pd.read_csv(file_path, sep=sep, header=0) all_edge_num = df_edges.shape[0] check_and_make_path(output_node_dir) check_and_make_path(output_edge_dir) edge_num_list = [50, 100, 500, 1000, 5000, 10000, 70000] edge_indices = np.arange(all_edge_num).tolist() for i, edge_num in enumerate(edge_num_list): sample_edge_indices = random.sample(edge_indices, edge_num) df_subgraph = df_edges.loc[sample_edge_indices, :] node_list = pd.unique( pd.concat([df_subgraph['from_id'], df_subgraph['to_id']], axis=0)).tolist() df_nodes = pd.DataFrame(node_list, columns=['node']) df_nodes.to_csv(os.path.join(output_node_dir, str(i) + '.csv'), sep='\t', index=False) df_subgraph.to_csv(os.path.join(output_edge_dir, str(i) + '.csv'), sep='\t', index=False)
def __init__(self, base_path, origin_folder, walk_pair_folder, node_freq_folder, node_file, walk_time=100, walk_length=5): self.base_path = base_path self.origin_base_path = os.path.abspath( os.path.join(base_path, origin_folder)) self.walk_pair_base_path = os.path.abspath( os.path.join(base_path, walk_pair_folder)) self.node_freq_base_path = os.path.abspath( os.path.join(base_path, node_freq_folder)) node_path = os.path.abspath(os.path.join(base_path, node_file)) nodes_set = pd.read_csv(node_path, names=['node']) self.full_node_list = nodes_set['node'].tolist() self.walk_time = walk_time self.walk_length = walk_length check_and_make_path(self.walk_pair_base_path) check_and_make_path(self.node_freq_base_path)
def get_kcore_graph(self, input_file, output_dir, sep='\t', core_list=None, degree_list=None): input_path = os.path.join(self.origin_base_path, input_file) graph = get_nx_graph(input_path, self.full_node_list, sep=sep) core_num_dict = nx.core_number(graph) print("unique core nums: ", len(np.unique(np.array(list(core_num_dict.values()))))) max_core_num = max(list(core_num_dict.values())) print('file name: ', input_file, 'max core num: ', max_core_num) # x = list(graph.degree()) # max_degree = max(list(zip(*x))[1]) # print('max degree: ', max_degree) # core_list.append(max_core_num) # degree_list.append(max_degree) check_and_make_path(output_dir) format_str = get_format_str(max_core_num) for i in range(1, max_core_num + 1): k_core_graph = nx.k_core(graph, k=i, core_number=core_num_dict) k_core_graph.add_nodes_from(self.full_node_list) ############################### # This node_list is quit important, or it will change the graph adjacent matrix and cause bugs!!! A = nx.to_scipy_sparse_matrix(k_core_graph, nodelist=self.full_node_list) ############################### signature = format_str.format(i) sp.save_npz(os.path.join(output_dir, signature + '.npz'), A)
def aggregate_results(base_path, lp_res_folder, start_idx, rep_num, method_list, measure_list): if rep_num <= 0: return # Aggregate link prediction results when rep_num > 0 for method in method_list: res_base_path = os.path.join(base_path, lp_res_folder + '_' + str(start_idx)) res_path = os.path.join(res_base_path, method + '_auc_record.csv') column_names = ['date'] + [measure + '_' + str(start_idx) for measure in measure_list] df_method = pd.read_csv(res_path, sep=',', header=0, names=column_names) measure_df_dict = dict() for measure in measure_list: df_measure = df_method.loc[:, ['date', measure + '_' + str(start_idx)]].copy() measure_df_dict[measure] = df_measure for i in range(start_idx + 1, start_idx + rep_num): res_base_path = os.path.join(base_path, lp_res_folder + '_' + str(i)) res_path = os.path.join(res_base_path, method + '_auc_record.csv') column_names = ['date'] + [measure + '_' + str(i) for measure in measure_list] df_rep = pd.read_csv(res_path, sep=',', header=0, names=column_names) for measure in measure_list: measure_df_dict[measure] = pd.concat([measure_df_dict[measure], df_rep.loc[:, [measure + '_' + str(i)]]], axis=1) output_base_path = os.path.join(base_path, lp_res_folder) check_and_make_path(output_base_path) for measure in measure_list: measure_column = [measure + '_' + str(i) for i in range(start_idx, start_idx + rep_num)] df_measure = measure_df_dict[measure] df_measure['avg'] = df_measure.loc[:, measure_column].mean(axis=1) df_measure['max'] = df_measure.loc[:, measure_column].max(axis=1) df_measure['min'] = df_measure.loc[:, measure_column].min(axis=1) output_path = os.path.join(output_base_path, method + '_' + measure + '_record.csv') df_measure.to_csv(output_path, sep=',', index=False)
def __init__(self, base_path, origin_folder, core_folder, node_file): self.base_path = base_path self.origin_base_path = os.path.abspath( os.path.join(base_path, origin_folder)) self.core_base_path = os.path.abspath( os.path.join(base_path, core_folder)) node_path = os.path.abspath(os.path.join(base_path, node_file)) nodes_set = pd.read_csv(node_path, names=['node']) self.full_node_list = nodes_set['node'].tolist() self.node_num = len(self.full_node_list) check_and_make_path(self.core_base_path)
def copy_labels(): input_dir = '/data/america_air/nodes_set' label_file = 'labels.csv' label_path = os.path.join(input_dir, label_file) output_dir = '/data/america_air/nodes_label' check_and_make_path(output_dir) copy_node_labels(label_path, output_dir) input_dir = '/data/europe_air/nodes_set' label_file = 'labels.csv' label_path = os.path.join(input_dir, label_file) output_dir = '/data/europe_air/nodes_label' check_and_make_path(output_dir) copy_node_labels(label_path, output_dir)
def __init__(self, base_path, input_folder, output_folder, node_file, file_sep='\t', alpha=0.5, iter_num=100): self.base_path = base_path self.input_base_path = os.path.abspath(os.path.join(base_path, input_folder)) self.output_base_path = os.path.abspath(os.path.join(base_path, output_folder)) self.file_sep = file_sep node_file_path = os.path.abspath(os.path.join(base_path, node_file)) nodes_set = pd.read_csv(node_file_path, names=['node']) self.full_node_list = nodes_set['node'].tolist() self.node_num = len(self.full_node_list) self.alpha = alpha self.iter_num = iter_num assert 0 < self.alpha < 1 check_and_make_path(self.input_base_path) check_and_make_path(self.output_base_path)
def __init__(self, base_path, origin_folder, embedding_folder, node_list, model, loss, model_folder='model', file_sep='\t', has_cuda=False): # file paths self.base_path = base_path self.origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder)) self.embedding_base_path = os.path.abspath(os.path.join(base_path, embedding_folder)) self.model_base_path = os.path.abspath(os.path.join(base_path, model_folder)) self.has_cuda = has_cuda self.device = torch.device('cuda: 0') if has_cuda else torch.device('cpu') self.model = model self.loss = loss self.file_sep = file_sep self.full_node_list = node_list self.node_num = len(self.full_node_list) # node num self.timestamp_list = sorted(os.listdir(self.origin_base_path)) check_and_make_path(self.embedding_base_path) check_and_make_path(self.model_base_path)
def __init__(self, base_path, input_folder, output_folder, node_file, file_sep='\t', train_ratio=0.5, val_ratio=0.2, test_ratio=0.3): self.base_path = base_path self.input_base_path = os.path.join(base_path, input_folder) self.output_base_path = os.path.join(base_path, output_folder) self.file_sep = file_sep nodes_set = pd.read_csv(os.path.join(base_path, node_file), names=['node']) self.full_node_list = nodes_set['node'].tolist() self.node_num = len(self.full_node_list) self.node2idx_dict = dict(zip(self.full_node_list, np.arange(self.node_num))) assert train_ratio + test_ratio + val_ratio <= 1.0 self.train_ratio = train_ratio self.test_ratio = test_ratio self.val_ratio = val_ratio check_and_make_path(self.input_base_path) check_and_make_path(self.output_base_path)
def aggregate_results(base_path, edgecls_res_folder, start_idx, rep_num, method_list): if rep_num <= 0: return # Aggregate edge classification results when rep_num > 0 for method in method_list: res_base_path = os.path.join(base_path, edgecls_res_folder + '_' + str(start_idx)) res_path = os.path.join(res_base_path, method + '_acc_record.csv') df_method = pd.read_csv(res_path, sep=',', header=0, names=['date', 'acc_' + str(start_idx)]) for i in range(start_idx + 1, start_idx + rep_num): res_base_path = os.path.join(base_path, edgecls_res_folder + '_' + str(i)) res_path = os.path.join(res_base_path, method + '_acc_record.csv') df_rep = pd.read_csv(res_path, sep=',', header=0, names=['date', 'acc_' + str(i)]) df_method = pd.concat([df_method, df_rep.iloc[:, [1]]], axis=1) output_base_path = os.path.join(base_path, edgecls_res_folder) check_and_make_path(output_base_path) acc_list = ['acc_' + str(i) for i in range(start_idx, start_idx + rep_num)] df_method['avg'] = df_method.loc[:, acc_list].mean(axis=1) df_method['max'] = df_method.loc[:, acc_list].max(axis=1) df_method['min'] = df_method.loc[:, acc_list].min(axis=1) output_path = os.path.join(output_base_path, method + '_acc_record.csv') df_method.to_csv(output_path, sep=',', index=False)
def __init__(self, base_path, origin_folder, embedding_folder, centrality_folder, output_folder, node_file, file_sep='\t', alpha_list=None, split_fold=5): self.base_path = base_path self.origin_base_path = os.path.abspath( os.path.join(base_path, origin_folder)) self.embedding_base_path = os.path.abspath( os.path.join(base_path, embedding_folder)) self.centrality_base_path = os.path.abspath( os.path.join(base_path, centrality_folder)) self.output_base_path = os.path.abspath( os.path.join(base_path, output_folder)) self.file_sep = file_sep node_file_path = os.path.abspath(os.path.join(base_path, node_file)) nodes_set = pd.read_csv(node_file_path, names=['node']) self.full_node_list = nodes_set['node'].tolist() self.alpha_list = alpha_list self.split_fold = split_fold check_and_make_path(self.embedding_base_path) check_and_make_path(self.origin_base_path) check_and_make_path(self.output_base_path)
def __init__(self, base_path, origin_folder, embedding_folder, similarity_folder, output_folder, node_file, file_sep='\t'): self.base_path = base_path self.origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder)) self.embedding_base_path = os.path.abspath(os.path.join(base_path, embedding_folder)) self.similarity_base_path = os.path.abspath(os.path.join(base_path, similarity_folder)) self.output_base_path = os.path.abspath(os.path.join(base_path, output_folder)) self.file_sep = file_sep node_file_path = os.path.abspath(os.path.join(base_path, node_file)) nodes_set = pd.read_csv(node_file_path, names=['node']) self.full_node_list = nodes_set['node'].tolist() check_and_make_path(self.embedding_base_path) check_and_make_path(self.origin_base_path) check_and_make_path(self.output_base_path)
def __init__(self, base_path, origin_folder, embedding_folder, lp_edge_folder, output_folder, node_file, file_sep='\t', C_list=None, measure_list=None, max_iter=5000): self.base_path = base_path self.origin_base_path = os.path.join(base_path, origin_folder) self.embedding_base_path = os.path.join(base_path, embedding_folder) self.lp_edge_base_path = os.path.join(base_path, lp_edge_folder) self.output_base_path = os.path.join(base_path, output_folder) self.file_sep = file_sep self.measure_list = measure_list nodes_set = pd.read_csv(os.path.join(base_path, node_file), names=['node']) self.full_node_list = nodes_set['node'].tolist() self.C_list = C_list self.max_iter = max_iter check_and_make_path(self.embedding_base_path) check_and_make_path(self.origin_base_path) check_and_make_path(self.output_base_path)
def __init__(self, base_path, origin_folder, embedding_folder, edgeclas_folder, output_folder, node_file, label_folder, file_sep='\t', C_list=None, max_iter=5000): self.base_path = base_path self.origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder)) self.embedding_base_path = os.path.abspath(os.path.join(base_path, embedding_folder)) self.edgeclas_base_path = os.path.abspath(os.path.join(base_path, edgeclas_folder)) self.output_base_path = os.path.abspath(os.path.join(base_path, output_folder)) self.file_sep = file_sep node_file_path = os.path.abspath(os.path.join(base_path, node_file)) nodes_set = pd.read_csv(node_file_path, names=['node']) self.full_node_list = nodes_set['node'].tolist() self.label_base_path = os.path.abspath(os.path.join(base_path, label_folder)) f_list = os.listdir(self.label_base_path) assert len(f_list) > 0 label_path = os.path.join(self.label_base_path, f_list[0]) df_label = pd.read_csv(label_path, sep=file_sep) self.unique_labels = df_label['label'].unique() self.C_list = C_list self.max_iter = max_iter check_and_make_path(self.embedding_base_path) check_and_make_path(self.origin_base_path) check_and_make_path(self.output_base_path)
def get_graph_from_nodes(file_path, node_file, output_node_dir, output_edge_dir, sep='\t'): import random df_edges = pd.read_csv(file_path, sep=sep, header=0) # node_list = pd.unique(pd.concat([df_edges['from_id'], df_edges['to_id']], axis=0)).tolist() nodes_set = pd.read_csv(node_file, names=['node']) full_node_list = nodes_set['node'].tolist() print('node number: ', len(full_node_list)) check_and_make_path(output_node_dir) check_and_make_path(output_edge_dir) nx_graph = get_nx_graph(file_path, full_node_list, sep=sep) node_num_list = [50, 100, 500, 1000, 5000, 10000] max_cc = max(nx.connected_components(nx_graph), key=len) node_list = list(max_cc) print(node_list[:10]) print(len(node_list)) for i, node_num in enumerate(node_num_list): start_node = random.sample(node_list, 1)[0] adj = nx_graph.adj node_dict = dict() node_dict[start_node] = 1 sample_list = [start_node] front, cnt = -1, 1 while front < cnt and cnt < node_num: front += 1 # print('front = ', front) cur = sample_list[front] for neighbor, edge_attr in adj[cur].items(): if neighbor not in node_dict: node_dict[neighbor] = 1 cnt += 1 sample_list.append(neighbor) if cnt >= node_num: break if cnt > node_num: break # print(sample_nodes) print('i = ', i, 'cnt = ', cnt) nx_subgraph = nx_graph.subgraph(sample_list) edge_list = [] df_nodes = pd.DataFrame(sample_list, columns=['node']) df_nodes.to_csv(os.path.join(output_node_dir, str(i) + '.csv'), sep='\t', index=False, header=False) for node, neighbors in nx_subgraph.adj.items(): for neighbor, edge_attr in neighbors.items(): edge_list.append([node, neighbor, edge_attr['weight']]) edges_arr = np.array(edge_list) print('edges arr shape: ', edges_arr.shape[0]) df_output = pd.DataFrame(edges_arr, columns=['from_id', 'to_id', 'weight']) df_output.to_csv(os.path.join(output_edge_dir, str(i) + '.csv'), sep='\t', index=False) df_nodes = pd.DataFrame(np.array(full_node_list), columns=['node']) df_nodes.to_csv(os.path.join(output_node_dir, str(len(node_num_list)) + '.csv'), sep='\t', index=False, header=False) df_edges.to_csv(os.path.join(output_edge_dir, str(len(node_num_list)) + '.csv'), sep='\t', index=False)