def load_dataset( dataset, prefix='', ): mtx_path = Path("dataset/{}/{}.mtx".format(dataset, dataset)) prefix = Path(prefix, 'dataset', dataset) feats_path = Path(prefix, f'{dataset}-feats.npy') feats = np.load(str(feats_path)) if mtx_path.exists(): print('load previous mtxfile') laplacian = mmread(str(mtx_path)) else: if dataset in [ 'citeseer', 'cora', 'pubmed', ]: dataset_path = str(Path(prefix, f'{dataset}-G.json')) G_data = json.load(open(dataset_path)) G = json_graph.node_link_graph(G_data) elif dataset in ['Amazon2M', 'reddit', 'ppi']: start = time.time() G = read_gpickle(str(Path(prefix, f'{dataset}.gpickle'))) print(f'gpickle load finish: {time.time() - start}') else: raise ValueError('dataset not known') print('calculating laplacian') start = time.time() laplacian = laplacian_matrix(G) print(f'calculating finished: {time.time() - start}') file = open(mtx_path, "wb") mmwrite(str(mtx_path), laplacian) file.close() return laplacian, feats
def draw(self): colors = ['c', 'm', 'g', 'y', 'r', 'k'] plt.subplot(131) nx.draw_networkx_nodes(self.G, self.pos, node_size=500) edges = [(u, v) for (u, v, d) in self.G.edges(data=True)] edges_width = [10 * d['w'] for (u, v, d) in self.G.edges(data=True)] nx.draw_networkx_edges(self.G, self.pos, edgelist=edges, width=edges_width) nx.draw_networkx_labels(self.G, self.pos, font_size=20, font_family='sans-serif') plt.axis('off') plt.subplot(132) lap_matrix = laplacian_matrix(self.G, weight='w').todense() node_list = self.G.nodes() k = 3 eigenvalues, eigenvectors = linalg.eig(lap_matrix) sorted_indices = np.argsort(eigenvalues) topk_evecs = eigenvectors[:, sorted_indices[:k]] topk_evecs = self.standardization(topk_evecs) x = [np.array(i)[0][0] for i in topk_evecs] y = [np.array(i)[0][1] for i in topk_evecs] plt.scatter(x, y, c='r', s=50) for i, txt in enumerate(node_list): plt.annotate(txt, (x[i], y[i])) plt.subplot(133) clustering = DBSCAN(eps=0.7, min_samples=2) c_res = clustering.fit_predict(topk_evecs) dict_res = {} for c, n in zip(c_res, node_list): if c in dict_res: dict_res[c].append(n) else: dict_res[c] = [n] for idx, i in enumerate(dict_res): nx.draw_networkx_nodes(self.G, self.pos, nodelist=dict_res[i], node_size=500, node_color=colors[idx]) nx.draw_networkx_edges(self.G, self.pos, edgelist=edges, width=edges_width) nx.draw_networkx_labels(self.G, self.pos, font_size=20, font_family='sans-serif') plt.axis('off') plt.show()
def json2mtx(dataset): G_data = json.load(open("dataset/{}/{}-G.json".format(dataset, dataset))) G = json_graph.node_link_graph(G_data) laplacian = laplacian_matrix(G, nodelist=range(len(G.nodes))) file = open("dataset/{}/{}.mtx".format(dataset, dataset), "wb") mmwrite("dataset/{}/{}.mtx".format(dataset, dataset), laplacian) file.close() return laplacian
def json2mtx(dataset): G_data = json.load( open("/home/xl289/CS6241_proj/dataset/{}/{}-G.json".format( dataset, dataset))) G = json_graph.node_link_graph(G_data) laplacian = laplacian_matrix(G) file = open( "/home/xl289/CS6241_proj/dataset/{}/{}.mtx".format(dataset, dataset), "wb") mmwrite( "/home/xl289/CS6241_proj/dataset/{}/{}.mtx".format(dataset, dataset), laplacian) file.close() return laplacian
def laplacian_cluster(g, partitions=8): nodes = list(g.nodes()) if partitions < 2 or len(nodes) == 1: return [nodes] L = lpm.laplacian_matrix(g).todense() eig_vals, eig_vecs = np.linalg.eig(L) eig_vals = np.abs(eig_vals) s_idx = np.argsort(eig_vals)[:2] index = s_idx[-1] vec = eig_vecs[:, index] n1, n2 = [], [] for i, v in enumerate(vec): if v < 0: n1.append(nodes[i]) else: n2.append(nodes[i]) g1 = g.subgraph(n1) g2 = g.subgraph(n2) out = [] out.extend(laplacian_cluster(g1, partitions=partitions // 2)) out.extend(laplacian_cluster(g2, partitions=partitions // 2)) return out
def main(): parser = ArgumentParser(description="ne") parser.add_argument("-d", "--dataset", type=str, default="cora", \ help="input dataset") parser.add_argument("-o", "--coarse", type=str, default="simple", \ help="choose either simple_coarse or lamg_coarse, [simple, lamg]") parser.add_argument("-c", "--mcr_dir", type=str, default="/opt/matlab/R2018A/", \ help="directory of matlab compiler runtime (only required by lamg_coarsen)") parser.add_argument("-s", "--search_ratio", type=int, default=12, \ help="control the search space in graph fusion process (only required by lamg_coarsen)") parser.add_argument("-r", "--reduce_ratio", type=int, default=2, \ help="control graph coarsening levels (only required by lamg_coarsen)") parser.add_argument("-v", "--level", type=int, default=1, \ help="number of coarsening levels (only required by simple_coarsen)") parser.add_argument("-n", "--num_neighs", type=int, default=2, \ help="control k-nearest neighbors in graph fusion process") parser.add_argument("-l", "--lda", type=float, default=0.1, \ help="control self loop in adjacency matrix") parser.add_argument("-e", "--embed_path", type=str, default="embed_results/embeddings_palone_deepwalk.npy", \ help="path of embedding result") parser.add_argument("-m", "--embed_method", type=str, default="deepwalk", \ help="[deepwalk, node2vec, graphsage]") parser.add_argument("-f", "--fusion", default=True, action="store_false", \ help="whether use graph fusion") parser.add_argument("-p", "--power", default=False, action="store_true", \ help="Strong power of graph filter, set True to enhance filter power") parser.add_argument("-g", "--sage_model", type=str, default="mean", \ help="aggregation function in graphsage") parser.add_argument("-w", "--sage_weighted", default=True, action="store_false", \ help="whether consider weighted reduced graph") args = parser.parse_args() dataset = args.dataset feature_path = "dataset/{}/{}-feats.npy".format(dataset, dataset) fusion_input_path = "dataset/{}/{}.mtx".format(dataset, dataset) reduce_results = "reduction_results/" mapping_path = "{}Mapping.mtx".format(reduce_results) if args.fusion: coarsen_input_path = "dataset/{}/fused_{}.mtx".format(dataset, dataset) else: coarsen_input_path = "dataset/{}/{}.mtx".format(dataset, dataset) ######Load Data###### print("%%%%%% Loading Graph Data %%%%%%") if args.dataset == "ogb": d_name = "ogbl-ppa" from ogb.linkproppred import LinkPropPredDataset dataset = LinkPropPredDataset(name=d_name) print(dataset) print(dataset[0]) split_edge = dataset.get_edge_split() print(split_edge) # train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"] graph = dataset[0] # graph: library-agnostic graph object print(graph['edge_index'].shape) print(graph['edge_feat']) print(graph['node_feat']) # print((np.array(graph['node_feat']) == 0.0).all()) graph['directed'] = False print(graph) graph_nodes = [i for i in range(0, graph['num_nodes'])] G = nx.Graph() G.add_nodes_from(graph_nodes) G.add_edges_from(graph['edge_index'].T) # nx.draw(G, with_labels=True) print(G.nodes) # plt.show() laplacian = laplacian_matrix(G) print(laplacian) else: path = "dataset/ppi/ppi.mtx" G = mtx2graph(path) laplacian, edges = json2mtx(dataset) ## whether node features are required if args.fusion or args.embed_method == "graphsage": if args.dataset == 'ogb': feature = graph['node_feat'] else: feature = np.load(feature_path) # print(feature[1][0]) ######Embed Reduced Graph###### print("%%%%%% Starting Graph Embedding %%%%%%") if args.embed_method == "deepwalk": embed_start = time.process_time() embeddings = deepwalk(G) elif args.embed_method == "node2vec": embed_start = time.process_time() embeddings = node2vec(G) elif args.embed_method == "graphsage": from embed_methods.graphsage.graphsage import graphsage nx.set_node_attributes(G, False, "test") nx.set_node_attributes(G, False, "val") ## obtain mapping operator if args.coarse == "lamg": mapping = normalize(mtx2matrix(mapping_path), norm='l1', axis=1) else: mapping = identity(feature.shape[0]) for p in projections: mapping = mapping @ p mapping = normalize(mapping, norm='l1', axis=1).transpose() ## control iterations for training coarse_ratio = mapping.shape[1] / mapping.shape[0] ## map node feats to the coarse graph feats = mapping @ feature embed_start = time.process_time() embeddings = graphsage(G, feats, args.sage_model, args.sage_weighted, int(1000 / coarse_ratio)) embed_time = time.process_time() - embed_start ######Save Embeddings###### np.save(args.embed_path, embeddings) ######Evaluation###### print("%%%%%% Starting Evaluation %%%%%%") # link prediction embeds = np.load(args.embed_path) ''' if args.dataset == "ogb": acc, pre, sen, mcc, auc = linkprediction_ogb(split_edge, embeds) else: acc, pre, sen, mcc, auc = linkprediction(edges, embeds, dataset)''' print("Running regression..") # node prediction # run_regression(np.array(train_embeds), np.array(train_labels), np.array(test_embeds), np.array(test_labels)) # lr("dataset/{}/".format(dataset), args.embed_path, dataset) ######Report timing information######å print("%%%%%% CPU time %%%%%%") if args.fusion: total_time = embed_time print(f"Graph Fusion Time:") else: total_time = embed_time print("Graph Fusion Time: 0") print(f"Graph Embedding Time: {embed_time:.3f}") print(f"Total Time = Embedding_time = {total_time:.3f}")