def node2vec_embedding(graph): p = 1.0 q = 1.0 dimensions = 128 num_walks = 10 walk_length = 80 window_size = 10 num_iter = 1 workers = multiprocessing.cpu_count() graph = StellarGraph(graph) rw = BiasedRandomWalk(graph) walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q) print(f"Number of random walks: {len(walks)}") model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0, sg=1, workers=workers, iter=num_iter) features = pd.DataFrame(data=model.wv.vectors, index=list(graph.nodes())) features.index = features.index.map(str) return features
def execute_model( dict_node_df, df_edges ): global dataset global model_save_path global metapaths global emb_dim global num_walks_per_node global walk_length print('Metapaths ', metapaths) emb_fpath = os.path.join( model_save_path, 'n2v_{}_{}_{}.npy'.format( emb_dim, num_walks_per_node, walk_length) ) graph_obj = StellarGraph( dict_node_df, df_edges ) walks_save_file = "n2v_random_walks_{}_{}.npy".format(walk_length, num_walks_per_node) walks_save_file = os.path.join(model_use_data_DIR, walks_save_file) try: walks_np_arr = np.load(walks_save_file) walks = [list(_) for _ in walks_np_arr] except: walks = generate_random_walks( graph_obj, num_walks_per_node, walk_length, metapaths) walks_np_arr = np.array(walks) np.save(walks_save_file, walks_np_arr) print("Number of random walks: {}".format(len(walks))) str_walks = [[str(n) for n in walk] for walk in walks] if not os.path.exists(emb_fpath): word2vec_params = { 'sg': 0, "size": emb_dim, "alpha": 0.5, "min_alpha": 0.001, 'window': 5, 'min_count': 0, "workers": multiprocessing.cpu_count(), "negative": 1, "hs": 0, # 0: negative sampling, 1:hierarchical softmax 'compute_loss': True, 'iter': 10, 'cbow_mean': 1, } iters = 20 mp2v_model = Word2Vec(**word2vec_params) mp2v_model.build_vocab(str_walks) losses = [] learning_rate = 0.5 step_size = (0.5 - 0.001) / iters for i in tqdm(range(iters)): trained_word_count, raw_word_count = mp2v_model.train( str_walks, compute_loss=True, start_alpha=learning_rate, end_alpha=learning_rate, total_examples=mp2v_model.corpus_count, epochs=1 ) loss = mp2v_model.get_latest_training_loss() losses.append(loss) print('>> ', i, ' Loss:: ', loss, learning_rate) learning_rate -= step_size # ======== Save node weights ============ # node_embeddings = [] for i in range(len(graph_obj.nodes())): vec = mp2v_model.wv[str(i)] node_embeddings.append(vec) node_embeddings = np.array(node_embeddings) np.save(emb_fpath, node_embeddings) else: node_embeddings = np.load(emb_fpath) return node_embeddings
def ctdne(dataset="ia-contact", output="/nfs/zty/Graph/Dynamic-Graph/ctdne_output"): # if os.path.exists("{}/{}.emb".format(output, dataset)): # return print("Begin CTDNE {} embeddings.".format(dataset)) train_dir = "/nfs/zty/Graph/train_data" df = pd.read_csv("{}/{}.csv".format(train_dir, dataset)) edges_df = df[df["label"] == 1].copy() print("original edges: {} filtering edges: {}".format( len(df), len(edges_df))) edges_df = edges_df[["from_idx", "to_idx", "timestamp"]].copy() edges_df.columns = ["source", "target", "time"] edges_df["source"] = edges_df["source"].astype(str) edges_df["target"] = edges_df["target"].astype(str) # nodes = list(set(edges_df["source"]).union(set(edges_df["target"]))) graph = StellarGraph( edges=edges_df, edge_weight_column="time", ) num_walks_per_node = 10 walk_length = 80 context_window_size = 10 num_cw = len(graph.nodes()) * num_walks_per_node *\ (walk_length - context_window_size + 1) print("Begin CTDNE TemporalRandomWalk.") temporal_rw = TemporalRandomWalk(graph) temporal_walks = temporal_rw.run( num_cw=num_cw, cw_size=context_window_size, max_walk_length=walk_length, walk_bias="exponential", ) print("End CTDNE TemporalRandomWalk.") embedding_size = 128 temporal_model = Word2Vec(temporal_walks, size=embedding_size, window=context_window_size, min_count=0, sg=1, workers=16, iter=1) print("Done CTDNE {} embeddings.".format(dataset)) # if not os.path.exists("{}/{}.emb".format(output, dataset)): wv = temporal_model.wv vecs = np.array([wv[u] for u in wv.vocab]) df = pd.DataFrame(vecs, index=wv.vocab) walks_nodes = set([s for l in temporal_walks for s in l]) embed_nodes = set(wv.vocab.keys()) nodes = set(edges_df["source"]).union(set(edges_df["target"])) print("{} nodes not exist in walk_nodes.".format(len(nodes - walks_nodes))) print("{} nodes not exist in embed_nodes.".format(len(nodes - embed_nodes))) emb_path = "{}/{}.emb".format(output, dataset) if not os.path.exists(emb_path): f = open(emb_path, "w") f.close() os.chmod(emb_path, 0o777) df.to_csv("{}/{}.emb".format(output, dataset), header=None)
def get_node_feats(adj): # input is cur_adj edgelist = adj['idx'].cpu().data.numpy() source = edgelist[:, 0] target = edgelist[:, 1] weight = np.ones(len(source)) G = pd.DataFrame({ 'source': source, 'target': target, 'weight': weight }) G = StellarGraph(edges=G) rw = BiasedRandomWalk(G) weighted_walks = rw.run( nodes=list(G.nodes()), # root nodes length=2, # maximum length of a random walk n=5, # number of random walks per root node p=1, # Defines (unormalised) probability, 1/p, of returning to source node q=0.5, # Defines (unormalised) probability, 1/q, for moving away from source node weighted=True, # for weighted random walks seed=42, # random seed fixed for reproducibility ) str_walks = [[str(n) for n in walk] for walk in weighted_walks] weighted_model = Word2Vec(str_walks, size=self.feats_per_node, window=5, min_count=0, sg=1, workers=1, iter=1) # Retrieve node embeddings and corresponding subjects node_ids = weighted_model.wv.index2word # list of node IDs # change to integer for i in range(0, len(node_ids)): node_ids[i] = int(node_ids[i]) weighted_node_embeddings = ( weighted_model.wv.vectors ) # numpy.ndarray of size number of nodes times embeddings dimensionality # create dic dic = dict(zip(node_ids, weighted_node_embeddings.tolist())) # ascending order dic = dict(sorted(dic.items())) # create matrix adj_mat = sp.lil_matrix((self.data.num_nodes, self.feats_per_node)) for row_idx in node_ids: adj_mat[row_idx, :] = dic[row_idx] adj_mat = adj_mat.tocsr() adj_mat = adj_mat.tocoo() coords = np.vstack((adj_mat.row, adj_mat.col)).transpose() values = adj_mat.data row = list(coords[:, 0]) col = list(coords[:, 1]) indexx = torch.LongTensor([row, col]) tensor_size = torch.Size( [self.data.num_nodes, self.feats_per_node]) degs_out = torch.sparse.FloatTensor(indexx, torch.FloatTensor(values), tensor_size) hot_1 = { 'idx': degs_out._indices().t(), 'vals': degs_out._values() } return hot_1