def classify_embeddings(model, training_set, test_set, training_labels): """ Calculate scores using edge embeddings and a binary classifier :param model: Node2vec model :param training_set: list, whole training set :param test_set: list, whole test set :param training_labels: list, labels for each pair in training set :return: dict, scores, key=node pair, value=probability for being labelled 1 """ print("Embedding...") # Using Hadamard product for the embedding vectors edge_embeddings = HadamardEmbedder(keyed_vectors=model.wv) x_train_embedded = [edge_embeddings[pair] for pair in training_set] x_test = [edge_embeddings[pair] for pair in test_set] # Using MLPClassifier from sklearn as binary classifier classifier = MLPClassifier(random_state=1) print("Classifying...") classify = classifier.fit(x_train_embedded, training_labels) predict = classify.predict_proba(x_test) score = dict() for i in range(len(test_set)): node1 = test_set[i][0] node2 = test_set[i][1] score[node1 + ' ' + node2] = predict[i][1] return score
def get_edge_embeddings(G: Union[MultiDiGraph, MultiGraph], debug: bool = False) -> HadamardEmbedder: if G is None: raise TypeError(' A type of G must be MultiDiGraph or MultiGraph') wv = None try: wv = HadamardEmbedder(keyed_vectors=get_node_embedings(G)) except Exception as e: raise e if debug: for idxI in range(len(G.nodes())): for idxJ in range(idxI): print('similar vector: ', (idxI, idxJ)) print('similar_edge', wv.most_similar((idxI, idxJ))) print('') return wv
def graph_embedding(train_data): # GE using n2v train_G = nx.from_pandas_edgelist(train_data, 'node1', 'node2') n2v = Node2Vec(train_G, dimensions=32, walk_length=150, num_walks=250, workers=4) model = n2v.fit(window=10, min_count=1, batch_words=4) edges_embs = HadamardEmbedder(keyed_vectors=model.wv) return edges_embs
def embed_graph(G, dimensions = 3, walk_length=30, num_walks=200, workers=4, window=10, min_count=1, batch_words=4): node_embbeding = np.zeros((len(G.nodes()), dimensions)) edge_embedding = np.zeros((len(G.nodes()),len(G.nodes()), dimensions)) node2vec = Node2Vec(G, dimensions, walk_length, num_walks, workers) # Use temp_folder for big graphs model = node2vec.fit(window=window, min_count=min_count, batch_words=batch_words) edges_embs = HadamardEmbedder(keyed_vectors=model.wv) for node in G.nodes(): node_embbeding[node,:] = model.wv.get_vector(str(node)) for node2 in G.nodes(): edge_embedding[node,node2] = edges_embs[(str(node),str(node2))] # return model.wv.get_normed_vectors() return node_embbeding,edge_embedding, model
def compute_similarity(self, first_node, second_node): edges_embs = HadamardEmbedder(keyed_vectors=self.model.wv) # Look for embeddings on the fly - here we pass normal tuples edges_embs[(first_node, second_node)] ''' OUTPUT array([ 5.75068220e-03, -1.10937878e-02, 3.76693785e-01, 2.69105062e-02, ... ... .... ..................................................................], dtype=float32) ''' # Get all edges in a separate KeyedVectors instance - use with caution could be huge for big networks edges_kv = edges_embs.as_keyed_vectors() # Look for most similar edges - this time tuples must be sorted and as str results = edges_kv.most_similar(str((first_node, second_node))) # Save embeddings for later use # edges_kv.save_word2vec_format(EDGES_EMBEDDING_FILENAME) return results
def edges(self): # 接下来几部一般不使用 ,边嵌入 获取边嵌入 if self.embedder_type == "Edge": edges_embs = EdgeEmbedder(keyed_vectors=self.model.wv) elif self.embedder_type == "Wl1": edges_embs = WeightedL1Embedder(keyed_vectors=self.model.wv) elif self.embedder_type == "Wl2": edges_embs = WeightedL2Embedder(keyed_vectors=self.model.wv) else: edges_embs = HadamardEmbedder(keyed_vectors=self.model.wv) return edges_embs
def _node2vec_encode(self, save=False): if self.all_connectivity is None: print("Need to run function _find_all_connectivityd() first.") return graph = nx.from_numpy_matrix(self.all_connectivity, create_using=nx.DiGraph) node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4) model = node2vec.fit(window=10, min_count=1, batch_words=4) # Save Node2vec results if save: model.wv.save_word2vec_format(NODE2VEC_EMBEDDING_FILENAME) model.save(NODE2VEC_EMBEDDING_MODEL_FILENAME) edges_embs = HadamardEmbedder(keyed_vectors=model.wv) edges_kv = edges_embs.as_keyed_vectors() edges_kv.save_word2vec_format(NODE2VEC_EDGES_EMBEDDING_FILENAME) url_embedding = model.wv.vectors[0:len(self.url_vocab_list)] nav_url_embedding = model.wv.vectors[len(self.url_vocab_list):] return url_embedding, nav_url_embedding
G_validation = get_graph(lists=links_validation) links_validation = list(G_validation.edges()) # Use positive validation set to evaluate score G_valid_pos = get_graph(VAL_POS) X_test = list(G_valid_pos.edges()) '''NODE2VEC''' n2v = Node2Vec(G_train, dimensions=128, walk_length=16, num_walks=10, workers=6) # Windows OS might be limited to 1 print("Fitting model...") model = n2v.fit(window=5, min_count=1, sg=1, hs=0) print("Embedding nodes...") hadamard_embedded_links = HadamardEmbedder(keyed_vectors=model.wv) X_validation = preprocess_lists(links_validation) X_validation_embedded = [hadamard_embedded_links[x] for x in X_validation] X_train = preprocess_lists(X_train) X_train_embedded = [hadamard_embedded_links[x] for x in X_train] '''CLASSIFICATION''' # Standardize features by removing mean and scale to unit variance logit = LogisticRegression() clf = logit.fit(X_train_embedded, y_train) probabilities = clf.predict_proba(X_validation_embedded) '''EVALUATION''' score = dict() X_validation = preprocess_lists(X_validation, cast=int) for i, prob in enumerate(probabilities): score[links_validation[i]] = prob[1] top100 = get_top_n_links(score, out=1)
from node2vec import Node2Vec # pre-compute the probabilities and and generate walks node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=10, workers=1) # In[15]: # embed the nodes model = node2vec.fit(window=10, min_count=1, batch_words=4) # In[16]: from node2vec.edges import HadamardEmbedder # embed the edges edges_embs = HadamardEmbedder(keyed_vectors=model.wv) # In[19]: # NOTE: I have put all the different things I tried in this section of the code. Comment it out for readability # Of course if you tried to run everything together it's NOT going to work. Since I ran everything separately and got results # But if you had like you can uncomment/comment each of them and get the prediction of valid/invalid edge in form of 1 or 0 with open('comp2_submission.csv', 'w') as csvfile: fieldnames = ['edge', 'label'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() with open('sample_submission.csv') as csvfile2: reader = csv.reader(csvfile2, delimiter=',') for row in reader:
def main(): with open(r"training.txt", "r") as f: reader = csv.reader(f) training = list(reader) # in order of training examples training = [element[0].split(" ") for element in training] training = pd.DataFrame(training, columns=['Node1', 'Node2', 'Link']) print("Training examples shape: {}".format(training.shape)) with open(r"testing.txt", "r") as f: reader = csv.reader(f) testing = list(reader) # in order of testing examples testing = [element[0].split(" ") for element in testing] testing = pd.DataFrame(testing, columns=['Node1', 'Node2']) print("Testing examples shape: {}".format(testing.shape)) train_graph_split_path = 'pickles/train_graph_split.PICKLE' if os.path.exists(train_graph_split_path): with open(train_graph_split_path, 'rb') as f: keep_indices = pickle.load(f) f.close() else: keep_indices = random.sample(range(len(training)), k=int(len(training) * 0.05)) with open(train_graph_split_path, '+wb') as f: pickle.dump(keep_indices, f) f.close() data_train_val = training.iloc[keep_indices] data_train = training.loc[~training.index.isin(keep_indices)] linked_nodes = data_train.loc[data_train['Link'] == '1'] linked_nodes = linked_nodes[['Node1', 'Node2']] linked_nodes.to_csv('linked_nodes.txt', sep=' ', index=False, header=False) graph = nx.read_edgelist('linked_nodes.txt', create_using=nx.Graph(), nodetype=str) from node2vec import Node2Vec from node2vec.edges import HadamardEmbedder dicti = {} p_d, p_wl, p_nw, p_w, p_mc, p_bw = 64, 30, 200, 10, 1, 4 node2vec = Node2Vec(graph, dimensions=p_d, walk_length=p_wl, num_walks=p_nw, workers=15) model = node2vec.fit( window=p_w, min_count=p_mc, batch_words=p_bw ) # Any keywords acceptable by gensim.Word2Vec can be passed # Embed edges using Hadamard method edges_embs = HadamardEmbedder(keyed_vectors=model.wv) embed_size = len(edges_embs[('0', '1')]) df_train = pd.DataFrame(0, index=np.arange(len(data_train_val)), columns=range(embed_size)) j = [] for j, i in tqdm(enumerate(data_train_val.index), position=0, leave=True, total=len(data_train_val)): try: df_train.loc[j] = edges_embs[(data_train_val.loc[i]['Node1'], data_train_val.loc[i]['Node2'])] except: df_train.loc[j] = np.zeros(embed_size) X = df_train y = data_train_val['Link'] y = list(map(lambda i: int(i), y)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1) lgbm = lightgbm.LGBMClassifier() model_lgbm = lgbm.fit(X_train, y_train) predictions = model_lgbm.predict(X_test) dicti['params:{}:{}:{}:{}:{}:{}'.format(p_d, p_wl, p_nw, p_w, p_mc, p_bw)] = f1_score( y_test, predictions) print(dicti['params:{}:{}:{}:{}:{}:{}'.format(p_d, p_wl, p_nw, p_w, p_mc, p_bw)])
data['link'] = 0 # fb_df_partial = edge_list.drop(index=edge_list_ghost.index.values) data = data.sample(frac=2 * (len(edge_list) / len(data)), random_state=SEED) fb_df_partial = edge_list.copy() G_data = nx.from_pandas_edgelist(fb_df_partial, "node_1", "node_2", create_using=nx.Graph()) fb_df_partial["link"] = 1 # data = data.append(edge_list_ghost[['node_1', 'node_2', 'link']], ignore_index=True) data = data.append(fb_df_partial, ignore_index=True) if not os.path.exists(os.path.join(os.curdir, "author.embedding")): node2vec = Node2Vec(G_data, dimensions=NODE2VEC_DIMENSION) n2w_model = node2vec.fit(window=7, min_count=1, workers=12) n2w_model.save("author.embedding") else: n2w_model = Word2Vec.load("author.embedding") edge_embeddings = HadamardEmbedder(keyed_vectors=n2w_model.wv) if not os.path.exists(os.path.join(os.curdir, "mapping.npy")): x = list() for i, j in zip(data['node_1'], data['node_2']): try: x.append(edge_embeddings[(str(i), str(j))]) except KeyError: x.append(np.zeros(NODE2VEC_DIMENSION)) x = np.array(x) np.save("mapping", x) else: x = np.load("mapping.npy") # y = list() # for i, j in zip(data['node_1'], data['node_2']): # y.append(list(set(nx.neighbors(author_graph, i)).intersection(set(nx.neighbors(author_graph, j)))))