def initiEmbeddings(): if not os.path.exists(basepath + 'line_embeddings.emb'): G = nx.read_gml(basepath + "network.gml", label=None) model = LINE(G, embedding_size=128, order='second') model.train(batch_size=1024, epochs=150, verbose=2) embeddings = model.get_embeddings() # print (embeddings) t = [] for key in embeddings: t.append(embeddings[key]) #print (embeddings[key]) np.savetxt(basepath + 'line_embeddings.emb', np.array(t)) # Emneddings = np.loadtxt(basepath + 'line_embeddings.emb') Embeddings = np.loadtxt(basepath + 'line_embeddings.emb') return Embeddings
def embedding_trainer(G, embedder, epochs=250, seed=1234, learning_rate=0.05, embedding_dim=96, batch_size=1024, walk_length=30, num_walks=200, window=10, p=1.0, q=1.0, workers=1, temp_folder=None): if embedder == 'node2vec': if not os.path.isdir(temp_folder): os.mkdir(temp_folder) model = Node2Vec(G, dimensions=embedding_dim, walk_length=walk_length, num_walks=num_walks, p=p, q=q, weight_key='weight', workers=5, temp_folder=temp_folder) model = model.fit(window=window, min_count=1, seed=seed, alpha=learning_rate, batch_words=4) model.wv.save_word2vec_format('./temp_embeddings_file.emb') embeddings = node2vec_embedder('temp_embeddings_file.emb') os.remove('./temp_embeddings_file.emb') elif embedder == 'line': model = LINE(G, embedding_size=embedding_dim, order='second') model.train(batch_size=batch_size, epochs=epochs, verbose=2) embeddings = model.get_embeddings() elif embedder == 'rolx': # embeddings = np.load('./data/rolx_embeddings.npy')[()] with open(args["directory+'/embeddings/rolx_embedding.json'"]) as fp: embeddings = json.load(fp) # embeddings = {str(k):v for k,v in embeddings.items()} return embeddings
emb_list = [] for k in X: emb_list.append(embeddings[k]) emb_list = np.array(emb_list) model = TSNE(n_components=2) node_pos = model.fit_transform(emb_list) color_idx = {} for i in range(len(X)): color_idx.setdefault(Y[i][0], []) color_idx[Y[i][0]].append(i) for c, idx in color_idx.items(): plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) plt.legend() plt.show() if __name__ == "__main__": G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt', create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) model = LINE(G, embedding_size=128, order='second') model.train(batch_size=1024, epochs=1, verbose=1) embeddings = model.get_embeddings() evaluate_embeddings(embeddings) plot_embeddings(embeddings)
for k in X: emb_list.append(embeddings[k]) emb_list = np.array(emb_list) model = TSNE(n_components=2) node_pos = model.fit_transform(emb_list) color_idx = {} for i in range(len(X)): color_idx.setdefault(Y[i][0], []) color_idx[Y[i][0]].append(i) for c, idx in color_idx.items(): plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c) plt.legend() plt.show() if __name__ == "__main__": G = nx.read_edgelist('../data/zachary/zachary_edgelist.txt', create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) model = LINE(G, embedding_size=128, order='second') model.train(batch_size=1024, epochs=50, verbose=2) embeddings = model.get_embeddings() evaluate_embeddings(embeddings) plot_embeddings(embeddings)
# model = LINE(G, embedding_size=config.embedding_size, order='second') # elif config.model_name.lower() == 'struc2vec': # model = Struc2Vec(G, 10, 80, workers=4, verbose=40, ) # elif config.model_name.lower() == 'sdne': # model = SDNE(G, hidden_size=[256, 128], ) # elif config.model_name.lower() == 'node2vec': # model = Node2Vec(G, walk_length=config.walk_length, num_walks=config.num_walks,p=0.25, q=4, workers=config.workers) # else: # model = DeepWalk(G, walk_length=config.walk_length, num_walks=config.num_walks, workers=config.workers) # print('参数文件中模型名称错误,采用默认的deepwalk模型') model = LINE(G, embedding_size=config.embedding_size, order=config.line_order, negative_ratio=config.negative_ratio) model.train(batch_size=config.line_batch_size, epochs=50, verbose=2) # embeddings = model.get_embeddings() # evaluate_embeddings(embeddings) # plot_embeddings(embeddings) # entity_dict = {} # f = open('/Users/admin/Desktop/GraphEmbedding-deeplearning/data/XunYiWenYao/寻医问药category.txt','r',encoding='utf-8') # for i in f.readlines(): # entity_dict[i.strip().split(' ')[0]] = i.strip().split(' ')[1] # # max_similarity = 0 # max_similarity_tuple = [] # # for k1, v1 in entity_dict.items(): #
def embedding_feature(self, full_data, key='uid', target='task_id', embedding_size=16, epoch=10, window_size=5, mode='LINE', suffix='cnt', order='second', graph='day'): # 调用GraphEmbedding生成全局embedding model_path = './yangzhe/model/n2v/{}_{}_{}_{}_{}_{}.pkl'.format(mode, suffix, key, target, graph, embedding_size) if not os.path.exists(model_path): G = nx.read_edgelist('./yangzhe/feature/graph/{}_{}_{}_graph.csv'.format(target, suffix, graph), create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)]) tf.keras.backend.clear_session() if mode == 'LINE': model = LINE(graph=G, embedding_size=embedding_size, order=order) model.train(batch_size=64, epochs=epoch, verbose=1) else: model = DeepWalk(G, walk_length=10, num_walks=80, workers=1) model.train(embed_size=embedding_size, window_size=window_size, workers=5) with open(model_path, 'wb') as f: pickle.dump(model.get_embeddings(), f) # LINE对应的一阶特征与二阶特征 if order == 'all': embedding_size = embedding_size * 2 # 有些target的embedding是没有学习到的,这些不不存在于这个dict中,所以embedding中没有这些target所对应的行 with open(model_path, 'rb') as f: embedding_dict = pickle.load(f) embedding = pd.DataFrame() embedding[target] = embedding_dict.keys() embedding['embedding'] = [embedding_dict[i] for i in embedding[target].values] embedding[target] = embedding[target].astype(int) sentences = full_data[[key, target]].groupby([key])[target].agg(list) # 这里是根据每个用户的历史曝光target进行均值来求用户的embedding,这些target应该在embedding[target]中 task_id_have_embedding = set(embedding[target]) lbl = LabelEncoder() lbl.fit(embedding[target]) emb_matrix = np.array([i for i in embedding['embedding'].values]) emb_mean = [] for idx_list in sentences.values.tolist(): need_key = [x for x in idx_list if x in task_id_have_embedding] if len(need_key) == 0: mean = np.zeros((embedding_size,)) else: index = lbl.transform(need_key) mean = np.mean(emb_matrix[index], axis=0) emb_mean.append(mean) emb_feature = np.asarray(emb_mean) mean_col = ['{}_{}(MainKEY)_{}_MEAN_Window{}_{}'.format(mode, key, target, window_size, i) for i in range(embedding_size)] emb_feature = pd.DataFrame(emb_feature, columns=mean_col) emb_feature[key] = sentences.index # target对应的embedding矩阵也存起来 embeddings = np.concatenate(embedding['embedding'].values).reshape(-1, embedding_size) embeddings = pd.DataFrame(embeddings, columns=["{}_{}_{}(MainKEY)_Window{}_{}".format(mode, key, target, window_size, i) for i in range(embedding_size)]) embedding[embeddings.columns] = embeddings del embedding['embedding'] return emb_feature.reset_index(drop=True), embedding.reset_index(drop=True)