Exemplo n.º 1
0
def initiEmbeddings():
    if not os.path.exists(basepath + 'line_embeddings.emb'):
        G = nx.read_gml(basepath + "network.gml", label=None)
        model = LINE(G, embedding_size=128, order='second')
        model.train(batch_size=1024, epochs=150, verbose=2)
        embeddings = model.get_embeddings()
        # print (embeddings)
        t = []
        for key in embeddings:
            t.append(embeddings[key])
            #print (embeddings[key])
        np.savetxt(basepath + 'line_embeddings.emb', np.array(t))

    # Emneddings = np.loadtxt(basepath + 'line_embeddings.emb')
    Embeddings = np.loadtxt(basepath + 'line_embeddings.emb')
    return Embeddings
def embedding_trainer(G,
                      embedder,
                      epochs=250,
                      seed=1234,
                      learning_rate=0.05,
                      embedding_dim=96,
                      batch_size=1024,
                      walk_length=30,
                      num_walks=200,
                      window=10,
                      p=1.0,
                      q=1.0,
                      workers=1,
                      temp_folder=None):
    if embedder == 'node2vec':
        if not os.path.isdir(temp_folder):
            os.mkdir(temp_folder)
        model = Node2Vec(G,
                         dimensions=embedding_dim,
                         walk_length=walk_length,
                         num_walks=num_walks,
                         p=p,
                         q=q,
                         weight_key='weight',
                         workers=5,
                         temp_folder=temp_folder)
        model = model.fit(window=window,
                          min_count=1,
                          seed=seed,
                          alpha=learning_rate,
                          batch_words=4)
        model.wv.save_word2vec_format('./temp_embeddings_file.emb')
        embeddings = node2vec_embedder('temp_embeddings_file.emb')
        os.remove('./temp_embeddings_file.emb')
    elif embedder == 'line':
        model = LINE(G, embedding_size=embedding_dim, order='second')
        model.train(batch_size=batch_size, epochs=epochs, verbose=2)
        embeddings = model.get_embeddings()
    elif embedder == 'rolx':
        # embeddings = np.load('./data/rolx_embeddings.npy')[()]
        with open(args["directory+'/embeddings/rolx_embedding.json'"]) as fp:
            embeddings = json.load(fp)
        # embeddings = {str(k):v for k,v in embeddings.items()}
    return embeddings
Exemplo n.º 3
0
    emb_list = []
    for k in X:
        emb_list.append(embeddings[k])
    emb_list = np.array(emb_list)

    model = TSNE(n_components=2)
    node_pos = model.fit_transform(emb_list)

    color_idx = {}
    for i in range(len(X)):
        color_idx.setdefault(Y[i][0], [])
        color_idx[Y[i][0]].append(i)

    for c, idx in color_idx.items():
        plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
    plt.legend()
    plt.show()


if __name__ == "__main__":
    G = nx.read_edgelist('../data/wiki/Wiki_edgelist.txt',
                         create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])

    model = LINE(G, embedding_size=128, order='second')
    model.train(batch_size=1024, epochs=1, verbose=1)
    embeddings = model.get_embeddings()

    evaluate_embeddings(embeddings)
    plot_embeddings(embeddings)
    for k in X:
        emb_list.append(embeddings[k])
    emb_list = np.array(emb_list)

    model = TSNE(n_components=2)
    node_pos = model.fit_transform(emb_list)

    color_idx = {}
    for i in range(len(X)):
        color_idx.setdefault(Y[i][0], [])
        color_idx[Y[i][0]].append(i)

    for c, idx in color_idx.items():
        plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)
    plt.legend()
    plt.show()


if __name__ == "__main__":
    G = nx.read_edgelist('../data/zachary/zachary_edgelist.txt',
                         create_using=nx.DiGraph(),
                         nodetype=None,
                         data=[('weight', int)])

    model = LINE(G, embedding_size=128, order='second')
    model.train(batch_size=1024, epochs=50, verbose=2)
    embeddings = model.get_embeddings()

    evaluate_embeddings(embeddings)
    plot_embeddings(embeddings)
Exemplo n.º 5
0
    #     model = LINE(G, embedding_size=config.embedding_size, order='second')
    # elif config.model_name.lower() == 'struc2vec':
    #     model = Struc2Vec(G, 10, 80, workers=4, verbose=40, )
    # elif config.model_name.lower() == 'sdne':
    #     model = SDNE(G, hidden_size=[256, 128], )
    # elif config.model_name.lower() == 'node2vec':
    #     model = Node2Vec(G, walk_length=config.walk_length, num_walks=config.num_walks,p=0.25, q=4, workers=config.workers)
    # else:
    #     model = DeepWalk(G, walk_length=config.walk_length, num_walks=config.num_walks, workers=config.workers)
    #     print('参数文件中模型名称错误,采用默认的deepwalk模型')

    model = LINE(G,
                 embedding_size=config.embedding_size,
                 order=config.line_order,
                 negative_ratio=config.negative_ratio)
    model.train(batch_size=config.line_batch_size, epochs=50, verbose=2)

    # embeddings = model.get_embeddings()
    # evaluate_embeddings(embeddings)
    # plot_embeddings(embeddings)

    # entity_dict = {}
    # f = open('/Users/admin/Desktop/GraphEmbedding-deeplearning/data/XunYiWenYao/寻医问药category.txt','r',encoding='utf-8')
    # for i in f.readlines():
    #     entity_dict[i.strip().split(' ')[0]] = i.strip().split(' ')[1]
    #
    # max_similarity = 0
    # max_similarity_tuple = []
    #
    # for k1, v1 in entity_dict.items():
    #
    def embedding_feature(self, full_data, key='uid', target='task_id', embedding_size=16, epoch=10, window_size=5,
                          mode='LINE', suffix='cnt', order='second', graph='day'):
        # 调用GraphEmbedding生成全局embedding
        model_path = './yangzhe/model/n2v/{}_{}_{}_{}_{}_{}.pkl'.format(mode, suffix, key, target, graph,
                                                                        embedding_size)

        if not os.path.exists(model_path):
            G = nx.read_edgelist('./yangzhe/feature/graph/{}_{}_{}_graph.csv'.format(target, suffix, graph),
                                 create_using=nx.DiGraph(), nodetype=None, data=[('weight', int)])
            tf.keras.backend.clear_session()
            if mode == 'LINE':
                model = LINE(graph=G, embedding_size=embedding_size, order=order)
                model.train(batch_size=64, epochs=epoch, verbose=1)
            else:
                model = DeepWalk(G, walk_length=10, num_walks=80, workers=1)
                model.train(embed_size=embedding_size, window_size=window_size, workers=5)
            with open(model_path, 'wb') as f:
                pickle.dump(model.get_embeddings(), f)

        # LINE对应的一阶特征与二阶特征
        if order == 'all':
            embedding_size = embedding_size * 2

        # 有些target的embedding是没有学习到的,这些不不存在于这个dict中,所以embedding中没有这些target所对应的行
        with open(model_path, 'rb') as f:
            embedding_dict = pickle.load(f)

        embedding = pd.DataFrame()
        embedding[target] = embedding_dict.keys()
        embedding['embedding'] = [embedding_dict[i] for i in embedding[target].values]
        embedding[target] = embedding[target].astype(int)

        sentences = full_data[[key, target]].groupby([key])[target].agg(list)

        # 这里是根据每个用户的历史曝光target进行均值来求用户的embedding,这些target应该在embedding[target]中
        task_id_have_embedding = set(embedding[target])
        lbl = LabelEncoder()
        lbl.fit(embedding[target])
        emb_matrix = np.array([i for i in embedding['embedding'].values])
        emb_mean = []
        for idx_list in sentences.values.tolist():
            need_key = [x for x in idx_list if x in task_id_have_embedding]
            if len(need_key) == 0:
                mean = np.zeros((embedding_size,))
            else:
                index = lbl.transform(need_key)
                mean = np.mean(emb_matrix[index], axis=0)
            emb_mean.append(mean)
        emb_feature = np.asarray(emb_mean)
        mean_col = ['{}_{}(MainKEY)_{}_MEAN_Window{}_{}'.format(mode, key, target, window_size, i) for i in
                    range(embedding_size)]
        emb_feature = pd.DataFrame(emb_feature, columns=mean_col)
        emb_feature[key] = sentences.index

        # target对应的embedding矩阵也存起来
        embeddings = np.concatenate(embedding['embedding'].values).reshape(-1, embedding_size)
        embeddings = pd.DataFrame(embeddings,
                                  columns=["{}_{}_{}(MainKEY)_Window{}_{}".format(mode, key, target, window_size, i)
                                           for i in range(embedding_size)])
        embedding[embeddings.columns] = embeddings
        del embedding['embedding']

        return emb_feature.reset_index(drop=True), embedding.reset_index(drop=True)