Exemplo n.º 1
0
 def test_n2v(self):
     tt = nx.generators.complete_graph(50)
     wordsize = 32
     # Gensim triggers deprecation warnings...
     warnings.simplefilter("ignore", category=PendingDeprecationWarning)
     warnings.simplefilter("ignore", category=DeprecationWarning)
     g2v = nodevectors.Node2Vec(walklen=5,
                                epochs=5,
                                threads=6,
                                n_components=wordsize,
                                keep_walks=True,
                                verbose=False,
                                w2vparams={
                                    "window": 3,
                                    "negative": 3,
                                    "iter": 3,
                                    "batch_words": 32,
                                    "workers": 2
                                })
     g2v2 = nodevectors.Node2Vec(walklen=5,
                                 epochs=5,
                                 threads=6,
                                 n_components=wordsize,
                                 keep_walks=False,
                                 verbose=False,
                                 w2vparams={
                                     "window": 3,
                                     "negative": 3,
                                     "iter": 3,
                                     "batch_words": 32,
                                     "workers": 2
                                 })
     g2v.fit(tt)
     g2v2.fit(tt)
     res_v = g2v.predict(9)
     self.assertTrue(len(res_v) == wordsize)
     self.assertTrue(len(g2v2.predict(9)) == wordsize)
     self.assertTrue(hasattr(g2v, 'walks'))
     self.assertFalse(hasattr(g2v2, 'walks'))
     warnings.resetwarnings()
     # Test save/load
     fname = 'test_saving'
     try:
         g2v.save(fname)
         g2v_l = nodevectors.Node2Vec.load(fname + '.zip')
         res_l = g2v_l.predict(9)
         self.assertTrue(len(res_l) == wordsize)
         np.testing.assert_array_almost_equal(res_l, res_v)
     finally:
         os.remove(fname + '.zip')
Exemplo n.º 2
0
 def test_node2vec_factored_names(self):
     tt = cg.read_edgelist("./tests/unfactored_edgelist.csv", sep=",")
     ndim = 3
     skle = nodevectors.Node2Vec(walklen=5,
                                 epochs=5,
                                 threads=1,
                                 n_components=ndim,
                                 keep_walks=True,
                                 verbose=False,
                                 w2vparams={
                                     "window": 3,
                                     "negative": 3,
                                     "iter": 3,
                                     "batch_words": 32,
                                     "workers": 2
                                 })
     skle.fit(tt)
     res_v = skle.predict(9)
     self.assertTrue(len(res_v) == ndim)
     # Test save/load
     fname = 'test_saving'
     try:
         skle.save(fname)
         g2v_l = nodevectors.SKLearnEmbedder.load(fname + '.zip')
         res_l = g2v_l.predict(9)
         self.assertTrue(len(res_l) == ndim)
         np.testing.assert_array_almost_equal(res_l, res_v)
     finally:
         os.remove(fname + '.zip')
Exemplo n.º 3
0
 def test_node2vec_fit_transform(self):
     tt = cg.read_edgelist("./tests/unfactored_edgelist.csv", sep=",")
     ndim = 3
     skle = nodevectors.Node2Vec(walklen=5,
                                 epochs=5,
                                 threads=1,
                                 n_components=ndim,
                                 keep_walks=True,
                                 verbose=False,
                                 w2vparams={
                                     "window": 3,
                                     "negative": 3,
                                     "iter": 3,
                                     "batch_words": 32,
                                     "workers": 2
                                 })
     skle.fit_transform(tt)
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description='A wrapper for running Node2Vec on Very Large Graphs')
    parser.add_argument(
        '-e',
        '--edgelist',
        help='Name/path to text file containing graph edge list',
        required=True)
    parser.add_argument('-d',
                        '--dim',
                        help='Embedding dimensions',
                        required=True)
    parser.add_argument('-l',
                        '--walklen',
                        help='Random walk length',
                        required=True)
    parser.add_argument('-r',
                        '--walknum',
                        help='Number of walks',
                        required=True)
    parser.add_argument('-t', '--threads', help='# threads to use', default=0)
    parser.add_argument('-p',
                        '--return_weight',
                        help='Return node probability',
                        default=1.)
    parser.add_argument('-q',
                        '--explore_weight',
                        help='Node visit probability',
                        default=1.)
    parser.add_argument('-k',
                        '--window',
                        help='Context window size',
                        required=True)
    parser.add_argument('-w',
                        '--keep_walks',
                        help='Save the random walks',
                        default=False)
    parser.add_argument('-m',
                        '--save_model',
                        help='Save Gensim node2vec model',
                        default=False)
    args = parser.parse_args()

    # print user parameters to console
    print(
        '\n#######################################################################\n'
    )
    print('NODE2VEC Parameters:')
    print('Edge List: {input_file}'.format(
        input_file=args.edgelist.split('/')[-1]))
    print('Embedding Dimensions: {dim}'.format(dim=args.dim))
    print('Random walk Length: {walk_len}'.format(walk_len=args.walklen))
    print('Number of random walks: {walk_num}'.format(walk_num=args.walknum))
    print('Threads: {threads}'.format(threads=args.threads))
    print('Return Weight (p): {p}'.format(p=args.return_weight))
    print('Explore Weight (q): {q}'.format(q=args.explore_weight))
    print('Context Window Size: {window_size}'.format(window_size=args.window))
    print('Save Random Walks with Node2Vec Model: {keep_walks}'.format(
        keep_walks=args.keep_walks))
    print('Save Gensim Node2Vec Model: {save_model}'.format(
        save_model=args.save_model))
    print('Embedding output: {write_loc}'.format(
        write_loc=args.edgelist.split('.')[0] + '_node2vec_Embeddings.emb'))
    print(
        '\n#######################################################################\n'
    )

    print('\n#### STEP 1: Convert Edge List to CSR Graph ####')
    graph = cg.read_edgelist(args.edgelist, sep=' ', header=None)

    print('\n#### STEP 2: Fit Embedding Model to Graph ####')
    g2v = nodevectors.Node2Vec(n_components=int(args.dim),
                               walklen=int(args.walklen),
                               epochs=int(args.walknum),
                               return_weight=float(args.return_weight),
                               neighbor_weight=float(args.explore_weight),
                               threads=int(args.threads),
                               keep_walks=args.keep_walks,
                               verbose=True,
                               w2vparams={
                                   'window': int(args.window),
                                   'iter': 10
                               })
    g2v.fit(graph)

    print('\n#### STEP 3: Save Model Output and Embeddings ####')
    # save embeddings (gensim.KeyedVector format)
    g2v.save_vectors(args.edgelist.split('.')[0] + '_node2vec_Embeddings.emb')

    if args.save_model:
        # save node2vec model -- uses a lot of memory and takes a very long time to run on large graphs
        g2v.save(args.edgelist.split('.')[0] + '_node2vec_Model.pckl')