Пример #1
0
def learn_embeddings(args):
    ############
    G = graph.load_edgelist(args.input, undirected=args.undirected)
    print("Walking...")
    walks = graph.build_deepwalk_corpus(G,
                                        num_paths=args.num_walks,
                                        path_length=args.walk_length,
                                        alpha=0,
                                        rand=random.Random(args.seed))

    print("Training...")
    model = Word2Vec(walks,
                     size=args.dimensions,
                     window=args.window_size,
                     min_count=0,
                     sg=1,
                     hs=1,
                     workers=1)

    model.wv.save_word2vec_format(args.output)
    ###采用skip-gram模型训练,获取节点的embedding
    print('done node embedding')
    ####          此处填写代码                 ####

    #####################
    return
Пример #2
0
def run_from_params(params):
    # We load the walks if the current parameterization of them is already done.
    walk_path = join(base_dir, 'walks', 'walks-%(num_paths)ix%(path_length)i-paths.txt' % walk_params)
    if exists(walk_path):
        #print('Walk path already exists.')
        pass
    else:
        #print('Walking...')
        walk_model = params['walk_model']
        if walk_model == 'dw':
            walks = graph.build_deepwalk_corpus(G, params['num_paths'], params['path_length'], alpha=0)
        elif walk_model == 'n2v':
            G = G.preprocess_transition_probs(p=params['n2v_p'], q=params['n2v_q'])
            walks = G.simulate_walks(params['num_paths'], params['path_length'])
        else:
            raise Exception("Unknown walk model: %s" % walk_model)

        #print('Writing walks...')
        with open(walk_path, 'w+') as f:
            f.write(' '.join([node for path in walks for node in path]))

    params['corpus_file'] = walk_path
    params['output_file'] = join(base_dir, 'models', 'model-%s.vec' % str(datetime.datetime.utcnow()))
    _ = run_embedding(params)

    res = eval_blogcat(params['output_file'],
                       G=G, 
                       labels_matrix=labels_matrix, 
                       training_percents=[0.6],
                       normalize=1, verbose=0)

    # negative b/c we're minimizing
    score = res[0.6][0]['micro']
    #print 'micro-f1: %.3f' % score
    return -1.*score
Пример #3
0
def getdeepwalks(dir, number_walks=50, walk_length=10, seed=1):
    G = graph.load_adjacencylist(dir + '/adjedges.txt')

    print("Number of nodes: {}".format(len(G.nodes())))
    num_walks = len(G.nodes()) * number_walks
    print("Number of walks: {}".format(num_walks))

    print("Walking...")
    walks = graph.build_deepwalk_corpus(G,
                                        num_paths=number_walks,
                                        path_length=walk_length,
                                        alpha=0,
                                        rand=random.Random(seed))
    networksentence = []
    raw_walks = []
    for i, x in enumerate(walks):

        sentence = [gensim.utils.to_unicode(str(t)) for t in x]
        if i == 0:
            print("first walk is: ", x)
            print("sentence is: ", sentence)

        s = NetworkSentence(sentence, [sentence[0]], None,
                            0)  # label information is not used by random walk
        networksentence.append(s)
        raw_walks.append(sentence)

    return raw_walks, networksentence
Пример #4
0
def process(args):

  if args.format == "adjlist":
    G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
    G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
    G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  elif args.format == 'weighted_edgelist':
    G = nx.read_weighted_edgelist(args.input)
  else:
    raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  if data_size < args.max_memory_data_size:
    print("Walking...")
    if args.format == 'weighted_edgelist':
      #only changaed this part -- shun
      walks = weighted_random_walk.random_walk(G, num_paths=args.number_walks,path_length=args.walk_length, alpha=0)
    else:
      walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
 
    print("Training...")
    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)

  model.save_word2vec_format(args.output)
Пример #5
0
 def _get_random_walks(self, G, num_walks, wlength, gtype='graph'):
     walks = graph.build_deepwalk_corpus(G,
                                         num_paths=num_walks,
                                         path_length=wlength,
                                         alpha=0,
                                         rand=random.Random(self._seed))
     return walks
def deep_walk_data(args):
    nodedict = data_processor.records_to_graph()

    G = graph.load_adjacencylist("out.adj", undirected=True)

    # walk = graph.build_deepwalk_corpus(G, 2, 4, alpha=0,rand=random.Random(0))
    walk = graph.build_deepwalk_corpus(G,
                                       args.number_walks,
                                       args.walk_length,
                                       alpha=0,
                                       rand=random.Random(0))
    print(len(walk))
    model = Word2Vec(walk,
                     size=args.representation_size,
                     window=args.window_size,
                     min_count=0,
                     workers=args.workers)
    print(model)
    # # Namespace(csv_to_graph=True, loo=True, max_memory_data_size=1000000000, number_walks=10, representation_size=64, seed=0, walk_length=40, window_size=5, workers=1)
    # # Perform some evaluation of the model on the test dataset
    with open("./data/test_user_ratings.dat") as fin:
        fin.readline()
        groundtruth = [line.strip().split("\t")[:3]
                       for line in fin]  # (user, movie, rating)
    tr = [int(round(float(g[2]))) for g in groundtruth]
    print(groundtruth)
    pr = [
        predict_rating(model, nodedict, "u" + g[0], "m" + g[1])
        for g in groundtruth
    ]
    print(pr)
    print("MSE = %f" % mean_squared_error(tr, pr))
    print("accuracy = %f" % accuracy_score(tr, pr))
    cm = confusion_matrix(tr, pr, labels=range(1, 6))
    print(cm)
Пример #7
0
 def _get_random_walks(self, G, num_walks, wlength):
     """return random walks."""
     walks = graph.build_deepwalk_corpus(G,
                                         num_paths=num_walks,
                                         path_length=wlength,
                                         alpha=0,
                                         rand=random.Random(self._seed))
     return walks
Пример #8
0
    def load_graph(self, input_address, output_name="g1_out.embeddings", number_walks=10, walk_length=40,
                   max_memory_data_size=1000000000, matfile_variable_name="network", format='adjlist', undirected=True,
                   representation_size=16, workers=1, window_size=5, vertex_freq_degree=False, seed=0):
        if format == "adjlist":
            G = graph.load_adjacencylist(input_address, undirected=undirected)
        elif format == "edgelist":
            G = graph.load_edgelist(input_address, undirected=undirected)
        elif format == "mat":
            G = graph.load_matfile(input_address, variable_name=matfile_variable_name, undirected=undirected)
        else:
            raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % format)

        print("Number of nodes: {}".format(len(G.nodes())))

        num_walks = len(G.nodes()) * number_walks

        print("Number of walks: {}".format(num_walks))

        data_size = num_walks * walk_length

        print("Data size (walks*length): {}".format(data_size))

        if data_size < max_memory_data_size:
            print("Walking...")
            walks = graph.build_deepwalk_corpus(G, num_paths=number_walks,
                                                path_length=walk_length, alpha=0, rand=random.Random(seed))
            print("Training...")
            model = Word2Vec(walks, size=representation_size, window=window_size, min_count=0, sg=1, hs=1,
                             workers=workers)
        else:
            print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(
                data_size,
                max_memory_data_size))
            print("Walking...")

            walks_filebase = output_name + ".walks"
            walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=number_walks,
                                                              path_length=walk_length, alpha=0,
                                                              rand=random.Random(seed),
                                                              num_workers=workers)

            print("Counting vertex frequency...")
            if not vertex_freq_degree:
                vertex_counts = serialized_walks.count_textfiles(walk_files, workers)
            else:
                # use degree distribution for frequency in tree
                vertex_counts = G.degree(nodes=G.iterkeys())

            print("Training...")
            walks_corpus = serialized_walks.WalksCorpus(walk_files)
            model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
                             size=representation_size,
                             window=window_size, min_count=0, trim_rule=None, workers=workers)

        model.wv.save_word2vec_format("./dataset/{}".format(output_name))
Пример #9
0
def process(args):

    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(
            args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                            path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
        print("Training...")
        model = Word2Vec(walks, size=args.representation_size,
                         window=args.window_size, min_count=0, workers=args.workers)
    else:
        print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(
            data_size, args.max_memory_data_size))
        print("Walking...")

        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                                          path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
                                                          num_workers=args.workers)

        print("Counting vertex frequency...")
        if not args.vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(
                walk_files, args.workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts,
                         size=args.representation_size,
                         window=args.window_size, min_count=0, workers=args.workers)

    model.save_word2vec_format(args.output)
Пример #10
0
def process(args):

  G = graph.load_adjacencylist(args.input, undirected=True)
  
  print("Number of nodes: {}".format(len(G.nodes())))

  walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))

  print("Number of walks: {}".format(len(walks)))

  model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)

  model.save_word2vec_format(args.output)
Пример #11
0
def process(edgelistFile, indexToTokens, undirected, number_walks, walk_length, seed):
  G = graph.load_edgelist(edgelistFile, undirected=undirected)

  print("Number of nodes: {}".format(len(G.nodes())))
  num_walks = len(G.nodes()) * number_walks
  print("Number of walks: {}".format(num_walks))
  data_size = num_walks * walk_length
  print("Data size (walks*length): {}".format(data_size))

  print("Walking...")
  walks = graph.build_deepwalk_corpus(G, num_paths=number_walks, path_length=walk_length, alpha=0, rand=random.Random(seed))
  

  return walks
Пример #12
0
def process(args):

    print("Building Graph...")
    G_apa = buildGraphPCP()
#     G_apa = buildGraphAPA()
    
    print("walk...")
    walks = graph.build_deepwalk_corpus(G_apa, num_paths=args.number_walks,
                                        path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
    print("Training...")
    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
 
    print("Saving...")
    model.save_word2vec_format(args.output)
Пример #13
0
def process(args):

  if args.format == "adjlist":
    G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
    G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
    G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  else:
    raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks

  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length

  print("Data size (walks*length): {}".format(data_size))

  if data_size < args.max_memory_data_size:
    print("Walking...")
    walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                        path_length=args.walk_length, alpha=0, rand=random.Random(args.seed))
    print("Training...")
    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")

    walks_filebase = args.output + ".walks"
    walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                         path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
                                         num_workers=args.workers)

    print("Counting vertex frequency...")
    if not args.vertex_freq_degree:
      vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
    else:
      # use degree distribution for frequency in tree
      vertex_counts = G.degree(nodes=G.iterkeys())

    print("Training...")
    model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts,
                     size=args.representation_size,
                     window=args.window_size, min_count=0, workers=args.workers)

  model.save_word2vec_format(args.output)
Пример #14
0
 def _gen_paths(self, num_walks, walk_length):
     print "Number of nodes(train graph): %d." % (len(self.G.nodes()))
     walks = graph.build_deepwalk_corpus(self.G,
                                         num_walks,
                                         walk_length,
                                         alpha=0,
                                         rand=random.Random(0))
     fo = open('%s_paths.%d' % (self.name, self.path_len), 'w+')
     for path in walks:
         labels = ''
         for node in path:
             if node < self.num_u:
                 labels += 'u%d ' % node
             else:
                 labels += 'q%d ' % (node - self.num_u)
         fo.write('%s\n' % labels)
     fo.close()
     return walks
Пример #15
0
def run_from_params(params):
    # We load the walks if the current parameterization of them is already done.
    walk_path = join(
        base_dir, 'walks',
        'walks-%(num_paths)ix%(path_length)i-paths.txt' % walk_params)
    if exists(walk_path):
        #print('Walk path already exists.')
        pass
    else:
        #print('Walking...')
        walk_model = params['walk_model']
        if walk_model == 'dw':
            walks = graph.build_deepwalk_corpus(G,
                                                params['num_paths'],
                                                params['path_length'],
                                                alpha=0)
        elif walk_model == 'n2v':
            G = G.preprocess_transition_probs(p=params['n2v_p'],
                                              q=params['n2v_q'])
            walks = G.simulate_walks(params['num_paths'],
                                     params['path_length'])
        else:
            raise Exception("Unknown walk model: %s" % walk_model)

        #print('Writing walks...')
        with open(walk_path, 'w+') as f:
            f.write(' '.join([node for path in walks for node in path]))

    params['corpus_file'] = walk_path
    params['output_file'] = join(
        base_dir, 'models', 'model-%s.vec' % str(datetime.datetime.utcnow()))
    _ = run_embedding(params)

    res = eval_blogcat(params['output_file'],
                       G=G,
                       labels_matrix=labels_matrix,
                       training_percents=[0.6],
                       normalize=1,
                       verbose=0)

    # negative b/c we're minimizing
    score = res[0.6][0]['micro']
    #print 'micro-f1: %.3f' % score
    return -1. * score
Пример #16
0
    def _get_random_walks(self, num_walks, wlength, label=False, type='graph'):
        if label:
            fname = self.dataset_dir + self.dataset + type + '.adjlist'
        else:
            fname = self.dataset_dir + self.dataset + '_' + type + '.adjlist'
        print fname

        # type = graph refers to structural graph
        if type == 'graph':
            G = graph.load_adjacencylist(fname)
        else:
            G = graph.load_adjacencylist(fname, undirected=True)

        print("Number of nodes: {}".format(len(G.nodes())))
        total_walks = len(G.nodes()) * num_walks
        print("Number of walks: {}".format(total_walks))
        walks = graph.build_deepwalk_corpus(G, num_paths=num_walks, path_length=wlength, alpha=0,
                                        rand=random.Random(self._seed))
        return walks, len(G.nodes())
Пример #17
0
def process(args):

    G = graph.load_adjacencylist(args.input, undirected=True)

    print("Number of nodes: {}".format(len(G.nodes())))

    walks = graph.build_deepwalk_corpus(G,
                                        num_paths=args.number_walks,
                                        path_length=args.walk_length,
                                        alpha=0,
                                        rand=random.Random(args.seed))

    print("Number of walks: {}".format(len(walks)))

    model = Word2Vec(walks,
                     size=args.representation_size,
                     window=args.window_size,
                     min_count=0,
                     workers=args.workers)

    model.save_word2vec_format(args.output)
Пример #18
0
def generateWalks(dir, T=5, number_walks=20, walk_length=10, topicN=1):
    adjlistfile = os.path.join(dir, "adjlist.txt")
    print('topicNtopicN:', topicN)
    print('number_walks:', number_walks)
    print('T:', T)
    #     labelfile = os.path.join(dir,"topics"+str(topicN)+".txt")
    labelfile = os.path.join(dir,
                             'topics' + str(topicN) + '_T_' + str(T) + '.txt')
    G = graph.load_adjacencylist(adjlistfile)
    num_walks = len(G.nodes()) * number_walks
    print("Number of nodes:", len(G.nodes()))
    print("Number of walks:", num_walks)

    print("walking...")
    walks = graph.build_deepwalk_corpus(G,
                                        num_paths=number_walks,
                                        path_length=walk_length,
                                        alpha=0,
                                        rand=random.Random(1))
    node2topics = dict()
    unique_labels = []
    with open(labelfile) as lr:
        for lline in lr:
            node_index = lline.split()[0]
            node2topics[node_index] = lline.split()[1].split(",")
            for label in lline.split()[1].split(","):
                if not label in unique_labels:
                    unique_labels.append(label)

    topical_raw_walks = []
    walkfile = os.path.join(dir, "walks.txt")
    with open(walkfile, 'w') as ww:
        for i, x in enumerate(walks):
            nodes = [str(t) for t in x]
            topics = [node2topics[str(t)] for t in x]
            topical_raw_walks.append(NodeTopic(nodes, topics))

            ww.write(' '.join(nodes) + "\n")
    return topical_raw_walks, len(unique_labels), node2topics
Пример #19
0
def process(args):
  if args.format == "adjlist":
    G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  else:
    raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)

  print("Number of nodes: {}".format(len(G.nodes())))

  num_walks = len(G.nodes()) * args.number_walks
  print("Number of walks: {}".format(num_walks))

  data_size = num_walks * args.walk_length
  print("Data size (walks*length): {}".format(data_size))

  # Start walking
  print("Random walking...")
  walk_seq = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                      path_length=args.walk_length)
  print("Training...")
  model = Word2Vec(walk_seq, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1,
                   workers=args.workers)

  print("Training completed!")
  model.wv.save_word2vec_format(args.output)
Пример #20
0
    print('Graph: {:>4}'.format(idx))
    G = graph.from_networkx(g)

    print("Number of nodes: {}".format(len(G.nodes())))
    if len(G.nodes()) == 0:
        continue

    num_walks = len(G.nodes()) * number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * walk_length

    print("Data size (walks*length): {}".format(data_size))

    print("Walking...")
    walks = graph.build_deepwalk_corpus(G,
                                        num_paths=number_walks,
                                        path_length=walk_length,
                                        alpha=0,
                                        rand=random.Random(seed))
    print("Training...")
    model = Word2Vec(walks,
                     size=representation_size,
                     window=window_size,
                     min_count=0,
                     workers=workers)

    # model.wv.save_word2vec_format(output)
    models.append(model)
print('Finished')
Пример #21
0
    def features(self, number_walks = 1, walk_length = 100, 
                        max_memory_data_size = 1000000000, seed = 0, 
                        representation_size = 64, window_size = 5,
                        workers = 1, output = "SocialGraph",
                        vertex_freq_degree = False, nodes=None, take=100):
        
        path = self.path
        takenNodes = self.takenNodes
        
        print("Importing Social Graph Dataset")
        edges = self._loadData(path,self.sample,graph=True)
        
        print("Generating List of Edges")
        edgeTuples = edges.itertuples(False)
        
        del(edges)
        
        print("Generating Graph")
        socialNet = nx.DiGraph()
        sanitizedEdgeTuples=[]
        for edge in edgeTuples:
            sanitizedEdgeTuples.append(tuple(reversed(edge)))
        
        socialNet.add_edges_from(sanitizedEdgeTuples)
        del(edgeTuples)
        
        print("Computing Centrality Measures")
        #k = int(socialNet.number_of_nodes()*0.001)
        
        inDegree = dict(socialNet.in_degree())
        outDegree = dict(socialNet.out_degree())
        
        maxInDegree = max(inDegree.values())
        maxOutDegree = max(outDegree.values())
        
        minInDegree = min(inDegree.values())
        minOutDegree = min(outDegree.values())
        
        #closeness = nx.closeness_centrality(socialNet)
        #betweenness = nx.betweenness_centrality(socialNet,k=k)
        #eigenvector = nx.eigenvector_centrality(socialNet,k=k)
        
        # remove self loops
        print("Removing Self Loops")
        socialNet.remove_edges_from(socialNet.selfloop_edges())
        
        # convert to deepwalk format
        print("Transforming Graph Data Format")
        dwSocialNet = graph.from_networkx(socialNet, False)
        del(socialNet)
        
        # deepwalk process
        G = dwSocialNet
        del(dwSocialNet)
       
        num_walks = len(G.nodes()) * number_walks
        
        data_size = num_walks * walk_length
        
        if data_size < max_memory_data_size:
            print("Generating Random Walks")
            walks = graph.build_deepwalk_corpus(G, num_paths=number_walks, 
                                                path_length=walk_length, alpha=0, 
                                                rand=random.Random(seed))
        print("Applying Sampling Procedure")
        if(nodes is None):
            nodes = [walk[0] for walk in walks]
        nodeWalks = pd.DataFrame({'node':nodes})
        nodeWalks = pd.concat([nodeWalks, pd.DataFrame.from_records(walks)], axis=1)
        nodeWalkSample = nodeWalks[nodeWalks.node.isin(self.sample)]
        del(G)
        
        if(takenNodes==None):
            allNodes = []
            for walk in walks:
                for node in walk:
                    allNodes.append(node)
            
            nodeFrequencies = Counter(allNodes)
            
            del(allNodes)
            
            orderedNodeFrequencies = nodeFrequencies.most_common()
            
            del(nodeFrequencies)
            
            takenNodes = [node[0] for node in orderedNodeFrequencies[1:take]]
            
            del(orderedNodeFrequencies)
        
        print("Building One-Hot Encoded Matrix")
        encodedWalks = scipy.sparse.dok_matrix((len(nodeWalkSample['node'])*len(self.timestamps),take))
        nodeCodes = self._assignCodes(takenNodes)
        
        dayNodes = []
        row=0
        for node in nodeWalkSample['node']:
            #encode first row
            node_walk = nodeWalkSample.set_index('node').loc[node].tolist()
            other_flag=0
            col=0
            for element in takenNodes:
                if(element in node_walk):
                    encodedWalks[row,col]=1
                else:
                    encodedWalks[row,col]=0
                    other_flag=1
                col+=1
            encodedWalks[row,col]=other_flag
            row+=1
            encodedRow = encodedWalks[row]
            dayNodes.append((node,self.timestamps[0]))
            #encode all other rows for user
            for time in self.timestamps[1:]:
                dayNodes.append((node,time))
                encodedWalks[row] = encodedRow
                row+=1
        del(nodeCodes)

        num_rows, num_cols = encodedWalks.shape
        
        columns=[]
        for walk in range(0,num_cols):
            columns.append("node"+str(walk))

        dayNodes = pd.SparseDataFrame({'id':[row[0] for row in dayNodes],'timestamp':[row[1] for row in dayNodes], 'indegree':[(inDegree[int(row[0])] - minInDegree)/(maxInDegree - minInDegree) for row in dayNodes], 'outdegree':[(inDegree[int(row[0])] - minOutDegree)/(maxOutDegree - minOutDegree) for row in dayNodes]})
        graphDayNodes = pd.concat([dayNodes, pd.SparseDataFrame(encodedWalks,columns=columns)],axis=1)
        
        return takenNodes,graphDayNodes.to_sparse()
Пример #22
0
def process(edges_list,
            undirected=True,
            number_walks=10,
            walk_length=40,
            window_size=5,
            workers=1,
            dimensions=64,
            max_memory_data_size=1000000000,
            seed=0,
            vertex_freq_degree=False):
    G = graph.load_edgelist(edges_list, undirected=undirected)

    #print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * number_walks

    #print("Number of walks: {}".format(num_walks))

    data_size = num_walks * walk_length

    #print("Data size (walks*length): {}".format(data_size))

    if data_size < max_memory_data_size:
        #  print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            num_paths=number_walks,
                                            path_length=walk_length,
                                            alpha=0,
                                            rand=random.Random(seed))
        #  print("Training...")
        model = Word2Vec(walks,
                         size=dimensions,
                         window=window_size,
                         min_count=0,
                         workers=workers)
    else:
        #  print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, max_memory_data_size))
        #  print("Walking...")

        walks_filebase = "karate.embeddings" + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=number_walks,
            path_length=walk_length,
            alpha=0,
            rand=random.Random(seed),
            num_workers=workers)

        #  print("Counting vertex frequency...")
        if not vertex_freq_degree:
            vertex_counts = serialized_walks.count_textfiles(
                walk_files, workers)
        else:
            # use degree distribution for frequency in tree
            vertex_counts = G.degree(nodes=G.iterkeys())

    #  print("Training...")
        model = Skipgram(
            sentences=serialized_walks.combine_files_iter(walk_files),
            vocabulary_counts=vertex_counts,
            size=dimensions,
            window=window_size,
            min_count=0,
            workers=workers)

    #model.save_word2vec_format("karate.embeddings")
    return model
Пример #23
0
    def build_deepwalk_model(self, number_walks=10,
                             walk_length=40,
                             seed=0, representation_size=64,
                             window_size=5,
                             workers=0, epochs=50, sg=0, hs=0,
                             concepts_only=False):
        from deepwalk.graph import load_edgelist, build_deepwalk_corpus
        self.preload_merged_dist_graph()
        nmap_inv = {}
        for k, v in self.udg_nodemap.items():
            nmap_inv.setdefault(str(v), []).append(k)
        assert nx.number_connected_components(self.udg) == 1


        if workers <= 0:
            workers = multiprocessing.cpu_count()
        if workers <= 0:
            workers = multiprocessing.cpu_count()
        with tempfile.NamedTemporaryFile(prefix='graph', suffix='.edgelist') \
                as tf:
            tf.flush()
            temp_name = tf.name
            nx.write_edgelist(self.udg, temp_name, data=False)
            print("%s saved" % temp_name)
            G = load_edgelist(temp_name, undirected=True)
        print("Number of nodes: {}".format(len(G.nodes())))
        num_walks = len(G.nodes()) * number_walks
        print("Number of walks: {}".format(num_walks))

        data_size = num_walks * walk_length
        print("Data size (walks*length): {}".format(data_size))
        print("Walking...")
        walks = build_deepwalk_corpus(G, num_paths=number_walks,
                                      path_length=walk_length,
                                      alpha=0,
                                      rand=random.Random(seed))
        print("Training...")
        model = Word2Vec(walks, size=representation_size,
                         window=window_size, min_count=0, sg=sg, hs=hs,
                         iter=epochs,
                         workers=workers, callbacks=[GensimEpochLogger()])
        keys = list(model.wv.vocab.keys())
        wc = len(self.udg_nodemap)
        awc = 0
        dim = model.vector_size
        if len(nmap_inv) != len(keys):
            kd = set(nmap_inv.keys()).difference(set(keys))
            print("MISSING GROUPS:")
            print(kd)
            for g in kd:
                print(nmap_inv[str(g)])
            raise RuntimeError("Groups are inconsistent!")

        with tempfile.NamedTemporaryFile(mode='w', prefix='gemb',
                                         suffix='.txt') as tf:
            tf.write("%d %d\n" % (wc, dim))
            for w in keys:
                v = model.wv.vocab.pop(w)
                vec = model.wv.vectors[v.index]
                for inv_w in nmap_inv[w]:
                    tf.write("%s %s" % (inv_w, " ".join([str(i) for i in vec])))
                    tf.write("\n")
                    awc += 1
            tf.seek(0)
            tf.flush()
            assert wc == awc
            model = KeyedVectors.load_word2vec_format(tf.name, binary=False)
        return model
Пример #24
0
def process(params, save=True):
    """
    :param params:  传入参数用于训练
    :param save:   是否保存 训练的数据
    :return:
    """
    if params["format"] == "adjlist":
        G = graph.load_adjacencylist(params["input"],
                                     undirected=params["undirected"])
    elif params["format"] == "edgelist":
        G = graph.load_edgelist(params["input"],
                                undirected=params["undirected"])
    elif params["format"] == "mat":
        G = graph.load_matfile(params["input"],
                               variable_name=params["matfile_variable_name"],
                               undirected=params["undirected"])
    else:
        print("输入格式有误,当前输入格式为 %s" % (params["format"]))
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', "
            "mat" % params["format"])
    print("Number of node :{}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * params["number_walks"]

    print("Number of walks:{}".format(num_walks))

    data_size = num_walks * params["walk_length"]

    print("Data size (walks*length):{}".format(data_size))

    if data_size < params["max_memory_data_size"]:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(
            G,
            num_paths=params.get("number_walks", 10),
            path_length=params.get("walk_length", 40),
            alpha=params.get("alpha", 0),
            rand=random.Random(params.get("seed", 0)))

        print("Training...")
        model = Word2Vec(walks,
                         size=params.get("representation_size", 64),
                         window=params.get("window_siz", 5),
                         min_count=params.get("min_count", 0),
                         sg=params.get("sg", 1),
                         hs=params.get("hs", 1),
                         workers=params.get("workers", 1))
    else:
        print(
            "Data size{} is larger than limit(max-memory-data-size:{}).Dumping walks t disk."
            .format(data_size, params.get("max_memory_data_size")))

        print("walking...")

        walks_filebase = params["output"] + ".walks"
        walks_files = wk.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=params.get("number_walks", 10),
            path_length=params.get("walk_length", 40),
            alpha=params.get("alpha", 0),
            rand=random.Random(params.get("seed", 0)),
            num_workers=params.get("workers", 1))

        print("Counting vertex frequecy...")  # 统计节点频次

        if params["vertex_freq_degree"]:
            vertex_counts = wk.count_textfiles(walks_files, params["workers"])

        else:
            vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")

        walks_corpus = wk.WalksCorpus(walks_files)  # walk 语料

        model = Skipgram(sentences=walks_corpus,
                         vocabulary_counts=vertex_counts,
                         size=params.get("representation_size"),
                         window=params.get("windows_size", 80),
                         min_count=params.get("min_count", 0),
                         trim_rule=params.get("trim_rule", None),
                         workers=params.get("workers", 8))
    if save == True:
        model.wv.save_word2vec_format(params["output"])  # 对模型进行保存
    else:
        models = model.wv.load_word2vec_format(params["output"])  # 加载模型.
        return models
def process(args):
    """Section1:
      deepwalk的输入可以是三种格式的数据,因此针对这三种格式的数据分别构建图类对象G
      我们这里主要关心的是karae.adjlist和p2p-Gnutella08.edgelist两个文件对应的格式,
      这两个文件都在文件夹example_graphs里面
      对于.mat格式我们不关心,也不使用
    """
    if args.format == "adjlist":
        G = graph.load_adjacencylist(args.input, undirected=args.undirected)
    elif args.format == "edgelist":
        G = graph.load_edgelist(args.input, undirected=args.undirected)
    elif args.format == "mat":
        G = graph.load_matfile(args.input,
                               variable_name=args.matfile_variable_name,
                               undirected=args.undirected)
    else:
        raise Exception(
            "Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'"
            % args.format)
    """Section2:
    下面打印的都是一些基本信息
    """
    print("Number of nodes: {}".format(len(G.nodes())))

    num_walks = len(G.nodes()) * args.number_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * args.walk_length

    print("Data size (walks*length): {}".format(data_size))
    """Section3:
      这里的data_size不是实际的内存大小,只是节点使用总次数的计数。
      在这个if-else代码段中,更常出现的情况是if,而else里的代码几乎从来不会出现,除非data_size超过了max_memory_data_size
      max_memory_data_size默认的是10亿次。
    """
    if data_size < args.max_memory_data_size:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            num_paths=args.number_walks,
                                            path_length=args.walk_length,
                                            alpha=0,
                                            rand=random.Random(args.seed))
        """
        这里直接调用图对象里的build_deepwalk_corpus,它的主要作用是,对每一个点进行多次随机游走,最后的到全部的walks。
        相当于deepwalk论文里两层for循环randomwalk,
        也相当于node2vec里两层for循环node2vecWalk,
        也相当于你算法里的两层for循环下的adawalk
        num_paths=args.number_walks:是采样的轮数,即每一个起点采样的次数,相当于你论文里的$\gamma$
        path_length=args.walk_length, 
        alpha=0, 指的是以概率1-alpha从当前节点继续走下去(可以回到上一个节点),或者以alpha的概率停止继续走,回到路径的起点重新走。
                 请注意:这里的随机游走路径未必是连续的,有可能是走着走着突然回到起点接着走
                 即便alpha=0,也不能保证以后的路径不会出现起点,因为有可能从起点开始走到第二个点时,这第二个点以1的概率继续走下去,
                 这时候它可能会回到起点。
                 但是如果alpha=1,那么就意味着它不可能继续走下去,只能每次都从起点开始重新走,所以这个时候打印出来的路径序列是一连串的起点。
        rand=random.Random(args.seed)
        """

        print("Training...")

        model = Word2Vec(walks,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         sg=1,
                         hs=1,
                         workers=args.workers)
        """
        这段代码几乎不需要改动,它就是你算法里的SkipGram,你算发里有三个输入:
        random walks---->walks
        context size---->window=args.window_size
        embedding size---->size=args.representation_size
        如果你想对这个模型了解的更多,或者是做更加人性化的定制,需要继续了解from gensim.models import Word2Vec下的源码。
        我们会在下一篇博文中专门解读这个源码
        Word2Vec:

        Initialize the model from an iterable of `sentences`. Each sentence is a
        list of words (unicode strings) that will be used for training.

        Parameters
        ----------
        sentences : iterable of iterables
            The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
            If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
            in some other way.

        sg : int {1, 0}
            Defines the training algorithm. If 1, skip-gram is employed; otherwise, CBOW is used.
        size : int
            Dimensionality of the feature vectors.
        window : int
            The maximum distance between the current and predicted word within a sentence.
        alpha : float
            The initial learning rate.
        min_alpha : float
            Learning rate will linearly drop to `min_alpha` as training progresses.
        seed : int
            Seed for the random number generator. Initial vectors for each word are seeded with a hash of
            the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
            you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
            from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
            use of the `PYTHONHASHSEED` environment variable to control hash randomization).
        min_count : int
            Ignores all words with total frequency lower than this.
        max_vocab_size : int
            Limits the RAM during vocabulary building; if there are more unique
            words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
            Set to `None` for no limit.
        sample : float
            The threshold for configuring which higher-frequency words are randomly downsampled,
            useful range is (0, 1e-5).
        workers : int
            Use these many worker threads to train the model (=faster training with multicore machines).
        hs : int {1,0}
            If 1, hierarchical softmax will be used for model training.
            If set to 0, and `negative` is non-zero, negative sampling will be used.
        negative : int
            If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
            should be drawn (usually between 5-20).
            If set to 0, no negative sampling is used.
        cbow_mean : int {1,0}
            If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
        hashfxn : function
            Hash function to use to randomly initialize weights, for increased training reproducibility.
        iter : int
            Number of iterations (epochs) over the corpus.
        trim_rule : function
            Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
            be trimmed away, or handled using the default (discard if word count < min_count).
            Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
            or a callable that accepts parameters (word, count, min_count) and returns either
            :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
            Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
            of the model.
        sorted_vocab : int {1,0}
            If 1, sort the vocabulary by descending frequency before assigning word indexes.
        batch_words : int
            Target size (in words) for batches of examples passed to worker threads (and
            thus cython routines).(Larger batches will be passed if individual
            texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
        compute_loss: bool
            If True, computes and stores loss value which can be retrieved using `model.get_latest_training_loss()`.
        callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
            List of callbacks that need to be executed/run at specific stages during training.

        Examples
        --------
        Initialize and train a `Word2Vec` model

        >>> from gensim.models import Word2Vec
        >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
        >>>
        >>> model = Word2Vec(sentences, min_count=1)
        >>> say_vector = model['say']  # get vector for word
        """

    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, args.max_memory_data_size))
        print("Walking...")
        """
        在这种情况下,游走路径会被存入到一系列文件output.walks.x中,即如果你在命令行指定了--output file_path
        那么程序结束后会出现两个文件,一个是file_path,一个是file_path.walks.0,file_path.walks.1,...file_path.walks.n。
        file_path存的是各个节点的embedding,
        output.walks.x存的是采样的路径,x表示这个文件是第x个处理器存入的
        x的个数与处理器个数相同
        """
        walks_filebase = args.output + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=args.number_walks,
            path_length=args.walk_length,
            alpha=0,
            rand=random.Random(args.seed),
            num_workers=args.workers)
        """
        在目前的层次上,你只需要知道serialized_walks.write_walks_to_disk本质上也是在调用graph里的randwalk,只不过包装一下
        加入了并行化代码和写到磁盘的程序。
        这个函数具体的细节需要稍后剖析另一个文件walk.py
        函数里各个参数的含义可以参考if语句里面的代码
        """

        # print("Counting vertex frequency...")
        # if not args.vertex_freq_degree:
        #     """Use vertex degree to estimate the frequency of nodes
        #     in the random walks. This option is faster than
        #     calculating the vocabulary."""
        #     vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
        # else:
        #     # use degree distribution for frequency in tree
        #     vertex_counts = G.degree(nodes=G.iterkeys())
        #
        # print("Training...")
        # walks_corpus = serialized_walks.WalksCorpus(walk_files)
        # model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts,
        #                  size=args.representation_size,
        #                  window=args.window_size, min_count=0, trim_rule=None, workers=args.workers)
        """
        上面注释掉的代码是原来的代码,这里没有使用Word2Vec而是使用了自己编写的Skipgram,
        但是Skipgram仅仅是对Word2Vec的输入参数做了一些保装,里面仍然在调用word2vec,
        word2vec本身也可以实现并行化运算,不知道这里为什么要这么做。
        需要说明的是:从现有的情况类看vertex_counts似乎并没什么卵用
        直接向下面这样写就可以:         
        """
        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Word2Vec(sentences=walks_corpus,
                         size=args.representation_size,
                         window=args.window_size,
                         min_count=0,
                         sg=1,
                         hs=1,
                         workers=args.workers)
    """Section?:
      训练完成之后,将模型输出
    """
    model.wv.save_word2vec_format(args.output)
Пример #26
0
import random
from deepwalk import graph
from gensim.models import Word2Vec
#1.第一步导入图
G = graph.load_matfile("./blogcatalog.mat")
print(G.values())
print("Number of nodes: {}".format(len(G.nodes())))
num_walks = len(G.nodes()) * 10  #(每个节点随机游走的数目)
print("Number of walks: {}".format(num_walks))
data_size = num_walks * 40  #游走长度
print("Data size (walks*length): {}".format(data_size))
#2.开始游走
walks = graph.build_deepwalk_corpus(G,
                                    num_paths=10,
                                    path_length=40,
                                    alpha=0,
                                    rand=random.Random(0))
#3转化为向量
model = Word2Vec(walks, size=64, window=5, min_count=0, sg=1, hs=1, workers=1)
#前面那种适用于数据小的模式
#数据量大的需要写到硬盘
Пример #27
0
def run_dw(matrix,
           num_walks=100,
           walk_length=5,
           representation_size=32,
           window_size=2,
           undirected=True,
           seed=0,
           workers=1):
    random.seed(seed)
    np.random.seed(seed)
    adj_list = []
    for n, edges in enumerate(matrix):
        adj_list.append([n] + edges.nonzero()[0].tolist())

    print(adj_list)

    G = graph.from_adjlist(adj_list)
    if undirected:
        G.make_undirected()

    print("Number of nodes: {}".format(len(G.nodes())))
    num_walks = len(G.nodes()) * num_walks

    print("Number of walks: {}".format(num_walks))

    data_size = num_walks * walk_length

    print("Data size (walks*length): {}".format(data_size))

    if data_size < 1000000000:
        print("Walking...")
        walks = graph.build_deepwalk_corpus(G,
                                            num_paths=num_walks,
                                            path_length=walk_length,
                                            alpha=0,
                                            rand=random.Random(seed))
        print("Training...")
        model = Word2Vec(walks,
                         size=representation_size,
                         window=window_size,
                         min_count=0,
                         sg=1,
                         hs=1,
                         workers=workers)
    else:
        print(
            "Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk."
            .format(data_size, 1000000000))
        print("Walking...")

        walks_filebase = str(adj_list) + ".walks"
        walk_files = serialized_walks.write_walks_to_disk(
            G,
            walks_filebase,
            num_paths=num_walks,
            path_length=walk_length,
            alpha=0,
            rand=random.Random(seed),
            num_workers=workers)

        print("Counting vertex frequency...")
        #if not args.vertex_freq_degree:
        vertex_counts = serialized_walks.count_textfiles(walk_files, workers)
        #else:
        #  # use degree distribution for frequency in tree
        #  vertex_counts = G.degree(nodes=G.iterkeys())

        print("Training...")
        walks_corpus = serialized_walks.WalksCorpus(walk_files)
        model = Skipgram(sentences=walks_corpus,
                         vocabulary_counts=vertex_counts,
                         size=representation_size,
                         window=window_size,
                         min_count=0,
                         trim_rule=None,
                         workers=workers,
                         seed=seed)

    embeddings = np.zeros((len(G.nodes()), representation_size))

    for i in range(len(G.nodes())):
        embeddings[i] = model.wv.get_vector(str(i))

    return embeddings
def process(args):

  # Build "(Node, Layer)" map
  if args.floor != "":
    floorFile = open(args.floor, 'r')
    for line in floorFile:
      nd, layer = line.strip().split()[:2]
      nd = int(nd)
      layer = int(layer)
      #print nd, layer
      if nd not in graph.Graph.nodePos:
        graph.Graph.nodeList.append(graph.NodeType(nd,layer))
        graph.Graph.nodePos[nd] = len(graph.Graph.nodeList)-1

  # read input Graph
  if args.format == "adjlist":
    G = graph.load_adjacencylist(args.input, undirected=args.undirected)
  elif args.format == "edgelist":
    G = graph.load_edgelist(args.input, undirected=args.undirected)
  elif args.format == "mat":
    G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected)
  else:
    raise Exception("Unknown file format: '%s'.  Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format)
  
  timelog = ""

  print("Number of nodes: {}".format(len(G.nodes())))
  num_walks = len(G.nodes()) * args.number_walks
  print("Number of walks: {}".format(num_walks))
  data_size = num_walks * args.walk_length
  print("Data size (walks*length): {}".format(data_size))

  # Centrality calculation >> store in File
  '''
  centrality = nxGraph(args.input)
  print centrality
  fo = open("closeness.txt","wb")
  for k in centrality.keys():
    fo.write("{} {}\n".format(k,centrality[k]))
  fo.close()
  '''
  #exit()
  lsfile = open(args.LSfile, 'r')
  calculateBC(lsfile)
  #exit()

  #building (Unit)Metapath Table
  MPList = []
  graph.Graph.mpath = []
  if args.metapath != "":
    mpfile = open(args.metapath, 'r')
    for line in mpfile:
      MPList.append(int(line.strip().split()[0]))
  print "(Unit)Metapath: {}".format(MPList)
  while len(graph.Graph.mpath) < args.walk_length:
    graph.Graph.mpath.extend(MPList)
  args.walk_length = len(graph.Graph.mpath)
  print "(Full)Metapath: {}\nargs.walk_length: {}".format(graph.Graph.mpath, args.walk_length)
  
  tStart = time.time()

  if data_size < args.max_memory_data_size:
    print("Walking...")
    walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks,
                                        path_length=args.walk_length, alpha=0, rand=random.Random())
    tEnd = time.time()
    print "Walking takes {} seconds".format(round(tEnd - tStart, 3))
    timelog = "{}, {}".format( timelog, round(tEnd-tStart, 3) )
    print "Number of walks generated: {}".format(len(walks))

    tStart = time.time()
    print("Training...")
    model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
    tEnd = time.time()

    print "Training takes {} seconds".format(round(tEnd - tStart, 3))
    timelog = "{}, {}, ,{}".format( timelog, round(tEnd-tStart, 3), len(walks) )
  else:
    print("Data size {} is larger than limit (max-memory-data-size: {}).  Dumping walks to disk.".format(data_size, args.max_memory_data_size))
    print("Walking...")

    walks_filebase = args.output + ".walks"
    walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks,
                                         path_length=args.walk_length, alpha=0, rand=random.Random(args.seed),
                                         num_workers=args.workers)

    print("Counting vertex frequency...")
    if not args.vertex_freq_degree:
      vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers)
    else:
      # use degree distribution for frequency in tree
      vertex_counts = G.degree(nodes=G.iterkeys())

    print("Training...")
    model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts,
                     size=args.representation_size,
                     window=args.window_size, min_count=0, workers=args.workers)

  model.save_word2vec_format(args.output)
  with open(args.output, 'r') as f:
    timelog = "{}, {}\n".format( timelog, f.readline().split()[0] )
  with open(args.timelog, 'ab') as tl:
    tl.write(timelog)