def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) # G = graphConstruction.buildGraphAPA() print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def train(corpus): if args.retrain: Utils.remove_models('model') losses = [] start_epoch = 0 model, start_epoch = Utils.load_previous_model('model') if model == None: model = Skipgram(corpus.n_words, const.EMBEDDING_SIZE) if torch.cuda.is_available(): model.cuda() optimizer = optim.Adam(model.parameters(), const.LR_RATE) for epoch in range(start_epoch, const.EPOCH): for i, batch in enumerate(corpus.batch_data(const.BATCH_SIZE)): inputs, targets = zip(*batch) # unzip inputs = torch.cat(inputs) targets = torch.cat(targets) negs = corpus.negative_sampling(targets) #print(inputs.size(), targets.size(), vocabs.size()) #exit() model.zero_grad() loss = model(inputs, targets, negs) loss.backward() optimizer.step() losses.append(loss.data.tolist()[0]) if epoch % 10 == 0: print("Epoch : %d, mean_loss : %.02f" % (epoch, np.mean(losses))) Utils.save_model(model, epoch, 'model') losses = [] Utils.save_model(model, epoch, 'model')
def train(self, embed_size=128, w_size=5, workers=3, iter_num=5, **kwargs): kwargs["sentences"] = self.sentences kwargs["min_count"] = kwargs.get("min_count", 0) kwargs["workers"] = workers kwargs["window"] = w_size kwargs["size"] = embed_size kwargs["iter"] = iter_num skipgram = Skipgram(**kwargs) self.w2v_model = skipgram return skipgram
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus_1(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) nodes = [word for word, vcab in model.wv.vocab.iteritems()] inds = [vcab.index for word, vcab in model.wv.vocab.iteritems()] X = model.wv.syn0[inds] nodes=[int(i) for i in nodes] np.savetxt('D:/deepwalk/'+args.output+'/embs_'+str(len(X[0]))+'.txt',X,fmt='%f') np.savetxt('D:/deepwalk/'+args.output+'/nodes_'+str(len(X[0]))+'.txt',np.array(nodes),fmt='%d')
def process(args): #if args.format == "adjlist": # G = graph.load_adjacencylist(args.input, undirected=args.undirected) #elif args.format == "edgelist": # G = graph.load_edgelist(args.input, undirected=args.undirected) #elif args.format == "mat": # G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) if args.format == "w_edgelist": G = graph.load_weighted_edgelist(args.input, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. This version supports only 'w_edgelist'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if True: print("Initailizing...") vertex_counts = G.degree(nodes=G.iterkeys()) #model = Word2Vec(None, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model = Skipgram(sentences=None, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers, sg=args.sg) print("Walking & Training...") sys.stderr.write("\rprogress: 0.00 [0/%d] %%" % (args.number_walks+1)) for i in xrange(args.number_walks): sys.stderr.write("\rprogress: %.2f %% [%d/%d] (walk step) " % ((i)*100./(args.number_walks+1), i+1, args.number_walks+1)) sys.stderr.flush() walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0., rand=random.Random(args.seed), workers=args.workers) sys.stderr.write("\rprogress: %.2f %% [%d/%d] (train step) " % ((i+.5)*100./(args.number_walks+1), i+1, args.number_walks+1)) sys.stderr.flush() #model.build_vocab(walks) model.train(walks) sys.stderr.write("\rprogress: 100.00 %%\n") sys.stderr.flush() else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0.1, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
# main.py # # author: sean lee # # email: [email protected] # #--------------------------------------------# import argparse parser = argparse.ArgumentParser(description='main.py') parser.add_argument('-train', action='store_true', default=False, help='train model') parser.add_argument('-test', action='store_true', default=False, help='test model') args = parser.parse_args() from dataset import Corpus, load_data from skipgram import Skipgram if __name__ == '__main__': data = list(load_data()) corpus = Corpus(data) skipgram = Skipgram(corpus) if args.train: skipgram.train() elif args.test: word = input('Input word> ') print(skipgram.test(word))
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected, attr_file_name=args.sensitive_attr_file, test_links_ratio=args.test_links, test_links_file=args.test_links_file, train_links_file=args.train_links_file) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) if args.heuristic_wrb_for_wbr is not None: wrb, err = graph.compute_heuristic_wrb(G, float(args.heuristic_wrb_for_wbr)) print(wrb, err) return if (args.weighted is not None) and (args.weighted != 'unweighted'): G = graph.set_weights(G, args.weighted) if args.just_write_graph: with open('wgraph.out', 'w') as fout: if args.weighted == 'unweighted': for v in G: s = len(G[v]) for u in G[v]: fout.write(str(v) + ' ' + str(u) + ' ' + str(1/s) + '\n') elif args.weighted.startswith('random_walk'): for v in G: for u, w in zip(G[v], G.edge_weights[v]): fout.write(str(v) + ' ' + str(u) + ' ' + str(w) + '\n') else: raise Exception('just-write-graph is not supported for this weighting method') return None num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, p_modified=args.pmodified, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, p_modified=args.pmodified, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) model.wv.save_word2vec_format(args.output)
def process(edges_list, undirected=True, number_walks=10, walk_length=40, window_size=5, workers=1, dimensions=64, max_memory_data_size=1000000000, seed=0, vertex_freq_degree=False): G = graph.load_edgelist(edges_list, undirected=undirected) #print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * number_walks #print("Number of walks: {}".format(num_walks)) data_size = num_walks * walk_length #print("Data size (walks*length): {}".format(data_size)) if data_size < max_memory_data_size: # print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=number_walks, path_length=walk_length, alpha=0, rand=random.Random(seed)) # print("Training...") model = Word2Vec(walks, size=dimensions, window=window_size, min_count=0, workers=workers) else: # print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, max_memory_data_size)) # print("Walking...") walks_filebase = "karate.embeddings" + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=number_walks, path_length=walk_length, alpha=0, rand=random.Random(seed), num_workers=workers) # print("Counting vertex frequency...") if not vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) # print("Training...") model = Skipgram( sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=dimensions, window=window_size, min_count=0, workers=workers) #model.save_word2vec_format("karate.embeddings") return model
def deepwalk_get_feature(args, adj_indices, result_path): model_path = result_path + '.model' if os.path.exists(model_path): return Word2Vec.load(model_path) G = graph.load_edgelist(adj_indices, undirected=args.undirected) print(G) if len(G) < 10: print('输出随机游走点太少') return [] print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, sg=1, hs=1, workers=args.workers) else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.dataset + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, trim_rule=None, workers=args.workers) model.wv.save_word2vec_format(result_path + '.feature') model.save(model_path) return model
def process(args): #if args.format == "adjlist": # G = graph.load_adjacencylist(args.input, undirected=args.undirected) #elif args.format == "edgelist": # G = graph.load_edgelist(args.input, undirected=args.undirected) #elif args.format == "mat": # G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) if args.format == "w_edgelist": G = graph.load_weighted_edgelist(args.input, undirected=args.undirected) else: raise Exception( "Unknown file format: '%s'. This version supports only 'w_edgelist'" % args.format) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) if True: print("Initailizing...") vertex_counts = G.degree(nodes=G.iterkeys()) #model = Word2Vec(None, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model = Skipgram(sentences=None, vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers, sg=args.sg) print("Walking & Training...") sys.stderr.write("\rprogress: 0.00 [0/%d] %%" % (args.number_walks + 1)) for i in xrange(args.number_walks): sys.stderr.write( "\rprogress: %.2f %% [%d/%d] (walk step) " % ((i) * 100. / (args.number_walks + 1), i + 1, args.number_walks + 1)) sys.stderr.flush() walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0., rand=random.Random(args.seed), workers=args.workers) sys.stderr.write( "\rprogress: %.2f %% [%d/%d] (train step) " % ((i + .5) * 100. / (args.number_walks + 1), i + 1, args.number_walks + 1)) sys.stderr.flush() #model.build_vocab(walks) model.train(walks) sys.stderr.write("\rprogress: 100.00 %%\n") sys.stderr.flush() else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0.1, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram( sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output)
def process(args): # Build "(Node, Layer)" map if args.floor != "": floorFile = open(args.floor, 'r') for line in floorFile: nd, layer = line.strip().split()[:2] nd = int(nd) layer = int(layer) #print nd, layer if nd not in graph.Graph.nodePos: graph.Graph.nodeList.append(graph.NodeType(nd,layer)) graph.Graph.nodePos[nd] = len(graph.Graph.nodeList)-1 # read input Graph if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_edgelist(args.input, undirected=args.undirected) elif args.format == "mat": G = graph.load_matfile(args.input, variable_name=args.matfile_variable_name, undirected=args.undirected) else: raise Exception("Unknown file format: '%s'. Valid formats: 'adjlist', 'edgelist', 'mat'" % args.format) timelog = "" print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("Data size (walks*length): {}".format(data_size)) # Centrality calculation >> store in File ''' centrality = nxGraph(args.input) print centrality fo = open("closeness.txt","wb") for k in centrality.keys(): fo.write("{} {}\n".format(k,centrality[k])) fo.close() ''' #exit() lsfile = open(args.LSfile, 'r') calculateBC(lsfile) #exit() #building (Unit)Metapath Table MPList = [] graph.Graph.mpath = [] if args.metapath != "": mpfile = open(args.metapath, 'r') for line in mpfile: MPList.append(int(line.strip().split()[0])) print "(Unit)Metapath: {}".format(MPList) while len(graph.Graph.mpath) < args.walk_length: graph.Graph.mpath.extend(MPList) args.walk_length = len(graph.Graph.mpath) print "(Full)Metapath: {}\nargs.walk_length: {}".format(graph.Graph.mpath, args.walk_length) tStart = time.time() if data_size < args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus(G, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random()) tEnd = time.time() print "Walking takes {} seconds".format(round(tEnd - tStart, 3)) timelog = "{}, {}".format( timelog, round(tEnd-tStart, 3) ) print "Number of walks generated: {}".format(len(walks)) tStart = time.time() print("Training...") model = Word2Vec(walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) tEnd = time.time() print "Training takes {} seconds".format(round(tEnd - tStart, 3)) timelog = "{}, {}, ,{}".format( timelog, round(tEnd-tStart, 3), len(walks) ) else: print("Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk.".format(data_size, args.max_memory_data_size)) print("Walking...") walks_filebase = args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk(G, walks_filebase, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed), num_workers=args.workers) print("Counting vertex frequency...") if not args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles(walk_files, args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") model = Skipgram(sentences=serialized_walks.combine_files_iter(walk_files), vocabulary_counts=vertex_counts, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers) model.save_word2vec_format(args.output) with open(args.output, 'r') as f: timelog = "{}, {}\n".format( timelog, f.readline().split()[0] ) with open(args.timelog, 'ab') as tl: tl.write(timelog)
def getEmbeddings(self, relationships): G = graph.load_py4jclient(relationships) print("Number of nodes: {}".format(len(G.nodes()))) num_walks = len(G.nodes()) * self.args.number_walks print("Number of walks: {}".format(num_walks)) data_size = num_walks * self.args.walk_length print("Data size (walks*length): {}".format(data_size)) if data_size < self.args.max_memory_data_size: print("Walking...") walks = graph.build_deepwalk_corpus( G, num_paths=self.args.number_walks, path_length=self.args.walk_length, alpha=0, rand=random.Random(self.args.seed)) print("Training...") model = Word2Vec(walks, size=self.args.representation_size, window=self.args.window_size, min_count=0, sg=1, hs=1, workers=self.args.workers) else: print( "Data size {} is larger than limit (max-memory-data-size: {}). Dumping walks to disk." .format(data_size, self.args.max_memory_data_size)) print("Walking...") walks_filebase = self.args.output + ".walks" walk_files = serialized_walks.write_walks_to_disk( G, walks_filebase, num_paths=self.args.number_walks, path_length=self.args.walk_length, alpha=0, rand=random.Random(self.args.seed), num_workers=self.args.workers) print("Counting vertex frequency...") if not self.args.vertex_freq_degree: vertex_counts = serialized_walks.count_textfiles( walk_files, self.args.workers) else: # use degree distribution for frequency in tree vertex_counts = G.degree(nodes=G.iterkeys()) print("Training...") walks_corpus = serialized_walks.WalksCorpus(walk_files) model = Skipgram(sentences=walks_corpus, vocabulary_counts=vertex_counts, size=self.args.representation_size, window=self.args.window_size, min_count=0, trim_rule=None, workers=self.args.workers) # to_return = {} # for word, vec in zip(model.wv.vocab, model.wv.vectors): # to_return[word] = " ".join([for str(x) in vec]) to_return = "" for word, vec in zip(model.wv.vocab, model.wv.vectors): vector_str = " ".join([str(x) for x in vec]) to_return = to_return + word + "\t" + vector_str + "\n" print(to_return) # from py4j.java_collections import SetConverter, MapConverter, ListConverter # to_return = MapConverter().convert(to_return, client) # to_return = D() # for word, vec in zip(model.wv.vocab, model.wv.vectors): # to_return.word = str(vec) return to_return
corpus = PVDBOWCorpus(corpus_dir, extension=".wld2", max_files=0, min_count=0) dataloader = DataLoader(corpus, batch_size=256, shuffle=False, num_workers=0, collate_fn=corpus.collate) output_file = "Embeddings.testfile" # A needed parameter output_file_name = output_file num_targets = corpus.num_graphs vocab_size = corpus.num_subgraphs emb_dimension = 100 # A needed parameter batch_size = 256 # A needed parameter epochs = 100 # A needed parameter initial_lr = 0.001 # A needed parameter skipgram = Skipgram(num_targets, vocab_size, emb_dimension) if torch.cuda.is_available(): device = torch.device("cuda") skipgram.cuda() else: device = torch.device("cpu") for epoch in range(epochs): print("### Epoch: " + str(epoch)) optimizer = optim.SparseAdam(skipgram.parameters(), lr=initial_lr) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, len(dataloader)) running_loss = 0.0 for i, sample_batched in enumerate(tqdm(dataloader)):