def __init__(self): self.db = MySQLdb.connect(host="127.0.0.1", user="******", passwd="wmmkscsie", db="recommender_system", charset="utf8") self.cursor = self.db.cursor() # sql = "SELECT a.relationship_type, a.scenario_type, b.id, b.scenario_e2v_bert FROM movies as a, movies_vector as b Where a.id=b.id and a.id >= 1 and a.id <= 1171 and b.scenario_e2v_bert !=''" sql = "SELECT a.relationship_type, a.scenario_type, b.id, b.scenario_e2v_w2v_sg FROM movies as a, movies_vector as b Where a.id=b.id and a.id >= 1 and a.id <= 1171 and b.scenario_e2v_w2v_sg !=''" print(sql) self.cursor.execute(sql) self.movies_information = self.cursor.fetchall() # Relationship Model ####################### # self.model = CNN_E2V_BERT() # For Produce Vector # self.bert_embedding = BertEmbedding(model = 'bert_12_768_12', dataset_name='wiki_cn', max_seq_length = 50) # self.relationship_e2v_bert = [] # self.scenario_e2v_bert = [] ####################### self.model = CNN_E2V_W2V_SG() # 產生一個 word2vec 物件 self.t = Word2Vec() self.t.train_file_setting("segmentation.txt", "e2v_w2v_sg") self.t.load_model() self.dimension = self.t.size self.relationship_e2v_w2v_sg = [] self.scenario_e2v_w2v_sg = []
def main(): tagged_words = brown.tagged_words() words_corpus = brown.words() word2vec = Word2Vec() word2vec.train(words_corpus) word_vecs = [word2vec.word2vec(word) for word in words_corpus] n_clusters = 10 # random number for now kmeans = KMeans(n_clusters) kmeans.compute(word_vecs) # word-cluster HMM p_word = {} p_cluster = {} p_cluster_given_word = None # softmax p_word_given_cluster = None # joint probability formula p_transition_cluster = None # count p_initial_cluster = None # count # cluster-tag HMM p_cluster_given_tag = None # softmax p_transition_tag = None # count from tagged data p_initial_tag = None # count from tagged data hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster) hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag) words = [] clusters = hmm_word_cluster.viterbi(words) tags = hmm_cluster_tag.viterbi(clusters)
def test_cbow(self): cbow = Word2Vec(learning_rate=self.learning_rate) W1_m, W2_m, loss_m = cbow.cbow(np.asmatrix(self.context_words), np.asmatrix(self.center_word), self.W1, self.W2, 0.) with tf.name_scope("cbow"): x = tf.placeholder(shape=[self.V, len(self.context_words)], dtype=tf.float32, name="x") W1_tf = tf.Variable(self.W1, dtype=tf.float32) W2_tf = tf.Variable(self.W2, dtype=tf.float32) hh = [tf.matmul(tf.transpose(W1_tf), tf.reshape(x[:, i], [self.V, 1])) for i in range(len(self.context_words))] h = tf.reduce_mean(tf.stack(hh), axis=0) u = tf.matmul(tf.transpose(W2_tf), h) loss_tf = -u[int(np.where(self.center_word == 1)[0])] + tf.log(tf.reduce_sum(tf.exp(u), axis=0)) grad_W1, grad_W2 = tf.gradients(loss_tf, [W1_tf, W2_tf]) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) W1_tf, W2_tf, loss_tf, dW1_tf, dW2_tf = sess.run([W1_tf, W2_tf, loss_tf, grad_W1, grad_W2], feed_dict={x: self.context_words.T}) W1_tf -= self.learning_rate * dW1_tf W2_tf -= self.learning_rate * dW2_tf for i in range(self.V): for j in range(self.N): self.assertAlmostEqual(W1_m[i, j], W1_tf[i, j], places=5) for i in range(self.N): for j in range(self.V): self.assertAlmostEqual(W2_m[i, j], W2_tf[i, j], places=5) self.assertAlmostEqual(loss_m, float(loss_tf), places=5)
def test_skipgram(self): skipgram = Word2Vec(learning_rate=self.learning_rate) W1_m, W2_m, loss_m = skipgram.skipgram(np.asmatrix(self.context_words), np.asmatrix(self.center_word), self.W1, self.W2, 0.) with tf.name_scope("skipgram"): x = tf.placeholder(shape=[self.V, 1], dtype=tf.float32, name="x") W1_tf = tf.Variable(self.W1, dtype=tf.float32) W2_tf = tf.Variable(self.W2, dtype=tf.float32) h = tf.matmul(tf.transpose(W1_tf), x) u = tf.stack([tf.matmul(tf.transpose(W2_tf), h) for i in range(len(self.context_words))]) loss_tf = -tf.reduce_sum([u[i][int(np.where(c == 1)[0])] for i, c in zip(range(len(self.context_words)), self.context_words)], axis=0)\ + tf.reduce_sum(tf.log(tf.reduce_sum(tf.exp(u), axis=1)), axis=0) grad_W1, grad_W2 = tf.gradients(loss_tf, [W1_tf, W2_tf]) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) W1_tf, W2_tf, loss_tf, dW1_tf, dW2_tf = sess.run([W1_tf, W2_tf, loss_tf, grad_W1, grad_W2], feed_dict={x: self.center_word.reshape(self.V, 1)}) W1_tf -= self.learning_rate * dW1_tf W2_tf -= self.learning_rate * dW2_tf for i in range(self.V): for j in range(self.N): self.assertAlmostEqual(W1_m[i, j], W1_tf[i, j], places=5) for i in range(self.N): for j in range(self.V): self.assertAlmostEqual(W2_m[i, j], W2_tf[i, j], places=5) self.assertAlmostEqual(loss_m, float(loss_tf), places=5)
def main(input, output, iter=5, size=128, worker=4, batch_nodes=10000, negative=5, sample=1e-4, output_format="gensim"): # load karate graph in csr matrix RWG = RandomWalksGeneratorCSR(path=input) # init model skipgram = Word2Vec(sg=1, iter=iter, min_count=0, size=size, workers=worker, batch_words=batch_nodes, sample=sample, negative=negative) # build vocab skipgram.build_vocab(RWG) # learn embbeding skipgram.train(RWG) if output_format == "gensim": skipgram.save(output) elif output_format == "txt": skipgram.save_word2vec_format(output)
def load_embeddings(self): # check if embeddings saved in cache if os.path.exists(R.EMB.format(self.dim)): # read from cache; return return pickle.load(open(R.EMB.format(self.dim), 'rb')) # read model from word2vec model = Word2Vec(self.dim).get_model() embeddings = [] for w in self._vocab: # if word in model if w in model: emb = model[w] # else check if lower-case of w in model elif w.lower() in model: emb = model[w.lower()] # return zero vector else: emb = np.zeros(self.dim) # keep track of embedding embeddings.append(emb) # np.array embeddings = np.stack(embeddings) # attach to self self.emb = embeddings # write to cache pickle.dump(self.emb, open(R.EMB.format(self.dim), 'wb')) # make sure vocab size == num of embeddings assert self.vocab_size() == self.emb.shape[0] return self.emb
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) else: raise Exception( "unknown file format: '%s'. valid formats: 'adjlist', 'edgelist'" % args.format) print("number of nodes: {}".format(len(G.nodes()))) # .format 格式化字符串(取代{}) num_walks = len(G.nodes()) * args.number_walks # 每个节点有多个walks print("number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("data size (walk*length): {}".format(data_size)) print("walking...") walk_file = walks.write_walks_to_disk(G, args.output, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) model = Word2Vec(walk_file, args.output, emb_dimension=args.representation_size, window_size=args.window_size, min_count=0) print("Training...") model.skip_gram_train()
def trainWord2Vec(doc_list=None, buildvoc=1, passes=10, sg=1, size=100, dm_mean=0, window=5, hs=0, negative=5, min_count=1, workers=1): model = Word2Vec(size=size, sg=sg, window=window, hs=hs, negative=negative, min_count=min_count, workers=workers, compute_loss=True) if buildvoc == 1: print('Building Vocabulary') model.build_vocab(doc_list) # build vocabulate with words + nodeID for epoch in range(passes): print('Iteration %d ....' % epoch) # shuffle(doc_list) # shuffling gets best results model.train(doc_list, total_examples=len(doc_list), epochs=1) print(model.running_training_loss) print(model.sg, model.window, model.hs, model.min_count) print('batch words', model.batch_words) return model
def test_word2vec(): data = [ 'Merge multiple sorted inputs into a single sorted output', 'The API below differs from textbook heap algorithms in two aspects' ] wv = Word2Vec(vec_len=50) wv.train(data, model='cbow') print(wv['into'])
def key_words(self, string, top_number=10): # use the pretrained model with 5.9 million pretrained model model_name = '5.9m' model_download = W2VModelDownload(bq_project) model_download.download_w2v_model('patent_landscapes', model_name) word2vec5_9m = Word2Vec('5.9m') w2v_runtime = word2vec5_9m.restore_runtime() return w2v_runtime.find_similar(string, top_number)
def __init__(self, sentences, model_file=None, size=200, alpha=0.025, window=5, min_count=5, sample=0, seed=1, workers=16, min_alpha=0.0001, model="cb", hs=1, negative=0, cbow_mean=0, iteration=1, word_learn=1, init_adjust=True, update_mode=0, normalize_each_epoch=False): self.sg = 1 if model == "sg" or model == "dbow" else 0 self.table = None # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving self.alpha = float(alpha) self.window = int(window) self.seed = seed self.sample = sample self.workers = workers self.min_alpha = min_alpha self.hs = hs self.negative = negative self.cbow_mean = int(cbow_mean) self.iteration = iteration self.word_learn = int(word_learn) self.layer1_size = size self.min_count = min_count self.sent_no_hash = {} #mapping sent_id to index of self.sents self.sent_id_list = [] #mapping sent_no to sent_id self.sane_vec_len = 100000 #for sanity check self.sane_max_sim10 = 0.9 #for sanity check self.init_adjust = init_adjust #for adjustment of initialization self.update_mode = update_mode #0:SGD, 1: AdaGrad, 2:AdaDelta, (3:ADAM not implemented) self.normalize_each_epoch = normalize_each_epoch if sentences: if model_file: self.w2v = Word2Vec.load(model_file) self.vocab = self.w2v.vocab self.layer1_size = self.w2v.layer1_size self.build_vec(sentences, has_vocab=True) else: self.word_learn = 1 self.w2v = Word2Vec(None, self.layer1_size, self.alpha, self.window, self.min_count, self.sample, self.seed, self.workers, self.min_alpha, self.sg, self.hs, self.negative, self.cbow_mean) self.build_vec(sentences, has_vocab=False) self.train_iteration(sentences, iteration=iteration)
def initialise_model(data): input_file = 'test.txt' f = open(input_file,'w') input_txt = get_all_text(data) f.write(input_txt) f.close() model = Word2Vec(LineSentence(input_file), size=100, window=5, sg=0, min_count=1, workers=8) model.save(input_file + '.model') model.save_word2vec_format(input_file + '.vec')
def train_model(window_size, embedding_dim, batch_size_word2vec): file_to_save_trained_data = '../../results/word2vec/ver6/ws-' + str( window_size) + '-embed-' + str(embedding_dim) + 'batch_size-' + str( batch_size_word2vec) + '.pkl' word2vec = Word2Vec(window_size=window_size, epoch_word2vec=epoch_word2vec, embedding_dim=embedding_dim, batch_size_word2vec=batch_size_word2vec, file_to_save_trained_data=file_to_save_trained_data) vectors, word2int, int2word = word2vec.train()
def main(args: argparse.Namespace) -> None: """Main entrypoint for the script.""" # Make sure we have at least one file. if len(args.filenames) == 0: logger.error('At least one text file is required!') exit(1) set_seed(args.seed) tokenizer = Tokenizer(max_tokens=args.max_tokens, min_word_frequency=args.min_word_frequency, sample_threshold=args.sample_threshold) start_time = time.time() logger.info('Building vocabulary from corpus...') tokenizer.build(filenames=args.filenames) logger.info('Finished building vocabulary (took {:.2f} seconds)'.format( time.time() - start_time)) dataset = make_dataset(args.filenames, tokenizer, window_size=args.window_size, batch_size=args.batch_size, epochs=args.epochs) model = Word2Vec(tokenizer, hidden_size=args.hidden_size, batch_size=args.batch_size, n_negative_samples=args.n_negative_samples, lambda_power=args.lambda_power, bias=args.bias) # Create output directory args.output_dir.mkdir(parents=True, exist_ok=True) run_name = args.run_name or args.filenames[0].stem logdir = args.output_dir / get_next_run_id(args.output_dir, run_name) logger.info('Starting training (for {} epochs).'.format(args.epochs)) model.train(dataset, logdir, args.initial_lr, args.target_lr, args.log_freq, args.save_freq) # Save embeddings and vocab # # The weights of the projection layer are components of the # embedding vectors. The i-th row of the weight matrix is the # embedding vector for the word whose encoded index is i. proj = model.weights[0].numpy() np.save(logdir / 'proj_weights', proj) # Save the tokenizer state tokenizer.save(logdir / 'tokenizer.json') # Save a list of the vocabulary words with open(logdir / 'vocab.txt', 'w') as file: for word in tokenizer.words: file.write(f'{word}\n')
def __init__(self, dataset_file=None, cv_folds=10): """ :param embeddings_file: path to the embeddings file. :param dataset_file: path to a labeled dataset file. :param cv_folds: int, number of folds for cross validation """ self.dataset_file = dataset_file self.cv_folds = cv_folds # read dataset dataset = pd.read_csv(self.dataset_file) text = dataset['tweet'] self.Y = dataset['label'] # Option 1-- word2vec using embedding -- # w2v = Word2Vec() self.X = w2v.getVectors(text, ) # -- Option 2 count vectorization -- # #self.X = self.features_extraction(text) # -- Option 3 TFIDF -- # #self.X = self.tfidfFeatureExtraction(text) info('Done loading and vectorizing data.') info("--- Sentiment CLASSIFIERS ---") info("fitting ... ") self.accuracies = {} # classifiers to use classifiers = [ #RandomForestClassifier(n_estimators=100), #SGDClassifier(), LinearSVC(), #LinearDiscriminantAnalysis(), #LogisticRegression(), #GaussianNB(), #DecisionTreeClassifier() ] # RUN classifiers for c in classifiers: self.classify(c) info('results ...') for k, v in self.accuracies.items(): string = '\tAcc. {:.2f}% F1. {:.2f}% P. {:.2f} R. {:.2f} : {}' print( string.format(v[0] * 100, v[1] * 100, v[2] * 100, v[3] * 100, k)) info("DONE!")
def main(): contexts = np.fromfile("./data/npcontexts.dat", dtype=int) neighbors = np.fromfile("./data/npneighbors.dat", dtype=int) skipgram = Word2Vec(contexts, neighbors, 35000, 10, 0.001, 64, "sg.ckpt", batch_size=500) skipgram.train(2)
def ExtractSent2Vec(filename): model = Word2Vec(LineSentence(filename), size=512, window=5, sg=0, min_count=5, workers=8) model.save(filename + '.model') model.save_word2vec_format(filename + '-01.vec') model = Sent2Vec(LineSentence(filename), model_file=filename + '.model') model.save_sent2vec_format(filename + '-02.vec')
def word2vec(rdd): sentences = parse_sentences(rdd) sentences_without_id = sentences.map(lambda (_id, sent): sent) model = Word2Vec(size=100, hs=0, negative=8) dd2v = DistDoc2VecFast(model, learn_hidden=True, num_partitions=15, num_iterations=20) dd2v.build_vocab_from_rdd(sentences_without_id) print "*** done training words ****" print "*** len(model.vocab): %d ****" % len(model.vocab) return dd2v, sentences
def __init__(self, data): self.data = data self.corpus = None self.liu = LiuLexicon() self.subj = SubjLexicon() self.buildTweetCorpus() self.word_vec_model = Word2Vec(self.corpus) self.glove_vec_model = Glove(100, self.corpus) self.clusters = Cluster(100) self.initEncoders() self.topicVecs = self.word_vec_model.getVectorsForTopics( self.topicenc.classes_) self.collectTopUnigrams() self.collectTopBigrams()
def main(text): params = getattr(parameters, text) w2v = Word2Vec(params['file'], window_size=params['window_size'], learning_rate=params['learning_rate'], vocab_size=params['vocab_size'], embedding_size=params['embedding_size'], n_negative=params['n_negative']) w2v.fit(n_iter=params['n_iter'], num_proc=params['num_proc']) print(w2v.process_time) print(w2v.process_time[-1] - w2v.process_time[0])
def main(): # with open("/Users/johnkarasev/PycharmProjects/TweetGrouper/word2vec/contexts.json") as fp: # contexts = json.load(fp) # with open("/Users/johnkarasev/PycharmProjects/TweetGrouper/word2vec/neighbors.json") as fp: # neighbors = json.load(fp) print("Reading dat files") npn = np.fromfile("npneighbors.dat", dtype=int) print(str(npn.shape[0])) npc = np.fromfile("npcontexts.dat", dtype=int) print(str(npc.shape[0])) print("finished read") # train skipgram model skipgram = Word2Vec(npn, npc, 35000, 10, 0.001, 64, "sg.ckpt", batch_size=500) skipgram.train(5) # train cbow model cbow = Word2Vec(npc, npn, 35000, 10, 0.001, 64, "sg.ckpt", batch_size=500) cbow.train(5)
def getTextualFeature(text_reading_path): # Train and save the Word2Vec model for the text file. # Please note that, you can change the dimension of the resulting feature vector by modifying the value of 'size'. model = Word2Vec(LineSentence(text_reading_path), size=500, window=5, sg=0, min_count=5, workers=8) model.save(text_reading_path + '.model') # Train and save the Sentence2Vec model for the sentence file. model = Sent2Vec(LineSentence(text_reading_path), model_file=text_reading_path + '.model') model.save_sent2vec_format(text_reading_path + '.vec') program = os.path.basename(sys.argv[0])
def __init__(self, batchop, batch_size=1, datapoints=[], w2v=None): # current iteration self._offset = 0 # num of examples self.n = len(datapoints) # default batch size self.B = batch_size # create Word2Vec model self._w2v = Word2Vec() if not w2v else w2v # batch process operation self._batchop = batchop # if data available if len(datapoints): self.bind(datapoints)
def main(_): model = Word2Vec() norm_w_embed = tf.nn.l2_normalize(model._w_embed_in, 1) # [vocab_size, embed_size] embedings = model._sess.run(norm_w_embed) results = bh_tsne(embedings, no_dims=2, perplexity=50, theta=DEFAULT_THETA, randseed=EMPTY_SEED, verbose=VERBOSE) with open(os.path.join(word_config.output_dir, "tsne.txt"), "w") as f: for result in results: fmt = '' for i in range(1, len(result)): fmt = fmt + '{}\t' fmt = fmt + '{}\n' f.write(fmt.format(*result))
if ARGS.mode == "train": print("Starting Training branch...") print("Loading data ...") data = data_processing.get_w2v_data(ARGS) print("Initializing dataset and data loader...") word2vec_dataset = Word2VecDataset(data, ARGS) data_loader = DataLoader(word2vec_dataset, batch_size=ARGS.batch_size, shuffle=True, num_workers=2) print("Initializing model ...") model = Word2Vec(vocab, ARGS.embed_dim).to(ARGS.device) print("Train...") train(ARGS, data_loader, model) elif ARGS.mode in ["ret_words", "eval"]: model_path = os.path.join("models", f"ww_{ARGS.ww_size}_{ARGS.freq_thresh}") model = load_model(ARGS, model_path) print(model) print( f"Load docs: filtered_docs/filtered_docs_{ARGS.freq_thresh}.pkl..." ) docs_by_id = data_processing.load_pickle( f"filtered_docs/filtered_docs_{ARGS.freq_thresh}.pkl")
None, None ] training = not args.no_training test = args.test ensemble = args.ensemble function = args.model_function lr = args.lr os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) K.set_session(sess) max_seq_len = 32 w2v_model = Word2Vec().load(word2vec_model_path) word2idx = w2v_model.get_word2idx() embedding = w2v_model.get_embedding() vocabulary_size = len(word2idx) print(f'\033[32;1mvocabulary_size: {vocabulary_size}\033[0m') if function not in globals(): globals()[function] = getattr( importlib.import_module(function[:function.rfind('.')]), function.split('.')[-1]) model = globals()[function](embedding) model.compile(Adam(lr), loss='binary_crossentropy', metrics=['acc']) model.summary() if training: trainX, trainY = utils.load_train_data(labeled_path, word2idx,
# print(row["word1"], row["word2"]) # print(tmpdf) # 各データセットとword2vecの比較 datasets = [{ "df": wordsim, "name": "WordSim" }, { "df": simlex, "name": "SimLex" }, { "df": men, "name": "MEN" }] modelname = "glove-wiki-gigaword-100" w2v = Word2Vec(modelname=modelname) for dataset in datasets: print(dataset["name"]) target_dict = { "word1": [], "word2": [], dataset["name"]: [], "cos": [] } for _, row in dataset["df"].iterrows(): cos = w2v.get_cosine_similarity(row["word1"], row["word2"]) if cos: target_dict["word1"].append(row["word1"]) target_dict["word2"].append(row["word2"]) target_dict[dataset["name"]].append(row["result"]) target_dict["cos"].append(cos)
def train_word2vec(): trainpath = 'dl4j.txt' w2v = Word2Vec() w2v.build(trainpath) w2v.fit(trainpath) w2v.evaluate()
format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) category = 'Diseases_and_disorders' logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info("running %s" % " ".join(sys.argv)) #input_file = 'test2.txt' input_file = '../inputFile/' + category + '.corpus.txt' model = Word2Vec(LineSentence(input_file), size=50, window=7, sg=0, min_count=3, workers=8) model.save(input_file + '.model') model.save_word2vec_format(input_file + '.vec') # f_wv=codecs.open('../inputFile/word-vec.txt','w','utf-8') # with open('../inputFile/vocab.txt') as textfile1, open('../inputFile/wordVectors.txt') as textfile2: # for x, y in izip(textfile1, textfile2): # x = x.strip() # y = y.strip() # f_wv.write(x+'\t'+y+'\n') # f_wv.close() sent_file = input_file model = Sent2Vec(LineSentence(sent_file), model_file=input_file + '.model')
def train_word2vec(data_len=90000, vocab_size=1000, embed_size=300, end_iter=2000, verbose=True, verbose_freq=100, save=True, save_freq=100): """ Trains the Word2Vec Model based on: - data_len: number of data points to train and test on - vocab_size: number of top words the model will choose a solution from - embed_size: dimension of the question embedding - verbose: (boolean) prints out train and test losses every step - save: (boolean) saves model in ./savedir/ """ data_arr = (get_by_ques_type([], train=True) + get_by_ques_type([], train=False))[:data_len] p = Pipeline(data_arr, embed_type=embed_type) p.create_split() train_step = 0 curr_samples = 0 train_losses = [] test_losses = [] w2v = Word2Vec(vocab_size + 1, embed_size) run = True while run: p.next_batch(train=True, replace=True) train_inp, train_out = p.batch_word2vec() train_step += 1 batch_samples = len(train_inp) curr_samples += batch_samples train_loss = w2v.train_step(np.array(train_inp), np.array(train_out), sess) p.next_batch(train=False, replace=True) test_inp, test_out = p.batch_word2vec() test_samples = len(test_inp) test_loss = w2v.evaluate(np.array(test_inp), np.array(test_out), sess) train_losses.append(train_loss) test_losses.append(test_loss) if train_step % save_freq == 0 and save: tf.train.Saver().save(sess, "saved_models/word2vec_model/word2vec_%d" % (train_step), global_step=train_step) np.savez( "saved_models/word2vec_model/word2vec_train_losses_%d" % (train_step), np.array(train_losses)) np.savez( "saved_models/word2vec_model/word2vec_test_losses_%d" % (train_step), np.array(test_losses)) if train_step % verbose_freq == 0 and verbose: print( "TRAIN STEP: %d | SAMPLES IN TRAIN BATCH: %d | TRAIN SAMPLES SO FAR: %d | TRAIN LOSS: %f | TEST LOSS: %f" % (train_step, batch_samples, curr_samples, train_loss, test_loss)) if train_step == end_iter: run = False if save: tf.train.Saver().save(sess, "saved_models/word2vec_model/word2vec_%d" % (train_step), global_step=train_step) np.savez( "saved_models/word2vec_model/word2vec_train_losses_%d" % (train_step), np.array(train_losses)) np.savez( "saved_models/word2vec_model/word2vec_test_losses_%d" % (train_step), np.array(test_losses))