def train_model(in_file_name, out_file_name, use_plain_word2vec=False, size=100, phrases_n_gram=1, threads=4): options = { 'size': size, } if use_plain_word2vec: if phrases_n_gram > 1: phrases_file_name = '{}.phrases'.format(in_file_name) word2vec.word2phrase(in_file_name, phrases_file_name, verbose=True) in_file_name = phrases_file_name if threads: options['threads'] = threads # noinspection PyCallingNonCallable word2vec.word2vec(in_file_name, out_file_name, verbose=True, **options) else: sentences = LineSentence(in_file_name) for i in range(phrases_n_gram - 1): n_gram_transformer = Phrases(sentences) sentences = n_gram_transformer[sentences] if threads: options['workers'] = threads model = Word2Vec(sentences, **options) model.save(out_file_name)
def main(): word2vec.word2phrase('./text8', './text8-phrases', verbose=True) word2vec.word2vec('./text8-phrases', './text8.bin', size=100, verbose=True) word2vec.word2clusters('./text8', './text8-clusters.txt', 100, verbose=True)
def train(): word2vec.word2phrase('all.txt', 'phrase.txt', verbose=True) word2vec.word2vec('phrase.txt', 'vec.bin', min_count=50, size=50, verbose=False)
def preprocess(): """Preprocess given data and build vocabulary file.""" def extend_vocab(): """.""" ground_truth_vocab = "" with open("gt_input.txt", "r") as f: for line in f: ground_truth_vocab += line.strip() ground_truth_vocab += " " ground_truth_vocab = ground_truth_vocab[:-1] text8_vocab = "" with open("./text8-phrases", "r") as f: for line in f: text8_vocab += line return text8_vocab + " " + ground_truth_vocab logging.info("Beginning preprocessing:") # covert text8 vocabulary to include bigram phrases word2vec.word2phrase("./text8", "./text8-phrases", verbose=True, min_count=1) logging.info("Done creating test8-phrases.") # extend text8-phrases vocab with ground truth vocab then write to file full_vocab = extend_vocab() with open("./text8-phrases-extra", "w") as f: f.write(full_vocab) logging.info("Done creating test8-phrases-extra") logging.info("Done preprocessing")
def word_clusters( corpora, size=100, verbose=True, text='text.txt', phrases='phrases.txt', binary='text.bin', clusters='clusters.txt' ): """Produce word2vec word clusters.""" words = [] for corpus in corpora: for document in corpus.documents: for sentence in document.sentences: for word in sentence.words: words.append(word.lower().strip(punctuation + whitespace)) with io.open(text, mode='w', encoding='utf-8') as file: file.write(u' '.join(words)) word2vec.word2phrase(text, phrases, verbose=verbose) word2vec.word2vec(phrases, binary, size=size, verbose=verbose) word2vec.word2clusters(text, clusters, size, verbose=verbose) json_clusters = clusters.rstrip('.txt') + '.json' with io.open(clusters, mode='r', encoding='utf-8') as file: d = dict( (w, int(c)) for w, c in map(split, file.read().splitlines()) ) with io.open(json_clusters, mode='w', encoding='utf-8') as file: json.dump(d, file, indent=4, ensure_ascii=False) return d
def main(): words, pos_tags = load_data('all.txt') word2vec.word2phrase('all.txt', 'word2phrase.txt', verbose=False) word2vec.word2vec('word2phrase.txt', 'word2vec.bin', alpha=0.087, hs=1, size=100, verbose=False) model = word2vec.load('word2vec.bin') words_table, words_vec = get_most_frequent_words(500, model, pos_tags) tsne = TSNE(n_components=2, random_state=87) words_t_vec = tsne.fit_transform(words_vec) # show figure = pyplot.figure(figsize=(12, 6), dpi=150) pyplot.scatter(words_t_vec[:, 0], words_t_vec[:, 1], c='b', alpha=0.2, s=15) texts = [] for vec, text in zip(words_t_vec, words_table): texts.append(pyplot.text(vec[0], vec[1], text, size=5)) adjust_text(texts, arrowprops=dict(arrowstyle='-', color='k', lw=0.5)) pyplot.show() figure.savefig('figure.png')
def main(): if '--download-nltk' in argv: nltk.download('punkt') nltk.download('maxent_treebank_pos_tagger') nltk.download('averaged_perceptron_tagger') nltk.download('brown') if not isfile('wordvec.bin') or '--train' in argv: print("\nwords to phrases...") wv.word2phrase('./HarryPotter/HarryPotter.txt', 'phrase', verbose=1) print("\nphrases to vectors...") wv.word2vec('phrase', 'wordvec.bin', size=50, verbose=1) print("") print("\nload model...") model = wv.load('wordvec.bin') print("model shape: " + repr(model.vectors.shape)) X, Y = [], [] if '--load-vector' in argv: if isfile('X.npy') and isfile('Y.npy'): X = np.load('X.npy') Y = np.load('Y.npy') else: print("can't load X.npy, Y.npy") return else: print("TSNE...") tsne = TSNE(n_components=2, learning_rate=10, random_state=0) vectors = tsne.fit_transform(X=model.vectors[:SIZE, :]) X = vectors[:, 0] Y = vectors[:, 1] print("start plot...(using nltk.corpus.brown)") brown_tagged_sents = brown.tagged_sents(categories='news') unigram_tagger = nltk.UnigramTagger(brown_tagged_sents) words = unigram_tagger.tag(model.vocab[:SIZE]) texts = [] plt.figure(figsize=(12, 8)) for x, y, word in zip(X, Y, words): print("word: (%s, %s)" % (word[0], word[1]), end="") if filter_words(word[0], word[1]): print("\r\t\t\t\tplot") plt.plot(x, y, 'o') texts.append(plt.text(x, y, word[0], fontsize=8)) else: print("\r\t\t\t\tignore") adjust_text(texts, force_text=1, arrowprops=dict(arrowstyle="-", color="k", lw=1)) plt.savefig("wordvec.png", dpi=100) plt.show()
def checkForSemanticIndex(carrel): # configure MODEL = 'reader.bin' TXT = 'model.txt' PHRASES = 'model.phrases' # require from pathlib import Path from word2vec import word2vec, word2phrase import os # initialize localLibrary = configuration('localLibrary') model = localLibrary / carrel / ETC / MODEL # see if we have been here previously if not model.exists(): # initialize some more stopwords = localLibrary / carrel / ETC / STOPWORDS corpus = localLibrary / carrel / ETC / CORPUS txt = str(Path.home() / TXT) phrases = str(Path.home() / PHRASES) # tokenize click.echo('Indexing. This needs to be done only once.', err=True) click.echo('Step #1 of 6: Tokenizing corpus...', err=True) tokens = open(corpus).read().split() # normalize click.echo('Step #2 of 6: Normalizing tokens...', err=True) tokens = [token.lower() for token in tokens if token.isalpha()] # remove stop words click.echo('Step #3 of 6: Removing stop words...', err=True) stopwords = open(stopwords).read().split() tokens = [token for token in tokens if token not in stopwords] # save click.echo('Step #4 of 6: Saving tokens...', err=True) with open(txt, 'w') as handle: handle.write(' '.join(tokens)) # create phrases click.echo('Step #5 of 6: Creating phrases...', err=True) word2phrase(txt, phrases, verbose=True) # do the work click.echo('Step #6 of 6: Indexing...', err=True) word2vec(phrases, str(model), size=100, binary=True, verbose=True) # clean up and done os.remove(txt) os.remove(phrases) click.echo('\nDone. Happy searching!', err=True)
def extract(dim, data, trained): if(not trained): word2vec.word2phrase(data, data+'-phrases', verbose=True) word2vec.word2vec(data+'-phrases', data+'.bin', size=dim, verbose=True) model = word2vec.load(data+'.bin') keys = model.vocab features = model.vectors dic = dict(zip(keys,features)) print(len(dic)) return dic
def word_training(path, embedded_size): dirname = os.path.dirname(path) filename = os.path.basename(path) phrasesname = os.path.join(dirname, '{}-phrases'.format(filename)) modelname = os.path.join(dirname, '{}.bin'.format(filename)) print('Training...') word2vec.word2phrase(path, phrasesname) word2vec.word2vec(phrasesname, modelname, size=embedded_size) print('Training Done!!!') return modelname
def build_vector_model(self, file_name, verbose=True, size=200): # Build word vector phrase models word2vec.word2phrase(self.token_dir + "/" + file_name, self.phrase_dir + "/" + file_name, verbose=verbose) # Build the word vector now word2vec.word2vec(self.phrase_dir + "/" + file_name, self.vector_dir + "/" + file_name + ".bin", size=size, verbose=verbose)
def word_to_vec(config_path: str, dimension: int, T = ""): folder = "data/word2vec" words_file = os.path.join(folder, f"{T}words-noisefiltered-{dimension}") phrases_file = os.path.join(folder, f"{T}phrases-noisefiltered-{dimension}") w2v_file = os.path.join(folder, f"{T}noisefiltered-{dimension}.bin") import word2vec word2vec.word2phrase(words_file, phrases_file, verbose=True) word2vec.word2vec(phrases_file, w2v_file, size=dimension) logging.info("wrote to " + w2v_file) return word2vec.load(w2v_file)
def train(): movie_set = cornell_movie_set.MovieSet() movie_set.parse_movie_set('train') word2vec.word2phrase('cornell_movie_train.txt', 'movie_phrases_train.txt', verbose=True) word2vec.word2vec('movie_phrases_train.txt', 'movie_train.bin', size=100, verbose=True) model = word2vec.load('movie_train.bin') return model
def getWordVector(_fileName): phraseName = _fileName + "-phrases.txt" binName = _fileName + ".bin" outPutName = _fileName + "_100d.vocab" file = open(outPutName, 'wt') word2vec.word2phrase(_fileName, phraseName, verbose=True) word2vec.word2vec(phraseName, binName, size=100, verbose=True) model = word2vec.load(binName) for key in model.vocab: vectorLine = key for value in model[key]: vectorLine = vectorLine + " " + str(value) file.write(vectorLine + "\n") file.close()
def train_model(docfile_root="corpora/Thaliana/documents-processed"): print "phrases..." word2vec.word2phrase(docfile_root + ".txt", docfile_root + "-phrases.txt", verbose=True) #print "word2vec" #word2vec.word2vec(docfile_root + "-phrases.txt", docfile_root + ".bin", size=1000, verbose=True, min_count=1) print "word2cluster" word2vec.word2clusters(docfile_root + ".txt", docfile_root + '-clusters.txt', 10000, verbose=True, min_count=1, threads=4)
def create_word2vec_model(save_text_file): '''run word2vec on the text corpus and create a model''' save_phrases = save_text_file + '_phrases' save_model = save_text_file + '.bin' save_cluster = save_text_file + '-cluster.txt' # create phrases for processing word2vec.word2phrase(save_text_file, save_phrases, verbose=True) # create model word2vec.word2vec(save_phrases, save_model, size=100, verbose=True) # create cluster word2vec.word2clusters(save_text_file, save_cluster, 100, verbose=True)
def create_model(): global created if (created): model = word2vec.load('movie_all.bin') return model else: movie_set = parse_movie_set.MovieSet() movie_set.parse_movie_set('train') movie_set.parse_movie_set('test') movie_set.parse_movie_set('all') word2vec.word2phrase('parsed_all.txt', 'parsed_all_phrases.txt', verbose=True) word2vec.word2vec('parsed_all_phrases.txt', 'movie_all.bin', size=100, verbose=True) model = word2vec.load('movie_all.bin') created = True return model
def files_10(): count = 1 while count < 11: for i in os.listdir(directory): if i == 'text' + str(count): text = "text" + str(count) + "-adapted" vec = "text" + str(count) + "-vec.bin" cluster = "text" + str(count) + "-clusters.txt" result = "text" + str(count) + "-result.txt" #result_list = [] stopwordsss = ["in", "it", "as", "my", "do", "is", "don't", "doesn't", "am", "it's", "i", "you", "and", "to", "the", "on", "but", "that", "are", "so", "to", "me", "of", "with", "try", 'a', 'about', 'after', 'all', 'also', 'always', 'am', 'an', 'and', 'any', 'are', 'at', 'be', 'been', 'being', 'but', 'by', 'came', 'can', "can't", 'come', 'could' , 'did', 'do', 'does', 'doing', 'else', 'for', 'from', 'get', 'give', 'goes', 'going', 'had', 'happen', 'has', 'have', 'having', 'how', 'in', 'into', 'really', 'if', 'see', 'plus', 'then', "i'll", "then", "or", "will", "i'm", "too", "doesn't", "don't", "will", "that's", "-", "i've", "would", "making", "usually", "what", "hasn't", "it's", "hmmm", "really", "this", "someone", "not", "i'll", "like", "this", "e", "=", "just", "more", "actually", "most", "one", ":", "very", "b", "yes", "same"] word2vec.word2phrase(i, text, verbose=True) #model = word2vec.load(vec) reading = open(text).read().lower().replace(',', ' ').replace('.', ' ').replace('/', ' ').replace('-', ' ').replace('(', ' ').replace(')', ' ').replace('?', ' ').split() for i in stopwordsss: try: reading = list(filter(lambda x: x != i, reading)) except: continue print(reading) iterat = 0 s = ' ' s = s.join(reading) with open(text, "w") as result_file: result_file.write(s) word2vec.word2vec(text, vec, 100, verbose=True) word2vec.word2clusters(vec, cluster, 100, verbose=True) for word in reading: word2vec_model = Word2Vec([reading], min_count=1) vocabulary = word2vec_model.wv.vocab sim_words = word2vec_model.wv.most_similar(word)[:3] #indexes, metrics = model.similar(word.lower()) #real_time = model.generate_response(indexes, metrics).tolist() #result_list.append(sim_words) #result_list.append('\n\n') if iterat == 0: with open(result, "w") as result_file: result_file.write(str(word) + '\n' + str(sim_words) + '\n\n') iterat = 1 elif iterat == 1: with open(result, "a") as result_file: result_file.write(str(word) + '\n' + str(sim_words) + '\n\n') count += 1 else: continue
def main(): argv1 = "sparkler.config" argv2 = "Sparkler" configs = read_config(argv1, argv2) train_file_obj = codecs.open(configs['train_file'], 'r') train_lines = train_file_obj.readlines() train_file_obj.close() dimension_input = int(configs['dimension_input']) word2vec.word2phrase(cleanText('data/200andOcean.txt', 'clean200Ocean'), 'ocean-full-phrases', verbose=True) word2vec.word2vec('ocean-full-phrases', 'ocean.bin', size=dimension_input, verbose=True, min_count=5) model = word2vec.load('ocean.bin') word = 'ocean' print(closeWords(model, word, 5)) print(model.vectors.shape)
def w2v(dim_w2v=dim_w2v, data_training=None): if data_training is None: print('nooooooo') return 0 with open("word2vec_corpus.tmp", "w") as f: f.write(("\n".join(data_training) + "\n")) print('running word2vec ...') word2vec.word2phrase('word2vec_corpus.tmp', 'word2vec_corpus_phrases', verbose=True) word2vec.word2vec('word2vec_corpus_phrases', 'word2vec_corpus.bin', size=dim_w2v, verbose=True, window=5, cbow=0, binary=1, min_count=1, sample='1e-5', hs=1, iter_=5)
# step 1 install word2vec (ref: http://nbviewer.jupyter.org/github/danielfrg/word2vec/blob/master/examples/word2vec.ipynb) import word2vec import numpy as np import scipy.io as sio vector_size = 100 amount_nearest = 100 word2vec.word2phrase('text8', 'text8-phrases', verbose=True) word2vec.word2vec('text8-phrases', 'text8.bin', size=vector_size, verbose=True) word2vec.word2clusters('text8', 'text8-clusters.txt', vector_size, verbose=True) # read the trained model model = word2vec.load('text8.bin') # list of vague motivation coming from mind (topic: potential problems for enterprise) motivation = ['enterprise', \ 'business',\ 'solution',\ 'entrepreneur',\ 'latent',\ 'problem',\ 'funds',\ 'management',\ 'quality',\ 'projects'] # start get nearest clusters by picking the similar words amount_motivation = len(motivation) motivation_vector = [] nearest_indexes = []
def testWord2Phrase(): #Run word2phrase to group up similar words "Los Angeles" to "Los_Angeles" #This will create a text8-phrases that we can use as a better input for word2vec. # #Note that you could easily skip this previous step and use the origial data as input for word2vec. word2vec.word2phrase('/D/test/text8/text8', '/D/test/text8/text8-phrases', verbose=True)
#!/usr/bin/env python3 import word2vec import nltk import numpy as np import matplotlib.pyplot as plt from sklearn.manifold import TSNE from adjustText import adjust_text import sys txt = sys.argv[1] phrase_file_name = "text_phrases.bin" word2vec.word2phrase(txt, phrase_file_name, verbose=True) model_file_name = "text.bin" word2vec.word2vec(phrase_file_name, model_file_name, verbose=True, cbow=1) model = word2vec.load(model_file_name) words = [] word_vectors = [] for word in model.vocab: words.append(word) word_vectors.append(model[word]) words = np.array(words) word_vectors = np.array(word_vectors) num_plotted_words = 1000
def setup_module(module): word2vec.word2phrase(input_, output_phrases, verbose=False) word2vec.word2vec(input_, output_bin, size=10, binary=1, verbose=False) word2vec.word2vec(input_, output_txt, size=10, binary=0, verbose=False) word2vec.word2clusters(input_, output_clusters, 10, verbose=True)
# coding: utf-8 import word2vec import gensim word2vec.word2phrase('./refined_text.txt', './wiki-phrase', verbose=True) word2vec.word2vec('./wiki-phrase', './word2vec_model.bin', size=100, verbose=True) word2vec.word2clusters( '/Users/KYD/Documents/wiki_project/refined_text.txt', 'Users/KYD/Documents/wiki_project/refined_text_cluster.txt', 100, verbose=True)
# -*- coding: utf-8 -*- import plyvel import re, string import sys, locale import word2vec import os reload(sys) sys.setdefaultencoding(locale.getdefaultlocale()[1]) model_path = os.path.abspath('model.bin') text_path = os.path.abspath('text.txt') phrase_path = os.path.abspath('phrases.txt') word2vec.word2phrase(text_path, phrase_path, verbose=True) word2vec.word2vec(phrase_path, model_path, binary=1, verbose=True) model = word2vec.load(model_path) indexes, metrics = model.cosine('seymour') print (string.join(model.vocab[indexes], ' '))
else: features[7] = len(sentence1) / len(sentence2) return features # Uses treetagger-python (Installation https://github.com/miotto/treetagger-python ; http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/) try: semanticsimilarity_lookuptable = pickle.load(open('semanticsimilarity_lookuptable.pkl', 'rb')) except Exception: semanticsimilarity_lookuptable = {} print "Build Word2Vec Corpus" dir = os.path.dirname(os.path.abspath(__file__)) try: # on OSX for some reason this does not work word2vec.word2phrase(dir + '/text8', dir + '/text8-phrases', verbose=True) word2vec.word2vec(dir + '/text8-phrases', dir + '/text8.bin', size=100, verbose=True) except Exception as e: print e model = word2vec.load(dir + '/text8.bin') print "Finish" def computeSemantics(sentence1, sentence2): def computeSemanticSimilarityFeatures(sentence1, sentence2): features = [0] * 9 if (sentence1 + sentence2) not in semanticsimilarity_lookuptable: def prepareSentence(sentence): return sentence.replace('-', ' ').replace('$', ' ')
def test_run_word2phrase(): word2vec.word2phrase(input_text, output_phrases) assert os.path.exists(output_phrases)
import word2vec import os print os.getcwd() for file in os.listdir(os.getcwd()): print file word2vec.word2phrase('C:\devbox\DeepSphere\word2vec\text8', 'C:\devbox\DeepSphere\word2vec\text8-phrases', verbose=True)
from gensim.models.keyedvectors import KeyedVectors import word2vec import timeit start_time = timeit.default_timer() word2vec.word2phrase('3g-p.txt', '3g-phrases.txt', verbose=True) word2vec.word2vec('3g-phrases.txt', '3g.bin', size=100, verbose=True) elapsed = timeit.default_timer() - start_time InMinutes = elapsed / 60 word2vec.word2clusters('3g-p.txt', '3g-clusters.txt', 100, verbose=True) model = KeyedVectors.load_word2vec_format('3g.bin', binary=True) model.save_word2vec_format('3g-vectors.txt', binary=False) print("The Totatl Execution Time in Minutes is: ", InMinutes)
import word2vec as w2v import clean_articles from pprint import pprint import sys if len(sys.argv) > 1 and sys.argv[1] in ['-t', '-train']: # Add new articles to file clean_articles.clean() # Train new model w2v.word2phrase('combined', './text-phrases', verbose=True) w2v.word2vec('text8-phrases', 'text.bin', size=100, verbose=True) w2v.word2clusters('combined', 'text-clusters.txt', 100, verbose=True) # Initialize pre-trained model model_old = w2v.load('text8.bin') model = w2v.load('text.bin') clusters = w2v.load_clusters('text-clusters.txt') model.clusters = clusters #ind = clusters['Trump'] #print(clusters.get_words_on_cluster(ind)) print(len(model_old.vocab)) print(len(model.vocab)) # King - man + woman : "Man is to King as Woman is to # Trump - America + Germany pos = ['Putin', 'America'] neg = ['Russia'] leader = model.analogy(pos, neg)
import sys if sys.argv[1] == "train": print("Phrasing starts") MIN_COUNT = 5 WORDVEC_DIM = 100 WINDOW = 5 NEGATIVE_SAMPLES = 5 ITERATIONS = 0 MODEL = 1 LEARNING_RATE = 0.025 model_name = sys.argv[2] + ".txt" model_phrase = sys.argv[2] + "-phrase.txt" word2vec.word2phrase(model_name, model_phrase, verbose=True) print("===============") print("Training starts") # train model word2vec.word2vec(train=model_phrase, output=sys.argv[3] + ".bin", size=WORDVEC_DIM, min_count=MIN_COUNT, window=WINDOW, negative=NEGATIVE_SAMPLES, alpha=LEARNING_RATE, verbose=False) else: # load model for plotting model = word2vec.load("hp/" + sys.argv[1])
import word2vec word2vec.word2phrase('topics.xml', 'topics-phrases.txt', verbose=True)
line = line.split()[1:] for term in line: try: if stop.stop3(term.split('/')[1]): continue except: print count print 'ERROR' print term #quit() term = term.split('/')[0] l.write(term + ' ') l.write('\n') f.close() l.close() quit() word2vec.doc2vec(cleanFileName, fileName, cbow=0, size=50, window=10, negative=5, hs=0, sample='1e-4', threads=12, iter_=20, min_count=1, verbose=True) word2vec.word2phrase(cleanFileName, fileName, verbose=True) word2vec.word2clusters(cleanFileName, fileName, 100, verbose=True) model = word2vec.load(fileName) print model.vectors.shape quit() indexes, metrics = model.cosine('_*1') model.generate_response(indexes, metrics).tolist() ''' 找出重要詞彙的相似字 把它存起來,vectorSim.json '''
# In[ ]: data = re.sub( '\s+', ' ', re.sub('[:\.\(\)0123456789%,–\?\\\&\']', '', ' '.join(product_fr.values).replace('-', ' ').lower())) # In[ ]: with open('data/names.txt', 'w') as f: f.write(data) f.close() # In[ ]: word2vec.word2phrase('data/names.txt', 'data/names-phrases.txt', verbose=True) # In[ ]: word2vec.word2vec('data/names.txt', 'data/names-model.bin', size=100, verbose=True) # In[ ]: word2vec.word2clusters('data/names.txt', 'data/names-clusters.txt', 100, verbose=True)
def train(path): word2vec.word2phrase(path, path+'-phrases', verbose=True) word2vec.word2vec(path+'-phrases', path+'.bin', size=100, binary=True, verbose=True) word2vec.word2clusters(path, path + '.clusters.txt', 100, verbose=True) model = word2vec.load(path+'.bin') return model
from gensim.models import Word2Vec import word2vec import numpy as np from argparse import ArgumentParser import os import sys import multiprocessing word2vec.word2phrase('all.txt', 'phrases.txt', verbose=True) #word2vec.word2vec('phrases.txt' , 'my_model.bin',size=100, verbose=True) # parser = ArgumentParser() # parser.add_argument('--train', action='store_true', # help='Set this flag to train word2vec model') # parser.add_argument('--corpus-path', type=str, default='all.txt', # help='Text file for training') # parser.add_argument('--model-path', type=str, default='my_model.bin', # help='Path to save word2vec model') # parser.add_argument('--plot-num', type=int, default=600, # help='Number of words to perform dimensionality reduction') # args = parser.parse_args() MIN_COUNT = 6 WORDVEC_DIM = 300 WINDOW = 5 NEGATIVE_SAMPLES = 10 ITERATIONS = 5 MODEL = 1 LEARNING_RATE = 1e-3 #CPU_COUNT = multiprocessing.cpu_count() # train model word2vec.word2vec( train='phrases.txt',
import word2vec import sys import getpass user = getpass.getuser() if user == 'ctnuser': root = '/home/ctnuser/bjkomer/' elif user == 'bjkomer': root = '/home/bjkomer/' if len(sys.argv) == 2: dim = int(sys.argv[1]) else: dim = 100 word2vec.word2phrase(root + 'word2vec/text8', root + 'semantic-network/data/text8-phrases', verbose=True) word2vec.word2vec(root + 'semantic-network/data/text8-phrases', root + 'semantic-network/data/text8-%s.bin'%dim, size=dim, verbose=True) word2vec.word2clusters(root + 'word2vec/text8', root + 'semantic-network/data/text8-%s-clusters.txt'%dim, dim, verbose=True)
#train['link_pred'] = (train.temp2 >= 1) | (train.temp1 >= train.temp1.quantile(0.7)) #accuracy = (train.link_pred == train.link.astype(bool)).mean() #print 'Accuracy is {acc}'.format(acc=accuracy) ## Try word2vec train import word2vec from sklearn.metrics.pairwise import cosine_similarity as cosine # Create txt file from node_info all_abst_file_name = 'all_abstracts.txt' all_phrases_file_name = 'all_abstracts_phrases.txt' word2vec_out_file_name = 'all_abstracts.bin' with open(pth(all_abst_file_name), 'w') as f: for abstract in node_info.abstract.as_matrix(): f.write(abstract + '\n') word2vec.word2phrase(pth(all_abst_file_name), pth(all_phrases_file_name), verbose=True) word2vec.word2vec(pth(all_phrases_file_name), pth(word2vec_out_file_name), \ size=30, iter_=3, verbose=True) model = word2vec.load(pth(word2vec_out_file_name)) indexes, metrics = model.cosine('applications', 20) indexes, metrics = model.analogy(pos=['theorem', 'false'], neg=['true'], n=10) model.vocab[indexes]
def train_word2phrase(): word2vec.word2phrase("all_of_sefaria.txt", "word2phrase.bin", verbose=True)