def preprocess(): train_df = load_data_from_csv(train_file) val_df = load_data_from_csv(validation_file) test_df = load_data_from_csv(test_file) train_df = data_clean(train_df,train_after_clean) val_df = data_clean(val_df,val_after_clean) test_df = data_clean(test_df,test_after_clean) train_content = train_df.iloc[:,1] val_content = val_df.iloc[:,1] test_content = test_df.iloc[:,1] all_content = [] all_content.extend(train_content) all_content.extend(val_content) all_content.extend(test_content) print(len(all_content)) all_seg_words = seg_words(all_content) with open(seg_text,"w+") as txt_write: for sentence in tqdm(all_seg_words): sentence = sentence.replace("\n","") + "\n" txt_write.write(sentence) txt_write.close() word2vec.word2vec(seg_text,embedding_bin,min_count = 5,size = 100,verbose = True)
def main(): words, pos_tags = load_data('all.txt') word2vec.word2phrase('all.txt', 'word2phrase.txt', verbose=False) word2vec.word2vec('word2phrase.txt', 'word2vec.bin', alpha=0.087, hs=1, size=100, verbose=False) model = word2vec.load('word2vec.bin') words_table, words_vec = get_most_frequent_words(500, model, pos_tags) tsne = TSNE(n_components=2, random_state=87) words_t_vec = tsne.fit_transform(words_vec) # show figure = pyplot.figure(figsize=(12, 6), dpi=150) pyplot.scatter(words_t_vec[:, 0], words_t_vec[:, 1], c='b', alpha=0.2, s=15) texts = [] for vec, text in zip(words_t_vec, words_table): texts.append(pyplot.text(vec[0], vec[1], text, size=5)) adjust_text(texts, arrowprops=dict(arrowstyle='-', color='k', lw=0.5)) pyplot.show() figure.savefig('figure.png')
def main(): # size: 次元, threds: スレッド数, binary: 出力する形式を指定、0でテキスト形式で出力 word2vec.word2vec(train='out_rename.txt', output='knock90.txt', size=300, threads=4, binary=0) t_index = OrderedDict() # 単語:インデックス の辞書 with open('knock90.txt', 'rt') as f: for i, line in enumerate(f): line = line.strip().split(' ') if i == 0: # 1行目が語彙数と次元数 words_count = int(line[0]) size = int(line[1]) # 行列の作成 matrix_90 = np.zeros([words_count, size], dtype=np.float64) continue # たまに次元数が300以下のときがある if len(line[1:]) < 300: continue word = line[0] t_index[word] = i - 1 matrix_90[i - 1] = line[1:] io.savemat('knock90_300', {'knock90_300': matrix_90}) with open('./pickles/knock90_idx_t', 'wb') as f: pickle.dump(t_index, f)
def test_single_static_model(args): params = json.load(open("conf/{}.json".format(args.model), "r")) embedding_size = params['embedding_size'] filter_sizes = params['filter_sizes'] num_filters = params['num_filters'] dropout_keep_prob = params['dropout_keep_prob'] sequence_length = params['sequence_length'] num_classes = params['num_classes'] batch_size = params['batch_size'] num_epochs = params['num_epochs'] train_data = params['train_data'] test_data = params['test_data'] w2c = word2vec(args.word2vec).word2vec embedding_size = word2vec(args.word2vec).embedding_size datas = DataSetWord2vecEval(sequence_length=sequence_length, batch_size=batch_size, train_data=train_data, test_data=test_data, word2vec=w2c, embedding_size=embedding_size) vocab_size = datas.vocab_size checkpoint_dir = os.path.abspath( os.path.join("{}".format(args.model), "checkpoints")) checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) model = eval(args.model)(sequence_length, num_classes, embedding_size, filter_sizes, num_filters) model.load(checkpoint_file) model.eval(datas)
def train_model(in_file_name, out_file_name, use_plain_word2vec=False, size=100, phrases_n_gram=1, threads=4): options = { 'size': size, } if use_plain_word2vec: if phrases_n_gram > 1: phrases_file_name = '{}.phrases'.format(in_file_name) word2vec.word2phrase(in_file_name, phrases_file_name, verbose=True) in_file_name = phrases_file_name if threads: options['threads'] = threads # noinspection PyCallingNonCallable word2vec.word2vec(in_file_name, out_file_name, verbose=True, **options) else: sentences = LineSentence(in_file_name) for i in range(phrases_n_gram - 1): n_gram_transformer = Phrases(sentences) sentences = n_gram_transformer[sentences] if threads: options['workers'] = threads model = Word2Vec(sentences, **options) model.save(out_file_name)
def train_lstm_mean_model(args): sequence_length = args.sequence_length num_classes = args.num_classes batch_size = args.batch_size num_epochs = args.num_epochs train_data = args.train_data test_data = args.test_data data_exists = args.data_exists w2c = word2vec(args.word2vec).word2vec embedding_size = word2vec(args.word2vec).embedding_size datas = DataSetWord2vecMeanRnn(sequence_length=sequence_length, batch_size=batch_size, train_data=train_data, test_data=test_data, exists=data_exists, word2vec=w2c, embedding_size=embedding_size) params = { "embedding_size": embedding_size, "sequence_length": sequence_length, "num_classes": num_classes, "batch_size": batch_size, "num_epochs": num_epochs, "train_data": train_data, "test_data": test_data, "model": args.model } if not os.path.exists("conf"): os.mkdir("conf") json.dump(params, open("conf/{}.json".format(args.model), "w")) model = eval(args.model)(sequence_length, num_classes, embedding_size) model.fit(datas, num_epochs)
def build_vectors(): word2vec.word2vec(training_file, './by1.bin', size=vector_size, verbose=True) model = word2vec.load('./by1.bin') return model
def train_word2vec(word2vec_size=128): seg_file = "/home/chenyu/intent_reco/output/seg.txt" word2vec_output_file = "/home/chenyu/intent_reco/output/word2vec_" + str( word2vec_size) + ".bin" print "Start training word2vec" word2vec.word2vec(seg_file, word2vec_output_file, size=word2vec_size, verbose=True) print "End training word2vec" print "Start creating dictionary ..." word_dic = {} model = word2vec.load(word2vec_output_file) voc_size = model.vocab.size for i in range(voc_size): word_dic[model.vocab[i]] = model.vectors[i].tolist() print "End creating dictionary" word_dict_path = "/home/chenyu/intent_reco/output/word_dic_" + str( word2vec_size) + ".json" print "Start storing dictionary ..." with open(word_dict_path, "w") as f: json.dump(word_dic, f) print "End storing dictionary"
def get_pos_dictionary_matrix(): txt_fname = 'tags.txt' vec_fname = 'vec.bin' vec_size = 15 with open(txt_fname, 'w') as tags_file: words = masc_tagged.tagged_words() tags_file.write(' '.join([w[1] for w in words if w[1]])) word2vec.word2vec(txt_fname, vec_fname, size=vec_size, negative=5, sample=1, cbow=1, window=3, verbose=False) model = word2vec.load(vec_fname) pos_dictionary = {} count = 2 for tag in model.vocab: pos_dictionary[tag] = count count += 1 pos_dictionary['UNK'] = 1 pos_dictionary['<pad>'] = 0 pos_matrix = np.concatenate((np.zeros( (2, 15), dtype='float'), model.vectors), axis=0) return pos_dictionary, torch.tensor(pos_matrix)
def main(): word2vec.word2phrase('./text8', './text8-phrases', verbose=True) word2vec.word2vec('./text8-phrases', './text8.bin', size=100, verbose=True) word2vec.word2clusters('./text8', './text8-clusters.txt', 100, verbose=True)
def main(): # learn embeddings word2vec.word2vec() # convert training,test and eval data into np arrays DataProcessor.build_data() # this calculates sentiments for the data lstm.lstm_script()
def main(): train_file = 'tokens81.txt' output_file = 'vectors.txt' maxtrix_file = 'matrix_x300' dict_index_file = 'dict_index_t' word2vec.word2vec(train=train_file, output=output_file, size=300, threads=3, binary=0) with open(output_file, 'rt') as f: status = f.readline().split(' ') size_dict = int(status[0]) size_x = int(status[1]) dict_index_t = OrderedDict() matrix_x = np.zeros([size_dict, size_x], dtype=np.float64) for i, line in enumerate(f): vecs = line.strip().split(' ') dict_index_t[vecs[0]] = i matrix_x[i] = vecs[1:] io.savemat(maxtrix_file, {'matrix_x300': matrix_x}) with open(dict_index_file, 'wb') as f: pickle.dump(dict_index_t, f)
def word_clusters( corpora, size=100, verbose=True, text='text.txt', phrases='phrases.txt', binary='text.bin', clusters='clusters.txt' ): """Produce word2vec word clusters.""" words = [] for corpus in corpora: for document in corpus.documents: for sentence in document.sentences: for word in sentence.words: words.append(word.lower().strip(punctuation + whitespace)) with io.open(text, mode='w', encoding='utf-8') as file: file.write(u' '.join(words)) word2vec.word2phrase(text, phrases, verbose=verbose) word2vec.word2vec(phrases, binary, size=size, verbose=verbose) word2vec.word2clusters(text, clusters, size, verbose=verbose) json_clusters = clusters.rstrip('.txt') + '.json' with io.open(clusters, mode='r', encoding='utf-8') as file: d = dict( (w, int(c)) for w, c in map(split, file.read().splitlines()) ) with io.open(json_clusters, mode='w', encoding='utf-8') as file: json.dump(d, file, indent=4, ensure_ascii=False) return d
def build_model(self, record_list_fname, cache_dir, tmp_file): self.__load_seg_plainstr(record_list_fname, cache_dir, tmp_file) word2vec.word2vec(tmp_file, self.model_name + '.bin', size=self.vec_out, verbose=True) return word2vec.load(self.model_name + '.bin')
def solve(dataId, usingExist=True): dataId = str(dataId) dataPath = './data/' + dataId + '.txt' binPath = './out/' + dataId + '.bin' outputPath = "out/ans" + dataId + ".txt" if not os.path.exists(binPath) or not usingExist: word2vec.word2vec(dataPath, binPath, size=100, verbose=True) # 使用word2vec载入binPath model = word2vec.load(binPath) # 打开输出文件 output = codecs.open(outputPath, "w", "utf-8") ClustersNumber = 10 WordNumber = len(model.vectors) # 使用Kmeans算法 kmeans = KMeans(n_clusters=ClustersNumber, random_state=0).fit(model.vectors) # 得到每个word ID 所属于的cluster 编号,编号范围[0, WordNumber) label = kmeans.labels_ # 获取每个word 的得分,即是每个word和cluster中心的距离的相反数 scores = [] for i in xrange(WordNumber): scores.append(kmeans.score([model.vectors[i]])) # 把处于相同cluster的word ID 放入相同的list allCluster = [] for i in xrange(ClustersNumber): allCluster.append([]) for i in xrange(len(label)): allCluster[label[i]].append(i) # 定义两个word ID的大小关系,使用scores数组比较其大小关系 def comparator(a, b): vala = scores[a] valb = scores[b] if vala > valb: return 1 elif vala == valb: return 0 else: return -1 #对于每个cluster分别处理 for clusterId in xrange(len(allCluster)): output.write("-----------------------------------cluster " + str(clusterId) + ":\n") #排序,按照score从高到低排序 allCluster[clusterId].sort(cmp=comparator, reverse=True) #获取前30个 for x in allCluster[clusterId][:30]: #输出score的相反数,即输出距离 output.write(model.vocab[x] + " " + str(-scores[x]) + "\n") print '\n'
def create(): data_path_ = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) + '/data_.txt' glove_data = os.path.dirname(os.path.dirname( os.path.abspath(__file__))) + '/code/GloVe-master/data_.txt' shutil.copyfile(data_path_, glove_data) os.system("cd GloVe-master;sh demo.sh") os.remove(glove_data) os.remove( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/code/GloVe-master/vectors.bin') if not os.path.exists( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/vectors'): os.mkdir( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/vectors') shutil.move( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/code/GloVe-master/vectors.txt', os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/vectors/glove_vectors.txt') shutil.move( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/code/GloVe-master/vocab.txt', os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/vocab.txt') word2vec.word2vec( data_path_, os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/vectors/word2vec_vectors.txt', size=300, verbose=True, binary=0, min_count=5)
def train_vector(input_set,vector_size, min_count, words_path, bin_path): f = open(words_path,'w') for i in input_set: f.write(i) f.write(' ') f.close() word2vec.word2vec(words_path, bin_path, size=vector_size,min_count=min_count, verbose=True)
def word_vector(self): print('%.2f:开始转化词向量' % (time.time() - now)) word2vec.word2vec(self.word_file, self.bin_file, binary=1, verbose=False) print('%.2f:词向量转化完成' % (time.time() - now))
def train_word2vec(filepath='./data/hp_allphrase.txt', size=100, window=5, neg=5): # Turn training data into as better input for word2vec #word2vec.word2phrase('./data/hp_all.txt', './data/hp_allphrase.txt', verbose=True) # Train model word2vec.word2vec(filepath, './model/hp.bin', size=size, window=window, negative=neg, verbose=False)
def train(): word2vec.word2phrase('all.txt', 'phrase.txt', verbose=True) word2vec.word2vec('phrase.txt', 'vec.bin', min_count=50, size=50, verbose=False)
def create_model(input_path, output_path): word2vec.word2vec(\ input_path, \ output_path, \ size=10, binary=1, verbose=True) assert(os.path.isfile(output_path)) #return word2vec.load(output_path) return word2vec.WordVectors.from_binary(output_path, encoding='ISO-8859-1')
def get_word_vec(self, file_in, size): """ Args: file_in (string): szie (int): size of word embeddings the model stored in self.file_word2vec_bin """ word2vec.word2vec(file_in, self.file_word2vec_bin, size, verbose=False)
def generate_model(_size=150): # model_file = 'model90.bin' word2vec.word2vec('undersore.txt', model_file, size=_size, min_count=10, verbose=True) print('Model generated: ' + model_file)
def create_model(input_path, output_path): word2vec.word2vec(\ input_path, \ output_path, \ size=10, binary=1, verbose=True) assert (os.path.isfile(output_path)) #return word2vec.load(output_path) return word2vec.WordVectors.from_binary(output_path, encoding='ISO-8859-1')
def train_model(self): word2vec(self.src_file, self.model_file, window=self.window, hs=self.hs, alpha=self.alpha, size=self.size, verbose=self.verbose)
def main(): if '--download-nltk' in argv: nltk.download('punkt') nltk.download('maxent_treebank_pos_tagger') nltk.download('averaged_perceptron_tagger') nltk.download('brown') if not isfile('wordvec.bin') or '--train' in argv: print("\nwords to phrases...") wv.word2phrase('./HarryPotter/HarryPotter.txt', 'phrase', verbose=1) print("\nphrases to vectors...") wv.word2vec('phrase', 'wordvec.bin', size=50, verbose=1) print("") print("\nload model...") model = wv.load('wordvec.bin') print("model shape: " + repr(model.vectors.shape)) X, Y = [], [] if '--load-vector' in argv: if isfile('X.npy') and isfile('Y.npy'): X = np.load('X.npy') Y = np.load('Y.npy') else: print("can't load X.npy, Y.npy") return else: print("TSNE...") tsne = TSNE(n_components=2, learning_rate=10, random_state=0) vectors = tsne.fit_transform(X=model.vectors[:SIZE, :]) X = vectors[:, 0] Y = vectors[:, 1] print("start plot...(using nltk.corpus.brown)") brown_tagged_sents = brown.tagged_sents(categories='news') unigram_tagger = nltk.UnigramTagger(brown_tagged_sents) words = unigram_tagger.tag(model.vocab[:SIZE]) texts = [] plt.figure(figsize=(12, 8)) for x, y, word in zip(X, Y, words): print("word: (%s, %s)" % (word[0], word[1]), end="") if filter_words(word[0], word[1]): print("\r\t\t\t\tplot") plt.plot(x, y, 'o') texts.append(plt.text(x, y, word[0], fontsize=8)) else: print("\r\t\t\t\tignore") adjust_text(texts, force_text=1, arrowprops=dict(arrowstyle="-", color="k", lw=1)) plt.savefig("wordvec.png", dpi=100) plt.show()
def pre_train_word_embedding(): word2vec.word2vec('./data2/word2vec_corpus.txt', './data2/word_embedding.bin', size=200, window=10, sample='1e-5', cbow=0, save_vocab='./data2/worddict', min_count=6)
def checkForSemanticIndex(carrel): # configure MODEL = 'reader.bin' TXT = 'model.txt' PHRASES = 'model.phrases' # require from pathlib import Path from word2vec import word2vec, word2phrase import os # initialize localLibrary = configuration('localLibrary') model = localLibrary / carrel / ETC / MODEL # see if we have been here previously if not model.exists(): # initialize some more stopwords = localLibrary / carrel / ETC / STOPWORDS corpus = localLibrary / carrel / ETC / CORPUS txt = str(Path.home() / TXT) phrases = str(Path.home() / PHRASES) # tokenize click.echo('Indexing. This needs to be done only once.', err=True) click.echo('Step #1 of 6: Tokenizing corpus...', err=True) tokens = open(corpus).read().split() # normalize click.echo('Step #2 of 6: Normalizing tokens...', err=True) tokens = [token.lower() for token in tokens if token.isalpha()] # remove stop words click.echo('Step #3 of 6: Removing stop words...', err=True) stopwords = open(stopwords).read().split() tokens = [token for token in tokens if token not in stopwords] # save click.echo('Step #4 of 6: Saving tokens...', err=True) with open(txt, 'w') as handle: handle.write(' '.join(tokens)) # create phrases click.echo('Step #5 of 6: Creating phrases...', err=True) word2phrase(txt, phrases, verbose=True) # do the work click.echo('Step #6 of 6: Indexing...', err=True) word2vec(phrases, str(model), size=100, binary=True, verbose=True) # clean up and done os.remove(txt) os.remove(phrases) click.echo('\nDone. Happy searching!', err=True)
def __init__(self, originData=None, w2vModelPath="vectors.w2v", vectorSize=100): self.__model = None self.__vectorSize = vectorSize if type(originData) is str: word2vec.word2vec( originData, w2vModelPath, size=vectorSize, verbose=True) self.__model = word2vec.load(w2vModelPath)
def w2v_train(): print '.....train word2vec start at ', time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) word2vec.word2vec(corpus_file, model_file, size=300, verbose=True, threads=30) print '.....finish training word2vec end at ', time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
def word_training(path, embedded_size): dirname = os.path.dirname(path) filename = os.path.basename(path) phrasesname = os.path.join(dirname, '{}-phrases'.format(filename)) modelname = os.path.join(dirname, '{}.bin'.format(filename)) print('Training...') word2vec.word2phrase(path, phrasesname) word2vec.word2vec(phrasesname, modelname, size=embedded_size) print('Training Done!!!') return modelname
def extract(dim, data, trained): if(not trained): word2vec.word2phrase(data, data+'-phrases', verbose=True) word2vec.word2vec(data+'-phrases', data+'.bin', size=dim, verbose=True) model = word2vec.load(data+'.bin') keys = model.vocab features = model.vectors dic = dict(zip(keys,features)) print(len(dic)) return dic
def train(self): if not os.path.isfile(self.trained_fname): print("Previous training '" + self.trained_fname + "' not found. Begin training on input '" + self.input_fname + "' into " + str(self.train_dimensions) + " dimensions ...") self.trained_fname = 'src/resources/output' + self.train_dimensions word2vec.word2vec(self.input_fname, self.trained_fname, size=self.train_dimensions) else: print("Trained data seems to exist at '" + self.trained_fname + "'") print("Loading training results...") self.model = word2vec.load(self.trained_fname, kind='bin')
def create_word2vec_model(save_text_file): '''run word2vec on the text corpus and create a model''' save_phrases = save_text_file + '_phrases' save_model = save_text_file + '.bin' save_cluster = save_text_file + '-cluster.txt' # create phrases for processing word2vec.word2phrase(save_text_file, save_phrases, verbose=True) # create model word2vec.word2vec(save_phrases, save_model, size=100, verbose=True) # create cluster word2vec.word2clusters(save_text_file, save_cluster, 100, verbose=True)
def test_verbose(): saved_stdout = sys.stdout try: sys.stdout = io.StringIO() word2vec.word2vec(input_, output_bin, size=10, binary=1, verbose=True) output = sys.stdout.getvalue() assert "b'" not in output assert "Starting training" in output assert "\\r" not in output assert "\r" in output finally: sys.stdout = saved_stdout
def w2v_bin(general_bin_file_path, general_corpus_file_path, corpus_name): """ :param general_bin_file_path: :param general_corpus_file_path: :param corpus_name: :return: """ # combine all files in one corpus text_file_path = ''.join((general_bin_file_path, corpus_name, '.text')) corpus_path = ''.join((general_corpus_file_path, corpus_name, '\\')) # create .text file for word2vec concatenate_files(corpus_path, text_file_path) # create word2vec .bin file word2vec_bin_path = ''.join((general_bin_file_path, corpus_name, '.bin')) word2vec.word2vec(text_file_path, word2vec_bin_path, size=200, verbose=True) # size of word vectors
def testWord2Vec(): #Train the model using the word2phrase output. #That generated a text8.bin file containing the word vectors in a binary format. word2vec.word2vec('/D/test/text8/text8-phrases.txt', '/D/test/text8/text8-phrases.bin', size=100, verbose=True) # bin文件: # 第一行存储的是 vocab_size, vector_size,读取代码为 vocab_size, vector_size = list(map(int, header.split())) # 其余行存储的是 word vector #Predictions model = word2vec.load('/D/test/text8/text8-phrases.bin') #take a look at the vocabulaty as a numpy array print model.vocab #vocabulaty #Or take a look at the whole matrix print model.vectors.shape #word vector print model.vectors # retreive the vector of individual words print model['dog'].shape print model['dog'][:10] #We can do simple queries to retreive words similar to "socks" based on cosine similarity: indexes, metrics = model.cosine('socks') #Its possible to get the words of those indexes print model.vocab[indexes] #There is a helper function to create a combined response: a numpy record array print model.generate_response(indexes, metrics).tolist() #Since we trained the model with the output of word2phrase we can ask for similarity of "phrases" indexes, metrics = model.cosine('los_angeles') #单词的索引和余弦相似度 print model.generate_response(indexes, metrics).tolist() #单词和余弦相似度 #Its possible to do more complex queries like analogies such as: king - man + woman = queen This method returns the same as cosine the indexes of the words in the vocab and the metric indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'], n=10) print model.generate_response(indexes, metrics).tolist()
def setup_model(input, output, binary=1, cbow=0, size=300, window=10, negative=5, hs=0, threads=12, iter_=5, min_count=5, verbose=False): """ setup default value here for word2vec parameters """ return word2vec.word2vec(input, output, binary=binary, cbow=cbow, size=size, window=window, negative=negative, hs=hs, threads=threads, iter_=iter_, min_count=min_count, verbose=verbose)
def training(): ''' Training use file in '../tmp/book-seg.txt'. Make sure that you have ran the function 'text_seg' to do segmentation first. ''' word2vec.word2vec('../tmp/book-seg.txt', '../tmp/book.bin', size=300, verbose=True)
import word2vec import sys import getpass user = getpass.getuser() if user == 'ctnuser': root = '/home/ctnuser/bjkomer/' elif user == 'bjkomer': root = '/home/bjkomer/' if len(sys.argv) == 2: dim = int(sys.argv[1]) else: dim = 100 word2vec.word2phrase(root + 'word2vec/text8', root + 'semantic-network/data/text8-phrases', verbose=True) word2vec.word2vec(root + 'semantic-network/data/text8-phrases', root + 'semantic-network/data/text8-%s.bin'%dim, size=dim, verbose=True) word2vec.word2clusters(root + 'word2vec/text8', root + 'semantic-network/data/text8-%s-clusters.txt'%dim, dim, verbose=True)
def wordvec(): # 少于min_count次数的单词会被丢弃掉 word2vec.word2vec('D:\nlp\corpora\segs.txt', 'vectors.bin', size=100, window=10, sample='1e-3', hs=1, negative=0, threads=12, iter_=5, min_count=10, binary=1, cbow=0, verbose=True)
# -*- coding: utf-8 -*- """ Created on Sat Jan 30 21:17:57 2016 @author: dudu """ import word2vec if __name__ == '__main__': path = '/home/dudu/hack_cambridge/all.txt' out_path = '/home/dudu/hack_cambridge/cambridge/word2vec_model.bin' word2vec.word2vec(path, out_path, size=10, verbose=True)
# -*- coding: utf-8 -*- import word2vec if __name__ == '__main__': word2vec.word2vec("raw_text","text8.bin",size=100,verbose=True)
sentences = sent_tokenize(textbooks.decode('utf8')) print 'sentence tokenization finished' count = 0 outLines = list() for s in sentences: count = count+1 if count % 10000 == 0: print count tokens = word_tokenize(s) if len(tokens) < 3: continue outLines.append(str.join(' ', tokens)) print 'word tokenization finished' outFile.write((str.join('\n', outLines)).encode('utf8')) textbooksFile.close() outFile.close() # # word2vec.word2phrase( # 'data/books/textbooks.txt', 'data/books/phrases', verbose=True) print 'starting word2vec' word2vec.word2vec( 'data/allTokenized.txt', 'data/model_allTokenized.bin', verbose=True, min_count = 5, threads = 4, size = 300, window = 9, iter_ = 10) print 'finish'
def preprocess(): wakati_text_file = config.get("word2vec", "wakati.file.path") word2vec.word2vec(wakati_text_file, wakati_bin_file, size=300, verbose=True)
def preprocess(): word2vec.word2vec('wakati_text.txt', 'wakati_text.bin', size=300, verbose=True)
np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] vocab_size = len(vocabulary) print("Vocabulary Size: {:d}".format(vocab_size)) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) data = x_train.flatten() iterations = 1000 data = data[data!=468] w2v = word2vec.word2vec(data,vocabulary,vocabulary_inv,vocab_size,iterations) final_embeddings = w2v.runWord2Vec() accuracies = [] # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=x_train.shape[1], num_classes=2,
# step 1 install word2vec (ref: http://nbviewer.jupyter.org/github/danielfrg/word2vec/blob/master/examples/word2vec.ipynb) import word2vec import numpy as np import scipy.io as sio vector_size = 100 amount_nearest = 100 word2vec.word2phrase('text8', 'text8-phrases', verbose=True) word2vec.word2vec('text8-phrases', 'text8.bin', size=vector_size, verbose=True) word2vec.word2clusters('text8', 'text8-clusters.txt', vector_size, verbose=True) # read the trained model model = word2vec.load('text8.bin') # list of vague motivation coming from mind (topic: potential problems for enterprise) motivation = ['enterprise', \ 'business',\ 'solution',\ 'entrepreneur',\ 'latent',\ 'problem',\ 'funds',\ 'management',\ 'quality',\ 'projects'] # start get nearest clusters by picking the similar words amount_motivation = len(motivation) motivation_vector = [] nearest_indexes = []
features[7] = len(sentence1) / len(sentence2) return features # Uses treetagger-python (Installation https://github.com/miotto/treetagger-python ; http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/) try: semanticsimilarity_lookuptable = pickle.load(open('semanticsimilarity_lookuptable.pkl', 'rb')) except Exception: semanticsimilarity_lookuptable = {} print "Build Word2Vec Corpus" dir = os.path.dirname(os.path.abspath(__file__)) try: # on OSX for some reason this does not work word2vec.word2phrase(dir + '/text8', dir + '/text8-phrases', verbose=True) word2vec.word2vec(dir + '/text8-phrases', dir + '/text8.bin', size=100, verbose=True) except Exception as e: print e model = word2vec.load(dir + '/text8.bin') print "Finish" def computeSemantics(sentence1, sentence2): def computeSemanticSimilarityFeatures(sentence1, sentence2): features = [0] * 9 if (sentence1 + sentence2) not in semanticsimilarity_lookuptable: def prepareSentence(sentence): return sentence.replace('-', ' ').replace('$', ' ') tt = TreeTagger(language='english')
import word2vec word2vec.word2phrase('text8', 'text8-phrases', verbose=True) word2vec.word2vec('text8-phrases', 'text8.bin', size=100, verbose=True) # word2vec.word2clusters('text8', 'text8-clusters.txt', 10, verbose=True) # word2vec.word2phrase('enwik9', 'enwik9-phrases', verbose=True) # word2vec.word2vec('enwik9-phrases', 'enwik9.bin', size=100, verbose=True)
def setup_module(module): word2vec.word2phrase(input_, output_phrases, verbose=False) word2vec.word2vec(input_, output_bin, size=10, binary=1, verbose=False) word2vec.word2vec(input_, output_txt, size=10, binary=0, verbose=False) word2vec.word2clusters(input_, output_clusters, 10, verbose=True)
# -*- coding: utf-8 -*- import plyvel import re, string import sys, locale import word2vec import os reload(sys) sys.setdefaultencoding(locale.getdefaultlocale()[1]) model_path = os.path.abspath('model.bin') text_path = os.path.abspath('text.txt') phrase_path = os.path.abspath('phrases.txt') word2vec.word2phrase(text_path, phrase_path, verbose=True) word2vec.word2vec(phrase_path, model_path, binary=1, verbose=True) model = word2vec.load(model_path) indexes, metrics = model.cosine('seymour') print (string.join(model.vocab[indexes], ' '))
print args #read the fil # Read a CSV file with the train paramters with open(args.parameter_file,'r') as input: runs = json.load(input) #print runs for run_code in runs: print "Procesing %s " % run_code output = "dewiki-" + run_code + ".bin" word2vec.word2vec(args.text_file,output=output, verbose=True, **runs[run_code]) print "Successfully finished processing %s" % run_code
import word2vec from timeit import default_timer as timer start = timer() word2vec.word2vec('data-test.txt', 'vectors-model.bin', cbow=0, size=100, window=10, negative=5, hs=0, sample='1e-4', threads=8, iter_=20, min_count=1, verbose=True) end = timer() print('Model generated in %f seconds' % (end - start))
def w2v_train(): print '.....train word2vec start at ', time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) word2vec.word2vec(corpus_file, model_file,size=300, verbose=True,threads=30) print '.....finish training word2vec end at ', time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
#train['link_pred'] = (train.temp2 >= 1) | (train.temp1 >= train.temp1.quantile(0.7)) #accuracy = (train.link_pred == train.link.astype(bool)).mean() #print 'Accuracy is {acc}'.format(acc=accuracy) ## Try word2vec train import word2vec from sklearn.metrics.pairwise import cosine_similarity as cosine # Create txt file from node_info all_abst_file_name = 'all_abstracts.txt' all_phrases_file_name = 'all_abstracts_phrases.txt' word2vec_out_file_name = 'all_abstracts.bin' with open(pth(all_abst_file_name), 'w') as f: for abstract in node_info.abstract.as_matrix(): f.write(abstract + '\n') word2vec.word2phrase(pth(all_abst_file_name), pth(all_phrases_file_name), verbose=True) word2vec.word2vec(pth(all_phrases_file_name), pth(word2vec_out_file_name), \ size=30, iter_=3, verbose=True) model = word2vec.load(pth(word2vec_out_file_name)) indexes, metrics = model.cosine('applications', 20) indexes, metrics = model.analogy(pos=['theorem', 'false'], neg=['true'], n=10) model.vocab[indexes]
import os import sys sys.path.append(os.path.abspath(__file__ + "/../../")) import pandas as pd import word2vec as w2v #w2v.word2phrase('text8.txt', 'text8-phrases', verbose=True) w2v.word2vec('training_text.txt', 'training_text_clean.bin', size=100, verbose=True) #w2v.word2clusters('/Users/Henrik/Downloads/test/text8', '/Users/Henrik/Downloads/test/text8-clusters.txt', 100, verbose=True)