def main(): word2vec.word2phrase('./text8', './text8-phrases', verbose=True) word2vec.word2vec('./text8-phrases', './text8.bin', size=100, verbose=True) word2vec.word2clusters('./text8', './text8-clusters.txt', 100, verbose=True)
def word_clusters( corpora, size=100, verbose=True, text='text.txt', phrases='phrases.txt', binary='text.bin', clusters='clusters.txt' ): """Produce word2vec word clusters.""" words = [] for corpus in corpora: for document in corpus.documents: for sentence in document.sentences: for word in sentence.words: words.append(word.lower().strip(punctuation + whitespace)) with io.open(text, mode='w', encoding='utf-8') as file: file.write(u' '.join(words)) word2vec.word2phrase(text, phrases, verbose=verbose) word2vec.word2vec(phrases, binary, size=size, verbose=verbose) word2vec.word2clusters(text, clusters, size, verbose=verbose) json_clusters = clusters.rstrip('.txt') + '.json' with io.open(clusters, mode='r', encoding='utf-8') as file: d = dict( (w, int(c)) for w, c in map(split, file.read().splitlines()) ) with io.open(json_clusters, mode='w', encoding='utf-8') as file: json.dump(d, file, indent=4, ensure_ascii=False) return d
def __preprocess(self, _model_dir, _training, _seg_file, _cluster_file): if not os.path.exists(_model_dir): os.mkdir(_model_dir) if not os.path.exists(_seg_file): fin = open(_training, "r") fout = open(_seg_file, "w") pattern = re.compile(r"(?:([a-zA-Z]+))") counter = 0 for line in fin: if counter % 5000 == 0: fout.flush() print("preprocessed: %d" % counter) counter += 1 line = line.strip("\r\n") matches = pattern.finditer(line) results = [] for match in matches: if len(match.group()) > 0: results.extend(word_seg.segment(match.group())) if len(results) > 0: fout.write("%s\n" % " ".join(results)) print("preprocess done! %d" % counter) fin.close() fout.flush() fout.close() # eng_parse.segmentation(_training, _seg_file, separation=self.__separation, # window=self.__window_size - 1, special_chr=self.__special_chr) if not os.path.exists(_cluster_file): wv.word2clusters(train=_seg_file, output=_cluster_file, classes=self.__class_number, min_count=self.__min_count, window=self.__window_size) pass
def testWord2Cluter(): """Cluster""" # Do the clustering of the vectors based on the trained model. # That created a text8-clusters.txt with the cluster for every word in the vocabulary word2vec.word2clusters('/D/test/text8/text8', '/D/test/text8/text8-clusters', 100, verbose=True) clusters = word2vec.load_clusters('/Users/drodriguez/Downloads/text8-clusters') print clusters['dog'] print clusters.get_words_on_cluster(90).shape print clusters.get_words_on_cluster(90)[:10]
def word2vec(_seg_file, _cluster_file, _classes, _size, _window, _min_count, _cbow): """ :param _seg_file: :param _cluster_file: :param _classes: :param _size: :param _window: :param _min_count: :param _cbow: cbow = 1 represents that using cbow strategy :return: """ wv.word2clusters(_seg_file, _cluster_file, _classes, _size, _window, _min_count, cbow=_cbow) pass
def train_model(docfile_root="corpora/Thaliana/documents-processed"): print "phrases..." word2vec.word2phrase(docfile_root + ".txt", docfile_root + "-phrases.txt", verbose=True) #print "word2vec" #word2vec.word2vec(docfile_root + "-phrases.txt", docfile_root + ".bin", size=1000, verbose=True, min_count=1) print "word2cluster" word2vec.word2clusters(docfile_root + ".txt", docfile_root + '-clusters.txt', 10000, verbose=True, min_count=1, threads=4)
def main(): """Main method.""" k = 35 # write ground truth vocabulary to gt_input.txt and get ground truth # dictionary ldict = aggregate_input_and_ground_truths() logging.info("Done generating ldict and ground truth text file.") # if file containing clusters hasn't already been created, create it if not os.path.isfile("./clusters.txt"): preprocess() # train word2vec and cluster output from the full vocab word2vec.word2clusters("./text8-phrases-extra", "./clusters.txt", k, verbose=True, min_count=1) logging.info("Done training.") logging.info("Done creating clusters.") # load clusters clusters = word2vec.load_clusters("./clusters.txt") # build cluster dictionary from full vocabulary cdict = {} for i in range(0, k): for word in clusters.get_words_on_cluster(i): cdict[word] = set([i]) logging.info("Done generating cdict.") # trim cluster dictionary down to only keys included in ground truths trimmed_cdict = {} for key in ldict.keys(): try: trimmed_cdict[key] = cdict[key] except: pass logging.info("done trimming cdict; begining scoring\n") # compute bcubed score precision = bcubed.precision(trimmed_cdict, ldict) recall = bcubed.recall(trimmed_cdict, ldict) fscore = bcubed.fscore(precision, recall) print "precision: {p}, \t recall: {r}, \t fscore: {f}".format(p=precision, r=recall, f=fscore) logging.info("done scoring\n")
def create_word2vec_model(save_text_file): '''run word2vec on the text corpus and create a model''' save_phrases = save_text_file + '_phrases' save_model = save_text_file + '.bin' save_cluster = save_text_file + '-cluster.txt' # create phrases for processing word2vec.word2phrase(save_text_file, save_phrases, verbose=True) # create model word2vec.word2vec(save_phrases, save_model, size=100, verbose=True) # create cluster word2vec.word2clusters(save_text_file, save_cluster, 100, verbose=True)
def files_10(): count = 1 while count < 11: for i in os.listdir(directory): if i == 'text' + str(count): text = "text" + str(count) + "-adapted" vec = "text" + str(count) + "-vec.bin" cluster = "text" + str(count) + "-clusters.txt" result = "text" + str(count) + "-result.txt" #result_list = [] stopwordsss = ["in", "it", "as", "my", "do", "is", "don't", "doesn't", "am", "it's", "i", "you", "and", "to", "the", "on", "but", "that", "are", "so", "to", "me", "of", "with", "try", 'a', 'about', 'after', 'all', 'also', 'always', 'am', 'an', 'and', 'any', 'are', 'at', 'be', 'been', 'being', 'but', 'by', 'came', 'can', "can't", 'come', 'could' , 'did', 'do', 'does', 'doing', 'else', 'for', 'from', 'get', 'give', 'goes', 'going', 'had', 'happen', 'has', 'have', 'having', 'how', 'in', 'into', 'really', 'if', 'see', 'plus', 'then', "i'll", "then", "or", "will", "i'm", "too", "doesn't", "don't", "will", "that's", "-", "i've", "would", "making", "usually", "what", "hasn't", "it's", "hmmm", "really", "this", "someone", "not", "i'll", "like", "this", "e", "=", "just", "more", "actually", "most", "one", ":", "very", "b", "yes", "same"] word2vec.word2phrase(i, text, verbose=True) #model = word2vec.load(vec) reading = open(text).read().lower().replace(',', ' ').replace('.', ' ').replace('/', ' ').replace('-', ' ').replace('(', ' ').replace(')', ' ').replace('?', ' ').split() for i in stopwordsss: try: reading = list(filter(lambda x: x != i, reading)) except: continue print(reading) iterat = 0 s = ' ' s = s.join(reading) with open(text, "w") as result_file: result_file.write(s) word2vec.word2vec(text, vec, 100, verbose=True) word2vec.word2clusters(vec, cluster, 100, verbose=True) for word in reading: word2vec_model = Word2Vec([reading], min_count=1) vocabulary = word2vec_model.wv.vocab sim_words = word2vec_model.wv.most_similar(word)[:3] #indexes, metrics = model.similar(word.lower()) #real_time = model.generate_response(indexes, metrics).tolist() #result_list.append(sim_words) #result_list.append('\n\n') if iterat == 0: with open(result, "w") as result_file: result_file.write(str(word) + '\n' + str(sim_words) + '\n\n') iterat = 1 elif iterat == 1: with open(result, "a") as result_file: result_file.write(str(word) + '\n' + str(sim_words) + '\n\n') count += 1 else: continue
def test_word2vec_main(): """ 测试Word2Vec模型 """ # 初始化参数 root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # 训练数据路径 train_data_file = os.path.join(root_path, r"data/train_data.txt") # 模型保存路径 type_vec_file = os.path.join(root_path, r"data/type_vec.txt") # 向量聚类结果存储路径,即模型路径 out_put_clusters = os.path.join(root_path, r"data/output_clusters.txt") # 1.生成训练数据文件 # 2.生成一个bin文件,其中包含二进制格式的单词向量。 word2vec.word2vec(train_data_file, type_vec_file, size=128, window=5, sample='1e-3', hs=0, negative=5, iter_=100, min_count=1, binary=0, cbow=0, verbose=True) # 3.根据训练好的模型对向量进行聚类。这创建了一个txt文件,其中词汇表中的每个单词都有一个集群 word2vec.word2clusters(train_data_file, out_put_clusters, 100, verbose=True) # 4.导入上面创建的word2vec二进制文件 model = word2vec.load(type_vec_file) # 5.查看模型数据 print("=========Word2Vec=========") # #以numpy数组的形式来查看词汇表 print(model.vocab)
with open("text8") as myfile: firstNlines=myfile.readlines()[0:5] #put here the interval you want word2vec.word2phrase('text8', 'text8-phrases', verbose=True) # ## Train word2vec model -> create word vectors in binary format word2vec.word2vec('text8-phrases', 'text8.bin', size=100, verbose=True) # ## Create vector clusters based on trained model word2vec.word2clusters('text8', 'text8-clusters.txt', 100, verbose=True) # ## Predictions model = word2vec.load('text8.bin') model.vocab model.vectors.shape #retrieve vector of individual words model['dog'].shape
import sys sys.path.append('danielfrg') import os os.environ['PATH'] += ':mikolov' import word2vec word2vec.word2clusters('../corpus/mnsz2_only/press.clean.txt', 'tmp/press.clusters.txt', 100, verbose=True)
def train_clusters(): word2vec.word2clusters("word2phrase.bin", "word2vec.bin", 1000, verbose=True)
# coding: utf-8 import word2vec import gensim word2vec.word2phrase('./refined_text.txt', './wiki-phrase', verbose=True) word2vec.word2vec('./wiki-phrase', './word2vec_model.bin', size=100, verbose=True) word2vec.word2clusters( '/Users/KYD/Documents/wiki_project/refined_text.txt', 'Users/KYD/Documents/wiki_project/refined_text_cluster.txt', 100, verbose=True)
import word2vec word2vec.word2phrase('/home/guohf/AI_tutorial/ch8/data/text8', '/home/guohf/AI_tutorial/ch8/data/text8-phrases', verbose=True) word2vec.word2vec('/home/guohf/AI_tutorial/ch8/data/text8-phrases', '/home/guohf/AI_tutorial/ch8/data/text8.bin', size=100, verbose=True) word2vec.word2clusters('/home/guohf/AI_tutorial/ch8/data/text8', '/home/guohf/AI_tutorial/ch8/data/text8-clusters.txt', 100, verbose=True) model = word2vec.load('/home/guohf/AI_tutorial/ch8/data/text8.bin') # take look at the vocabulary as a numpy array print(model.vocab) # take a look at the whole matrix print(model.vectors.shape) print(model.vectors) # retreive the vector of individual words print(model['dog'].shape) print(model['dog'][:10]) # calculate the distance between two or more(all combinations) words print(model.distance("dog", "cat", "fish")) # do simple queries to retreive words similar to "dog" based on cosine similarity indexes, metrics = model.similar("dog")
import word2vec import sys import getpass user = getpass.getuser() if user == 'ctnuser': root = '/home/ctnuser/bjkomer/' elif user == 'bjkomer': root = '/home/bjkomer/' if len(sys.argv) == 2: dim = int(sys.argv[1]) else: dim = 100 word2vec.word2phrase(root + 'word2vec/text8', root + 'semantic-network/data/text8-phrases', verbose=True) word2vec.word2vec(root + 'semantic-network/data/text8-phrases', root + 'semantic-network/data/text8-%s.bin'%dim, size=dim, verbose=True) word2vec.word2clusters(root + 'word2vec/text8', root + 'semantic-network/data/text8-%s-clusters.txt'%dim, dim, verbose=True)
import word2vec import os path_dataset = os.path.abspath('dataset/text8') path_clusters = os.path.abspath('dataset/text8.clusters') word2vec.word2clusters(path_dataset, path_clusters, 100) clusters = word2vec.load_clusters(path_clusters) print clusters
def setup_module(module): word2vec.word2phrase(input_, output_phrases, verbose=False) word2vec.word2vec(input_, output_bin, size=10, binary=1, verbose=False) word2vec.word2vec(input_, output_txt, size=10, binary=0, verbose=False) word2vec.word2clusters(input_, output_clusters, 10, verbose=True)
# coding=utf-8 # Copyright 2020 Yedaffon Author. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import word2vec word2vec.word2phrase("text8", "text8-phrases", verbose=True) word2vec.word2vec("text8-phrases", "text8.bin", size=100, verbose=True) word2vec.word2clusters("text8", "text8-clusters.txt", 100, verbose=True) model = word2vec.load("text8.bin") print(model.vocab) indexes, metrics = model.cosine("you") output = model.generate_response(indexes, metrics).tolist() print(output)
import word2vec # Create phrase file word2vec.word2phrase('material/kjv.txt', 'kjv-phrases', verbose=True) # Do training word2vec.word2vec('kjv-phrases', 'kjv.bin', size=100, verbose=True) # Do clustering word2vec.word2clusters('material/kjv.txt', 'kjv-clusters.txt', 100, verbose=True)
from gensim.models.keyedvectors import KeyedVectors import word2vec import timeit start_time = timeit.default_timer() word2vec.word2phrase('3g-p.txt', '3g-phrases.txt', verbose=True) word2vec.word2vec('3g-phrases.txt', '3g.bin', size=100, verbose=True) elapsed = timeit.default_timer() - start_time InMinutes = elapsed / 60 word2vec.word2clusters('3g-p.txt', '3g-clusters.txt', 100, verbose=True) model = KeyedVectors.load_word2vec_format('3g.bin', binary=True) model.save_word2vec_format('3g-vectors.txt', binary=False) print("The Totatl Execution Time in Minutes is: ", InMinutes)
word2vec.word2phrase('./txt_file/text8', './txt_file/text8-phrases', verbose=True) """ 2. 训练skip-gram model, 得到word2vec词向量表示 """ word2vec.word2vec('./txt_file/text8-phrases', './word2vectors/text8.bin', size=100, verbose=True) """ 3. 词向量的应用:单词聚类,产生 text8-clusters.txt 包含所有单词的聚类结果, 结果数目小于等于单词表数目 """ word2vec.word2clusters('./txt_file/text8', './word2vectors/text8-clusters.txt', 100, verbose=True) """ 4. model模型的使用 """ model = word2vec.load('./word2vectors/text8.bin') print(model.vocab.size) print(model.vectors[0]) print(model['dog'][:10]) print(model.distance("dog", "cat", "fish")) indexes, metrics = model.similar("dog") print(model.vocab[indexes]) print(model.generate_response(indexes, metrics).tolist()) indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'])
import matplotlib.pyplot as plt from adjustText import adjust_text w2v_size = 1000 k = 750 word2vec.word2phrase('./Book5TheOrderOfThePhoenix/all.txt', './Book5TheOrderOfThePhoenix/all-phrases', verbose=True) word2vec.word2vec('./Book5TheOrderOfThePhoenix/all-phrases', './Book5TheOrderOfThePhoenix/all.bin', size=w2v_size, verbose=True) w2v_model = word2vec.load('./Book5TheOrderOfThePhoenix/all.bin') """ word2vec.word2clusters('./Book5TheOrderOfThePhoenix/all.txt', './Book5TheOrderOfThePhoenix/all-clusters.txt', w2v_size, verbose=True) w2v_model.clusters = word2vec.load_clusters('./Book5TheOrderOfThePhoenix/all-clusters.txt') """ tsne_model = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress=True) vocab_tsne = tsne_model.fit_transform(w2v_model.vectors[:k]) tag_list = [item[1] for item in nltk.pos_tag(w2v_model.vocab[:k])] selected_vocab = [] selected_tsne = [] punc = ", . : ; ! ? ' ’ ‘ ” “ \" _ - "
def train(): #wv.word2phrase(VOCAB_PATH, PHRASE_PATH, verbose=True) wv.word2vec(VOCAB_PATH, BIN_PATH, size=VEC_SIZE, verbose=True) wv.word2clusters(VOCAB_PATH, CLUSTER_PATH, VEC_SIZE, verbose=True)
import word2vec import nltk import numpy as np import matplotlib.pyplot as plt from sklearn.manifold import TSNE from adjustText import adjust_text word2vec.word2phrase('p2/all_text.txt', 'p2/all_phrase', verbose=True) word2vec.word2vec('p2/all_phrase', 'p2/all_text.bin', size=100, verbose=True) word2vec.word2clusters('p2/all_text.txt', 'p2/all_clusters.txt', 100, verbose=True) TRAINED = 800 USED = 80 origin_data = 'p2/all_text.txt' model = 'p2/all_text.bin' def plot(used_words, x, y, texts): color_array = np.arange(used_words) plt.figure(figsize=(15, 8)) plt.scatter(x, y, c=color_array, linewidths=0) text = [] for x, y, txt in zip(x, y, texts): text.append(plt.text(x, y, txt)) return text
import word2vec as w2v import clean_articles from pprint import pprint import sys if len(sys.argv) > 1 and sys.argv[1] in ['-t', '-train']: # Add new articles to file clean_articles.clean() # Train new model w2v.word2phrase('combined', './text-phrases', verbose=True) w2v.word2vec('text8-phrases', 'text.bin', size=100, verbose=True) w2v.word2clusters('combined', 'text-clusters.txt', 100, verbose=True) # Initialize pre-trained model model_old = w2v.load('text8.bin') model = w2v.load('text.bin') clusters = w2v.load_clusters('text-clusters.txt') model.clusters = clusters #ind = clusters['Trump'] #print(clusters.get_words_on_cluster(ind)) print(len(model_old.vocab)) print(len(model.vocab)) # King - man + woman : "Man is to King as Woman is to # Trump - America + Germany pos = ['Putin', 'America'] neg = ['Russia'] leader = model.analogy(pos, neg)
def train(path): word2vec.word2phrase(path, path+'-phrases', verbose=True) word2vec.word2vec(path+'-phrases', path+'.bin', size=100, binary=True, verbose=True) word2vec.word2clusters(path, path + '.clusters.txt', 100, verbose=True) model = word2vec.load(path+'.bin') return model
model_1 = word2vec.Word2Vec.load(model_file) # 计算某个词的相关词列表 y2 = model_1.most_similar(u"吸烟", topn=19) # 39个最相关的 print(u"和吸烟最相关的词有:\n") for item in y2: print ("%s: %g" % (item[0], item[1])) print("-------------------------------\n") except Exception: print ("Exception") print ('word2vec_test end.') # word2vec_test() model = word2vec.load('cnword2vec.bin') word2vec.word2clusters('cuttedwords.txt','cncluster.txt', 100, verbose=True) clusters = word2vec.load_clusters('cncluster.txt') print(clusters) # clusters.vocab # print(clusters.get_words_on_cluster(90)[:10]) # model.clusters = clusters # indexes, metrics = model.analogy(pos=["吸","戒烟"], neg=["抽"]) # print(model.generate_response(indexes, metrics).tolist())
# In[ ]: word2vec.word2phrase('data/names.txt', 'data/names-phrases.txt', verbose=True) # In[ ]: word2vec.word2vec('data/names.txt', 'data/names-model.bin', size=100, verbose=True) # In[ ]: word2vec.word2clusters('data/names.txt', 'data/names-clusters.txt', 100, verbose=True) # In[ ]: model = word2vec.load('data/names-model.bin') # In[ ]: clusters = word2vec.load_clusters('data/names-clusters.txt') # In[ ]: model.vocab # In[ ]:
# step 1 install word2vec (ref: http://nbviewer.jupyter.org/github/danielfrg/word2vec/blob/master/examples/word2vec.ipynb) import word2vec import numpy as np import scipy.io as sio vector_size = 100 amount_nearest = 100 word2vec.word2phrase('text8', 'text8-phrases', verbose=True) word2vec.word2vec('text8-phrases', 'text8.bin', size=vector_size, verbose=True) word2vec.word2clusters('text8', 'text8-clusters.txt', vector_size, verbose=True) # read the trained model model = word2vec.load('text8.bin') # list of vague motivation coming from mind (topic: potential problems for enterprise) motivation = ['enterprise', \ 'business',\ 'solution',\ 'entrepreneur',\ 'latent',\ 'problem',\ 'funds',\ 'management',\ 'quality',\ 'projects'] # start get nearest clusters by picking the similar words amount_motivation = len(motivation) motivation_vector = [] nearest_indexes = []
line = line.split()[1:] for term in line: try: if stop.stop3(term.split('/')[1]): continue except: print count print 'ERROR' print term #quit() term = term.split('/')[0] l.write(term + ' ') l.write('\n') f.close() l.close() quit() word2vec.doc2vec(cleanFileName, fileName, cbow=0, size=50, window=10, negative=5, hs=0, sample='1e-4', threads=12, iter_=20, min_count=1, verbose=True) word2vec.word2phrase(cleanFileName, fileName, verbose=True) word2vec.word2clusters(cleanFileName, fileName, 100, verbose=True) model = word2vec.load(fileName) print model.vectors.shape quit() indexes, metrics = model.cosine('_*1') model.generate_response(indexes, metrics).tolist() ''' 找出重要詞彙的相似字 把它存起來,vectorSim.json '''
def test_run_word2clusters(): word2vec.word2clusters(input_text, output_clusters, 10) assert os.path.exists(output_clusters)
import word2vec from config import * word2vec.word2phrase(filename_start, filename_phrases, verbose=True) word2vec.word2vec(filename_phrases, filename_bin, size=100, verbose=True) word2vec.word2clusters(filename_start, filename_clusters, 100, verbose=True)