from gensim.models import word2vec import os.path if __name__ == '__main__': path_model = 'result/knock90.bin' if not os.path.exists(path_model): data = word2vec.LineSentence('../chapter09/result/knock81_result.txt') model = word2vec.Word2Vec(data, min_count=1, window=3, size=100) model.save(path_model) else: model = word2vec.Word2Vec.load(path_model) with open('result/knock90.txt', 'w') as data_out: print('<knock86_result>', file=data_out) print(model['United_States'], file=data_out) print('\n<knock87_result>', file=data_out) print(model.similarity('United_States', 'U.S'), file=data_out) print('\n<knock88_result>', file=data_out) simi_England = model.most_similar('England', topn=10) print('\n'.join(map(lambda x: '{}'.format(x), simi_England)), file=data_out) print('\n<knock89_result>', file=data_out) simi_Answer = model.most_similar(positive=['Spain', 'Athens'], negative=['Madrid'], topn=10) print('\n'.join(map(lambda x: '{}'.format(x), simi_Answer)), file=data_out)
from konlpy.tag import Twitter from gensim.models import word2vec # word2vec: 문장 내부의 단어를 벡터로 변환하는 도구 # utf-16 인코딩으로 파일을 열고 글자를 출력하기 --- (※1) fp = codecs.open("hong.txt", "r", encoding="utf-8") text = fp.read() # 텍스트를 한 줄씩 처리하기 --- (※2) twitter = Twitter() results = [] lines = text.split("\r\n") for line in lines: # 형태소 분석하기 --- (※3) # 단어의 기본형 사용 malist = twitter.pos(line, norm=True, stem=True) r = [] for word in malist: # 어미/조사/구두점 등은 대상에서 제외 if not word[1] in ["Josa", "Eomi", "Punctuation"]: r.append(word[0]) rl = (" ".join(r)).strip() results.append(rl) print(rl) # 파일로 출력하기 --- (※4) wakati_file = 'hong.wakati' with open(wakati_file, 'w', encoding='utf-8') as fp: fp.write("\n".join(results)) # Word2Vec 모델 만들기 --- (※5) data = word2vec.LineSentence(wakati_file) model = word2vec.Word2Vec(data, size=200, window=10, hs=1, min_count=2, sg=1) model.save("hong.model") print("\n\n================= 분석 완료 ==================")
SIZE) # в отдельный текстовый файл выведем все параметры модели with open(filename + '.info', 'w+') as info_file: print('corpus_path=', corpus_path, file=info_file) print('SIZE=', SIZE, file=info_file) print('WINDOW=', WINDOW, file=info_file) print('CBOW=', CBOW, file=info_file) print('MIN_COUNT=', MIN_COUNT, file=info_file) print('NB_ITERS=', NB_ITERS, file=info_file) if ADD_WALLS: sentences = SentencesWithWalls(corpus_path) else: #sentences = word2vec.Text8Corpus(corpus_path) sentences = word2vec.LineSentence(corpus_path) # начинаем обучение w2v model = word2vec.Word2Vec(sentences, size=SIZE, window=WINDOW, cbow_mean=CBOW, min_count=MIN_COUNT, workers=6, sorted_vocab=1, iter=NB_ITERS) model.init_sims(replace=True) # сохраняем готовую w2v модель model.wv.save_word2vec_format(filename + '.bin', binary=True)
for word in pos: if not word[1] in ['Josa', 'Eomi', 'Punctuation', 'Verb' ] and len(word[0]) >= 2: temp.append(word[0]) # print(temp) temp2 = ' '.join(temp).strip() # print(temp2) result.append(temp2) # print(result) prepro_file = 'word2vec.prepro' with open(prepro_file, mode='wt', encoding='utf-8') as resultfile: resultfile.write('\n'.join(result)) # vec : 벡터 - (방향, 크기를 가지고 있는 단위) # scala : 값만 가지고 있는 단위 # 단어들의 유사도 : cos 유사도, 유클리디언 유사도, 맨해튼 유사도 from gensim.models import word2vec # LineSentence : 분석을 하기 위한 sentence를 만들어 주는 함수 data = word2vec.LineSentence(prepro_file) print(type(data)) # word2vec : word to vector 알고리즘 # Word2Vec() : LineSentence로 만든 문장에 대한 모델을 생성 # size : 벡터의 차원수, window : 윈도우 사이즈 # min_count : 빈도수 체크 시 최소 횟수 # sg : 1(skipgram), 0(cbow) model = word2vec.Word2Vec(data, size=200, window=10, min_count=2, sg=1) model_filename = 'word2vec.model' model.save(model_filename) # 모델 파일 저장
import sys from gensim.corpora import Dictionary from gensim.models.ldamodel import LdaModel from gensim.models import word2vec data_file = sys.argv[1] topic_num = int(sys.argv[2]) alpha = float(sys.argv[3]) sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2] dic = Dictionary(sentences) corpus = [dic.doc2bow(s) for s in sentences] lda = LdaModel(corpus=corpus, num_topics=topic_num, id2word=dic, alpha=alpha, random_state=1) for t in lda.top_topics(corpus=corpus): print(f"coherence = {t[1]}, topic = {t[0]}")
def train_model(): sentences = word2vec.LineSentence('sentences.txt') # Path model = word2vec.Word2Vec(sentences=sentences, size=100, min_count=3) model.wv.save_word2vec_format('word2vec.bin', binary=True) # Path
def make_word2vec(path, embed_size): word2vec_model = word2vec.Word2Vec(word2vec.LineSentence(path), size=embed_size, min_count=1) return word2vec_model
print(results) fileName = 'daumenews2.txt' with open(fileName, mode='w', encoding='utf-8') as fw: fw.write('\n'.join(results)) print('저장성공') print() # 워드 임베딩 중 word2vec from gensim.models import word2vec sentence = [['python', 'len', 'program', 'computer', 'say']] model = word2vec.Word2Vec(sentence, min_count=1) print(model.wv.most_similar('python')) genObj = word2vec.LineSentence(fileName) print(genObj) model = word2vec.Word2Vec( genObj, size=100, window=10, min_count=2, sg=1 ) # size=차원,window = 주변단어(앞,뒤) 10개 # sg=0 (CBOW=주변단어가지고 중심단어 예측ex) 나는 ~에 간다) ,sg=1 (skip_Gram) print(model) model.init_sims(replace=True) # 쓸대없는메모리 제거 # 1번만실행햄 # try: # model.save('news.model') # print('ok') # except Exception as e: # print('err ', e) print() # model 읽기
import os import sys #sys.path.append(os.environ['HOME']+'/lib/python') import string import re from gensim.models import word2vec from gensim.models import KeyedVectors if __name__=='__main__': import argparse # コマンドラインオプション ap=argparse.ArgumentParser(description="word2vecのモデルを学習する") ap.add_argument('file',type=str,metavar='TXT',help='分かち書き済みテキスト(1行1文)') ap.add_argument('--model','-m',type=str,metavar='MODEL',help='モデル名(default:vector.bin)',default='vector.bin') ap.add_argument('--size','-s',type=int,metavar='INT',help='dimensionality of the feature vector (default:100)',default=100) ap.add_argument('--window','-w',type=int,metavar='INT',help='maximum distance between the current and predicted word within a sentence (default:5)',default=5) ap.add_argument('--alpha','-a',type=float,metavar='FLOAT',help='the initial learning rate(default:0.025)',default=0.025) ap.add_argument('--min_count','-min',type=int,metavar='INT',help='ignore all words with total frequency lower than this (default:5)',default=5) ap.add_argument('--sg','-sg',choices=[0,1],help='the training algorithm(0:CBOW,1:skip-gram)',default=0) ap.add_argument('--negative','-n',type=int,metavar='INT',help='the number of "noise words" on negative sampling (default:5)',default=5) ap.add_argument('--iter','-i',type=int,metavar='INT',help='the number of iterations (default: 5)',default=5) ap.add_argument('--not-binary','-nb',action='store_false',help='save model in not binary format',default=True) args=ap.parse_args() sentence=word2vec.LineSentence(args.file) model=word2vec.Word2Vec(sentence,min_count=args.min_count,size=args.size,alpha=args.alpha,sg=args.sg,negative=args.negative,iter=args.iter) # model.save(args.model) model.wv.save_word2vec_format(args.model,binary=args.not_binary)
#word2vec模型训练 from gensim.models import word2vec import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences_one = word2vec.LineSentence('chinese_corpus/cut_zh_wiki.txt') #sentences_two = word2vec.LineSentence(u'./cut_zh_wiki_01.txt') model_one = word2vec.Word2Vec(sentences_one, size=200, window=10, min_count=64, sg=1, hs=1, iter=10, workers=25) #model_two = word2vec.Word2Vec(sentences_two,size=200,window=10,min_count=64,sg=1,hs=1,iter=10,workers=25) model_one.save(u'./train_test_x/word2vec2') #model_one.wv.save_word2vec_format(u'./w2v',binary=False) #model_two.save(u'./word2vec2') # import logging # import multiprocessing # import os.path # import sys # import jieba # # from gensim.models import Word2Vec # from gensim.models.word2vec import PathLineSentences #
# Radim Rehurek's tutorial on word2vec # https://rare-technologies.com/word2vec-tutorial/ # import modules and set up logging from gensim.models import word2vec from gensim.models import KeyedVectors import numpy as np import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.LineSentence("quote.txt") model = word2vec.Word2Vec(sentences, min_count=1) word_vectors = model.wv del model word_vectors.save("word_vectors.bin") len(word_vectors.vocab) predicted = word_vectors.seeded_vector("bobby") word_vectors["bob"] type( word_vectors.similar_by_vector(.1 + np.zeros(100, dtype=np.float32))[0][0])
arg('--target', '-t', help='Path to target words', required=True) arg('--model', '-m', help='Path to LSI model', required=True) arg('--output', '-o', help='Output path (tsv)', required=False) args = parser.parse_args() target_words = set([w.strip() for w in open(args.target, 'r', encoding='utf-8').readlines()]) try: f_out = open(args.output, 'w', encoding='utf-8') except TypeError: f_out = None model = LsiModel.load(args.model) texts = word2vec.LineSentence(args.input) data = {w: {0: [], 'density0': 0} for w in target_words} WINDOW = 15 logger.info('Reading corpus...') for line in texts: words = set(line) found = words.intersection(target_words) if found: for target in found: cv = extract_context(target, line, model, WINDOW) data[target][0].append(cv) logger.info('Calculating densities...') for word in target_words:
import sys import pyLDAvis.gensim from statistics import mean from gensim.corpora import Dictionary from gensim.models.ldamodel import LdaModel from gensim.models import word2vec data_file = sys.argv[1] topic_num = int(sys.argv[2]) dest_file = sys.argv[3] sentences = list(word2vec.LineSentence(data_file)) dic = Dictionary(sentences) corpus = [dic.doc2bow(s) for s in sentences if len(s) >= 2] lda = LdaModel(corpus = corpus, id2word = dic, num_topics = topic_num, random_state = 1) doc_topics = [lda[c] for c in corpus] avg_doc_topics = mean([len(t) for t in doc_topics]) print(f"topics num of doc = {avg_doc_topics}") vis = pyLDAvis.gensim.prepare(lda, corpus, dic, n_jobs = 1, mds='mmds', sort_topics = False) pyLDAvis.save_html(vis, dest_file)
__author__ = "Huangxuanyu Gong" from gensim.models import word2vec import os for file in os.listdir('./tokenized'): data = word2vec.LineSentence('./tokenized/' + file) model = word2vec.Word2Vec(data, size=100, window=3, hs=1, min_count=1, sg=1) model.save('./tokenizedmodels/' + file + '.model') print('Done with ' + str(file) + ' model.') print('All Done')
def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
import logging import gensim from gensim.models import word2vec # 设置输出日志 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # 直接用gemsim提供的API去读取txt文件,读取文件的API有LineSentence 和 Text8Corpus, PathLineSentences等。 sentences = word2vec.LineSentence("solar.txt") # 训练模型,词向量的长度设置为200, 迭代次数为8,采用skip-gram模型,模型保存为bin格式 model = gensim.models.Word2Vec(sentences, size=200, sg=1, iter=12, sample=1e-4, window=6) model.wv.save_word2vec_format("./word2Vecsolar" + ".bin", binary=True)
import multiprocessing from gensim.models import Word2Vec, word2vec #https://blog.csdn.net/u011748542/article/details/85880852 参数说明 dirpath = ".\\data\\utlp2\\vector" sentences = word2vec.LineSentence(dirpath + '\\random_walks.txt') model = Word2Vec(sentences, sg=1, hs=0, negative=5, size=200, window=5, min_count=1, workers=multiprocessing.cpu_count()) model.save(dirpath + '\\word2vec.model') # model = Word2Vec.load(dirpath+'\\word2vec.model') # print(model['USER_1083']) # # for key in model.similar_by_word('USER_1083', topn=10): # print(key) # print(model.similarity('u2099','u2099')) # print(model.wv.doesnt_match(u"u2099 u650 u1".split())) #
document = f.read() # document_decode = document.decode('GBK') document_cut = jieba.cut(document) # print ' '.join(jieba_cut) //如果打印结果,则分词效果消失,后面的result无法显示 result = ' '.join(document_cut) # result = result.encode('utf-8') with open('data/in_the_name_of_people_segment.txt', 'w') as f2: f2.write(result) f.close() f2.close() import logging import os from gensim.models import word2vec logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.LineSentence('data/in_the_name_of_people_segment.txt') model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=5, size=300) req_count = 5 for key in model.wv.similar_by_word('沙瑞金', topn=100): if len(key[0]) == 3: req_count -= 1 print(key[0], key[1]) if req_count == 0: break
# -*- coding: utf-8 -*- from gensim.models import word2vec import logging import sys #로그 저장 용도 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # 첫 번째 파라미터는 이릭어올 파일의 이름 sentences = word2vec.LineSentence(sys.argv[1]) # size : 공간 크기, min_count : 단어 최저 등장 횟수, window : 윈도우 수 model = word2vec.Word2Vec(sentences, size=100, min_count=1, window=10) # 입력받은 두번째값으로 모델을 생성한다 model.save(sys.argv[2])
def train(model_file='cbow.model', epoch=1): model = word2vec.Word2Vec.load(model_file) sentences = word2vec.LineSentence('./only_word.txt ') model.train(sentences, total_examples=1035846, epochs=epoch)
def get_word2vec_model(text_path): """训练词向量模型""" sentences = word2vec.LineSentence(text_path) model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=3, size=100) return model
-Google questions-phrases (Just for test, you can use any raw text data that is sentence separated) #-------------------------------------------------------------------------------------------------# ''' import gensim, logging from gensim.models import word2vec import matplotlib.pyplot as plt from sklearn.manifold import TSNE from nltk.corpus import brown # Showing log of training vectors logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = [['اول', 'جمله'], ['دوم', 'جمله']] # Building Models model1 = gensim.models.Word2Vec(word2vec.LineSentence("questions-phrases.txt"), min_count=1) model2 = gensim.models.Word2Vec(sentences, min_count=1) # model3=gensim.models.Word2Vec(brown.sents(), min_count=1) # Saving models model1.save("test.model") model2.save("persian.model") # model3.save("brown.model") # Loading models model11 = gensim.models.Word2Vec.load("test.model") model12 = gensim.models.Word2Vec.load("persian.model") # model13=gensim.models.Word2Vec.load("brown.model") # Finding similarities print(model11.most_similar("Bill_Gates")) print(model12.most_similar("جمله"))
# 机器模型相关工具包 import gensim.models.word2vec as w2v import sklearn.cross_validation as scv import sklearn.externals.joblib as sej import sklearn.preprocessing as sp import sklearn.svm as ss import sklearn.decomposition as sd import sklearn.metrics as sm # #### 4.2 生成目标文件词向量备用文档 # In[24]: #模型训练,生成词向量 sentences = w2v.LineSentence("文本分析/流浪地球_corpus.csv") model = w2v.Word2Vec(sentences, size=20, window=5, min_count=5, workers=4) model.save("文本分析/corpus01.model") # #### 4.3 模型样本词向量化 # In[26]: # 分别读取正评价和副评价文档内容 posInput = buildVecs('train_pos.txt', model) negInput = buildVecs('train_neg.txt', model) # 0(副评价),1(正评价)初始化评价词向量空间 y = np.concatenate((np.ones(len(posInput)), np.zeros(len(negInput)))) X = posInput[:]
import sys import matplotlib.pyplot as plt from gensim.models import word2vec from sklearn import manifold data_file = sys.argv[1] dest_img = sys.argv[2] sentences = word2vec.LineSentence(data_file) model = word2vec.Word2Vec(sentences, size=30, sg=1, window=2, min_count=2, iter=5000) tsne = manifold.TSNE(n_components=2) x = [model[k] for k in model.wv.index2word] xt = tsne.fit_transform(x) print(xt) xt_min = min(xt.flatten()) xt_max = max(xt.flatten()) plt.xlim(xt_min, xt_max) plt.ylim(xt_min, xt_max)
if __name__ == "__main__": file = r"F:\NLP_learnings\data\word2vec\chinese\in_the_name_of_people\in_the_name_of_people.txt" cleanData = r"F:\NLP_learnings\data\word2vec\chinese\in_the_name_of_people\cleaned2.txt" outmodel = r"F:\NLP_learnings\data\word2vec\chinese\in_the_name_of_people\model2" model_1 = r"E:\BaiduNetdiskDownload\实战-机器学习\word2vec\word2vec_c_from_weixin\word2vec_c" startTime = time.time() # model = mode(file, outModel) # endTime = time.time() # print("消费时间(分钟):", (endTime-startTime) // 60 ) # print(model['man']) # 清洗数据。分词 # clean_data(file, cleanData) train = False if train: sentences = word2vec.LineSentence(cleanData) # 使其格式化 # workers参数用于设置并发训练时候的线程数,不过仅当Cython安装的情况下才会起作用 # 模型保存和训练 model = word2vec.Word2Vec(sentences, sg=0, size=256, window=8, min_count=20, negative=3, sample=0.001, hs=1, workers=4) model.save(outmodel) # 加载模型 model = word2vec.Word2Vec.load(model_1)
twit = Twitter() # /r/n 제거 print(text) lines = text.split("\r\n") result = [] # word List comprehension // 필요오소만 #wordList=[i for line in lines for i,j in twit.pos(line, norm=True, stem=True) if not j in ["Josa", "Eomi", "Punctuation", "Foreign"]] for line in lines: resList = twit.pos(line, norm=True, stem=True) res = [] for word in resList: if not word[1] in ["Josa", "Eomi", "Punctuation", "Foreign"]: res.append(word[0]) r1 = (" ".join(res)) result.append(r1) #print(result) # word embedding toji = "toji.data" with open(toji, "w", encoding="utf-8") as fp: fp.write("\n".join(result)) # W2V data = word2vec.LineSentence(toji) model = word2vec.Word2Vec(data, size=200, window=2, min_count=5, sg=1, iter=10) model.save("toji.mode") # W2V 모델 불러오기 model = word2vec.Word2Vec.load("toji.mode") print(model.most_similar(positive=["집"]))
fp=codecs.open("2BEXXX01.txt","r",encoding="utf-16") soup = BeautifulSoup(fp,"html.parser") body = soup.select_one("text body") text = body.getText() twitter = Twitter() lines = text.split("\r\n") results = [] for line in lines: r = [] malist = twitter.pos(line, norm=True, stem=True) for (word, pumsa) in malist: if not word in ["Josa","Eomi","Punctuation"]: r.append(word) #print(r) #break results.append((" ".join(r)).strip()) output = (" ".join(r)).strip() #print(output) with open("toji.wakati","w",encoding="utf-8") as fp: fp.write(output) data = word2vec.LineSentence("toji.wakati") model = word2vec.Word2Vec(data, size=200, window=10, hs=1, min_count=2, sg=1) model.save("toji.model")
def testLineSentenceWorksWithFilename(self): """Does LineSentence work with a filename argument?""" with utils.smart_open(datapath('lee_background.cor')) as orig: sentences = word2vec.LineSentence(datapath('lee_background.cor')) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
from gensim.models import word2vec from konlpy.tag import _mecab # 모델 만들기 source = 'update_sentences.prepro' sentence = word2vec.LineSentence(source=source) print(type(sentence)) print(sentence.max_sentence_length) model = word2vec.Word2Vec(sentence, size=200, window=10, min_count=1, hs=1, sg=1) print(type(model)) model_filename = 'update_model.model' model.save(model_filename) print(model_filename + ' 파일 저장됨') print('finished') # 저장된 model 파일의 유사도를 이용한 시간화 import matplotlib.pyplot as plt import numpy as np import pandas as pd plt.rc('font', family='Malgun Gothic') model_filename = 'update_model.model' model = word2vec.Word2Vec.load(model_filename) print(model)
level=logging.INFO) import sys reload(sys) sys.setdefaultencoding('utf8') #set values for various parameters num_features = 300 #Word vector dimensionality min_word_count = 20 #minimum word count num_workers = 10 #Number of threads to run in parallel context = 20 #context window size downsampling = 1e-3 # Downsample setting for frequent words # Initialize and train the model (this will take some time) from gensim.models import word2vec print "Training model..." sentences = word2vec.LineSentence('corpus_stem.txt') model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling) # If you don't plan to train the model any further, calling # init_sims will make the model much more memory-dfficient model.init_sims(replace=True) # It can be helpful to create a meaningful model name and save # the model for later use. You can load it later using Word2Vec.load() model_name = "300features_40minwords_10context"