from gensim.models import word2vec data = word2vec.Text8Corpus("wiki_wakati.txt") model = word2vec.Word2Vec(data, size=100) model.save("wiki.model") print("ok")
#!/usr/bin/env python # -*- coding: utf-8 -*- """ 作者:fuli.shen 时间:2017年6月27日 """ from gensim.models import word2vec import logging # 主程序 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus( u"E:\\work_document\\part-r-00000-news_w2v_sampling.text") # 加载语料 model = word2vec.Word2Vec(sentences, size=200) # 训练skip-gram模型; 默认window=5 model.save(u"./word2vec_model/news_word2vec.custompy") print "save news_word2vec custompy successfully" if __name__ == "__main__": pass
def train_val(self): result = {} for n,name in tqdm(enumerate(self.val_author_data)): pubs = [] #get the author's all paper for clusters in self.val_author_data[name]: pubs.append(clusters) #print(pubs) name_pubs_raw = {} for i,pid in enumerate(pubs): name_pubs_raw[pid] = self.val_pub_data[pid] #load the author's features save_relation(name_pubs_raw,name) mpg = MetaPathGenerator () mpg.read_data ("gene") all_embs = [] rw_num = 10 cp = set() #start to random walk for k in range(rw_num): mpg.generate_WMRW ("gene/RW.txt", 5, 20) sentences = word2vec.Text8Corpus (r'gene/RW.txt') ##########use word2vec to train the paper's embedding############### model = word2vec.Word2Vec (sentences, size=128, negative=25, min_count=1, window=10) embs = [] for i, pid in enumerate (pubs): if pid in model: embs.append (model[pid]) else: cp.add (i) embs.append (np.zeros (128)) all_embs.append (embs) all_embs = np.array (all_embs) ##########################loading the sematic feautures################# ptext_emb = load_data ('gene', 'ptext_emb.pkl') tcp = load_data ('gene', 'tcp.pkl') tembs = [] for i, pid in enumerate (pubs): tembs.append (ptext_emb[pid]) ##############get the paper's connection's cosine matrix#################### sk_sim = np.zeros ((len (pubs), len (pubs))) for k in range (rw_num): sk_sim = sk_sim + pairwise_distances (all_embs[k], metric="cosine") sk_sim = sk_sim / rw_num ##############get the paper's semantic's cosine matrix#################### tembs = pairwise_distances (tembs, metric="cosine") w = 1 sim = (np.array (sk_sim) + w * np.array (tembs)) / (1 + w) pre = DBSCAN (eps=0.2, min_samples=4, metric="precomputed").fit_predict (sim) pre = np.array (pre) ##离群论文集 outlier = set () for i in range (len (pre)): if pre[i] == -1: outlier.add (i) for i in cp: outlier.add (i) for i in tcp: outlier.add (i) ##基于阈值的相似性匹配 paper_pair = generate_pair (pubs, outlier) paper_pair1 = paper_pair.copy () K = len (set (pre)) for i in range (len (pre)): if i not in outlier: continue j = np.argmax (paper_pair[i]) while j in outlier: paper_pair[i][j] = -1 j = np.argmax (paper_pair[i]) if paper_pair[i][j] >= 1.5: pre[i] = pre[j] else: pre[i] = K K = K + 1 for ii, i in enumerate (outlier): for jj, j in enumerate (outlier): if jj <= ii: continue else: if paper_pair1[i][j] >= 1.5: pre[j] = pre[i] #print (pre, len (set (pre))) result[name] = [] for i in set (pre): oneauthor = [] for idx, j in enumerate (pre): if i == j: oneauthor.append (pubs[idx]) result[name].append (oneauthor) json.dump (result, open (self.args['val_result'], 'w', encoding='utf-8'), indent=4) f1 = f1_score(result,self.args) print("f1:",f1)
def train(path = Dir.res+"/sen_data/604_corpus.txt",save_path = Dir.res+"/w2v/w2v.model"): sentences = word2vec.Text8Corpus(path) # 加载语料 model = word2vec.Word2Vec(sentences, size=10,window=3,min_count=1) model.save(save_path) return save_path
header=0, delimiter="\t", quoting=3) #读入我们预设的unlabel数据(目前来说比较小) pat = re.compile(r'[A-Za-z]+') #提取全部单词(如果是提取汉字的话就....)如果想进一步考虑到标点符号对语义的影响, #不妨加上[!@#$%^&*]等等垃(yue)圾(pao)短信中常见的一些符号 with open('imdb_text', 'a', encoding='utf-8') as f: for rev in data_un.review: #对语料库里面的评论进行迭代 str_list = pat.findall(rev) #先提取出所有单词 # str_list = [x.lower() for x in str_list] #小写化所有单词,但是实际应用中大小写也会影响语义,看情况是否选择最小化 string = ' '.join(str_list) f.write(string + '\n') #上述操作以后我们就能得到一个写满处理后的string的文件啦 del data_un sentences = word2vec.Text8Corpus("imdb_text") # 加载语料 model = word2vec.Word2Vec(sentences, size=50) #训练skip-gram模型,词向量长度设置50(不知道会不会有点大), #默认window=5,考虑上下5个单词来进行预测,计算词向量,如果一个词出现的次数少于5次那就默认这是一个生僻词,忽略掉,实际操作中这个数字可以改变 model.save('mymodel') #然后保存下来可以用于下一次训练啦~ else: model = word2vec.Word2Vec.load('mymodel') #如果已经存在之前训练好的model那就直接导入 #In[3]主要用于训练我们的词向量 # In[4]: word_vectors = model.wv #单词与向量的对应都在wv里面,把训练好的model复制过来 del model # In[5]: data_t['vec'] = data_t.review.apply( lambda x: [word_vectors[w] for w in x.split() if w in word_vectors])
""" Word2Vec 模型: * Word2Vec 通过训练,可以把对文本内容的处理简化为K维向量空间中的向量运算.(而向量空间上的相似度可以用来表示文本语义上的相似度) * 采用的模型有CBOW(Continuous Bag-Of-Words,即连续的词袋模型)和 Skip-Gram 两种. * 因此,Word2Vec 输出的词向量可以被用来做很多NLP相关的工作,比如聚类、找同义词、词性分析等等. * CBOW 模型: 能够根据输入周围n-1个词来预测出这个词本身. * 也就是说,CBOW模型的输入是某个词A周围的n个单词的词向量之和,输出是词A本身的词向量. * Skip-gram 模型: 能够根据词本身来预测周围有哪些词. * 也就是说,Skip-gram模型的输入是词A本身,输出是词A周围的n个单词的词向量. """ import pandas as pd from gensim.models import word2vec # 加载语料 sentences = word2vec.Text8Corpus(u"/opt/data/NLP/4.word2vec/text8.txt") model = word2vec.Word2Vec(sentences, size=200) # 训练skip-gram模型; 默认window=5 # 计算两个词的相似度/相关程度 y1 = model.similarity("woman", "man") print(u"woman和man的相似度为:", y1) print("--------\n") # 计算某个词的相关词列表 y2 = model.most_similar("good", topn=20) # 20个最相关的 print(pd.Series(y2)) # print(u"和good最相关的词有:\n") # for item in y2: # print(item[0], item[1]) print("--------\n")
from gensim.models import word2vec sentences = word2vec.Text8Corpus(u'分词后的爽肤水评论.txt') model = word2vec.Word2Vec(sentences, size=50) # y2 = model.similarity(u"好", u"还行") # print(y2) for i in model.most_similar(u"滋润"): print i[0], i[1]
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # @File : word2vec # @Author : 张志毅 # @Time : 2020/9/10 9:41 from gensim.models import word2vec import logging # 主程序 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus( u"D:\\Python\\WorkSpace\\word2vec\\Data\\text8") # 加载语料 model = word2vec.Word2Vec(sentences, size=32) # 训练skip-gram模型; 默认window=5 # 保存模型,以便重用 model.save("text8.model") # 对应的加载方式 # model_2 = word2vec.Word2Vec.load("text8.model") # 以一种C语言可以解析的形式存储词向量 model.wv.save_word2vec_format('embedding1.txt', binary=False) # 对应的加载方式 # model_3 = word2vec.Word2Vec.load_word2vec_format("text8.model.bin", binary=True) if __name__ == "__main__": pass
# -*- coding: utf-8 -*- import gensim from gensim.models import word2vec sentences = word2vec.Text8Corpus( 'D:/gitHubRes/python/词向量/text-classification-cnn-rnn/data/baike_triples.txt' ) model = word2vec.Word2Vec(sentences, min_count=5, size=100) model.save('D:/baike.model') print("训练完成")
#-*- encoding:utf-8 -*- __author__ = '' from gensim.models import word2vec import logging import numpy as np filename = "D:\\chinese\\word2vec_corpus\\merged_ehr_2_segdone.txt" sentence = word2vec.Text8Corpus(filename) n_dim =100 model =word2vec.Word2Vec(sentence,size=n_dim) model.save(u"abc.model") #test from gensim.models import word2vec model_2 = word2vec.Word2Vec.load("gensim_train.model") y1 = model_2.most_similar(u'肝脏',topn = 6) y2 = model_2.wv[u'肝脏'] #import word2vec # segment word part import jieba import struct filePath='merged.txt' fileSegWordDonePath ='corpusSegDone.txt' # read the file by line fileTrainRead = [] #fileTestRead = []
from gensim.models import word2vec data = word2vec.Text8Corpus("wiki.gubun") model = word2vec.Word2Vec(data) model.save("wiki.model") print("ok")
# print(seg_lists) # # # 分完词后保存到新的txt中 # with open('fenci_0225.txt','w',encoding='utf-8') as f: # for i in seg_lists: # if i =='': # pass # else: # f.write(i) # # f.write('\n') # print("分词结果保存成功") #------------------------------------------ # 用 word2vec 进行训练 sentences = word2vec.Text8Corpus('slurm-13014726.out') #52776 # #用来处理按文本分词语料 print(sentences) model = word2vec.Word2Vec(sentences, size=100, window=5, min_count=5, workers=5, sg=1, hs=1) #训练模型就这一句话 去掉出现频率小于2的词 # model = word2vec.Word2Vec(sentences,sg=1,size=100,window=5,min_count=5,negative=3,sample=0.001,hs=1,workers=4) # http://blog.csdn.net/szlcw1/article/details/52751314 训练skip-gram模型; 第一个参数是训练预料,min_count是小于该数的单词会被踢出,默认值为5,size是神经网络的隐藏层单元数,在保存的model.txt中会显示size维的向量值。默认是100。默认window=5 # # 第一个参数是训练语料,第二个参数是小于该数的单词会被剔除,默认值为5, 第三个参数是神经网络的隐藏层单元数,默认为100 # model=word2vec.Word2Vec(sentences,min_count=3, size=50, window=5, workers=4)
# -*- coding: utf-8 -*- """ use jieba and gensim to create word2vec 生成词向量 """ import jieba from gensim.models import word2vec import time import os dir_path = "/Users/luheng/dureader/data/preprocessed/trainset/" file_path = dir_path + 'train.conll' vocabfile = dir_path + 'vocab.txt' word2vecfile = dir_path + 'vec.txt' sentences = word2vec.Text8Corpus(file_path) model = word2vec.Word2Vec(sentences, size=50, min_count=5, max_vocab_size=100000) model.save(dir_path + 'mymodel') vocab = open(vocabfile, 'w+') vec = open(word2vecfile, 'w+') ''' 这边需要把vocab改成唯一的 ''' model = word2vec.Word2Vec.load(dir_path + 'mymodel') all_words = set() for line in open(file_path):
flags.DEFINE_string('save_path', '../model/Word2Vec/', 'path for saving data') flags.DEFINE_integer('min_count', 2, 'term occurs less than this is ignored') flags.DEFINE_integer('size', 50, 'embedding dimensions') flags.DEFINE_integer('window', 4, 'terms occur within a window-neighborhood of a term') flags.DEFINE_integer('sg', 1, 'sg=1:skip-gram model; sg=other:CBoW model') # flags.DEFINE_float() # flags.DEFINE_boolean() FLAGS = flags.FLAGS # the major part if __name__ == '__main__': # logging information logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # load-in training sentences sentences = word2vec.Text8Corpus(FLAGS.data_path) # training step: model = word2vec.Word2Vec(sentences, min_count=FLAGS.min_count, size=FLAGS.size, window=FLAGS.window, sg=FLAGS.sg) # save the trained model if not os.path.exists(FLAGS.save_path): os.makedirs(FLAGS.save_path) cur_time = datetime.datetime.now().strftime("%m-%d_%H:%M:%S") model.save(os.path.join(FLAGS.save_path, 'luru_news_'+cur_time+'.model')) model.wv.save_word2vec_format(os.path.join(FLAGS.save_path, 'luru_news_'+cur_time+'.model.bin'), binary=True)
comment = comment.replace('~', '') comment = comment.replace('{"error_message": "EMPTY SENTENCE"}', '') comment = comment.replace('…', '') comment = comment.replace('\r', '') comment = comment.replace('\t', ' ') comment = comment.replace('\f', ' ') comment = comment.replace('/', '') comment = comment.replace('、', ' ') comment = comment.replace('/', '') comment = comment.replace(' ', '') comment = comment.replace(' ', '') comment = comment.replace('_', '') comment = comment.replace('?', ' ') comment = comment.replace('?', ' ') comment = comment.replace('了', '') comment = comment.replace('➕', '') return comment comment = open('test.txt').read() comment = ' '.join(jieba.cut(comment)) fo = open("afterSeg.txt", "w") fo.write(comment) print("finished!") fo.close() sentences = word2vec.Text8Corpus(u'afterSeg.txt') model = word2vec.Word2Vec(sentences, min_count=3, size=50, window=5, workers=1)
1) Radim Řehůřek (author of gensim) - http://rare-technologies.com/performance-shootout-of-nearest-neighbours-intro 2) Erik Bernhardsson (author of annoy) - https://github.com/erikbern/ann-benchmarks """ import time, random import numpy as np from gensim.models import word2vec from sklearn.neighbors import KDTree # Download text8 dataset from: # http://mattmahoney.net/dc/text8.zip # and unzip the file sentences = word2vec.Text8Corpus('text8') model = word2vec.Word2Vec(sentences, size=200, workers=8) model.init_sims(replace=True) # normalize the vectors words = random.sample(model.vocab.keys(), 100) class ANNSearch: word2idx = {} idx2word = {} data = [] def __init__(self, model): for counter, key in enumerate(model.vocab.keys()): self.data.append(model[key]) self.word2idx[key] = counter
''' 90. word2vecによる学習 81で作成したコーパスに対してword2vecを適用し,単語ベクトルを学習せよ. さらに,学習した単語ベクトルの形式を変換し,86-89のプログラムを動かせ. ''' from gensim.models import word2vec import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus('../chapter09/corpus81') model = word2vec.Word2Vec(sentences, size=300, window=5) model.save('w2v')
def my_word2vec(cut_filename): mysetence = word2vec.Text8Corpus(cut_filename) #model = word2vec.Word2Vec(mysetence, size=100, min_count=1, window=5, hs=5) model = word2vec.Word2Vec(mysetence, size=100, min_count=1, window=5, hs=5) model.save('./model/zh_wiki_global.model')
else: avgword2vec = avgword2vec + word2idf[word] * model[word] # if at least one word in the sentence has a word embeddings : if avgword2vec is not None: avgword2vec = avgword2vec / sumidf # normalize sum array_sentences.append(line) array_embeddings.append(avgword2vec) print 'avg_word2vec_idf: Generated embeddings for {0} sentences from {1} dataset.'.format( len(array_sentences), dataset) return array_sentences, array_embeddings if __name__ == "__main__": if False: # FIRST PART sentences = word2vec.Text8Corpus('data/text8') # Train a word2vec model embedding_size = 200 model = word2vec.Word2Vec(sentences, size=embedding_size) # Train a word2vec model with phrases bigram_transformer = gensim.models.Phrases(sentences) model_phrase = Word2Vec(bigram_transformer[sentences], size=200) else: # Loading model trained on words model = word2vec.Word2Vec.load('models/text8.model') # Loading model enhanced with phrases (2-grams) model_phrase = word2vec.Word2Vec.load('models/text8.phrase.model') """
if __name__ == '__main__': # Read data from files train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'electronics', 'trainData.tsv'), header=0, delimiter="\t", quoting=3 ) test = pd.read_csv(os.path.join(os.path.dirname(__file__), 'electronics', 'testData.tsv'), header=0, delimiter="\t", quoting=3 ) #unlabeled_train = pd.read_csv( os.path.join(os.path.dirname(__file__), 'data', "unlabeledTrainData.tsv"), header=0, delimiter="\t", quoting=3 ) print "Read %d labeled train reviews, %d labeled test reviews, " % (train["review"].size,test["review"].size ) # Load the punkt tokenizer tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # ****** Split the labeled and unlabeled training sets into clean sentences # #sentences=cPickle.load(open('sentences.p', 'rb')) sentences = word2vec.Text8Corpus('electronics/alldata.txt') logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO) # Set values for various parameters num_features = 200 # Word vector dimensionality min_word_count = 10 # Minimum word count num_workers = 16 # Number of threads to run in parallel context = 10 # Context window size downsampling = 1e-3 # Downsample setting for frequent words # Initialize and train the model (this will take some time) print "Training Word2Vec model..." #model = word2vec.Word2Vec(sentences, workers=num_workers,size=num_features, min_count = min_word_count,window = context, sample = downsampling, seed=1) #model.init_sims(replace=True) #model_name = "200features_10minwords_10context_electronics" #model.save(model_name) model = word2vec.Word2Vec.load("200features_10minwords_10context_electronics") #
# -*- coding: utf-8 -*- from gensim.models import word2vec import logging import os logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus('DATASET/merge.txt') model = word2vec.Word2Vec(sentences, size=5, min_count=0, window=1) #size:是每个词的向量维度; #window:是词向量训练时的上下文扫描窗口大小,窗口为5就是考虑前5个词和后5个词; #min-count:设置最低频率,默认是5,如果一个词语在文档中出现的次数小于5,那么就会丢弃; #workers:是训练的进程数 model.save('text.model') #模型存储 model.wv.save_word2vec_format('text.model.bin') #格式化存储 model['單位'] #得到单个单词的向量表示 model.most_similar(['上班']) #得到接近相似度结果 model.similarity('單位', '上班') #判断两个词汇的相似度
def train_w2v(): data = pd.read_csv('../data/full_tobe_classify_180316.csv',encoding='GBK') data['cutted_Dis'].to_csv('../data/lg_all_data.txt',index=False,encoding='utf-8') sentences=word2vec.Text8Corpus('../data/lg_all_data.txt') model=word2vec.Word2Vec(sentences,min_count=2,size=256) model.save('./word2vec/lg_data_model_comment_256dim')
from gensim.models import word2vec import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus("../input/text8") model = word2vec.Word2Vec(sentences, size=200) model.save("text8.model") from gensim.models.keyedvectors import KeyedVectors from gensim.models import word2vec model = KeyedVectors.load("text8.model.wv.vectors.npy") print(model['word'])
from gensim.models import word2vec sentences = word2vec.Text8Corpus(r'gene/all_text.txt') model = word2vec.Word2Vec(sentences, size=100, negative=5, min_count=2, window=5) model.save('word2vec/Aword2vec.model')
#!/usr/bin/env python # -*- coding: utf-8 -*- """ 功能:测试gensim使用,处理中文语料 时间:2016年5月21日 20:49:07 """ from gensim.models import word2vec import logging # 主程序 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.Text8Corpus(u"C:\\Users\\lenovo\\Desktop\\word2vec实验\\亚马逊中文书评语料.txt") # 加载语料 model = word2vec.Word2Vec(sentences, size=200) # 默认window=5 # 计算两个词的相似度/相关程度 y1 = model.similarity(u"不错", u"好") print u"【不错】和【好】的相似度为:", y1 print "--------\n" # 计算某个词的相关词列表 y2 = model.most_similar(u"书", topn=20) # 20个最相关的 print u"和【书】最相关的词有:\n" for item in y2: print item[0], item[1] print "--------\n" # 寻找对应关系 print u"书-不错,质量-" y3 = model.most_similar([u'质量', u'不错'], [u'书'], topn=3)
print('正在对{}进行分词.'.format(novel)) with codecs.open('novel/{}.txt'.format(novel), encoding='UTF-8') as f: sentences += [list(jieba.cut(line.strip())) for line in f] print('分词完成.') print('正在保存分词结果到sentences.txt.') f = open('sentences.txt', 'w', encoding='UTF-8') text = '' for line in sentences: text += ' '.join(line) text += '\n' f.write(text) f.close() print('保存完成.') print("训练中……") # Load file sentence = word2vec.Text8Corpus("sentences.txt") # Setting degree and Produce Model(Train) model = word2vec.Word2Vec(sentence) # size=500, window=5, min_count=5, workers=4, sg=1, max_vocab_size=120000000 try: # 删除重复模型 os.remove("model/{}.model.bin".format(query.replace(",", "-"))) except: pass else: print('检测到同名模型,将自动删除') # Save model model.wv.save_word2vec_format("model/{}.model.bin".format(query.replace(",", "-")), binary=True) print("训练模型已存储")
from gensim.models import word2vec # 加载分句后的文件 sentences = word2vec.Text8Corpus('NewsCar_new_after_process/1/2.txt') # 转换为一个个字 tokens = [] for sen in sentences: print(type(sen)) for j in sen: for token in j: tokens.append(token) # size 表示向量维度 min_count表示最小出现次数 model = word2vec.Word2Vec(tokens, size=100, min_count=1) # 计算和车最相似的5个字 x = model.most_similar("车", topn=5) print(x) # 输出'汽车'的词向量 print(model['车']) # 保存模型 model.save("res.model") # 对应的加载方式 # model_2 = word2vec.Word2Vec.load("text8.model")
# # save seg file and load it as Text8Corpus # In[41]: with open('seg/allseg.txt','w',encoding='utf8') as output: for line in seg_train: for word in line: output.write(word+' ') output.write('\n') # In[42]: sentences = word2vec.Text8Corpus('seg/allseg.txt') # # train w2v model # In[43]: dim = 64 min_count = 1 window = 20 iteration = 150 sg = 1 neg = 5 note = 'all' fname = str(dim)+'m'+str(min_count)+'w'+str(window)+'it'+str(iteration)+'sg'+str(1)+'neg'+str(neg)+note
from gensim.models import word2vec import logging sentences = word2vec.Text8Corpus('/tmp/text8') model = word2vec.Word2Vec(sentences, size=200) model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
def main( limit=30, # 句子长度 x_limit=1, y_limit=2): from word_sequence import WordSequence print('extract lines') """dgk语料""" # fp = open("raw_data/dgk_shooter_min.conv", 'r', errors='ignore', encoding='utf-8') """xiaohuangji语料""" fp = open("raw_data/xiaohaungji50w_test.conv", 'r', errors='ignore', encoding='utf-8') # 保存全部句子列表 groups = [] # 保存一行 group = [] for line in tqdm(fp): # 显示进度条 if line.startswith('M '): # 句子处理M开头 line = line.replace('\n', '') # 去掉回撤 if '/' in line: line = line[2:].split('/') # 去掉斜杠 -> return <list> line = list(regular(''.join(line))) # 去掉词语 line = jieba.lcut(''.join(line)) else: line = list(line[2:]) group.append(line) # print(group) else: # E开头句子---line.startswith('E ') if group: groups.append(group) group = [] if group: groups.append(group) group = [] print('\nextract group') """定义问答对""" x_data = [] y_data = [] for group in tqdm(groups): # print(group) for index, line in enumerate(group): if index == 0 and good_line(line): x_data.append(line) if index == 1 and good_line(line): y_data.append(line) print(x_data) print(y_data) # 问答对数据量 print('\n问句数量:' + str(len(x_data)), '答句数量:' + str(len(y_data))) # 将问答对放入zip object(至多20字符) for ask, answer in zip(x_data[:30], y_data[:30]): print(''.join(ask)) print(''.join(answer)) print('-' * 20) """组装数据""" data = list(zip(x_data, y_data)) # 组装规则: data = [(x, y) for x, y in data if len(x) < limit and len(y) < limit and len(y) >= y_limit and len(x) >= x_limit] x_data, y_data = zip(*data) # word_sequence模型训练 print('fit word_sequence') from gensim.models import word2vec import gensim sentences = word2vec.Text8Corpus(train_file_name) # 加载语料 model = gensim.models.Word2Vec(sentences, size=200) # 训练skip-gram模型; 默认window=5