def get_or_create_cut_word(input_path, output_path): """ 获取分词文件 :param input_path: :param output_path: :return: """ if os.path.exists(output_path): return word2vec.PathLineSentences(output_path) else: input_data = open(input_path, 'r') output_data = open(output_path, 'w') for line in input_data: output_data.write(segment_depart(line) + '\n') return word2vec.PathLineSentences(output_path)
def testPathLineSentencesOneFile(self): """Does PathLineSentences work with a single file argument?""" test_file = os.path.join(datapath('PathLineSentences'), '1.txt') with utils.smart_open(test_file) as orig: sentences = word2vec.PathLineSentences(test_file) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
def train_THUCNews(segment_dir, out_word2vec_path): sentences = word2vec.PathLineSentences(segment_dir) model = train_wordVectors(sentences, embedding_size=128, window=5, min_count=5) save_wordVectors(model, out_word2vec_path)
def extract_sentences(self): """Extract sentences from data set for Word2Vec model. See https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec for detail. :return: sentences type of list of list. """ pickle_path = os.path.join(self.root, self.pickled_folder) pickle_file = 'sentences.pickle' if self.test_mode: try: os.remove(os.path.join(pickle_path, pickle_file)) os.rmdir(pickle_path) except FileNotFoundError: pass try: with open(os.path.join(pickle_path, pickle_file), 'rb') as f: print("Sentences will be loaded from pickled file: " + pickle_file) return pickle.load(f) except FileNotFoundError: print("Cannot find pickled file to load sentences.") pass except Exception as error: raise error print("Extracting...") sentences = [] for mode in ['train', 'test']: for classification in ['pos', 'neg', 'unsup']: if mode == 'test' and classification == 'unsup': # There is no test/unsup in our data. continue path = os.path.join(self.root, mode, classification) # sentences would be 12,500 review data sentences list. test_index = 0 for sentence in word2vec.PathLineSentences(path): test_index += 1 if self.test_mode and test_index > TEST_DATA_SIZE: break alphabetic_words = list( map(lambda x: to_alphabetic(x), sentence)) words = list( filter(lambda x: len(x) != 0, alphabetic_words)) sentences += words # Sentences look like [[review.split()], [...], ...]. sentences = [sentences] try: os.mkdir(pickle_path) except FileExistsError: # 'processed' folder already exists. pass with open(os.path.join(pickle_path, pickle_file), 'wb') as f: pickle.dump(sentences, f, pickle.HIGHEST_PROTOCOL) print("Done.") return sentences
def train_w2v(): program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) # 1.format: 指定输出的格式和内容,format可以输出很多有用信息, # %(asctime)s: 打印日志的时间 # %(levelname)s: 打印日志级别名称 # %(message)s: 打印日志信息 logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) # 打印这是一个通知日志 logger.info("running %s" % ' '.join(sys.argv)) # [1]若只有一个文件,使用LineSentence读取文件 # segment_path='./data/segment/segment_0.txt' # sentences = word2vec.LineSentence(segment_path) # [1]若存在多文件,使用PathLineSentences读取文件列表 segment_dir = './train_data' sentences = word2vec.PathLineSentences(segment_dir) # 一般训练,设置以下几个参数即可: word2vec_path = './models/train/word2vec.model' model = train_wordVectors(word2vec_path, sentences, embedding_size=256, window=5, min_count=5) print(model.alpha) save_wordVectors(model, word2vec_path)
def train(self, corpus_path, size=100, min_count=1, window=5, iter=20, out_path='./model/word2vec/word2vec.model'): logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO) try: sentences = word2vec.PathLineSentences(corpus_path) except Exception as e: print(e) return if self.__model is None: self.__model = word2vec.Word2Vec(sentences, size=size, min_count=min_count, window=window, iter=iter) else: self.__model.build___vocab(sentences, update=True) self.__model.train(sentences, total_examples=self.__model.corpus_count, epochs=self.__model.iter) self.__vocab_index = self.__model.wv.index2word self.__model.save(out_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--corpus-dir", "-i", default="/app/workspace/data", help="Location of pre-training text files.", ) parser.add_argument('--output', '-o') parser.add_argument('--dimension', '-d', type=int, default=256) parser.add_argument('--window', '-w', type=int, default=16) parser.add_argument('--min-count', type=int, default=10) parser.add_argument('--max-vocab-size', type=int, default=30000) parser.add_argument('--max-sentence-length', type=int, default=30000) parser.add_argument('--workers', type=int, default=-1) parser.add_argument('--sg', type=int, default=1) args = parser.parse_args() outputpath = args.output mc = multiprocessing.cpu_count() // 2 workers = mc if args.workers == -1 else args.workers sentences = word2vec.PathLineSentences(args.corpus_dir, max_sentence_length=args.max_sentence_length) model = word2vec.Word2Vec(sentences, size=args.dimension, window=args.window, min_count=args.min_count, max_vocab_size=args.max_vocab_size, workers=workers, sg=args.sg) # not saving temporary data model.delete_temporary_training_data() model.save(outputpath) model.wv.save_word2vec_format(f'{outputpath}.txt')
def pre_train(segmented_dir): sys.path.append('..') program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) model = word2vec.Word2Vec(word2vec.PathLineSentences(segmented_dir), size=300, min_count=2, workers=8, iter=10) with open(os.path.join(segmented_dir, 'w2v_dic.data'), 'w', encoding='utf-8') as f: for word in model.wv.vocab: f.write(word + ' ') f.write(' '.join(list(map(str, model[word])))) f.write('\n') f.close() model.save_word2vec_format(os.path.join(segmented_dir, 'w2v_model.bin'), binary=True)
def w2v_train(segment_dir='./data/segment/oil.txt', word2vec_path='./models/w2v/oil.model'): sentences = word2vec.PathLineSentences(segment_dir) model2 = train_wordVectors(sentences, embedding_size=300, window=5, min_count=1) save_wordVectors(model2, word2vec_path)
def testPathLineSentences(self): """Does PathLineSentences work with a path argument?""" with utils.smart_open(os.path.join(datapath('PathLineSentences'), '1.txt')) as orig1,\ utils.smart_open(os.path.join(datapath('PathLineSentences'), '2.txt.bz2')) as orig2: sentences = word2vec.PathLineSentences(datapath('PathLineSentences')) orig = orig1.readlines() + orig2.readlines() orig_counter = 0 # to go through orig while matching PathLineSentences for words in sentences: self.assertEqual(words, utils.to_unicode(orig[orig_counter]).split()) orig_counter += 1
def word2vec(self): sentences = word2vec.PathLineSentences("nlp/cut_words.txt") model = Word2Vec(sentences, size=20, window=5, min_count=1, workers=4) model.save("nlp/word2vec.model") model = Word2Vec.load("nlp/word2vec.model") # a= model.train([["吸收公众存款", "吸收公众存款"]], total_examples=1, epochs=1) vector = model.wv['新材料'] a = model.similar_by_vector(vector) print(a) print(vector)
def w2v_training(seg_corpus_dir, embedding_size): w2v_model_file = 'w2v_embed_' + str(embedding_size) + '.model' w2v_vector_file = 'w2v_embed_' + str(embedding_size) + '.txt' sentences = word2vec.PathLineSentences(seg_corpus_dir) workers = multiprocessing.cpu_count() # basic setting in w2v w2v_model = word2vec.Word2Vec(sentences=sentences, size=embedding_size, window=5, min_count=5, workers=workers,\ sg=1, hs=0, negative=10, ns_exponent=0.75, iter=10, sorted_vocab=1) w2v_model.save(config.params_dir + w2v_model_file) w2v_model.wv.save_word2vec_format(config.params_dir + w2v_vector_file, binary=False) logging.info('Word2Vec training is done and data are saved..')
def word2vec_vectorizer(rst1, rst2, embedding_size=1024, in_window=20, in_min_count=5): sentences = word2vec.PathLineSentences('./segwords') w2vModel = word2vec.Word2Vec(sentences, sg=1, size=embedding_size, window=in_window, min_count=in_min_count) return w2vModel
def w2v_train(self): #word2vecを学習する logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) sentences = word2vec.PathLineSentences('./warrior-data-wakati') model = word2vec.Word2Vec(sentences, sg=1, size=100, min_count=1, window=10, hs=1, negative=0) model.save(self.modelpath)
def __init__(self, file_name, dataset, vocab, vec_dim, epoch): super(FastEmbedding, self).__init__(file_name, dataset, vocab, vec_dim, epoch) file_list = word2vec.PathLineSentences(self.file_name).input_files res = [] for file_name in file_list: with open(file_name, 'r') as f: res.append(f.read()) if not os.path.isdir('../tmp_res'): os.mkdir('../tmp_res') with open('../tmp_res/tmp_file', 'w') as f: f.writelines(res)
def __init__(self, file_name, dataset, vocab, vec_dim, epoch): super(Doc2VecEmbedding, self).__init__(file_name, dataset, vocab, vec_dim, epoch) sentences = word2vec.PathLineSentences(self.file_name) docLabels = sentences.input_files data = [] for doc in docLabels: try: with open(doc) as f: doc_data = f.read() data.append(doc_data) except: pass self.it = LabeledLineSentence(data, docLabels)
def train(self, **kwargs): arg_string = '_'.join( key + '=' + str(value) for key, value in kwargs.items()) print(self.algorithm + '_' + arg_string) if self.algorithm == 'word2vec': sentences = word2vec.PathLineSentences(self.save_dir.joinpath('line_sentences')) self.model = word2vec.Word2Vec(sentences, **kwargs) if self.algorithm == 'doc2vec': self.model = doc2vec.Doc2Vec(self, **kwargs) savepath = self.save_dir.joinpath(self.algorithm + '_' + arg_string) self.model.save(str(savepath)) return self.model
def train(): logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s", level=logging.INFO) log = '加载语料...\n' sentences = word2vec.PathLineSentences(common.CORPUS_PATH) log = log + ' \n'.join(sentences.input_files) log = log + '\n开始训练..., word2vec.Word2Vec(sentences, min_count=1)\n' model = word2vec.Word2Vec(sentences, min_count=1) # 保存模型 if not os.path.exists(common.MODEL_PATH): os.mkdir(common.MODEL_PATH) model.save(common.MODEL_PATH + '/' + common.MODEL_FILE) log = log + '训练结束...,模型保存在' + common.MODEL_FILE return log
def word_vec(path): logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s", level=logging.INFO) # sentences = word2vec.LineSentence(r"D:\pycharm_project\knowledge_rule\datas\wiki\AA\wiki_corpus") sentences = word2vec.PathLineSentences(path) model = word2vec.Word2Vec(sentences, size=200, window=5, min_count=5, workers=multiprocessing.cpu_count()) # 保存模型 model.save("../model/20200928/corpus00.model") # 保存词向量 model.wv.save_word2vec_format("../model/20200928/corpus00.vector", binary=False)
def doc2vec(self): sentences = word2vec.PathLineSentences("nlp/cut_words.txt") documents = [ TaggedDocument(doc, [i]) for i, doc in enumerate(sentences) ] # for i ,doc in enumerate(sentences): # ddd = TaggedDocument(doc,[i]) # print(ddd) model = Doc2Vec(documents, vector_size=20, window=2, min_count=1, workers=4) model.save("nlp/doc2vec.model") model = Doc2Vec.load("nlp/doc2vec.model") vector = model.infer_vector(["电器开关零部件及附件制造"]) model.similar_by_vector(vector) pass
def build_word2vec(): tuples = [ (train_path, 'train.txt'), (test_path, 'test.txt'), (dev_path, 'dev.txt'), ] for (path, name) in tuples: transform_only_sentences(path, name) sentences = word2vec.PathLineSentences(sentences_path) model = word2vec.Word2Vec(sentences, size=embedding_size, hs=1, min_count=5) print(len(model.wv.vocab)) model.wv.add(padding_letter, np.zeros(model.wv.vector_size)) print(len(model.wv.vocab)) model.wv.save_word2vec_format(word2vec_path) return model.wv
def generate_embedding(self, model_type): sentences = word2vec.PathLineSentences(self.file_name) # Training the corpus to generate the co-occurance matrix which is used in GloVe corpus = Corpus() # Creating a corpus object corpus.fit(sentences, window=self.window) # Training GloVe model glove = Glove( no_components=self.vec_dim, learning_rate=self.learning_rate ) glove.fit( corpus.matrix, epochs=self.epoch, no_threads=self.no_threads, verbose=self.verbose ) glove.add_dictionary(corpus.dictionary) return trans_vocab(glove.dictionary, glove.word_vectors)
def embedding_train(self): print("词向量训练 start") starttime = datetime.datetime.now() # 获取文件夹中所有文件 sent = word2vec.PathLineSentences(self.save_path) # 具体参数在self.parms设置 model = word2vec.Word2Vec(sentences=tqdm(sent), **self.parms) endtime = datetime.datetime.now() print('秒:', (endtime - starttime).seconds) # 保存模型---载入word2vec.Word2Vec.load("\\name.model") model.save(self.embedding_path + '\\' + "word_embedding1.model") model.wv.save_word2vec_format(self.embedding_path + '\\' + "word_embedding1.txt", binary=0) # 载入bin、txt文件: gensim.models.KeyedVectors.load_word2vec_format('/ .txt/bin', binary=False) model.wv.save_word2vec_format(self.embedding_path + '\\' + "word_embedding1.bin", binary=0) print("##词向量训练已完成## the word_embedding and model exists in " + self.embedding_path) return (model)
def train_embeddings(self): sys.path.append('..') logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) self.logger.info("running %s" % ' '.join(sys.argv)) model = word2vec.Word2Vec(word2vec.PathLineSentences( self.segmented_dir), size=300, min_count=2, workers=8, iter=15) w2v_dict = {} for word in model.wv.vocab: w2v_dict[word] = model[word] with open(os.path.join(self.prepared_dir, 'w2v_dic.pkl'), 'wb') as f: pkl.dump(w2v_dict, f) f.close() model.wv.save_word2vec_format(os.path.join(self.prepared_dir, 'w2v_model.bin'), binary=True)
def pre_train(brc_data, segmented_dir): # parser = argparse.ArgumentParser('Reading Comprehension on BaiduRC dataset') # path_settings = parser.add_argument_group('path settings') # path_settings.add_argument('--train_files', nargs='+', # default=['../data/trainset/search.train.json'], # help='list of files that contain the preprocessed train data') # path_settings.add_argument('--dev_files', nargs='+', # default=['../data/devset/search.dev.json'], # help='list of files that contain the preprocessed dev data') # path_settings.add_argument('--test_files', nargs='+', # default=['../data/testset/search.test.json'], # help='list of files that contain the preprocessed test data') # path_settings.add_argument('--segmented_dir', default='../data/segmented', # help='the dir to store segmented sentences') sys.path.append('..') # args = parser.parse_args() # for files in args.train_files + args.dev_files + args.test_files: # json_to_sentence.load_data(files, args.segmented_dir) load_data(brc_data, segmented_dir) program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) model = word2vec.Word2Vec(word2vec.PathLineSentences(segmented_dir), size=300, min_count=2, workers=8, iter=10) with open(os.path.join(segmented_dir, 'w2v_dic.data'), 'w', encoding='utf-8') as f: for word in model.wv.vocab: f.write(word + ' ') f.write(' '.join(list(map(str, model[word])))) f.write('\n') f.close()
def Word2vec_train(file_path, save_path, dir_path=None, save_name='word2vec_model', replace_old=False, model_size=300, model_window=10, model_min_count=5, **kw): """ batch train usage: set dir_path、save_name, file_path = None, save_path = None if Multiple files using dir_path """ from gensim.models import word2vec logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # https://radimrehurek.com/gensim/models/word2vec.html if file_path != None: #單檔案 sentences = word2vec.LineSentence(file_path) model = word2vec.Word2Vec(sentences, size=model_size, window=model_window, min_count=model_min_count, **kw) #保存模型,供日後使用 model.save(save_path) if dir_path != None and file_path == None: #多檔案 sentences = word2vec.PathLineSentences(dir_path) model = word2vec.Word2Vec(sentences, size=model_size, window=model_window, min_count=model_min_count, **kw) #保存模型,供日後使用 model.save(os.path.join(dir_path, save_name))
def pre_train(segmented_dir, embed_size): """ 根据训语料训练词向量。或者可以考虑全部语料加上百度知道的数据集?? :param brc_data: :param segmented_dir: :return: """ sys.path.append('..') # 将原始数据的分词结果进行保存 # save_seg_data(brc_data, segmented_dir) program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) # 这里的语料是被预处理成多个结果PathLineSentences可以支持多个大文件,对内存很友好。 # 如果语料是单个大文件的话,建议使用LineSentences(file)这个类加载训练语料,同样内存友好。 # 默认embed_size=300 model = word2vec.Word2Vec(word2vec.PathLineSentences(segmented_dir), size=embed_size, min_count=2, workers=12, iter=10) # 保存模型 model.save(os.path.join(segmented_dir, 'w2v_dic.data')) with open(os.path.join(segmented_dir, 'w2v_dic.data'), 'w', encoding='utf-8') as f: for word in model.wv.vocab: f.write(word + ' ') f.write(' '.join(list(map(str, model[word])))) f.write('\n') f.close()
def train_test(): # [1]若只有一个文件,使用LineSentence读取文件 # segment_path='./data/segment/segment_0.txt' # sentences = word2vec.LineSentence(segment_path) # [1]若存在多文件,使用PathLineSentences读取文件列表 segment_dir = './data/segment' sentences = word2vec.PathLineSentences(segment_dir) # 简单的训练 model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=3, size=100) print(model.wv.similarity('沙瑞金', '高育良')) # print(model.wv.similarity('李达康'.encode('utf-8'), '王大路'.encode('utf-8'))) # 一般训练,设置以下几个参数即可: word2vec_path = './models/word2Vec.model' model2 = train_wordVectors(sentences, embedding_size=128, window=5, min_count=5) save_wordVectors(model2, word2vec_path) model2 = load_wordVectors(word2vec_path) print(model2.wv.similarity('沙瑞金', '高育良'))
from gensim.models import word2vec segment_folder = 'word2vec/three_kingdoms/segment' sentences = word2vec.PathLineSentences(segment_folder) model = word2vec.Word2Vec(sentences, size=100, window=3, min_count=3) #model.wv.save_word2vec_format('file1.txt', binary=False) #model.wv.similarity('刘备', '关羽') print(model.wv.most_similar(positive=['曹操'])) print(model.wv.most_similar(positive=['曹操','刘备'],negative=['张飞'])) #[('孙权', 0.986218273639679), ('荆州', 0.9801917672157288), ('夫人', 0.9764574766159058), ('周瑜', 0.9756923913955688), ('今反', 0.9745445847511292), ('孔明', 0.9739490747451782), ('已', 0.9734069108963013), ('拜', 0.9730291366577148), ('拜谢', 0.9727320671081543), ('袁绍', 0.9722797870635986)] #[('今', 0.9847639799118042), ('臣', 0.9846991300582886), ('吾', 0.9833989143371582), ('主公', 0.9833654165267944), ('丞相', 0.9818264842033386), ('某', 0.9800719022750854), ('问', 0.9799109697341919), ('此', 0.9775131940841675), ('告', 0.9753938317298889), ('卿', 0.9734485149383545)]
from gensim.models import word2vec word_file = './three_kingdoms/segment/seg_threekingdoms.txt' senstence = word2vec.PathLineSentences(word_file) model1 = word2vec.Word2Vec(senstence, size=128, window=3, min_count=2) print(model1.wv.most_similar('曹操')) print(model1.wv.most_similar(positive=['曹操', '刘备'], negative=['张飞'])) model1.save('./models/word2Vec_threekingdim.model') ''' [('孙权', 0.9883049726486206), ('先主', 0.9877791404724121), ('回报', 0.9873332977294922), ('夫人', 0.9860264658927917), ('关公', 0.9857215881347656), ('孔明', 0.9843080043792725), ('荆州', 0.983728289604187), ('周瑜', 0.9833334684371948), ('往', 0.9825193285942078), ('又', 0.9818975329399109)] [('丞相', 0.9887984395027161), ('臣', 0.9875719547271729), ('某', 0.9866517782211304), ('此', 0.9865485429763794), ('大叫', 0.9859899282455444), ('皆曰', 0.9858393669128418), ('朕', 0.9830409288406372), ('书略', 0.9822883605957031), ('乃曰', 0.9815787076950073), ('既', 0.9811386466026306)] '''