class Word2Vector(object): def __init__(self, src_file, dst_file, size=300, window=5, min_count=10, hs=0, sg=0, learning_rate=0.025): self.src_file = src_file self.model_file = dst_file self.size = size self.window = window self.min_count = min_count self.hs = hs # 1: 分层softmax, 0: 不使用分层softmax self.sg = sg # 1: skip-gram, 0: CBOW self.alpha = learning_rate self.workers = multiprocessing.cpu_count() def train(self, sentences): self.model = FastText(sentences, size=self.size, window=self.window, min_count=self.min_count, sg=self.sg, workers=self.workers) self.model.save(self.model_file) self.model.save_word2vec_format(self.model_file + '.bin', binary=True) def train_model(self): sentences = LineSentence(self.src_file) self.train(sentences) def online_train_model(self, sentences): # 在线训练 self.model.build_vocab(LineSentence(sentences)) self.model.train(total_examples=self.model.corpus_count, epochs=self.model.iter) def online_train_model(self, file_name, isdir=True): # 在线训练 if isdir: sentences = PathLineSentences(self.src_file) else: sentences = LineSentence(self.src_file) self.online_train_model(sentences) def train_dir_model(self): sentences = PathLineSentences(self.src_file) self.train(sentences) def load_model(self, model_name): self.model = FastText.load(model_name) def show_similarity(self, word1, word2): return self.model.wv.similarity(word1, word2) def show_word_vector(self, word): return self.model.wv[word]
''' total_examples = new_wv.corpus_count new_wv.build_vocab([list(pubmed_wv.vocab.keys())], update=True) new_wv.intersect_word2vec_format(preTrainedPath, binary=True, lockf=1.0) ''' ### Train for 2 epochs new_wv.train(sentences, epochs=2) # , total_examples=total_examples print('Time to train the model 2 epochs: {} mins'.format(round((time() - t) / 60, 2))) print('----------------------------') print(new_wv.most_similar(positive=['treatment'])) print(new_wv.most_similar(positive=['female'])) print(new_wv.most_similar(positive=['history'])) print(new_wv.most_similar(positive=['disease'])) print(new_wv.most_similar(positive=['brain'])) new_wv.save_word2vec_format('mimic-pubmed_2.bin', binary=True) print('----------------------------') # Train for 10 epochs new_wv.train(sentences, epochs=8) # , total_examples=total_examples print('Time to train the model 10 epochs: {} mins'.format(round((time() - t) / 60, 2))) print('----------------------------') print(new_wv.most_similar(positive=['treatment'])) print(new_wv.most_similar(positive=['female'])) print(new_wv.most_similar(positive=['history'])) print(new_wv.most_similar(positive=['disease'])) print(new_wv.most_similar(positive=['brain'])) new_wv.save_word2vec_format('mimic-pubmed_10.bin', binary=True) print('----------------------------')