def _train_model(self, model: Word2Vec, texts): tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) texts_seq = tokenizer.sequences_to_texts( tokenizer.texts_to_sequences(texts)) texts_seq = [f.split(" ") for f in texts_seq] print("Adding to word2vec vocabulary...") model.min_count = 2 model.build_vocab(texts_seq, update=True) print("Training word2vec ...") model.train(texts_seq, total_examples=len(texts_seq), epochs=model.epochs)
def train_model_w2vec( sentences: Iterable, model: Word2Vec, trigram_model_path: Path, save_model_path: Path, epochs: int, ): """Entrena Word2Vec.""" # TODO: Agregar kwargs para pasarlo a los metodos de gensim trigram = Phraser.load(str(trigram_model_path)) sentences = trigram[sentences] model.build_vocab(sentences, progress_per=10000) model.train( sentences, total_examples=model.corpus_count, epochs=epochs ) model.init_sims(replace=True) model.save(str(save_model_path))
def train(model: Word2Vec, word_lists, num_epoches: int = None): model.build_vocab(word_lists, progress_per=10000) model.train(word_lists, total_examples=len(word_lists), epochs=num_epoches) return model