def train_model_w2vec( sentences: Iterable, model: Word2Vec, trigram_model_path: Path, save_model_path: Path, epochs: int, ): """Entrena Word2Vec.""" # TODO: Agregar kwargs para pasarlo a los metodos de gensim trigram = Phraser.load(str(trigram_model_path)) sentences = trigram[sentences] model.build_vocab(sentences, progress_per=10000) model.train( sentences, total_examples=model.corpus_count, epochs=epochs ) model.init_sims(replace=True) model.save(str(save_model_path))
# (a) Sentence (tokens) list def build_sentence_list(dataset): sentence_list = [] for data in dataset: token = data['tokens'] sentence_list.append(token) return sentence_list train_sentence_list = build_sentence_list(train_data) test_sentence_list = build_sentence_list(test_data) # (b) Word2vec model word2vec_dim = 200 ''' word2vec_tr_model = Word2Vec(train_sentence_list + test_sentence_list, size = word2vec_dim, workers = 2, min_count = 3, iter = 10, window = 3) Word2Vec.save(word2vec_tr_model, 'savings/word2vec_model.model') ''' word2vec_tr_model = Word2Vec.load('savings/word2vec_model.model') num_words = len(word2vec_tr_model.wv.vocab) + 4 # 1533 words + <PAD>, <SOS>, <EOS>, <UNK> => 1537 ''' <PAD>: padding <SOS>: start of sentence <EOS>: end of sentence <UNK>: unknown words
def save(self, w2v: Word2Vec): filepath = path.join(self._folder_path, w2v.name) w2v.save(filepath)
from gensim.models import Word2Vec from gensim.utils.SaveLoad model = Word2Vec.load("data/en.model") #word_vectors = model.wv # Word2Vec.save(model, replace_word_vectors_with_normalized = true)