Exemplo n.º 1
0
def train_model_w2vec(
    sentences: Iterable,
    model: Word2Vec,
    trigram_model_path: Path,
    save_model_path: Path,
    epochs: int,
):
    """Entrena Word2Vec."""

    # TODO: Agregar kwargs para pasarlo a los metodos de gensim

    trigram = Phraser.load(str(trigram_model_path))
    sentences = trigram[sentences]
    model.build_vocab(sentences, progress_per=10000)
    model.train(
        sentences, total_examples=model.corpus_count, epochs=epochs
    )
    model.init_sims(replace=True)
    model.save(str(save_model_path))
# (a) Sentence (tokens) list

def build_sentence_list(dataset):
    sentence_list = []
    for data in dataset:
        token = data['tokens']
        sentence_list.append(token)
    return sentence_list

train_sentence_list = build_sentence_list(train_data)
test_sentence_list = build_sentence_list(test_data)
    

# (b) Word2vec model
word2vec_dim = 200
'''
word2vec_tr_model = Word2Vec(train_sentence_list + test_sentence_list, 
                             size = word2vec_dim, workers = 2, 
                             min_count = 3, iter = 10, window = 3)

Word2Vec.save(word2vec_tr_model, 'savings/word2vec_model.model') 
'''
word2vec_tr_model = Word2Vec.load('savings/word2vec_model.model') 

num_words = len(word2vec_tr_model.wv.vocab) + 4 # 1533 words +  <PAD>, <SOS>, <EOS>, <UNK> => 1537

'''
<PAD>: padding
<SOS>: start of sentence
<EOS>: end of sentence
<UNK>: unknown words
Exemplo n.º 3
0
 def save(self, w2v: Word2Vec):
     filepath = path.join(self._folder_path, w2v.name)
     w2v.save(filepath)
Exemplo n.º 4
0
from gensim.models import Word2Vec
from gensim.utils.SaveLoad
model = Word2Vec.load("data/en.model")
#word_vectors = model.wv
#

Word2Vec.save(model, replace_word_vectors_with_normalized = true)