def test_sg_neg_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: model_gensim = FT_gensim( size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) utils.save_as_line_sentence(lee_data, corpus_file) model_gensim.build_vocab(corpus_file=corpus_file) orig0 = np.copy(model_gensim.wv.vectors[0]) model_gensim.train(corpus_file=corpus_file, total_words=model_gensim.corpus_total_words, epochs=model_gensim.epochs) self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training sims_gensim = model_gensim.wv.most_similar('night', topn=10) sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words expected_sims_words = [ u'night.', u'night,', u'eight', u'overnight', u'overnight.', u'month', u'land', u'firm', u'singles', u'death'] overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) self.assertGreaterEqual(overlap_count, 2)
def test_training(self): model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(sentences) self.model_sanity(model) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) self.assertEqual(model.wv.syn0.shape, (12, 10)) self.assertEqual(len(model.wv.vocab), 12) self.assertEqual(model.wv.syn0_vocab.shape[1], 10) self.assertEqual(model.wv.syn0_ngrams.shape[1], 10) self.model_sanity(model) # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) self.models_equal(model, model2) # verify oov-word vector retrieval invocab_vec = model['minors'] # invocab word self.assertEqual(len(invocab_vec), 10) oov_vec = model['minor'] # oov word self.assertEqual(len(oov_vec), 10)
def test_cbow_neg_training(self): model_gensim = FT_gensim( size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.vectors[0]) model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs) self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training sims_gensim = model_gensim.wv.most_similar('night', topn=10) sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words expected_sims_words = [ u'night.', u'night,', u'eight', u'fight', u'month', u'hearings', u'Washington', u'remains', u'overnight', u'running'] overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) self.assertGreaterEqual(overlap_count, 2)
def trainVectors(corpus, implementation, dim=300, min_n=3, max_n=6, min_count=1, model='skipgram', epochs=5, threads=12, window=5, lr=0.05, t=1e-4, neg=5): if implementation == 'fasttext': ### PARSE TRAIN DATA train_data = LineSentence(corpus) ### INITIALIZE MODEL model_gensim = FT_gensim(size=dim, min_n=min_n, max_n=max_n, min_count=min_count, iter=epochs, window=window) # BUILD VOCABULARY model_gensim.build_vocab(train_data) ### TRAIN THE MODEL model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter, model=model, threads=threads, lr=lr, t=t, neg=neg) elif implementation == 'w2v': ### PARSE TRAIN DATA train_data = LineSentence(corpus) ### TRAIN THE MODEL model_gensim = Word2Vec(size=dim, min_count=min_count, iter=epochs, window=window, workers=threads) # BUILD VOCABULARY model_gensim.build_vocab(train_data) ### TRAIN THE MODEL model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) return model_gensim
def test_sg_neg_training(self): model_gensim = FT_gensim( size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.vectors[0]) model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs) self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training sims_gensim = model_gensim.wv.most_similar('night', topn=10) sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words expected_sims_words = [ u'night.', u'night,', u'eight', u'overnight', u'overnight.', u'month', u'land', u'firm', u'singles', u'death'] overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) self.assertGreaterEqual(overlap_count, 2)
class ModelFastText: def __init__(self, path, existModel=False): if existModel: self.loadModel(path) else: self.createModel(path) def createModel(self, pathTrain, size=300, min_count=50, sg=1, workers=8, progress_per=50000): self.model = FT_gensim(size=300, min_count=50, sg=1, workers=8) sentences = datapath(pathTrain) self.model.build_vocab(corpus_file=sentences, progress_per=50000) def loadModel(self, path): self.model = FT_gensim.load(path) def trainModel(self, pathTrain, epochs=5, compute_loss=False): sentences = datapath(pathTrain) self.model.train(sentences, epochs=5, total_examples=self.model.corpus_count, compute_loss=False) def saveModel(self, nameFile): self.model.save(nameFile+".model") def getSimilar(self, word): return self.model.wv.most_similar(word, topn=50)
def build_fast_text_model(fasttext_entity_path): # build fastText fasttext_params = { "hs": 1, "window": 10, "min_count": 1, "workers": 7, "min_n": 1, "max_n": 10, } print("building corpus") entity_corpus = [entity for entity in entity_generator(entity_collection)] fasttext_entity = FastText(**fasttext_params) print("count corpus") fasttext_entity.build_vocab(sentences=entity_corpus) total_examples = fasttext_entity.corpus_count print("train fasttext") fasttext_entity.train(sentences=entity_corpus, total_examples=total_examples, epochs=5) print("saving fasttext") fasttext_entity.save(fasttext_entity_path) return fasttext_entity
def test_training(self): model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(sentences) self.model_sanity(model) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) self.assertEqual(model.wv.syn0.shape, (12, 10)) self.assertEqual(len(model.wv.vocab), 12) self.assertEqual(model.wv.syn0_vocab.shape[1], 10) self.assertEqual(model.wv.syn0_ngrams.shape[1], 10) self.model_sanity(model) # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) self.models_equal(model, model2) # verify oov-word vector retrieval invocab_vec = model['minors'] # invocab word self.assertEqual(len(invocab_vec), 10) oov_vec = model['minor'] # oov word self.assertEqual(len(oov_vec), 10)
def test_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: utils.save_as_line_sentence(sentences, corpus_file) model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(corpus_file=corpus_file) self.model_sanity(model) model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.iter) sims = model.most_similar('graph', topn=10) self.assertEqual(model.wv.syn0.shape, (12, 10)) self.assertEqual(len(model.wv.vocab), 12) self.assertEqual(model.wv.syn0_vocab.shape[1], 10) self.assertEqual(model.wv.syn0_ngrams.shape[1], 10) self.model_sanity(model) # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # verify oov-word vector retrieval invocab_vec = model['minors'] # invocab word self.assertEqual(len(invocab_vec), 10) oov_vec = model['minor'] # oov word self.assertEqual(len(oov_vec), 10)
def test_get_vocab_word_vecs(self): model = FT_gensim(size=10, min_count=1, seed=42) model.build_vocab(sentences) original_syn0_vocab = np.copy(model.wv.syn0_vocab) model.trainables.get_vocab_word_vecs(model.wv) self.assertTrue( np.all(np.equal(model.wv.syn0_vocab, original_syn0_vocab)))
def test_online_learning(self): model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(new_sentences, update=True) # update vocab self.assertEqual(len(model_hs.wv.vocab), 14) self.assertTrue(model_hs.wv.vocab['graph'].count, 4) self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
def test_online_learning(self): model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(new_sentences, update=True) # update vocab self.assertEqual(len(model_hs.wv.vocab), 14) self.assertTrue(model_hs.wv.vocab['graph'].count, 4) self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
def train_weights(iSentencesList, **kwargs): if kwargs is not None: model = FastText(**kwargs) else: model = FastText() model.build_vocab(iSentencesList) model.train(iSentencesList, total_examples=model.corpus_count, epochs=model.epochs) print("Custom model trained.") return model
def test_online_learning_after_save(self): tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14)
def test_online_learning_after_save(self): tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14)
def trainFastTextModel(vectorSize, trainingModel): model = FT_gensim(size=vectorSize) model.build_vocab(corpus_file='Data/starGEO.txt') model.train(corpus_file='Data/starGEO.txt', epochs=model.epochs, model=trainingModel, total_examples=model.corpus_count, total_words=model.corpus_total_words) return model
def fasttext_train_model(): # read corpus with open(corpus_dir) as fp: corpus = fp.readlines() # list of string model = FastText(size=embed_size, window=3, min_count=1) # instantiate model.build_vocab(sentences=corpus) model.train(sentences=corpus, total_examples=len(corpus), epochs=epoch) # train model.save(model_dir) # save model
def test_online_learning_after_save(self): model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(testfile()) model_neg = FT_gensim.load(testfile()) self.assertTrue(len(model_neg.wv.vocab), 12) self.assertTrue(len(model_neg.wv.ngrams), 202) model_neg.build_vocab(new_sentences, update=True) # update vocab model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14) self.assertTrue(len(model_neg.wv.ngrams), 271)
def fasttext_embedding(source, method, emb_dim): model = FT_gensim(size=emb_dim, window=10, sg=1, min_count=5, workers=multiprocessing.cpu_count(), negative=10) # build the vocabulary model.build_vocab(sentences=Sentences(source)) # train the model model.train( sentences=Sentences(source), epochs=15, total_examples=model.corpus_count, total_words=model.corpus_total_words ) return model
def create_model_from_corpus(corpus_file: str) -> FastText: '''Reads the corpus file and trains a FastText model on it''' model = FastText(vector_size=VECTOR_SIZE) model.build_vocab(corpus_file=corpus_file) model.train(corpus_file=corpus_file, epochs=EPOCHS, total_examples=model.corpus_count, total_words=model.corpus_total_words, min_n=MIN_N, max_n=MAX_N) return model
def test_estimate_memory(self): model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3) model.build_vocab(sentences) report = model.estimate_memory() self.assertEqual(report['vocab'], 2800) self.assertEqual(report['syn0_vocab'], 160) self.assertEqual(report['syn1'], 160) self.assertEqual(report['syn1neg'], 160) self.assertEqual(report['syn0_ngrams'], 2240) self.assertEqual(report['buckets_word'], 640) self.assertEqual(report['total'], 6160)
def train_model(data_path, size_embeddings, epochs=64): corpus_file = datapath(data_path) model_gensim = FT_gensim(size=size_embeddings, workers=4) # build the vocabulary model_gensim.build_vocab(corpus_file=corpus_file) # train the model model_gensim.train(corpus_file=corpus_file, epochs=epochs, total_examples=model_gensim.corpus_count, total_words=model_gensim.corpus_total_words) return model_gensim
def test_estimate_memory(self): model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3) model.build_vocab(sentences) report = model.estimate_memory() self.assertEqual(report['vocab'], 2800) self.assertEqual(report['syn0_vocab'], 160) self.assertEqual(report['syn1'], 160) self.assertEqual(report['syn1neg'], 160) self.assertEqual(report['syn0_ngrams'], 2240) self.assertEqual(report['buckets_word'], 640) self.assertEqual(report['total'], 6160)
def create_FastText_model(skip_gram, tokenized_sentences, model_path): try: model = FastText.load(model_path) except: model = FastText(min_count=1, window=5, sg=skip_gram) model.build_vocab(sentences=tokenized_sentences) model.train(sentences=tokenized_sentences, total_examples=len(tokenized_sentences), vector_size=5, epochs=10) model.save(model_path) return model
def test_online_learning(self): model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(len(model_hs.wv.ngrams), 202) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) self.assertFalse('tif' in model_hs.wv.ngrams) model_hs.build_vocab(new_sentences, update=True) # update vocab self.assertEqual(len(model_hs.wv.vocab), 14) self.assertTrue(len(model_hs.wv.ngrams), 271) self.assertTrue(model_hs.wv.vocab['graph'].count, 4) self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) self.assertTrue('tif' in model_hs.wv.ngrams) # ngram added because of the word `artificial`
def test_sg_hs_against_wrapper(self): if self.ft_path is None: logger.info("FT_HOME env variable not set, skipping test") return tmpf = get_tmpfile('gensim_fasttext.tst') model_wrapper = FT_wrapper.train( ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), output_file=tmpf, model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.syn0[0]) model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) self.assertFalse((orig0 == model_gensim.wv.syn0[0] ).all()) # vector should vary after training self.compare_with_wrapper(model_gensim, model_wrapper)
def _fastText(self, medical_texts): print('fastText') model = FastText(sentences=medical_texts, size=150, min_count=2, window=5) model.build_vocab(sentences=medical_texts, update=True) model.train(sentences=medical_texts, total_examples=len(medical_texts), epochs=7) model.wv.save( os.path.join(abspath, "../vectors/fastText/medical.fasttext.model"))
def init_model(path): model = FT(size=150, window=5, min_count=3, workers=4) model.build_vocab(sentences=iter_doc(path)) model.train(corpus_file=path, epochs=model.epochs, total_examples=model.corpus_count, total_words=model.corpus_total_words, model='skipgram') show_vocab_size(model) return model
def fastText_embedding(daten): """ get embedding for training, dev, test daten :param daten: :return: """ ll_daten = fileread(daten) model = FastText(size=4, window=3, min_count=1) model.build_vocab(sentences=ll_daten) model.train(sentences=ll_daten, total_examples=len(ll_daten), epochs=10) # train model.wv.save_word2vec_format('daten_embedding.model') return model
def test_online_learning_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \ temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file: utils.save_as_line_sentence(sentences, corpus_file) utils.save_as_line_sentence(new_sentences, new_corpus_file) model_hs = FT_gensim(corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab self.assertEqual(len(model_hs.wv.vocab), 14) self.assertTrue(model_hs.wv.vocab['graph'].count, 4) self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
def create_fastText(file): fast_model = FastText(size=300, window=6, min_count=10, workers=8, negative=5, iter=10) fast_model.build_vocab(corpus_file=file) fast_model.train(corpus_file=file, epochs=fast_model.epochs, callbacks=[callback_log()], total_examples=fast_model.corpus_count, total_words=fast_model.corpus_total_words) #Verify vocabulary size and first ten words for index, word in enumerate(fast_model.wv.index2word): if index == 10: break print(f"word #{index}/{len(fast_model.wv.index2word)} is {word}") return fast_model
def main(): print('Instantiating the model') model = FT_gensim(size=100, window=5, min_count=5) # instantiate the model print('Building the vocabulary') model.build_vocab(sentences=MyIter()) total_examples = model.corpus_count print('Training the model') model.train(sentences=MyIter(), total_examples=total_examples, epochs=5) # train the model ## Save the model (can be loaded using gensim) print('Saving the model to specified filepath') save_file = sys.argv[2] model.save(save_file)
def generate_model(lang): model_gensim = FT_gensim(size=300) # build the vocabulary model_gensim.build_vocab(corpus_file='embedding/corpus_' + lang) # train the model model_gensim.train(corpus_file='embedding/corpus_' + lang, epochs=model_gensim.epochs, total_examples=model_gensim.corpus_count, total_words=model_gensim.corpus_total_words) model_gensim.save('embedding/fasttext_' + lang + '.vec')
def test_online_learning_after_save_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \ temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file: utils.save_as_line_sentence(sentences, corpus_file) utils.save_as_line_sentence(new_sentences, new_corpus_file) tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab model_neg.train(corpus_file=new_corpus_file, total_words=model_neg.corpus_total_words, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14)
def get_ft_model(documents, settings): model = FT_gensim(min_count=settings['min_count'], size=settings['size'], window=settings['window'], workers=40, sg=settings['sg'], negative=settings['negative'], iter=settings['iter']) model.build_vocab(documents) model.train(documents, total_examples=model.corpus_count, epochs=model.iter) return model
def createFastTextModel(data, isSG, vectorSize, maxNgram, modelFilePath): # train model model_gensim = FT_gensim(sg=isSG, size=vectorSize, min_count=1, min_n=1, max_n=maxNgram) # build the vocabulary model_gensim.build_vocab(data) # train the model model_gensim.train(data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) #save model_gensim.save(modelFilePath)
def BuildFastText(reviews, label): #input: sentences da dc tokenize thanh list model = FastText(size=200, window=3, min_count=4) model.build_vocab(sentences=reviews) model.train(sentences=reviews, total_examples=len(reviews), epochs=50) # fname = get_tmpfile("Model/fasttext.model") model.save("Model/fasttext.bin") print("Save model done!") a = 0 b = 0 dataf = list() for review in reviews: # print(a) len_sen = len(review) # print(type(len_sen)) try: vectors = model.wv[review] # print(type(vectors)) sumvec = 0 for i in range(0, len_sen): sumvec = sumvec + vectors[i] sumvec = sumvec / len_sen # print(type(sumvec)) # print(sumvec) dataf.append(sumvec) except: sumvec = 0 # print(a) # print(review) b = b + 1 if (label[a] == 0): sumvec = model.wv["positive"] print(type(sumvec)) if (label[a] == 1): sumvec = model.wv["negative"] print(type(sumvec)) dataf.append(sumvec) # print(sumvec) # print(type(sumvec)) # print(len(vectors)) # if (len(vectors) == 0): # print(a) # a = a + 1 #5953 #6253 #11001 #11801 #12051 #13937 #14005 dataf = np.array(dataf) return dataf
def test_sg_hs_against_wrapper(self): if self.ft_path is None: logger.info("FT_HOME env variable not set, skipping test") return model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'), output_file=testfile(), model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1, loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12) model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) model_gensim.build_vocab(lee_data) orig0 = np.copy(model_gensim.wv.syn0[0]) model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter) self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all()) # vector should vary after training self.compare_with_wrapper(model_gensim, model_wrapper)
def test_get_vocab_word_vecs(self): model = FT_gensim(size=10, min_count=1, seed=42) model.build_vocab(sentences) original_syn0_vocab = np.copy(model.wv.syn0_vocab) model.trainables.get_vocab_word_vecs(model.wv) self.assertTrue(np.all(np.equal(model.wv.syn0_vocab, original_syn0_vocab)))
def test_bucket_ngrams(self): model = FT_gensim(size=10, min_count=1, bucket=20) model.build_vocab(sentences) self.assertEqual(model.wv.syn0_ngrams.shape, (20, 10)) model.build_vocab(new_sentences, update=True) self.assertEqual(model.wv.syn0_ngrams.shape, (20, 10))