def test_training(self): model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(sentences) self.model_sanity(model) model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) self.assertEqual(model.wv.syn0.shape, (12, 10)) self.assertEqual(len(model.wv.vocab), 12) self.assertEqual(model.wv.syn0_vocab.shape[1], 10) self.assertEqual(model.wv.syn0_ngrams.shape[1], 10) self.model_sanity(model) # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) self.models_equal(model, model2) # verify oov-word vector retrieval invocab_vec = model['minors'] # invocab word self.assertEqual(len(invocab_vec), 10) oov_vec = model['minor'] # oov word self.assertEqual(len(oov_vec), 10)
def test_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: utils.save_as_line_sentence(sentences, corpus_file) model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(corpus_file=corpus_file) self.model_sanity(model) model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.iter) sims = model.most_similar('graph', topn=10) self.assertEqual(model.wv.syn0.shape, (12, 10)) self.assertEqual(len(model.wv.vocab), 12) self.assertEqual(model.wv.syn0_vocab.shape[1], 10) self.assertEqual(model.wv.syn0_ngrams.shape[1], 10) self.model_sanity(model) # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # verify oov-word vector retrieval invocab_vec = model['minors'] # invocab word self.assertEqual(len(invocab_vec), 10) oov_vec = model['minor'] # oov word self.assertEqual(len(oov_vec), 10)
# print(model.similarity("night", "nights")) ############################################################################### # # Syntactically similar words generally have high similarity in fastText models, since a large number of the component char-ngrams will be the same. As a result, fastText generally does better at syntactic tasks than Word2Vec. A detailed comparison is provided `here <Word2Vec_FastText_Comparison.ipynb>`_. # ############################################################################### # # Other similarity operations # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # The example training corpus is a toy corpus, results are not expected to be good, for proof-of-concept only print(model.most_similar("nights")) ############################################################################### # print(model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])) ############################################################################### # print(model.doesnt_match("breakfast cereal dinner lunch".split())) ############################################################################### # print(model.most_similar(positive=['baghdad', 'england'], negative=['london'])) ############################################################################### #
for activation in activations: if activation == 0: for negat in negatives: ftxt= FastText(min_count = min_cnt, size = 300, window = 3, workers = 3, sg = architecture, hs = activation, negative = negat, seed = 0) ftxt.build_vocab(sentences = sent) ftxt.train(sentences = sent, total_examples = len(sent), epochs = 30) keys = ['sval', 'oko', 'srdce'] embedding_clusters = [] word_clusters = [] for word in keys: embeddings = [] words = [] for similar_word, _ in ftxt.most_similar(word, topn = 30): words.append(similar_word) embeddings.append(ftxt[similar_word]) embedding_clusters.append(embeddings) word_clusters.append(words) embedding_clusters = np.array(embedding_clusters) n, m, k = embedding_clusters.shape tsne_model_en_2d = TSNE(perplexity = 15, n_components = 2, init = 'pca', n_iter = 3500, random_state = 32) embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2) tsne_plot_similar_words('Sval oko srdce', keys, embeddings_en_2d, word_clusters, 0.7, f'Sval oko srdce FastText OPTIMED NZIP min_count = ' + str(min_cnt) + ' , size = 300, workers = 3, window = 3, sg = ' + str(architecture) +', hs = ' + str(activation) + ', negative = '+ str(negat) + ', seed = 0.png') else: ftxt= FastText(min_count = min_cnt, size = 300, window = 3, workers = 3, sg = architecture, hs = activation, seed = 0)
## Note: Not necessary. Uses sentences.csv from the word2vec model # %% Train Fasttext from gensim.models.fasttext import FastText as FT_gensim from gensim.models.word2vec import LineSentence sentences = LineSentence(dataPath + "sentences.csv") model_gensim = FT_gensim(size=300) # build the vocabulary model_gensim.build_vocab(sentences) # train the model model_gensim.train( sentences, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs ) print(model_gensim) # %% save model model_gensim.save(dataPath + "fasttext.model") # %% sanity check model_gensim.most_similar("cattle") model_gensim.most_similar(["super", "market"]) model_gensim.most_similar(["pharma"]) model_gensim.wv.doesnt_match("pharma medical pharmaceutic cattle".split()) model_gensim.wv.doesnt_match("pharma medical pharmaceutic".split())
from pprint import pprint as print from gensim.models.fasttext import FastText as FT_gensim from gensim.test.utils import datapath import time # Set file names for train and test data corpus_file = datapath('lee_background.cor') model = FT_gensim(size=100) start = time.time() # build the vocabulary model.build_vocab(corpus_file=corpus_file) # train the model model.train( corpus_file=corpus_file, epochs=model.epochs, total_examples=model.corpus_count, total_words=model.corpus_total_words ) print(model) print("Model trained") print(str(model.most_similar("rest",topn=5)) + "\nTook " + str(time.time()-start))