# Initialize the model model = FT_gensim(size=32) # build the vocabulary model.build_vocab(corpus_file=corpus_file) # train the model model.epochs = 15 model.train(corpus_file=corpus_file, epochs=model.epochs, total_examples=model.corpus_count, total_words=model.corpus_total_words) print(model) # saving a model trained via Gensim's fastText implementation model.save(save_file, separately=[]) # run some basic tests on the model print("job" in model.wv.vocab) print("salary" in model.wv.vocab) print("learn" in model.wv.vocab) # print vector representaion print(model["job"]) # test similarity print(model.similarity("job", "salary")) print(model.similarity("job", "learn")) print(model.similarity("job", "the"))
############################################################################### # # Similarity operations work the same way as word2vec. **Out-of-vocabulary words can also be used, provided they have at least one character ngram present in the training custom.** # print("nights" in model.wv.vocab) ############################################################################### # print("night" in model.wv.vocab) ############################################################################### # print(model.similarity("night", "nights")) ############################################################################### # # Syntactically similar words generally have high similarity in fastText models, since a large number of the component char-ngrams will be the same. As a result, fastText generally does better at syntactic tasks than Word2Vec. A detailed comparison is provided `here <Word2Vec_FastText_Comparison.ipynb>`_. # ############################################################################### # # Other similarity operations # ^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # The example training corpus is a toy corpus, results are not expected to be good, for proof-of-concept only print(model.most_similar("nights"))
# In[ ]: from gensim.models.fasttext import FastText ft_model = FastText(train_data, size=embedding_size, window=window_size, min_count=min_word, sample=down_sampling, sg=1, iter=10) # In[ ]: semantically_similar_words = { words: [item[0] for item in ft_model.wv.most_similar([words], topn=5)] for words in ['kitchen', 'death', 'king', 'queen', 'strong', 'weak', 'woman', 'man'] } for k, v in semantically_similar_words.items(): print(k + ":" + str(v)) # In[ ]: ft_model.similarity("annabeth", "percy") # In[ ]: ft_model.wv.save_word2vec_format('FTvectors')