示例#1
0
    def test_training(self):
        model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
        model.build_vocab(sentences)
        self.model_sanity(model)

        model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
        sims = model.most_similar('graph', topn=10)

        self.assertEqual(model.wv.syn0.shape, (12, 10))
        self.assertEqual(len(model.wv.vocab), 12)
        self.assertEqual(model.wv.syn0_vocab.shape[1], 10)
        self.assertEqual(model.wv.syn0_ngrams.shape[1], 10)
        self.model_sanity(model)

        # test querying for "most similar" by vector
        graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        sims2 = [(w, sim) for w, sim in sims2 if w != 'graph']  # ignore 'graph' itself
        self.assertEqual(sims, sims2)

        # build vocab and train in one step; must be the same as above
        model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
        self.models_equal(model, model2)

        # verify oov-word vector retrieval
        invocab_vec = model['minors']  # invocab word
        self.assertEqual(len(invocab_vec), 10)

        oov_vec = model['minor']  # oov word
        self.assertEqual(len(oov_vec), 10)
示例#2
0
    def test_training_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)

            model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
            model.build_vocab(corpus_file=corpus_file)
            self.model_sanity(model)

            model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.iter)
            sims = model.most_similar('graph', topn=10)

            self.assertEqual(model.wv.syn0.shape, (12, 10))
            self.assertEqual(len(model.wv.vocab), 12)
            self.assertEqual(model.wv.syn0_vocab.shape[1], 10)
            self.assertEqual(model.wv.syn0_ngrams.shape[1], 10)
            self.model_sanity(model)

            # test querying for "most similar" by vector
            graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
            sims2 = model.most_similar(positive=[graph_vector], topn=11)
            sims2 = [(w, sim) for w, sim in sims2 if w != 'graph']  # ignore 'graph' itself
            self.assertEqual(sims, sims2)

            # verify oov-word vector retrieval
            invocab_vec = model['minors']  # invocab word
            self.assertEqual(len(invocab_vec), 10)

            oov_vec = model['minor']  # oov word
            self.assertEqual(len(oov_vec), 10)
示例#3
0
    def test_training(self):
        model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
        model.build_vocab(sentences)
        self.model_sanity(model)

        model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
        sims = model.most_similar('graph', topn=10)

        self.assertEqual(model.wv.syn0.shape, (12, 10))
        self.assertEqual(len(model.wv.vocab), 12)
        self.assertEqual(model.wv.syn0_vocab.shape[1], 10)
        self.assertEqual(model.wv.syn0_ngrams.shape[1], 10)
        self.model_sanity(model)

        # test querying for "most similar" by vector
        graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        sims2 = [(w, sim) for w, sim in sims2 if w != 'graph']  # ignore 'graph' itself
        self.assertEqual(sims, sims2)

        # build vocab and train in one step; must be the same as above
        model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
        self.models_equal(model, model2)

        # verify oov-word vector retrieval
        invocab_vec = model['minors']  # invocab word
        self.assertEqual(len(invocab_vec), 10)

        oov_vec = model['minor']  # oov word
        self.assertEqual(len(oov_vec), 10)
示例#4
0
#
print(model.similarity("night", "nights"))

###############################################################################
#
# Syntactically similar words generally have high similarity in fastText models, since a large number of the component char-ngrams will be the same. As a result, fastText generally does better at syntactic tasks than Word2Vec. A detailed comparison is provided `here <Word2Vec_FastText_Comparison.ipynb>`_.
#


###############################################################################
#
# Other similarity operations
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# The example training corpus is a toy corpus, results are not expected to be good, for proof-of-concept only
print(model.most_similar("nights"))

###############################################################################
#
print(model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']))

###############################################################################
#
print(model.doesnt_match("breakfast cereal dinner lunch".split()))

###############################################################################
#
print(model.most_similar(positive=['baghdad', 'england'], negative=['london']))

###############################################################################
#
示例#5
0
 for activation in activations:
     if activation == 0:
         for negat in negatives:
                     
             ftxt= FastText(min_count = min_cnt, size = 300, window = 3,  workers = 3, sg = architecture, hs = activation, negative = negat, seed = 0)  
             ftxt.build_vocab(sentences = sent)
             ftxt.train(sentences = sent, total_examples = len(sent), epochs = 30) 
         
             keys = ['sval', 'oko', 'srdce']
         
             embedding_clusters = []
             word_clusters = []
             for word in keys:
                 embeddings = []
                 words = []
                 for similar_word, _ in ftxt.most_similar(word, topn = 30):
                     words.append(similar_word)
                     embeddings.append(ftxt[similar_word])
                 embedding_clusters.append(embeddings)
                 word_clusters.append(words)
             
             embedding_clusters = np.array(embedding_clusters)
             n, m, k = embedding_clusters.shape
             tsne_model_en_2d = TSNE(perplexity = 15, n_components = 2, init = 'pca', n_iter = 3500, random_state = 32)
             embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
             
             tsne_plot_similar_words('Sval oko srdce', keys, embeddings_en_2d, word_clusters, 0.7,
                                     f'Sval oko srdce FastText OPTIMED NZIP min_count = ' + str(min_cnt) + ' , size = 300, workers = 3, window = 3, sg = ' + str(architecture) +', hs = ' + str(activation) + ', negative = '+ str(negat) + ',  seed = 0.png')    
     else:
             
         ftxt= FastText(min_count = min_cnt, size = 300, window = 3,  workers = 3, sg = architecture, hs = activation, seed = 0)  
示例#6
0
## Note: Not necessary. Uses sentences.csv from the word2vec model

# %% Train Fasttext

from gensim.models.fasttext import FastText as FT_gensim
from gensim.models.word2vec import LineSentence

sentences = LineSentence(dataPath + "sentences.csv")

model_gensim = FT_gensim(size=300)

# build the vocabulary
model_gensim.build_vocab(sentences)

# train the model
model_gensim.train(
    sentences, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs
)

print(model_gensim)

# %% save model
model_gensim.save(dataPath + "fasttext.model")

# %% sanity check
model_gensim.most_similar("cattle")
model_gensim.most_similar(["super", "market"])
model_gensim.most_similar(["pharma"])
model_gensim.wv.doesnt_match("pharma medical pharmaceutic cattle".split())
model_gensim.wv.doesnt_match("pharma medical pharmaceutic".split())
示例#7
0
from pprint import pprint as print
from gensim.models.fasttext import FastText as FT_gensim
from gensim.test.utils import datapath
import time

# Set file names for train and test data
corpus_file = datapath('lee_background.cor')

model = FT_gensim(size=100)

start = time.time()

# build the vocabulary
model.build_vocab(corpus_file=corpus_file)

# train the model
model.train(
    corpus_file=corpus_file, epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words
)

print(model)

print("Model trained")

print(str(model.most_similar("rest",topn=5)) + "\nTook " + str(time.time()-start))