Пример #1
0
    def test_sg_neg_training_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file:
            model_gensim = FT_gensim(
                size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
                min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
                sorted_vocab=1, workers=1, min_alpha=0.0)

            lee_data = LineSentence(datapath('lee_background.cor'))
            utils.save_as_line_sentence(lee_data, corpus_file)

            model_gensim.build_vocab(corpus_file=corpus_file)
            orig0 = np.copy(model_gensim.wv.vectors[0])
            model_gensim.train(corpus_file=corpus_file,
                               total_words=model_gensim.corpus_total_words,
                               epochs=model_gensim.epochs)
            self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all())  # vector should vary after training

            sims_gensim = model_gensim.wv.most_similar('night', topn=10)
            sims_gensim_words = [word for (word, distance) in sims_gensim]  # get similar words
            expected_sims_words = [
                u'night.',
                u'night,',
                u'eight',
                u'overnight',
                u'overnight.',
                u'month',
                u'land',
                u'firm',
                u'singles',
                u'death']
            overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words))
            self.assertGreaterEqual(overlap_count, 2)
Пример #2
0
    def test_training(self):
        model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
        model.build_vocab(sentences)
        self.model_sanity(model)

        model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
        sims = model.most_similar('graph', topn=10)

        self.assertEqual(model.wv.syn0.shape, (12, 10))
        self.assertEqual(len(model.wv.vocab), 12)
        self.assertEqual(model.wv.syn0_vocab.shape[1], 10)
        self.assertEqual(model.wv.syn0_ngrams.shape[1], 10)
        self.model_sanity(model)

        # test querying for "most similar" by vector
        graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        sims2 = [(w, sim) for w, sim in sims2 if w != 'graph']  # ignore 'graph' itself
        self.assertEqual(sims, sims2)

        # build vocab and train in one step; must be the same as above
        model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
        self.models_equal(model, model2)

        # verify oov-word vector retrieval
        invocab_vec = model['minors']  # invocab word
        self.assertEqual(len(invocab_vec), 10)

        oov_vec = model['minor']  # oov word
        self.assertEqual(len(oov_vec), 10)
Пример #3
0
    def test_cbow_neg_training(self):

        model_gensim = FT_gensim(
            size=50, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5,
            min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
            sorted_vocab=1, workers=1, min_alpha=0.0)

        lee_data = LineSentence(datapath('lee_background.cor'))
        model_gensim.build_vocab(lee_data)
        orig0 = np.copy(model_gensim.wv.vectors[0])
        model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs)
        self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all())  # vector should vary after training

        sims_gensim = model_gensim.wv.most_similar('night', topn=10)
        sims_gensim_words = [word for (word, distance) in sims_gensim]  # get similar words
        expected_sims_words = [
            u'night.',
            u'night,',
            u'eight',
            u'fight',
            u'month',
            u'hearings',
            u'Washington',
            u'remains',
            u'overnight',
            u'running']
        overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words))
        self.assertGreaterEqual(overlap_count, 2)
Пример #4
0
def trainVectors(corpus, implementation, dim=300, min_n=3, max_n=6, min_count=1, model='skipgram', epochs=5, threads=12, window=5, lr=0.05, t=1e-4, neg=5):

    if implementation == 'fasttext':

        ### PARSE TRAIN DATA
        train_data = LineSentence(corpus)
        ### INITIALIZE MODEL
        model_gensim = FT_gensim(size=dim, min_n=min_n, max_n=max_n, min_count=min_count, iter=epochs, window=window)
        # BUILD VOCABULARY
        model_gensim.build_vocab(train_data)
        ### TRAIN THE MODEL
        model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter, model=model, threads=threads, lr=lr, t=t, neg=neg)

    elif implementation == 'w2v':

    	### PARSE TRAIN DATA
        train_data = LineSentence(corpus)
        ### TRAIN THE MODEL
        model_gensim = Word2Vec(size=dim, min_count=min_count, iter=epochs, window=window, workers=threads)
        # BUILD VOCABULARY
        model_gensim.build_vocab(train_data)
        ### TRAIN THE MODEL
        model_gensim.train(train_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter)

    return model_gensim
Пример #5
0
    def test_sg_neg_training(self):

        model_gensim = FT_gensim(
            size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
            min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
            sorted_vocab=1, workers=1, min_alpha=0.0)

        lee_data = LineSentence(datapath('lee_background.cor'))
        model_gensim.build_vocab(lee_data)
        orig0 = np.copy(model_gensim.wv.vectors[0])
        model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs)
        self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all())  # vector should vary after training

        sims_gensim = model_gensim.wv.most_similar('night', topn=10)
        sims_gensim_words = [word for (word, distance) in sims_gensim]  # get similar words
        expected_sims_words = [
            u'night.',
            u'night,',
            u'eight',
            u'overnight',
            u'overnight.',
            u'month',
            u'land',
            u'firm',
            u'singles',
            u'death']
        overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words))
        self.assertGreaterEqual(overlap_count, 2)
Пример #6
0
class ModelFastText:
    def __init__(self, path, existModel=False):
        if existModel:
            self.loadModel(path)
        else:
            self.createModel(path)

    def createModel(self, pathTrain, size=300, min_count=50, sg=1, workers=8, progress_per=50000):
        self.model = FT_gensim(size=300, min_count=50, sg=1, workers=8)
        sentences = datapath(pathTrain)
        self.model.build_vocab(corpus_file=sentences, progress_per=50000)

    def loadModel(self, path):

        self.model = FT_gensim.load(path)

    def trainModel(self, pathTrain, epochs=5, compute_loss=False):
        sentences = datapath(pathTrain)

        self.model.train(sentences, epochs=5, total_examples=self.model.corpus_count, compute_loss=False)

    def saveModel(self, nameFile):
        self.model.save(nameFile+".model")

    def getSimilar(self, word):
        return self.model.wv.most_similar(word, topn=50)
Пример #7
0
def build_fast_text_model(fasttext_entity_path):
    # build fastText

    fasttext_params = {
        "hs": 1,
        "window": 10,
        "min_count": 1,
        "workers": 7,
        "min_n": 1,
        "max_n": 10,
    }

    print("building corpus")

    entity_corpus = [entity for entity in entity_generator(entity_collection)]
    fasttext_entity = FastText(**fasttext_params)

    print("count corpus")
    fasttext_entity.build_vocab(sentences=entity_corpus)
    total_examples = fasttext_entity.corpus_count

    print("train fasttext")
    fasttext_entity.train(sentences=entity_corpus,
                          total_examples=total_examples,
                          epochs=5)

    print("saving fasttext")

    fasttext_entity.save(fasttext_entity_path)

    return fasttext_entity
Пример #8
0
    def test_training(self):
        model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
        model.build_vocab(sentences)
        self.model_sanity(model)

        model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
        sims = model.most_similar('graph', topn=10)

        self.assertEqual(model.wv.syn0.shape, (12, 10))
        self.assertEqual(len(model.wv.vocab), 12)
        self.assertEqual(model.wv.syn0_vocab.shape[1], 10)
        self.assertEqual(model.wv.syn0_ngrams.shape[1], 10)
        self.model_sanity(model)

        # test querying for "most similar" by vector
        graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        sims2 = [(w, sim) for w, sim in sims2 if w != 'graph']  # ignore 'graph' itself
        self.assertEqual(sims, sims2)

        # build vocab and train in one step; must be the same as above
        model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
        self.models_equal(model, model2)

        # verify oov-word vector retrieval
        invocab_vec = model['minors']  # invocab word
        self.assertEqual(len(invocab_vec), 10)

        oov_vec = model['minor']  # oov word
        self.assertEqual(len(oov_vec), 10)
Пример #9
0
    def test_training_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)

            model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
            model.build_vocab(corpus_file=corpus_file)
            self.model_sanity(model)

            model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.iter)
            sims = model.most_similar('graph', topn=10)

            self.assertEqual(model.wv.syn0.shape, (12, 10))
            self.assertEqual(len(model.wv.vocab), 12)
            self.assertEqual(model.wv.syn0_vocab.shape[1], 10)
            self.assertEqual(model.wv.syn0_ngrams.shape[1], 10)
            self.model_sanity(model)

            # test querying for "most similar" by vector
            graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
            sims2 = model.most_similar(positive=[graph_vector], topn=11)
            sims2 = [(w, sim) for w, sim in sims2 if w != 'graph']  # ignore 'graph' itself
            self.assertEqual(sims, sims2)

            # verify oov-word vector retrieval
            invocab_vec = model['minors']  # invocab word
            self.assertEqual(len(invocab_vec), 10)

            oov_vec = model['minor']  # oov word
            self.assertEqual(len(oov_vec), 10)
Пример #10
0
 def test_get_vocab_word_vecs(self):
     model = FT_gensim(size=10, min_count=1, seed=42)
     model.build_vocab(sentences)
     original_syn0_vocab = np.copy(model.wv.syn0_vocab)
     model.trainables.get_vocab_word_vecs(model.wv)
     self.assertTrue(
         np.all(np.equal(model.wv.syn0_vocab, original_syn0_vocab)))
Пример #11
0
 def test_online_learning(self):
     model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0)
     self.assertTrue(len(model_hs.wv.vocab), 12)
     self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
     model_hs.build_vocab(new_sentences, update=True)  # update vocab
     self.assertEqual(len(model_hs.wv.vocab), 14)
     self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
     self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
Пример #12
0
 def test_online_learning(self):
     model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0)
     self.assertTrue(len(model_hs.wv.vocab), 12)
     self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
     model_hs.build_vocab(new_sentences, update=True)  # update vocab
     self.assertEqual(len(model_hs.wv.vocab), 14)
     self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
     self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
Пример #13
0
def train_weights(iSentencesList, **kwargs):
    if kwargs is not None:
        model = FastText(**kwargs)
    else:
        model = FastText()
    model.build_vocab(iSentencesList)
    model.train(iSentencesList, total_examples=model.corpus_count, epochs=model.epochs)
    print("Custom model trained.")
    return model
Пример #14
0
 def test_online_learning_after_save(self):
     tmpf = get_tmpfile('gensim_fasttext.tst')
     model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
     model_neg.save(tmpf)
     model_neg = FT_gensim.load(tmpf)
     self.assertTrue(len(model_neg.wv.vocab), 12)
     model_neg.build_vocab(new_sentences, update=True)  # update vocab
     model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
     self.assertEqual(len(model_neg.wv.vocab), 14)
Пример #15
0
 def test_online_learning_after_save(self):
     tmpf = get_tmpfile('gensim_fasttext.tst')
     model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
     model_neg.save(tmpf)
     model_neg = FT_gensim.load(tmpf)
     self.assertTrue(len(model_neg.wv.vocab), 12)
     model_neg.build_vocab(new_sentences, update=True)  # update vocab
     model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
     self.assertEqual(len(model_neg.wv.vocab), 14)
Пример #16
0
def trainFastTextModel(vectorSize, trainingModel):
    model = FT_gensim(size=vectorSize)
    model.build_vocab(corpus_file='Data/starGEO.txt')
    model.train(corpus_file='Data/starGEO.txt',
                epochs=model.epochs,
                model=trainingModel,
                total_examples=model.corpus_count,
                total_words=model.corpus_total_words)
    return model
Пример #17
0
def fasttext_train_model():
    # read corpus
    with open(corpus_dir) as fp:
        corpus = fp.readlines()  # list of string

    model = FastText(size=embed_size, window=3, min_count=1)  # instantiate
    model.build_vocab(sentences=corpus)
    model.train(sentences=corpus, total_examples=len(corpus),
                epochs=epoch)  # train
    model.save(model_dir)  # save model
Пример #18
0
 def test_online_learning_after_save(self):
     model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
     model_neg.save(testfile())
     model_neg = FT_gensim.load(testfile())
     self.assertTrue(len(model_neg.wv.vocab), 12)
     self.assertTrue(len(model_neg.wv.ngrams), 202)
     model_neg.build_vocab(new_sentences, update=True)  # update vocab
     model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
     self.assertEqual(len(model_neg.wv.vocab), 14)
     self.assertTrue(len(model_neg.wv.ngrams), 271)
Пример #19
0
def fasttext_embedding(source, method, emb_dim):
    model = FT_gensim(size=emb_dim, window=10, sg=1, min_count=5, workers=multiprocessing.cpu_count(), negative=10)
    # build the vocabulary
    model.build_vocab(sentences=Sentences(source))
    # train the model
    model.train(
        sentences=Sentences(source), epochs=15, 
        total_examples=model.corpus_count, total_words=model.corpus_total_words
    )
    return model
Пример #20
0
def create_model_from_corpus(corpus_file: str) -> FastText:
    '''Reads the corpus file and trains a FastText model on it'''
    model = FastText(vector_size=VECTOR_SIZE)
    model.build_vocab(corpus_file=corpus_file)
    model.train(corpus_file=corpus_file,
                epochs=EPOCHS,
                total_examples=model.corpus_count,
                total_words=model.corpus_total_words,
                min_n=MIN_N,
                max_n=MAX_N)
    return model
Пример #21
0
 def test_estimate_memory(self):
     model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3)
     model.build_vocab(sentences)
     report = model.estimate_memory()
     self.assertEqual(report['vocab'], 2800)
     self.assertEqual(report['syn0_vocab'], 160)
     self.assertEqual(report['syn1'], 160)
     self.assertEqual(report['syn1neg'], 160)
     self.assertEqual(report['syn0_ngrams'], 2240)
     self.assertEqual(report['buckets_word'], 640)
     self.assertEqual(report['total'], 6160)
Пример #22
0
def train_model(data_path, size_embeddings, epochs=64):
    corpus_file = datapath(data_path)
    model_gensim = FT_gensim(size=size_embeddings, workers=4)
    # build the vocabulary
    model_gensim.build_vocab(corpus_file=corpus_file)
    # train the model
    model_gensim.train(corpus_file=corpus_file,
                       epochs=epochs,
                       total_examples=model_gensim.corpus_count,
                       total_words=model_gensim.corpus_total_words)
    return model_gensim
Пример #23
0
 def test_estimate_memory(self):
     model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3)
     model.build_vocab(sentences)
     report = model.estimate_memory()
     self.assertEqual(report['vocab'], 2800)
     self.assertEqual(report['syn0_vocab'], 160)
     self.assertEqual(report['syn1'], 160)
     self.assertEqual(report['syn1neg'], 160)
     self.assertEqual(report['syn0_ngrams'], 2240)
     self.assertEqual(report['buckets_word'], 640)
     self.assertEqual(report['total'], 6160)
Пример #24
0
def create_FastText_model(skip_gram, tokenized_sentences, model_path):
    try:
        model = FastText.load(model_path)
    except:
        model = FastText(min_count=1, window=5, sg=skip_gram)
        model.build_vocab(sentences=tokenized_sentences)
        model.train(sentences=tokenized_sentences, total_examples=len(tokenized_sentences), vector_size=5, epochs=10)

        model.save(model_path)

    return model
Пример #25
0
 def test_online_learning(self):
     model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0)
     self.assertTrue(len(model_hs.wv.vocab), 12)
     self.assertTrue(len(model_hs.wv.ngrams), 202)
     self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
     self.assertFalse('tif' in model_hs.wv.ngrams)
     model_hs.build_vocab(new_sentences, update=True)  # update vocab
     self.assertEqual(len(model_hs.wv.vocab), 14)
     self.assertTrue(len(model_hs.wv.ngrams), 271)
     self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
     self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
     self.assertTrue('tif' in model_hs.wv.ngrams)  # ngram added because of the word `artificial`
Пример #26
0
    def test_sg_hs_against_wrapper(self):
        if self.ft_path is None:
            logger.info("FT_HOME env variable not set, skipping test")
            return

        tmpf = get_tmpfile('gensim_fasttext.tst')
        model_wrapper = FT_wrapper.train(
            ft_path=self.ft_path,
            corpus_file=datapath('lee_background.cor'),
            output_file=tmpf,
            model='skipgram',
            size=50,
            alpha=0.025,
            window=5,
            min_count=5,
            word_ngrams=1,
            loss='hs',
            sample=1e-3,
            negative=0,
            iter=5,
            min_n=3,
            max_n=6,
            sorted_vocab=1,
            threads=12)

        model_gensim = FT_gensim(size=50,
                                 sg=1,
                                 cbow_mean=1,
                                 alpha=0.025,
                                 window=5,
                                 hs=1,
                                 negative=0,
                                 min_count=5,
                                 iter=5,
                                 batch_words=1000,
                                 word_ngrams=1,
                                 sample=1e-3,
                                 min_n=3,
                                 max_n=6,
                                 sorted_vocab=1,
                                 workers=1,
                                 min_alpha=0.0)

        lee_data = LineSentence(datapath('lee_background.cor'))
        model_gensim.build_vocab(lee_data)
        orig0 = np.copy(model_gensim.wv.syn0[0])
        model_gensim.train(lee_data,
                           total_examples=model_gensim.corpus_count,
                           epochs=model_gensim.iter)
        self.assertFalse((orig0 == model_gensim.wv.syn0[0]
                          ).all())  # vector should vary after training
        self.compare_with_wrapper(model_gensim, model_wrapper)
Пример #27
0
 def _fastText(self, medical_texts):
     print('fastText')
     model = FastText(sentences=medical_texts,
                      size=150,
                      min_count=2,
                      window=5)
     model.build_vocab(sentences=medical_texts, update=True)
     model.train(sentences=medical_texts,
                 total_examples=len(medical_texts),
                 epochs=7)
     model.wv.save(
         os.path.join(abspath,
                      "../vectors/fastText/medical.fasttext.model"))
Пример #28
0
def init_model(path):

    model = FT(size=150, window=5, min_count=3, workers=4)
    model.build_vocab(sentences=iter_doc(path))
    model.train(corpus_file=path,
                epochs=model.epochs,
                total_examples=model.corpus_count,
                total_words=model.corpus_total_words,
                model='skipgram')

    show_vocab_size(model)

    return model
Пример #29
0
def fastText_embedding(daten):
    """
    get embedding for training, dev, test daten
    :param daten:
    :return:
    """
    ll_daten = fileread(daten)
    model = FastText(size=4, window=3, min_count=1)
    model.build_vocab(sentences=ll_daten)
    model.train(sentences=ll_daten, total_examples=len(ll_daten),
                epochs=10)  # train
    model.wv.save_word2vec_format('daten_embedding.model')
    return model
Пример #30
0
    def test_online_learning_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \
                temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)
            utils.save_as_line_sentence(new_sentences, new_corpus_file)

            model_hs = FT_gensim(corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0)
            self.assertTrue(len(model_hs.wv.vocab), 12)
            self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
            model_hs.build_vocab(corpus_file=new_corpus_file, update=True)  # update vocab
            self.assertEqual(len(model_hs.wv.vocab), 14)
            self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
            self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
Пример #31
0
def create_fastText(file):
    fast_model = FastText(size=300, window=6, min_count=10, workers=8, negative=5, iter=10)
    fast_model.build_vocab(corpus_file=file)
    fast_model.train(corpus_file=file, epochs=fast_model.epochs, callbacks=[callback_log()],
                     total_examples=fast_model.corpus_count, total_words=fast_model.corpus_total_words)

    #Verify vocabulary size and first ten words
    for index, word in enumerate(fast_model.wv.index2word):
        if index == 10:
            break
        print(f"word #{index}/{len(fast_model.wv.index2word)} is {word}")

    return fast_model
Пример #32
0
def main():
    print('Instantiating the model')
    model = FT_gensim(size=100, window=5, min_count=5)  # instantiate the model
    print('Building the vocabulary')
    model.build_vocab(sentences=MyIter())
    total_examples = model.corpus_count
    print('Training the model')
    model.train(sentences=MyIter(), total_examples=total_examples,
                epochs=5)  # train the model

    ## Save the model (can be loaded using gensim)
    print('Saving the model to specified filepath')
    save_file = sys.argv[2]
    model.save(save_file)
Пример #33
0
def generate_model(lang):

    model_gensim = FT_gensim(size=300)

    # build the vocabulary
    model_gensim.build_vocab(corpus_file='embedding/corpus_' + lang)

    # train the model
    model_gensim.train(corpus_file='embedding/corpus_' + lang,
                       epochs=model_gensim.epochs,
                       total_examples=model_gensim.corpus_count,
                       total_words=model_gensim.corpus_total_words)

    model_gensim.save('embedding/fasttext_' + lang + '.vec')
Пример #34
0
    def test_online_learning_after_save_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \
                temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)
            utils.save_as_line_sentence(new_sentences, new_corpus_file)

            tmpf = get_tmpfile('gensim_fasttext.tst')
            model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5)
            model_neg.save(tmpf)
            model_neg = FT_gensim.load(tmpf)
            self.assertTrue(len(model_neg.wv.vocab), 12)
            model_neg.build_vocab(corpus_file=new_corpus_file, update=True)  # update vocab
            model_neg.train(corpus_file=new_corpus_file, total_words=model_neg.corpus_total_words,
                            epochs=model_neg.iter)
            self.assertEqual(len(model_neg.wv.vocab), 14)
def get_ft_model(documents, settings):
    model = FT_gensim(min_count=settings['min_count'],
                      size=settings['size'],
                      window=settings['window'],
                      workers=40,
                      sg=settings['sg'],
                      negative=settings['negative'],
                      iter=settings['iter'])

    model.build_vocab(documents)

    model.train(documents,
                total_examples=model.corpus_count,
                epochs=model.iter)
    return model
Пример #36
0
def createFastTextModel(data, isSG, vectorSize, maxNgram, modelFilePath):
    # train model
    model_gensim = FT_gensim(sg=isSG,
                             size=vectorSize,
                             min_count=1,
                             min_n=1,
                             max_n=maxNgram)
    # build the vocabulary
    model_gensim.build_vocab(data)
    # train the model
    model_gensim.train(data,
                       total_examples=model_gensim.corpus_count,
                       epochs=model_gensim.iter)
    #save
    model_gensim.save(modelFilePath)
Пример #37
0
def BuildFastText(reviews, label):
    #input: sentences da dc tokenize thanh list
    model = FastText(size=200, window=3, min_count=4)
    model.build_vocab(sentences=reviews)
    model.train(sentences=reviews, total_examples=len(reviews), epochs=50)
    # fname = get_tmpfile("Model/fasttext.model")
    model.save("Model/fasttext.bin")
    print("Save model done!")
    a = 0
    b = 0
    dataf = list()
    for review in reviews:
        # print(a)
        len_sen = len(review)
        # print(type(len_sen))
        try:
            vectors = model.wv[review]
            # print(type(vectors))
            sumvec = 0
            for i in range(0, len_sen):
                sumvec = sumvec + vectors[i]
            sumvec = sumvec / len_sen
            # print(type(sumvec))
            # print(sumvec)
            dataf.append(sumvec)
        except:
            sumvec = 0
            # print(a)
            # print(review)
            b = b + 1
            if (label[a] == 0):
                sumvec = model.wv["positive"]
                print(type(sumvec))
            if (label[a] == 1):
                sumvec = model.wv["negative"]
            print(type(sumvec))
            dataf.append(sumvec)
        # print(sumvec)
        # print(type(sumvec))
    # print(len(vectors))
    # if (len(vectors) == 0):
    #     print(a)
    # a = a + 1
    #5953 #6253 #11001 #11801 #12051 #13937 #14005

    dataf = np.array(dataf)
    return dataf
Пример #38
0
    def test_sg_hs_against_wrapper(self):
        if self.ft_path is None:
            logger.info("FT_HOME env variable not set, skipping test")
            return

        model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'),
            output_file=testfile(), model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1,
            loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12)

        model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
            min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
            sorted_vocab=1, workers=1, min_alpha=0.0)

        lee_data = LineSentence(datapath('lee_background.cor'))
        model_gensim.build_vocab(lee_data)
        orig0 = np.copy(model_gensim.wv.syn0[0])
        model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter)
        self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all())  # vector should vary after training
        self.compare_with_wrapper(model_gensim, model_wrapper)
Пример #39
0
 def test_get_vocab_word_vecs(self):
     model = FT_gensim(size=10, min_count=1, seed=42)
     model.build_vocab(sentences)
     original_syn0_vocab = np.copy(model.wv.syn0_vocab)
     model.trainables.get_vocab_word_vecs(model.wv)
     self.assertTrue(np.all(np.equal(model.wv.syn0_vocab, original_syn0_vocab)))
Пример #40
0
 def test_bucket_ngrams(self):
     model = FT_gensim(size=10, min_count=1, bucket=20)
     model.build_vocab(sentences)
     self.assertEqual(model.wv.syn0_ngrams.shape, (20, 10))
     model.build_vocab(new_sentences, update=True)
     self.assertEqual(model.wv.syn0_ngrams.shape, (20, 10))