Пример #1
0
    def testLoadOldModel(self):
        """Test loading fasttext models from previous version"""

        model_file = 'fasttext_old'
        model = FT_gensim.load(datapath(model_file))
        self.assertTrue(model.wv.vectors.shape == (12, 100))
        self.assertTrue(len(model.wv.vocab) == 12)
        self.assertTrue(len(model.wv.index2word) == 12)
        self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size))
        self.assertTrue(model.trainables.vectors_lockf.shape == (12, ))
        self.assertTrue(model.vocabulary.cum_table.shape == (12, ))

        self.assertEqual(len(model.wv.hash2index), 202)
        self.assertTrue(model.wv.vectors_vocab.shape == (12, 100))
        self.assertTrue(model.wv.vectors_ngrams.shape == (202, 100))

        # Model stored in multiple files
        model_file = 'fasttext_old_sep'
        model = FT_gensim.load(datapath(model_file))
        self.assertTrue(model.wv.vectors.shape == (12, 100))
        self.assertTrue(len(model.wv.vocab) == 12)
        self.assertTrue(len(model.wv.index2word) == 12)
        self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size))
        self.assertTrue(model.trainables.vectors_lockf.shape == (12, ))
        self.assertTrue(model.vocabulary.cum_table.shape == (12, ))

        self.assertEqual(len(model.wv.hash2index), 202)
        self.assertTrue(model.wv.vectors_vocab.shape == (12, 100))
        self.assertTrue(model.wv.vectors_ngrams.shape == (202, 100))
Пример #2
0
    def test_sg_neg_training(self):

        model_gensim = FT_gensim(
            size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
            min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
            sorted_vocab=1, workers=1, min_alpha=0.0)

        lee_data = LineSentence(datapath('lee_background.cor'))
        model_gensim.build_vocab(lee_data)
        orig0 = np.copy(model_gensim.wv.vectors[0])
        model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.epochs)
        self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all())  # vector should vary after training

        sims_gensim = model_gensim.wv.most_similar('night', topn=10)
        sims_gensim_words = [word for (word, distance) in sims_gensim]  # get similar words
        expected_sims_words = [
            u'night.',
            u'night,',
            u'eight',
            u'overnight',
            u'overnight.',
            u'month',
            u'land',
            u'firm',
            u'singles',
            u'death']
        overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words))
        self.assertGreaterEqual(overlap_count, 2)
Пример #3
0
 def test_online_learning(self):
     model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0)
     self.assertTrue(len(model_hs.wv.vocab), 12)
     self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
     model_hs.build_vocab(new_sentences, update=True)  # update vocab
     self.assertEqual(len(model_hs.wv.vocab), 14)
     self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
     self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
Пример #4
0
 def test_estimate_memory(self):
     model = FT_gensim(sg=1, hs=1, size=10, negative=5, min_count=3)
     model.build_vocab(sentences)
     report = model.estimate_memory()
     self.assertEqual(report['vocab'], 2800)
     self.assertEqual(report['syn0_vocab'], 160)
     self.assertEqual(report['syn1'], 160)
     self.assertEqual(report['syn1neg'], 160)
     self.assertEqual(report['syn0_ngrams'], 2240)
     self.assertEqual(report['buckets_word'], 640)
     self.assertEqual(report['total'], 6160)
Пример #5
0
 def test_persistence(self):
     model = FT_gensim(sentences, min_count=1)
     model.save(testfile())
     self.models_equal(model, FT_gensim.load(testfile()))
     #  test persistence of the KeyedVectors of a model
     wv = model.wv
     wv.save(testfile())
     loaded_wv = FastTextKeyedVectors.load(testfile())
     self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams))
     self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
     self.assertEqual(len(wv.ngrams), len(loaded_wv.ngrams))
Пример #6
0
 def test_online_learning(self):
     model_hs = FT_gensim(sentences, size=10, min_count=1, seed=42, hs=1, negative=0)
     self.assertTrue(len(model_hs.wv.vocab), 12)
     self.assertTrue(len(model_hs.wv.ngrams), 202)
     self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
     self.assertFalse('tif' in model_hs.wv.ngrams)
     model_hs.build_vocab(new_sentences, update=True)  # update vocab
     self.assertEqual(len(model_hs.wv.vocab), 14)
     self.assertTrue(len(model_hs.wv.ngrams), 271)
     self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
     self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
     self.assertTrue('tif' in model_hs.wv.ngrams)  # ngram added because of the word `artificial`
Пример #7
0
 def test_load_model_with_non_ascii_vocab(self):
     model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext'))
     self.assertTrue(u'který' in model)
     try:
         model[u'který']
     except UnicodeDecodeError:
         self.fail('Unable to access vector for utf8 encoded non-ascii word')
Пример #8
0
 def test_load_model_non_utf8_encoding(self):
     model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852')
     self.assertTrue(u'který' in model)
     try:
         model[u'který']
     except KeyError:
         self.fail('Unable to access vector for cp-852 word')
Пример #9
0
 def test_online_learning_after_save(self):
     tmpf = get_tmpfile('gensim_fasttext.tst')
     model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
     model_neg.save(tmpf)
     model_neg = FT_gensim.load(tmpf)
     self.assertTrue(len(model_neg.wv.vocab), 12)
     model_neg.build_vocab(new_sentences, update=True)  # update vocab
     model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
     self.assertEqual(len(model_neg.wv.vocab), 14)
Пример #10
0
 def test_online_learning_after_save(self):
     model_neg = FT_gensim(sentences, size=10, min_count=0, seed=42, hs=0, negative=5)
     model_neg.save(testfile())
     model_neg = FT_gensim.load(testfile())
     self.assertTrue(len(model_neg.wv.vocab), 12)
     self.assertTrue(len(model_neg.wv.ngrams), 202)
     model_neg.build_vocab(new_sentences, update=True)  # update vocab
     model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter)
     self.assertEqual(len(model_neg.wv.vocab), 14)
     self.assertTrue(len(model_neg.wv.ngrams), 271)
Пример #11
0
    def test_training(self):
        model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
        model.build_vocab(sentences)
        self.model_sanity(model)

        model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
        sims = model.most_similar('graph', topn=10)

        self.assertEqual(model.wv.syn0.shape, (12, 10))
        self.assertEqual(len(model.wv.vocab), 12)
        self.assertEqual(model.wv.syn0_vocab.shape[1], 10)
        self.assertEqual(model.wv.syn0_ngrams.shape[1], 10)
        self.model_sanity(model)

        # test querying for "most similar" by vector
        graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
        sims2 = model.most_similar(positive=[graph_vector], topn=11)
        sims2 = [(w, sim) for w, sim in sims2 if w != 'graph']  # ignore 'graph' itself
        self.assertEqual(sims, sims2)

        # build vocab and train in one step; must be the same as above
        model2 = FT_gensim(sentences, size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
        self.models_equal(model, model2)

        # verify oov-word vector retrieval
        invocab_vec = model['minors']  # invocab word
        self.assertEqual(len(invocab_vec), 10)

        oov_vec = model['minor']  # oov word
        self.assertEqual(len(oov_vec), 10)
Пример #12
0
    def test_sg_hs_against_wrapper(self):
        if self.ft_path is None:
            logger.info("FT_HOME env variable not set, skipping test")
            return

        model_wrapper = FT_wrapper.train(ft_path=self.ft_path, corpus_file=datapath('lee_background.cor'),
            output_file=testfile(), model='skipgram', size=50, alpha=0.025, window=5, min_count=5, word_ngrams=1,
            loss='hs', sample=1e-3, negative=0, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12)

        model_gensim = FT_gensim(size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0,
            min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
            sorted_vocab=1, workers=1, min_alpha=0.0)

        lee_data = LineSentence(datapath('lee_background.cor'))
        model_gensim.build_vocab(lee_data)
        orig0 = np.copy(model_gensim.wv.syn0[0])
        model_gensim.train(lee_data, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter)
        self.assertFalse((orig0 == model_gensim.wv.syn0[0]).all())  # vector should vary after training
        self.compare_with_wrapper(model_gensim, model_wrapper)
Пример #13
0
    def test_load_fasttext_new_format(self):
        try:
            new_model = FT_gensim.load_fasttext_format(self.test_new_model_file)
        except Exception as exc:
            self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc))
        vocab_size, model_size = 1763, 10
        self.assertEqual(new_model.wv.syn0.shape, (vocab_size, model_size))
        self.assertEqual(len(new_model.wv.vocab), vocab_size, model_size)
        self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, model_size))

        expected_vec = [
            -0.025627,
            -0.11448,
            0.18116,
            -0.96779,
            0.2532,
            -0.93224,
            0.3929,
            0.12679,
            -0.19685,
            -0.13179
        ]  # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin
        self.assertTrue(np.allclose(new_model["hundred"], expected_vec, atol=1e-4))

        # vector for oov words are slightly different from original FastText due to discarding unused ngrams
        # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin
        expected_vec_oov = [
            -0.53378,
            -0.19,
            0.013482,
            -0.86767,
            -0.21684,
            -0.89928,
            0.45124,
            0.18025,
            -0.14128,
            0.22508
        ]
        self.assertTrue(np.allclose(new_model["rejection"], expected_vec_oov, atol=1e-4))

        self.assertEqual(new_model.min_count, 5)
        self.assertEqual(new_model.window, 5)
        self.assertEqual(new_model.iter, 5)
        self.assertEqual(new_model.negative, 5)
        self.assertEqual(new_model.sample, 0.0001)
        self.assertEqual(new_model.bucket, 1000)
        self.assertEqual(new_model.wv.max_n, 6)
        self.assertEqual(new_model.wv.min_n, 3)
        self.assertEqual(new_model.wv.syn0.shape, (len(new_model.wv.vocab), new_model.vector_size))
        self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, new_model.vector_size))
Пример #14
0
    def test_load_fasttext_format(self):
        try:
            model = FT_gensim.load_fasttext_format(self.test_model_file)
        except Exception as exc:
            self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc))
        vocab_size, model_size = 1762, 10
        self.assertEqual(model.wv.syn0.shape, (vocab_size, model_size))
        self.assertEqual(len(model.wv.vocab), vocab_size, model_size)
        self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model_size))

        expected_vec = [
            -0.57144,
            -0.0085561,
            0.15748,
            -0.67855,
            -0.25459,
            -0.58077,
            -0.09913,
            1.1447,
            0.23418,
            0.060007
        ]  # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin
        self.assertTrue(np.allclose(model["hundred"], expected_vec, atol=1e-4))

        # vector for oov words are slightly different from original FastText due to discarding unused ngrams
        # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin
        expected_vec_oov = [
            -0.23825,
            -0.58482,
            -0.22276,
            -0.41215,
            0.91015,
            -1.6786,
            -0.26724,
            0.58818,
            0.57828,
            0.75801
        ]
        self.assertTrue(np.allclose(model["rejection"], expected_vec_oov, atol=1e-4))

        self.assertEqual(model.min_count, 5)
        self.assertEqual(model.window, 5)
        self.assertEqual(model.iter, 5)
        self.assertEqual(model.negative, 5)
        self.assertEqual(model.sample, 0.0001)
        self.assertEqual(model.bucket, 1000)
        self.assertEqual(model.wv.max_n, 6)
        self.assertEqual(model.wv.min_n, 3)
        self.assertEqual(model.wv.syn0.shape, (len(model.wv.vocab), model.vector_size))
        self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model.vector_size))
Пример #15
0
    def test_norm_vectors_not_saved(self):
        model = FT_gensim(sentences, min_count=1)
        model.init_sims()
        model.save(testfile())
        loaded_model = FT_gensim.load(testfile())
        self.assertTrue(loaded_model.wv.syn0norm is None)
        self.assertTrue(loaded_model.wv.syn0_ngrams_norm is None)

        wv = model.wv
        wv.save(testfile())
        loaded_kv = FastTextKeyedVectors.load(testfile())
        self.assertTrue(loaded_kv.syn0norm is None)
        self.assertTrue(loaded_kv.syn0_ngrams_norm is None)
Пример #16
0
import os
from gensim.models.word2vec import LineSentence
from gensim.models.fasttext import FastText as FT_gensim

from fileObject import FileObj

file_obj = FileObj(r"testSet/data")
sentences = file_obj.read_lines_1_words()
"""
model_gensim = FT_gensim(size=100)
model_gensim.build_vocab(sentences)
model_gensim.train(sentences, total_examples=model_gensim.corpus_count, epochs=model_gensim.iter)
model_gensim.save('saved_model_gensim')
"""

loaded_model = FT_gensim.load('saved_model_gensim')
print(loaded_model)
print(loaded_model.most_similar('老人'))
print(loaded_model.doesnt_match("老人 小孩 孕妇 胃疼".split(" ")))

#sentence_obama = ["老人","高血压","怎么办"]
#sentence_president = ["青年","高血压","怎么办"]
#distance = loaded_model.wmdistance(sentence_obama, sentence_president)
#print(distance)

sentence_query = ["晚上", "经常", "失眠", "怎么办"]

sim_max = 0
sim_index = 0
sim_list = []
for i in range(0, sentences.__len__()):
Пример #17
0
#print(corpus)

wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(doc) for doc in corpus]
print(tokenized_corpus[1])

#Using Fasttext..............
feature_size = 50
window_context = 10
min_word_count = 5
sample = 1e-3

fasttext_model = FastText(tokenized_corpus,
                          size=feature_size,
                          window=window_context,
                          min_count=min_word_count,
                          sample=sample,
                          sg=1,
                          iter=20)  #1-skipgram and 0-CBOW

print(fasttext_model.wv['god'])

#Finding similar words..............
similar_words = {
    search_term: [
        item[0]
        for item in fasttext_model.wv.most_similar([search_term], topn=5)
    ]
    for search_term in ['god', 'jesus', 'egypt', 'john']
}
print(similar_words)
def load_old_fasttext(*args, **kwargs):
    old_model = FastText.load(*args, **kwargs)
    params = {
        'size': old_model.vector_size,
        'alpha': old_model.alpha,
        'window': old_model.window,
        'min_count': old_model.min_count,
        'max_vocab_size': old_model.__dict__.get('max_vocab_size', None),
        'sample': old_model.sample,
        'seed': old_model.seed,
        'workers': old_model.workers,
        'min_alpha': old_model.min_alpha,
        'sg': old_model.sg,
        'hs': old_model.hs,
        'negative': old_model.negative,
        'cbow_mean': old_model.cbow_mean,
        'hashfxn': old_model.hashfxn,
        'iter': old_model.iter,
        'null_word': old_model.null_word,
        'sorted_vocab': old_model.sorted_vocab,
        'batch_words': old_model.batch_words,
        'min_n': old_model.min_n,
        'max_n': old_model.max_n,
        'word_ngrams': old_model.word_ngrams,
        'bucket': old_model.bucket
    }
    new_model = NewFastText(**params)
    # set trainables attributes
    new_model.wv.vectors = old_model.wv.syn0
    new_model.wv.vectors_vocab = old_model.wv.syn0_vocab
    new_model.wv.vectors_ngrams = old_model.wv.syn0_ngrams
    if hasattr(old_model.wv, 'syn0norm'):
        new_model.wv.vectors_norm = old_model.wv.syn0norm
    if hasattr(old_model, 'syn1'):
        new_model.trainables.syn1 = old_model.syn1
    if hasattr(old_model, 'syn1neg'):
        new_model.trainables.syn1neg = old_model.syn1neg
    if hasattr(old_model, 'syn0_lockf'):
        new_model.trainables.vectors_lockf = old_model.syn0_lockf

    if hasattr(old_model, 'syn0_vocab_lockf'):
        new_model.trainables.vectors_vocab_lockf = old_model.syn0_vocab_lockf
    if hasattr(old_model, 'syn0_ngrams_lockf'):
        new_model.trainables.vectors_ngrams_lockf = old_model.syn0_ngrams_lockf
    if hasattr(old_model.wv, 'syn0_vocab_norm'):
        new_model.trainables.vectors_vocab_norm = old_model.wv.syn0_vocab_norm
    if hasattr(old_model.wv, 'syn0_ngrams_norm'):
        new_model.trainables.vectors_ngrams_norm = old_model.wv.syn0_ngrams_norm

    # set vocabulary attributes
    new_model.wv.vocab = old_model.wv.vocab
    new_model.wv.index2word = old_model.wv.index2word
    new_model.vocabulary.cum_table = old_model.cum_table

    new_model.wv.hash2index = old_model.wv.hash2index

    new_model.train_count = old_model.train_count
    new_model.corpus_count = old_model.corpus_count
    new_model.corpus_total_words = old_model.corpus_total_words
    new_model.running_training_loss = old_model.running_training_loss
    new_model.total_train_time = old_model.total_train_time
    new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached
    new_model.model_trimmed_post_training = old_model.model_trimmed_post_training

    new_model.trainables.num_ngram_vectors = old_model.num_ngram_vectors

    return new_model
Пример #19
0
 def test_bucket_ngrams(self):
     model = FT_gensim(size=10, min_count=1, bucket=20)
     model.build_vocab(sentences)
     self.assertEqual(model.wv.syn0_ngrams.shape, (20, 10))
     model.build_vocab(new_sentences, update=True)
     self.assertEqual(model.wv.syn0_ngrams.shape, (20, 10))
Пример #20
0
 def test_get_vocab_word_vecs(self):
     model = FT_gensim(size=10, min_count=1, seed=42)
     model.build_vocab(sentences)
     original_syn0_vocab = np.copy(model.wv.syn0_vocab)
     model.trainables.get_vocab_word_vecs(model.wv)
     self.assertTrue(np.all(np.equal(model.wv.syn0_vocab, original_syn0_vocab)))
Пример #21
0
tree_t_2 = etree.parse("a_corpus_tokens_2.xml")

root_t_2 = tree_t_2.getroot()

all_1 = root_t_1.findall(".//s") 
all_2 = root_t_2.findall(".//s") 
all_3 = root_t_3.findall(".//s") 
all_sent = all_1 + all_2 + all_3
class MyCorpus(object):
    def __iter__(self):
        for sentences in all_sent:
                yield ([entities.text.lower() for entities in sentences])
from pprint import pprint as print
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath
model = FastText(size=200, min_n = 3,window = 7 )
model.build_vocab(sentences=MyCorpus())
model.train(
     sentences=MyCorpus(), epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words)
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("lower_better_bigger.model")
model.save(fname)
model = FastText.load(fname)
import gensim.models

sentences = MyCorpus()
model_vec2 = gensim.models.Word2Vec(sentences=sentences, size=200)
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("lower_better_bigger_2vec.model")
model_vec2.save(fname)
Пример #22
0
    def load_word2vec(self,model_dir: str = config["paths"]["pretrain_dir"][plat][user],model_type: str = 'googlenews',
                      encoding: str = 'latin-1',model_file_name: str = "GoogleNews-vectors-negative300.bin") ->\
            gensim.models.keyedvectors.Word2VecKeyedVectors:
        """
        Loads Word2Vec model and returns initial weights for embedding layer.

        inputs:
        model_type      # GoogleNews / glove
        embedding_dim    # Word vector dimensionality
        """
        if self.pretrain_model is not None: return self.pretrain_model

        assert exists(
            join(model_dir,
                 model_file_name)), "Model file not found at: [{}].".format(
                     join(model_dir, model_file_name))
        logger.debug("Using [{0}] model from [{1}]".format(
            model_type, join(model_dir, model_file_name)))
        if model_type == 'googlenews' or model_type == "fasttext_wiki":
            if exists(join(model_dir, model_file_name + '.bin')):
                try:
                    pretrain_model = FastText.load_fasttext_format(
                        join(model_dir, model_file_name +
                             '.bin'))  ## For original fasttext *.bin format.
                except Exception as e:
                    pretrain_model = KeyedVectors.load_word2vec_format(
                        join(model_dir, model_file_name + '.bin'),
                        binary=True,
                        encoding=encoding)
            else:
                try:
                    pretrain_model = KeyedVectors.load_word2vec_format(
                        join(model_dir, model_file_name), binary=self.binary)
                except Exception as e:  ## On exception, trying a different format.
                    logger.info(
                        'Loading original word2vec format failed. Trying Gensim format.'
                    )
                    pretrain_model = KeyedVectors.load(
                        join(model_dir, model_file_name))
                ## Save model in binary format for faster loading in future.
                pretrain_model.save_word2vec_format(join(
                    model_dir, model_file_name + ".bin"),
                                                    binary=True)
                logger.info("Saved binary model at: [{0}]".format(
                    join(model_dir, model_file_name + ".bin")))
                logger.info(type(pretrain_model))

        elif model_type == 'glove':
            logger.info('Loading existing Glove model: [{0}]'.format(
                join(model_dir, model_file_name)))
            from gensim.scripts.glove2word2vec import glove2word2vec
            from gensim.test.utils import datapath, get_tmpfile

            glove_file = datapath(join(model_dir, model_file_name))
            tmp_file = get_tmpfile(
                join(model_dir, model_file_name + "_word2vec"))
            _ = glove2word2vec(glove_file, tmp_file)
            pretrain_model = KeyedVectors.load_word2vec_format(tmp_file)

        elif model_type == "bert_multi":
            bert_model = BertModel.from_pretrained('bert-base-uncased')
            bert_model.eval()
            # pretrain_model = FastText.load_fasttext_format(join(model_dir,model_file_name))
            # pretrain_model = FastText.load_binary_data (join(model_dir,model_file_name))
            pretrain_model = KeyedVectors.load_word2vec_format(join(
                model_dir, model_file_name),
                                                               binary=False)
            # import io
            # fin = io.open(join(model_dir, model_file_name), encoding=encoding, newline=newline,
            #               errors=errors)
            # n, d = map(int, fin.readline().split())
            # pretrain_model = OrderedDict()
            # for line in fin:
            #     tokens = line.rstrip().split(' ')
            #     pretrain_model[tokens[0]] = map(float, tokens[1:])
            """embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName, binary=False) embedding_dict.save_word2vec_format(dictFileName+".bin", binary=True) embedding_dict = gensim.models.KeyedVectors.load_word2vec_format(dictFileName+".bin", binary=True)"""
        else:
            raise ValueError('Unknown pretrain model type: %s!' % model_type)
        self.pretrain_model = pretrain_model
        return self.pretrain_model
Пример #23
0
macro_cols = [x + '_' + str(i) for x in macro_columns for i in range(N_parts)]
macro_stress_df = create_overall(
    pd.DataFrame(macro_stress, columns=macro_cols), 'max', N_parts)
macro_df = create_overall(pd.DataFrame(macro, columns=macro_cols), 'max',
                          N_parts)

data = pd.concat([data.loc[:, :'BENEFICIAR_ORG'], macro_df], axis=1)
data_stress = pd.concat(
    [data_stress.loc[:, :'BENEFICIAR_ORG'], macro_stress_df], axis=1)

pd.Series('\t'.join(data['PRINCIPAL_NAME'] + ' ' +
                    data['BENEFICIAR_NAME'])).to_csv('text.txt',
                                                     sep='\t',
                                                     index=None)
string = LineSentence('text.txt')
fasttext = FastText(size=50, sg=0, word_ngrams=2, iter=10, min_n=2, max_n=10)
fasttext.build_vocab(string)
fasttext.train(string,
               total_examples=fasttext.corpus_count,
               epochs=fasttext.iter)


def to_sent_emb(string):
    return np.sum([fasttext[x] for x in string.split(' ') if len(x) > 1],
                  axis=0)


emb_df = pd.DataFrame(
    np.vstack((data['PRINCIPAL_NAME'] + ' ' +
               data['BENEFICIAR_NAME']).apply(to_sent_emb)))
emb_df.columns = ['emb_' + str(w) for w in emb_df.columns]
Пример #24
0
 def setUp(self):
     ft_home = os.environ.get('FT_HOME', None)
     self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None
     self.test_model_file = datapath('lee_fasttext')
     self.test_model = FT_gensim.load_fasttext_format(self.test_model_file)
     self.test_new_model_file = datapath('lee_fasttext_new')
Пример #25
0
 def test_load_model_supervised(self):
     with self.assertRaises(NotImplementedError):
         FT_gensim.load_fasttext_format(
             datapath('pang_lee_polarity_fasttext'))
Пример #26
0
from gensim.models.fasttext import FastText as FT_gensim
from gensim.test.utils import datapath
import numpy as np
#from gensim.models.fasttext import FastText, load_facebook_vectors 이 부분 잘 살펴보면 update할 수 있을 지도

cap_path = datapath(
    '/Users/hanjaewon/폴더모음/학교생활/졸업과제/fasttextTest/model_report'
)
model = FT_gensim.load(cap_path)
corpus_file = datapath(
    '/Users/hanjaewon/폴더모음/학교생활/졸업과제/fasttextTest/new.txt')
#model.build_vocab(sentences=common_texts)
#model.build_vocab(sentences=text)
#model.train(sentences=text, total_examples=len(text), epochs=model.epochs)
print('채스가 vocab안에 있는가?')
print('채스' in model.wv.vocab)
print(model.corpus_count)
#print('채스의 vector는?')
#print(model.wv.__getitem__('채스'))
model.build_vocab(corpus_file=corpus_file, update=True)
model.train(corpus_file=corpus_file,
            epochs=model.epochs,
            total_examples=model.corpus_count,
            total_words=model.corpus_total_words)
print('----------업데이트 이후----------')
print('채스가 vocab안에 있는가?')
print('채스' in model.wv.vocab)
print(model.corpus_count)
#print('채스의 vector는?')
#print(model.wv.__getitem__('채스'))
    sentences = list(CreateCorpus(args.protocols))

if args.model_architecture == 'word2vec':
    model = Word2Vec(sentences=sentences,
                     size=args.size,
                     window=args.window,
                     min_count=args.min_count,
                     workers=args.threads,
                     sg=args.sg,
                     hs=args.hs,
                     negative=args.ns)
elif args.model_architecture == 'fasttext':
    model = FT_gensim(size=args.size,
                      window=args.window,
                      min_count=args.min_count,
                      workers=args.threads,
                      sg=args.sg,
                      hs=args.hs,
                      negative=args.ns)

    # build the vocabulary
    model.build_vocab(sentences)

    # train the model
    model.train(sentences,
                epochs=model.epochs,
                total_examples=model.corpus_count,
                total_words=model.corpus_total_words)

elapsed = time.time()
logging.info(f'Training finished. Took {elapsed-start} s')
Пример #28
0
import os
import pickle
import numpy as np
import sys
# from smart_bug import Statement_Norm, Statement_Vec
# from smart_bug import Statement_Vec
from gensim.models.fasttext import FastText
from scipy.spatial.distance import pdist, cdist, squareform
from scipy.spatial import distance

BUG_FASTTEXT_MODEL = FastText.load(
    "../statement_level/Model/FastText/fasttext_model")
print("Statement FastText Model loaded")

sys.path.append('../statement_level/Normalize')
from statement_normalization import Statement_Norm
sys.path.append('../statement_level/Vectorize/')
from statement_vectorize import Statement_Vec


def save_to_file(messagecontent):
    if not os.path.exists('./USERINPUT'):
        os.makedirs('./USERINPUT')
    with open('./USERINPUT/current.sol', 'w') as handle:
        handle.write(messagecontent)


def parser():
    # cmd = "java -classpath ./State_Parse/antlr4.jar:./State_Parse/target/ Tokenize ./Bug/current.sol ./Bug/"
    cmd = "java -classpath ../statement_level/Parse/antlr4.jar:../statement_level/Parse/target/ Tokenize ./USERINPUT/current.sol ./STATEMENT_RESULT/"
    os.system(cmd)
Пример #29
0
def genex_reviews():
    for i in df['Review'].values:
        yield simple_preprocess(i)


reviews = df['Review'].values
# reviews = genex_reviews()

# default window size is 5 (two words before and two words after the input word, in addition to the input word itself)
# training the cbow (Continuous Bag of Words) model
model_cbow = Word2Vec(reviews, window=10, min_count=2, workers=10)
model_cbow.train(reviews, total_examples=len(reviews), epochs=50)

# training the char n-gram model (subword information) with fastText
model_subword = FastText(reviews, window=10, min_count=2, workers=10, min_n=3, max_n=6)
model_subword.train(reviews, total_examples=len(reviews), epochs=50)

# training the SkipGram model
model_skipgram = Word2Vec(reviews, window=10, min_count=2, workers=10, sg=1)
model_skipgram.train(reviews, total_examples=len(reviews), epochs=50)

# saving the models
model_cbow.save("cbow.model")
model_subword.save("fasttext.model")
model_skipgram.save("skipgram.model")

# saving the word vectors
model_cbow.wv.save("cbow_vector.bin")
model_subword.wv.save("subword_vector.bin")
model_skipgram.wv.save("skipgram_vector.bin")
Пример #30
0
 def test_load_model_supervised(self):
     with self.assertRaises(NotImplementedError):
         FT_gensim.load_fasttext_format(datapath('pang_lee_polarity_fasttext'))
Пример #31
0
 def __init__(self, paraphrases):
     self.embeddings = FastText.load(EMBEDDINGS_FOLDER + EMBEDDINGS_FILE,
                                     mmap='r')
     self.paraphrases = paraphrases
     #self.stemmer = SnowballStemmer('russian')
     self.stemmer = Mystem()
Пример #32
0
 def fit_model(self, corpus: List):
     self.model = FastText(sentences=corpus, **self.additional_parameters)
Пример #33
0
"""
Script for ranking a list of pubmed results by date, ft, ...
"""

from gensim.models.fasttext import FastText
from utils import preprocess
import numpy as np
import nltk
from nltk.corpus import stopwords

ft_model = FastText.load('models/ft/med_model_dim300_win5_min100.bin')
stops = set(stopwords.words('english'))


def _remove_stopwords(sentence):
    """

    :param sentence: list of words
    :return: list of words
    """
    if isinstance(sentence, list):
        return [word for word in sentence if not word in stops]

    else:
        sentence = sentence.split()
        sentence = [word for word in sentence if not word in stops]
        sentence = ' '.join(sentence)
        return sentence


def similarity(w1, w2):
Пример #34
0
 def test_bucket_ngrams(self):
     model = FT_gensim(size=10, min_count=1, bucket=20)
     model.build_vocab(sentences)
     self.assertEqual(model.wv.vectors_ngrams.shape, (20, 10))
     model.build_vocab(new_sentences, update=True)
     self.assertEqual(model.wv.vectors_ngrams.shape, (20, 10))
Пример #35
0
 def __init__(self, model_name=None, storage=None, *args, **kwargs):
     BaseEmbeddingModel.__init__(self,
                                 model_name=model_name,
                                 storage=storage)
     FT_gensim.__init__(self, *args, **kwargs)
     self.model_type = FT_gensim.__name__.lower()
Пример #36
0
 def setUp(self):
     ft_home = os.environ.get('FT_HOME', None)
     self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None
     self.test_model_file = datapath('lee_fasttext')
     self.test_model = FT_gensim.load_fasttext_format(self.test_model_file)
     self.test_new_model_file = datapath('lee_fasttext_new')
Пример #37
0
    def fit(self, X: np.array, y: csr_matrix):

        if self.verbose:
            #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
            # TODO revert this
            pass

        X_splitted = np.array([s.split() for s in X])
        #docs = [TaggedDocument(words=tokens, tags=[index]) for index, tokens in enumerate(X_splitted)]

        if self.model.lower() == 'fasttext':
            self.wv_model_ = FastText(sentences=X_splitted.tolist(),
                                      size=self.embedding_dim,
                                      iter=self.epochs,
                                      min_count=self.min_count,
                                      window=self.window_size,
                                      workers=self.n_jobs)
        elif self.model.lower() == 'doc2vec':

            self.wv_model_ = Word2Vec(
                sentences=X_splitted.tolist(),
                size=self.embedding_dim,
                iter=self.epochs,
                min_count=self.min_count,
                window=self.window_size,
                workers=self.n_jobs,
            )

        else:
            raise NotImplementedError

        tag_doc_mapping = self._create_tag_docs(y)

        if self.tfidf_weighting:
            self.tfidf_ = TfidfVectorizer()
            self.texts_tfidf_ = self.tfidf_.fit_transform(X)

        self.tag_embeddings_ = np.empty((y.shape[1], self.embedding_dim),
                                        dtype='float64')

        if self.verbose:
            tac_doc_iterator = tqdm(enumerate(tag_doc_mapping),
                                    desc='Computing tag embeddings')
        else:
            tac_doc_iterator = enumerate(tag_doc_mapping)
        for tag_id, texts_idx in tac_doc_iterator:
            # will be of shape(n_texts, embedding_dim)
            tag_word_embeddings = []
            for text_ind in texts_idx:
                for token in list(set(X_splitted[text_ind])):
                    try:
                        word_embedding = self.wv_model_.wv[token]
                    except KeyError:
                        # if words occur that are ignored due to min_count
                        continue
                    if self.tfidf_weighting:
                        token_ind = self.tfidf_.vocabulary_.get(token, -1)
                        if token_ind > -1:
                            tfidf_value = self.texts_tfidf_[text_ind,
                                                            token_ind]
                            word_embedding = word_embedding * tfidf_value
                    tag_word_embeddings.append(word_embedding)

            self.tag_embeddings_[tag_id] = self.pooling_func(
                tag_word_embeddings)
        return self
Пример #38
0
def load_old_fasttext(*args, **kwargs):
    old_model = FastText.load(*args, **kwargs)
    params = {
        'size': old_model.vector_size,
        'alpha': old_model.alpha,
        'window': old_model.window,
        'min_count': old_model.min_count,
        'max_vocab_size': old_model.__dict__.get('max_vocab_size', None),
        'sample': old_model.sample,
        'seed': old_model.seed,
        'workers': old_model.workers,
        'min_alpha': old_model.min_alpha,
        'sg': old_model.sg,
        'hs': old_model.hs,
        'negative': old_model.negative,
        'cbow_mean': old_model.cbow_mean,
        'hashfxn': old_model.hashfxn,
        'iter': old_model.iter,
        'null_word': old_model.null_word,
        'sorted_vocab': old_model.sorted_vocab,
        'batch_words': old_model.batch_words,
        'min_n': old_model.min_n,
        'max_n': old_model.max_n,
        'word_ngrams': old_model.word_ngrams,
        'bucket': old_model.bucket
    }
    new_model = NewFastText(**params)
    # set trainables attributes
    new_model.wv.vectors = old_model.wv.syn0
    new_model.wv.vectors_vocab = old_model.wv.syn0_vocab
    new_model.wv.vectors_ngrams = old_model.wv.syn0_ngrams
    if hasattr(old_model.wv, 'syn0norm'):
        new_model.wv.vectors_norm = old_model.wv.syn0norm
    if hasattr(old_model, 'syn1'):
        new_model.trainables.syn1 = old_model.syn1
    if hasattr(old_model, 'syn1neg'):
        new_model.trainables.syn1neg = old_model.syn1neg
    if hasattr(old_model, 'syn0_lockf'):
        new_model.trainables.vectors_lockf = old_model.syn0_lockf

    if hasattr(old_model, 'syn0_vocab_lockf'):
        new_model.trainables.vectors_vocab_lockf = old_model.syn0_vocab_lockf
    if hasattr(old_model, 'syn0_ngrams_lockf'):
        new_model.trainables.vectors_ngrams_lockf = old_model.syn0_ngrams_lockf
    if hasattr(old_model.wv, 'syn0_vocab_norm'):
        new_model.trainables.vectors_vocab_norm = old_model.wv.syn0_vocab_norm
    if hasattr(old_model.wv, 'syn0_ngrams_norm'):
        new_model.trainables.vectors_ngrams_norm = old_model.wv.syn0_ngrams_norm

    # set vocabulary attributes
    new_model.wv.vocab = old_model.wv.vocab
    new_model.wv.index2word = old_model.wv.index2word
    new_model.vocabulary.cum_table = old_model.cum_table

    new_model.wv.hash2index = old_model.wv.hash2index

    new_model.train_count = old_model.train_count
    new_model.corpus_count = old_model.corpus_count
    new_model.running_training_loss = old_model.running_training_loss
    new_model.total_train_time = old_model.total_train_time
    new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached
    new_model.model_trimmed_post_training = old_model.model_trimmed_post_training

    new_model.trainables.num_ngram_vectors = old_model.num_ngram_vectors

    return new_model
Пример #39
0
import torch
import jieba
from torch import nn
from pytorch_pretrained_bert import BertModel, BertAdam
from gensim.models.fasttext import FastText
from tqdm import tqdm
from pyltp import Postagger

MODEL_PATH = 'bert-model'
jieba.load_userdict('bert-model/dict-traditional.txt')
seq_len = 512

# Load vocabularies
print('建構詞向量...')
word2vec = FastText.load_fasttext_format('bert-model/wordvec-large.dim1024')
vocab = {}
id2vocab = {}
vec = []

with open('bert-model/TF.csv') as TF:
    for idx, line in enumerate(tqdm(TF)):
        term = line.split(',')[0]
        vocab[term] = idx
        id2vocab[idx] = term
        vec.append(word2vec[term])

del word2vec

model = torch.load('checkpoint-generator-pretrain/bert-LanGen-last.pt')['full_model']
POS = Postagger()
POS.load('bert-model/ltp_data_v3.4.0/pos.model')