示例#1
0
 def testPersistenceKeyedVectorsFormatWithVocab(self):
     """Test storing/loading the entire model and vocabulary in word2vec format."""
     tmpf = get_tmpfile('gensim_word2vec.tst')
     model = word2vec.Word2Vec(sentences, min_count=1)
     model.init_sims()
     testvocab = get_tmpfile('gensim_word2vec.vocab')
     model.wv.save_word2vec_format(tmpf, testvocab, binary=True)
     kv_binary_model_with_vocab = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True)
     self.assertEqual(model.wv.vocab['human'].count, kv_binary_model_with_vocab.vocab['human'].count)
示例#2
0
    def test_persistence_fromfile(self):
        """Test storing/loading the entire model."""
        with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file:
            save_lee_corpus_as_line_sentence(corpus_file)

            tmpf = get_tmpfile('gensim_doc2vec.tst')
            model = doc2vec.Doc2Vec(corpus_file=corpus_file, min_count=1)
            model.save(tmpf)
            self.models_equal(model, doc2vec.Doc2Vec.load(tmpf))
示例#3
0
 def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self):
     """Test storing/loading the entire model and vocabulary in word2vec format chained with
      saving and loading via `save` and `load` methods`.
      It was possible prior to 1.0.0 release, now raises Exception"""
     tmpf = get_tmpfile('gensim_word2vec.tst')
     model = word2vec.Word2Vec(sentences, min_count=1)
     model.init_sims()
     testvocab = get_tmpfile('gensim_word2vec.vocab')
     model.wv.save_word2vec_format(tmpf, testvocab, binary=True)
     binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True)
     binary_model_with_vocab_kv.save(tmpf)
     self.assertRaises(AttributeError, word2vec.Word2Vec.load, tmpf)
    def test_persistence(self):
        # Test persistence without using `smartirs`
        fname = get_tmpfile('gensim_models.tst')
        model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
        model.save(fname)
        model2 = tfidfmodel.TfidfModel.load(fname)
        self.assertTrue(model.idfs == model2.idfs)
        tstvec = [corpus[1], corpus[2]]
        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
        self.assertTrue(np.allclose(model[[]], model2[[]]))  # try projecting an empty vector

        # Test persistence with using `smartirs`
        fname = get_tmpfile('gensim_models_smartirs.tst')
        model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
        model.save(fname)
        model2 = tfidfmodel.TfidfModel.load(fname)
        self.assertTrue(model.idfs == model2.idfs)
        tstvec = [corpus[1], corpus[2]]
        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
        self.assertTrue(np.allclose(model[[]], model2[[]]))  # try projecting an empty vector

        # Test persistence between Gensim v3.2.0 and current model.
        model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
        model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
        idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
        idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
        self.assertTrue(np.allclose(idfs3, idfs4))
        tstvec = [corpus[1], corpus[2]]
        self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
        self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
        self.assertTrue(np.allclose(model3[[]], model4[[]]))  # try projecting an empty vector

        # Test persistence with using pivoted normalization
        fname = get_tmpfile('gensim_models_smartirs.tst')
        model = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
        model.save(fname)
        model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
        self.assertTrue(model.idfs == model2.idfs)
        tstvec = [corpus[1], corpus[2]]
        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))

        # Test persistence between Gensim v3.2.0 and pivoted normalization compressed model.
        model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
        model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
        idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
        idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
        self.assertTrue(np.allclose(idfs3, idfs4))
        tstvec = [corpus[1], corpus[2]]
        self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
        self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
    def test_saveAsText(self):
        """`Dictionary` can be saved as textfile. """
        tmpf = get_tmpfile('save_dict_test.txt')
        small_text = [
            ["prvé", "slovo"],
            ["slovo", "druhé"],
            ["druhé", "slovo"]
        ]

        d = Dictionary(small_text)

        d.save_as_text(tmpf)
        with codecs.open(tmpf, 'r', encoding='utf-8') as file:
            serialized_lines = file.readlines()
            self.assertEqual(serialized_lines[0], u"3\n")
            self.assertEqual(len(serialized_lines), 4)
            # We do not know, which word will have which index
            self.assertEqual(serialized_lines[1][1:], u"\tdruhé\t2\n")
            self.assertEqual(serialized_lines[2][1:], u"\tprvé\t1\n")
            self.assertEqual(serialized_lines[3][1:], u"\tslovo\t3\n")

        d.save_as_text(tmpf, sort_by_word=False)
        with codecs.open(tmpf, 'r', encoding='utf-8') as file:
            serialized_lines = file.readlines()
            self.assertEqual(serialized_lines[0], u"3\n")
            self.assertEqual(len(serialized_lines), 4)
            self.assertEqual(serialized_lines[1][1:], u"\tslovo\t3\n")
            self.assertEqual(serialized_lines[2][1:], u"\tdruhé\t2\n")
            self.assertEqual(serialized_lines[3][1:], u"\tprvé\t1\n")
示例#6
0
    def testMmap(self):
        if self.cls == similarities.WmdSimilarity and not PYEMD_EXT:
            return

        fname = get_tmpfile('gensim_similarities.tst.pkl')
        if self.cls == similarities.Similarity:
            index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5)
        elif self.cls == similarities.WmdSimilarity:
            index = self.cls(texts, self.w2v_model)
        else:
            index = self.cls(corpus, num_features=len(dictionary))
        # store all arrays separately
        index.save(fname, sep_limit=0)

        # same thing, but use mmap to load arrays
        index2 = self.cls.load(fname, mmap='r')
        if self.cls == similarities.Similarity:
            # for Similarity, only do a basic check
            self.assertTrue(len(index.shards) == len(index2.shards))
            index.destroy()
        else:
            if isinstance(index, similarities.SparseMatrixSimilarity):
                # hack SparseMatrixSim indexes so they're easy to compare
                index.index = index.index.todense()
                index2.index = index2.index.todense()
            self.assertTrue(numpy.allclose(index.index, index2.index))
            self.assertEqual(index.num_best, index2.num_best)
示例#7
0
    def testLoadOldModel(self):
        """Test loading word2vec models from previous version"""

        model_file = 'word2vec_old'
        model = word2vec.Word2Vec.load(datapath(model_file))
        self.assertTrue(model.wv.vectors.shape == (12, 100))
        self.assertTrue(len(model.wv.vocab) == 12)
        self.assertTrue(len(model.wv.index2word) == 12)
        self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size))
        self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
        self.assertTrue(model.vocabulary.cum_table.shape == (12,))

        self.onlineSanity(model, trained_model=True)

        # Model stored in multiple files
        model_file = 'word2vec_old_sep'
        model = word2vec.Word2Vec.load(datapath(model_file))
        self.assertTrue(model.wv.vectors.shape == (12, 100))
        self.assertTrue(len(model.wv.vocab) == 12)
        self.assertTrue(len(model.wv.index2word) == 12)
        self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size))
        self.assertTrue(model.trainables.vectors_lockf.shape == (12,))
        self.assertTrue(model.vocabulary.cum_table.shape == (12,))

        self.onlineSanity(model, trained_model=True)

        # load really old model
        model_file = 'w2v-lee-v0.12.0'
        model = word2vec.Word2Vec.load(datapath(model_file))
        self.onlineSanity(model, trained_model=True)

        # test for max_final_vocab for model saved in 3.3
        model_file = 'word2vec_3.3'
        model = word2vec.Word2Vec.load(datapath(model_file))
        self.assertEqual(model.max_final_vocab, None)
        self.assertEqual(model.vocabulary.max_final_vocab, None)

        # Test loading word2vec models from all previous versions
        old_versions = [
            '0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4',
            '0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4',
            '1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0',
            '3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0'
        ]

        saved_models_dir = datapath('old_w2v_models/w2v_{}.mdl')
        for old_version in old_versions:
            model = word2vec.Word2Vec.load(saved_models_dir.format(old_version))
            self.assertTrue(len(model.wv.vocab) == 3)
            self.assertTrue(model.wv.vectors.shape == (3, 4))
            # check if similarity search and online training works.
            self.assertTrue(len(model.wv.most_similar('sentence')) == 2)
            model.build_vocab(list_corpus, update=True)
            model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
            # check if similarity search and online training works after saving and loading back the model.
            tmpf = get_tmpfile('gensim_word2vec.tst')
            model.save(tmpf)
            loaded_model = word2vec.Word2Vec.load(tmpf)
            loaded_model.build_vocab(list_corpus, update=True)
            loaded_model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
示例#8
0
    def testPersistence(self):
        fname = get_tmpfile('gensim_models_nmf.tst')

        self.model.save(fname)
        model2 = nmf.Nmf.load(fname)
        tstvec = []
        self.assertTrue(np.allclose(self.model[tstvec], model2[tstvec]))  # try projecting an empty vector
示例#9
0
 def testWord2Vec(self):
     corpus = BigCorpus(words_only=True, num_docs=100000, num_terms=3000000, doc_len=200)
     tmpf = get_tmpfile('gensim_big.tst')
     model = gensim.models.Word2Vec(corpus, size=300, workers=4)
     model.save(tmpf, ignore=['syn1'])
     del model
     gensim.models.Word2Vec.load(tmpf)
示例#10
0
def load_on_instance():
    # Save and load a Doc2Vec Model on instance for test
    tmpf = get_tmpfile('gensim_doc2vec.tst')
    model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1)
    model.save(tmpf)
    model = doc2vec.Doc2Vec()  # should fail at this point
    return model.load(tmpf)
示例#11
0
    def test_get_offsets_and_start_doctags_win(self):
        # Each line takes 7 bytes (including '\n' character which is actually '\r\n' on Windows)
        lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n']
        tmpf = get_tmpfile('gensim_doc2vec.tst')

        with utils.smart_open(tmpf, 'wb', encoding='utf8') as fout:
            for line in lines:
                fout.write(utils.any2unicode(line))

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 1)
        self.assertEqual(offsets, [0])
        self.assertEqual(start_doctags, [0])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 2)
        self.assertEqual(offsets, [0, 14])
        self.assertEqual(start_doctags, [0, 2])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 3)
        self.assertEqual(offsets, [0, 7, 21])
        self.assertEqual(start_doctags, [0, 1, 3])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 4)
        self.assertEqual(offsets, [0, 7, 14, 21])
        self.assertEqual(start_doctags, [0, 1, 2, 3])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 5)
        self.assertEqual(offsets, [0, 7, 14, 21, 28])
        self.assertEqual(start_doctags, [0, 1, 2, 3, 4])

        offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 6)
        self.assertEqual(offsets, [0, 0, 7, 14, 14, 21])
        self.assertEqual(start_doctags, [0, 0, 1, 2, 2, 3])
示例#12
0
    def test_json_len(self):
        tmpf = get_tmpfile('script.tst.json')
        segment_and_write_all_articles(self.fname, tmpf, workers=1)

        expected_num_articles = 106
        num_articles = sum(1 for line in smart_open(tmpf))
        self.assertEqual(num_articles, expected_num_articles)
示例#13
0
 def testLdaModel(self):
     corpus = BigCorpus(num_docs=5000)
     tmpf = get_tmpfile('gensim_big.tst')
     model = gensim.models.LdaModel(corpus, num_topics=500, id2word=corpus.dictionary)
     model.save(tmpf)
     del model
     gensim.models.LdaModel.load(tmpf)
示例#14
0
 def testNormalizeAfterTrainingData(self):
     tmpf = get_tmpfile('gensim_word2vec.tst')
     model = word2vec.Word2Vec(sentences, min_count=1)
     model.save(tmpf)
     norm_only_model = word2vec.Word2Vec.load(tmpf)
     norm_only_model.delete_temporary_training_data(replace_word_vectors_with_normalized=True)
     self.assertFalse(np.allclose(model['human'], norm_only_model['human']))
示例#15
0
def load_on_instance():
    # Save and load a Word2Vec Model on instance for test
    tmpf = get_tmpfile('gensim_word2vec.tst')
    model = word2vec.Word2Vec(sentences, min_count=1)
    model.save(tmpf)
    model = word2vec.Word2Vec()  # should fail at this point
    return model.load(tmpf)
示例#16
0
 def testPersistence(self):
     fname = get_tmpfile('gensim_models_atmodel.tst')
     model = self.model
     model.save(fname)
     model2 = self.class_.load(fname)
     self.assertEqual(model.num_topics, model2.num_topics)
     self.assertTrue(np.allclose(model.expElogbeta, model2.expElogbeta))
     self.assertTrue(np.allclose(model.state.gamma, model2.state.gamma))
示例#17
0
    def test_save_as_line_sentence_ru(self):
        corpus_file = get_tmpfile('gensim_utils.tst')
        ref_sentences = [l.split() for l in utils.any2unicode('привет мир\nкак ты поживаешь').split('\n')]
        utils.save_as_line_sentence(ref_sentences, corpus_file)

        with utils.smart_open(corpus_file, encoding='utf8') as fin:
            sentences = [line.strip().split() for line in fin.read().strip().split('\n')]
            self.assertEqual(sentences, ref_sentences)
 def testPersistenceCompressed(self):
     fname = get_tmpfile('gensim_models_logentry.tst.gz')
     model = logentropy_model.LogEntropyModel(self.corpus_ok, normalize=True)
     model.save(fname)
     model2 = logentropy_model.LogEntropyModel.load(fname, mmap=None)
     self.assertTrue(model.entr == model2.entr)
     tstvec = []
     self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))
 def testPersistenceCompressed(self):
     fname = get_tmpfile('gensim_models_coherence.tst.gz')
     model = CoherenceModel(
         topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass'
     )
     model.save(fname)
     model2 = CoherenceModel.load(fname)
     self.assertTrue(model.get_coherence() == model2.get_coherence())
示例#20
0
 def testPersistence(self):
     """Test storing/loading the entire model"""
     if not self.wr_path:
         return
     tmpf = get_tmpfile('gensim_wordrank.test')
     self.test_model.save(tmpf)
     loaded = wordrank.Wordrank.load(tmpf)
     self.models_equal(self.test_model, loaded)
示例#21
0
 def test_unicode_in_doctag(self):
     """Test storing document vectors of a model with unicode titles."""
     model = doc2vec.Doc2Vec(DocsLeeCorpus(unicode_tags=True), min_count=1)
     tmpf = get_tmpfile('gensim_doc2vec.tst')
     try:
         model.save_word2vec_format(tmpf, doctag_vec=True, word_vec=True, binary=True)
     except UnicodeEncodeError:
         self.fail('Failed storing unicode title.')
示例#22
0
    def testLargeMmapCompressed(self):
        fname = get_tmpfile('gensim_models_nmf.tst.gz')

        # simulate storing large arrays separately
        self.model.save(fname, sep_limit=0)

        # test loading the large model arrays with mmap
        self.assertRaises(IOError, nmf.Nmf.load, fname, mmap='r')
示例#23
0
 def test_persistence_word2vec_format(self):
     """Test storing/loading the model in word2vec format."""
     tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst')
     model = FT_gensim(sentences, min_count=1, size=10)
     model.wv.save_word2vec_format(tmpf, binary=True)
     loaded_model_kv = Word2VecKeyedVectors.load_word2vec_format(tmpf, binary=True)
     self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab))
     self.assertTrue(np.allclose(model['human'], loaded_model_kv['human']))
示例#24
0
 def testNoTrainingCFormat(self):
     tmpf = get_tmpfile('gensim_word2vec.tst')
     model = word2vec.Word2Vec(sentences, min_count=1)
     model.init_sims()
     model.wv.save_word2vec_format(tmpf, binary=True)
     kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True)
     binary_model = word2vec.Word2Vec()
     binary_model.wv = kv
     self.assertRaises(ValueError, binary_model.train, sentences)
示例#25
0
 def testTooShortTextWord2VecFormat(self):
     tfile = get_tmpfile('gensim_word2vec.tst')
     model = word2vec.Word2Vec(sentences, min_count=1)
     model.init_sims()
     model.wv.save_word2vec_format(tfile, binary=False)
     f = open(tfile, 'r+b')
     f.write(b'13')  # write wrong (too-long) vector count
     f.close()
     self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=False)
示例#26
0
 def test_dmc_neg_fromfile(self):
     """Test DBOW doc2vec training."""
     with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file:
         save_lee_corpus_as_line_sentence(corpus_file)
         model = doc2vec.Doc2Vec(
             list_corpus, dm=1, dm_concat=1, vector_size=24, window=4, hs=0,
             negative=10, alpha=0.05, min_count=2, epochs=20
         )
         self.model_sanity(model)
示例#27
0
 def tearDown(self):
     # remove all temporary test files
     fname = get_tmpfile('gensim_corpus.tst')
     extensions = ['', '', '.bz2', '.gz', '.index', '.vocab']
     for ext in itertools.permutations(extensions, 2):
         try:
             os.remove(fname + ext[0] + ext[1])
         except OSError:
             pass
示例#28
0
 def testPersistenceWord2VecFormat(self):
     """Test storing the entire model in word2vec format."""
     model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1)
     # test saving both document and word embedding
     test_doc_word = get_tmpfile('gensim_doc2vec.dw')
     model.save_word2vec_format(test_doc_word, doctag_vec=True, word_vec=True, binary=True)
     binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_doc_word, binary=True)
     self.assertEqual(len(model.wv.vocab) + len(model.docvecs), len(binary_model_dv.vocab))
     # test saving document embedding only
     test_doc = get_tmpfile('gensim_doc2vec.d')
     model.save_word2vec_format(test_doc, doctag_vec=True, word_vec=False, binary=True)
     binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_doc, binary=True)
     self.assertEqual(len(model.docvecs), len(binary_model_dv.vocab))
     # test saving word embedding only
     test_word = get_tmpfile('gensim_doc2vec.w')
     model.save_word2vec_format(test_word, doctag_vec=False, word_vec=True, binary=True)
     binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_word, binary=True)
     self.assertEqual(len(model.wv.vocab), len(binary_model_dv.vocab))
示例#29
0
    def test_save_as_line_sentence_en(self):
        corpus_file = get_tmpfile('gensim_utils.tst')
        ref_sentences = [l.split() for l in utils.any2unicode('hello world\nhow are you').split('\n')]

        utils.save_as_line_sentence(ref_sentences, corpus_file)

        with utils.smart_open(corpus_file, encoding='utf8') as fin:
            sentences = [line.strip().split() for line in fin.read().strip().split('\n')]
            self.assertEqual(sentences, ref_sentences)
示例#30
0
 def testPersistenceCompressed(self):
     fname = get_tmpfile('gensim_models_lda.tst.gz')
     model = self.model
     model.save(fname)
     model2 = self.class_.load(fname, mmap=None)
     self.assertEqual(model.num_topics, model2.num_topics)
     self.assertTrue(np.allclose(model.expElogbeta, model2.expElogbeta))
     tstvec = []
     self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
示例#31
0
# imports needed and logging
import gzip
import gensim 
import logging
import pdb
import time

from collections import defaultdict


from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

print('initializing ...')
t0 = time.time()
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True, )

word_vectors = model.wv

fname = get_tmpfile("vectors.kv")
word_vectors.save(fname)


t1 = time.time()

print('done', t1-t0)
示例#32
0
 def test_persistence(self):
     """Test storing/loading the entire model."""
     tmpf = get_tmpfile('gensim_doc2vec.tst')
     model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1)
     model.save(tmpf)
     self.models_equal(model, doc2vec.Doc2Vec.load(tmpf))
示例#33
0
def vector_loader(vector_path=vector_path):
    fname = get_tmpfile(vector_path)
    return KeyedVectors.load(fname, mmap="r")
示例#34
0
def glove_to_word2vec():
    glove_file = datapath(GLOVE_FILE)
    tmp_file = get_tmpfile(WORD2VEC_FILE)
    _ = glove2word2vec(glove_file, tmp_file)
示例#35
0
    # Save the model into
    fname = "model.kv"
    path = get_tmpfile(fname)
    model.wv.save(path)

    return model.wv


#========================================initialization of data=========================================#
# Taking G from memory
G = nx.read_multiline_adjlist("./adjlists/train_networkxAfterRemove.adjlist")

# Taking Memory from memory
fname = "model.kv"
path = get_tmpfile(fname)
model = KeyedVectors.load(path, mmap='r')

# the embeding section
# model = embedding(G)

#convert the json file to list of Conversation objects
data = bz2.BZ2File(
    "saved_objects/conversations_train_dataset_after_remove.pbz2",
    'rb')  # 40820 conversations
conversations = cPickle.load(data)
print("data conversations amount " + str(len(conversations)))

#=======================================preparing the intut data for bert========================================#
#get centers with name of all
plotter = Plotter.Plotter(G, model)

def avg_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec


path = get_tmpfile("word2vec.model")

#model = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True)
model = gensim.models.KeyedVectors.load("word2vec.model")
#model.save("word2vec.model")
index2word_set = set(model.wv.index2word)

with open('./txtfiles/testfile1.txt', 'r') as file:
    s1 = file.read().replace('\n', '')

with open('./txtfiles/testfile2.txt', 'r') as file:
    s2 = file.read().replace('\n', '')

s1_afv = avg_feature_vector(s1,
                            model=model,
                            num_features=300,
示例#37
0
 def setUp(self):
     self.datapath = datapath('word2vec_pre_kv_c')
     self.output_folder = get_tmpfile('w2v2t_test')
     self.metadata_file = self.output_folder + '_metadata.tsv'
     self.tensor_file = self.output_folder + '_tensor.tsv'
     self.vector_file = self.output_folder + '_vector.tsv'
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = datapath('D:\Downloads\glove.6B\glove.6B.200d.txt')
tmp_file = get_tmpfile("D:\Downloads\glove.6B\glove.6B.200d.word2vec")

glove2word2vec(glove_file, tmp_file)
"""
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import os

" more info at : https://radimrehurek.com/gensim/models/word2vec.html"

#data loading and preprocessing
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()
 
sentences = MySentences('C:\\Users\\luhoe\\Documents\\Git_Projects\\Github\\NeuronalNetwork_stuff\\first_steps\\training_data\\texts') # a memory-friendly iterator



path = get_tmpfile("C:\\Users\\luhoe\\Documents\\Git_Projects\\Github\\NeuronalNetwork_stuff\\first_steps\\word2vec_models\\word2vec.model")

print('start_training')
model = Word2Vec(sentences, size=200, window=8, min_count=4, workers=6)
print('finished')

model.save("C:\\Users\\luhoe\\Documents\\Git_Projects\\Github\\NeuronalNetwork_stuff\\first_steps\\word2vec_models\\word2vec.model")

示例#40
0
#get csv data
allWords = {}
with open("x:\\data\\Simtho-gensim\\model\\simtho999-10-25.csv", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",")
    next(reader)
    for x in reader:        
        allWords[x[0]] = [float(x[1]), float(x[2])]        

#in case the user inputs nothing
if word == None:
    print("Content-Type: text/html\n")
    print("Please enter a word")

else:    
    #import the model
    fname2 = get_tmpfile("x:\\data\\Simtho-gensim\\model\\simthovectors-size999-window10-min25.kv")
    word_vectors = KeyedVectors.load(fname2, mmap='r')    

    try:
        modelOutput = word_vectors.wv.most_similar(word, topn=10)
        queryResults = dict(modelOutput)
        
        for key in queryResults:            
            x = allWords[key]
            x.append(queryResults[key])
            queryResults[key] = x

        print(json.dumps(queryResults))
    except:
        print("The word " + word + " is not in the corpus")
示例#41
0
import argparse
import os
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("data_dir", help="Directory that contains the data")

    args = parser.parse_args()
    data_dir = args.data_dir

    # Create GloVe binary file
    glove_file = os.path.join(args.data_dir,
                              'original/Glove/glove.840B.300d.txt')
    tmp_file = get_tmpfile("glove.840B.300d.w2v.txt")
    _ = glove2word2vec(glove_file, tmp_file)
    model = KeyedVectors.load_word2vec_format(tmp_file)
    model.wv.save_word2vec_format(os.path.join(
        args.data_dir, "original/Glove/glove.840B.300d.bin"),
                                  binary=True)
示例#42
0
    def predict(self):
        try:
            #ML stuff below.
            #load pre-existing classifier from disk
            self.fp_fasttext_model = os.getcwd(
            ) + '\\assets\\model\\fasttext\\fasttext.model'
            if (os.path.isfile(self.fp_fasttext_model)):
                if (self.model == None):
                    self.model = FastText.load(
                        get_tmpfile(self.fp_fasttext_model))
            else:
                raise Exception(
                    'Fasttext model file cannot be found at "{}"'.format(
                        self.fp_fasttext_model))

            self.fp_classifier = os.getcwd(
            ) + '\\assets\\model\\model_gen_1_0_75803995341754.pkl'
            if (os.path.isfile(self.fp_classifier)):
                with open(self.fp_classifier, 'rb') as fid:
                    self._classifier = cPickle.load(fid)
            else:
                raise Exception(
                    'Machine learning model file cannot be found at "{}"'.
                    format(self.fp_classifier))

            # load data
            if (os.path.isfile(self.input_file_name.get())):
                self._df = pd.read_csv(
                    self.input_file_name.get(),
                    sep=";",
                    skipinitialspace=True,
                    engine='python'
                )  #engine='python' required to run in ipython
            else:
                raise Exception('Input file cannot be found at "{}"'.format(
                    self.input_file_name.get()))

            # prepare
            try:
                self.le = preprocessing.LabelEncoder()
                self.le.fit(classes.classes)
                self._df['source'] = preprocessing.LabelEncoder(
                ).fit_transform(self._df['source'].values)
                self._df['sentence'] = convert_str_array_to_numpy_vector(
                    self.model, self._df['sentence'].values.tolist(),
                    self._df['source'].values.tolist(), 'Average')
            except Exception as ex:
                raise Exception(
                    'Error occured while preparing input data. Are you sure it has only (sentence,source) columns? Trace: {}'
                    .format(str(ex)))

            # predict
            try:
                self._y_pred = self._classifier.predict(
                    self._df['sentence'].values.tolist())
                self._y_pred = self.le.inverse_transform(self._y_pred)
            except Exception as ex:
                raise Exception(
                    'Error occured while predicting input data. Are you sure it has only (sentence,source) columns? Trace: {}'
                    .format(str(ex)))

            # save data
            np.savetxt(self.output_file_name.get(),
                       self._y_pred,
                       delimiter=',',
                       fmt='%s')
            messagebox.showinfo(
                "Info", "Predicting is finished. File is saved to {}".format(
                    self.output_file_name.get()))

        except Exception as e:
            messagebox.showinfo(
                "Error",
                "Error occured while predicting. Send details to alozta\nDetails:\n{}"
                .format(str(e)))
    words.append(line.split())  
print(words) 

with open('newfileformodel.txt', 'w') as filehandle:  
    for listitem in words:
        filehandle.write('%s\n' % listitem)


# In[ ]:


from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec


path2 = get_tmpfile("word2vecyear1.model")

model23 = Word2Vec(my_texts1, size=100, window=5, min_count=5, workers=4)
model23.save("word2vecyear1.model")


modelyear1 = Word2Vec.load("word2vecyear1.model")
modelyear1.train(newfileformodel, total_examples=1, epochs=1)
print(modelyear1)

# numpy vector of a word
from gensim.models import KeyedVectors

path21 = get_tmpfile("wordvectorsyear1.kv")

modelyear1.wv.save('path/modelyear1.wv')
示例#44
0
def load_word2vec(filename, binary=False):
    glove_file = datapath(filename)
    tmp_file = get_tmpfile("test_word2vec.txt")
    _ = glove2word2vec(glove_file, tmp_file)
    word2vec = KeyedVectors.load_word2vec_format(tmp_file)
    return word2vec
示例#45
0
from gensim.models import KeyedVectors
nltk.download("popular")

#Methods required for running the project on colab
from google.colab import drive
drive.mount("/content/drive/")

cd drive/My\ Drive/asap-aes(1)

#Utility function for reading Stanford's Glove vectors using gensim
def readWordVector(tmp_file):
    wv=KeyedVectors.load_word2vec_format(tmp_file, binary=False)
    return wv

file="glove.6B.300d.txt"
tmp_file = get_tmpfile('temp_word2vec.txt')
glove2word2vec(file, tmp_file)
wv = readWordVector(tmp_file)

data = pd.read_csv('training_set_rel3.tsv', encoding = "ISO-8859-1",sep='\t')

data.head(6)

#Removing unnecessary columns since only domain1 score is required
y=data['domain1_score']
data=data.dropna(axis=1)
data=data.drop(axis=1,columns=['rater1_domain1','rater2_domain1'])

#As picked up from essay_set_description.xlsx
maximum_domain1_score=np.array([2,1,0,0,0,0,0,0])
minimum_domain1_score=np.array([12,6,3,3,4,4,30,60])
    for _mod in modelo:

        for _dim in dim:

            ini = datetime.now()

            filepath = '/home/nilc_embeddings/'
            filename_txt = filepath + _alg + "/" + _mod + "_s" + _dim + ".txt"
            filename_model = filepath + _alg + "/" + _mod + "_s" + _dim + ".model"

            conhecedor = KeyedVectors.load_word2vec_format(filename_txt,
                                                           encoding='utf8')
            advinhador = KeyedVectors.load_word2vec_format(filename_txt,
                                                           encoding='utf8')
            fname = get_tmpfile(filename_model)
            conhecedor.save(
                fname)  # conhecedor.wv.save(fname) # Funciona do mesmo jeito
            ##	conhecedor.wv.save_word2vec_format(fname)

            #	conhecedor = KeyedVectors.load(filename_model, mmap='r')
            #	advinhador = KeyedVectors.load(filename_model, mmap='r')

            vocab = []
            f = open(filename_txt)
            for line in f:
                if (line.strip() != ''):
                    cols = [j.strip() for j in line.split(' ')]
                    vocab.append(cols[0])
            f.close()
示例#47
0
def chunk_cosine_sim(gold_alignments,
                     embedding_file,
                     left_chunks_file,
                     right_chunks_file,
                     bert_tokenizer=None,
                     bert_model=None,
                     emb_type="chunk",
                     output_file=None):
    left_chunks = __read_chunks(left_chunks_file)
    right_chunks = __read_chunks(right_chunks_file)

    sentence_alignments = __read_alignments(gold_alignments, left_chunks,
                                            right_chunks)

    if emb_type == "glove":
        tmp_file = get_tmpfile("_temp_w2v_{}.txt".format(
            datetime.datetime.now().strftime('%H_%M_%S')))
        _ = glove2word2vec(embedding_file, tmp_file)
        embedding = KeyedVectors.load_word2vec_format(tmp_file)
        stoi = {w: i for i, w in enumerate(embedding.index2word)}
        itos = {i: w for i, w in enumerate(embedding.index2word)}
        num_words = len(embedding.index2word)
        unk_token_idx = num_words
        stoi["-UNK-"] = unk_token_idx
        itos[unk_token_idx] = "-UNK-"
        emb_matrix = embedding.vectors.copy()
        unk_symbol_emb = np.zeros((1, embedding.vector_size))
        emb_matrix = np.concatenate((emb_matrix, unk_symbol_emb), axis=0)

    total_alignments_counter = 0
    match_counter = 0
    vectors = {}

    for sid, emb_ids_map in tqdm(sentence_alignments.items()):
        try:
            left_chunk_emb_ids = list(emb_ids_map['left_chunk_emb_ids'])
            right_chunk_emb_ids = list(emb_ids_map['right_chunk_emb_ids'])

            if emb_type == "bert":
                # get bert sentence level embedding

                left_sentence = emb_ids_map['left_sentence']
                tokenized_left_text, left_sentence_embedding = get_bert_sentence_embedding(
                    left_sentence, bert_tokenizer, bert_model)
                right_sentence = emb_ids_map['right_sentence']
                tokenized_right_text, right_sentence_embedding = get_bert_sentence_embedding(
                    right_sentence, bert_tokenizer, bert_model)
                left_matrix = get_bert_chunks_embedding(
                    tokenized_left_text, left_sentence_embedding,
                    left_chunk_emb_ids, left_chunks[int(sid) - 1])
                unwrap_emb(left_chunk_emb_ids, left_matrix, vectors)

                right_matrix = get_bert_chunks_embedding(
                    tokenized_right_text, right_sentence_embedding,
                    right_chunk_emb_ids, right_chunks[int(sid) - 1])
                unwrap_emb(right_chunk_emb_ids, right_matrix, vectors)

            elif emb_type == "glove":
                left_matrix = get_glove_embeddings(emb_matrix,
                                                   left_chunk_emb_ids, stoi,
                                                   left_chunks[int(sid) - 1],
                                                   unk_token_idx)
                right_matrix = get_glove_embeddings(emb_matrix,
                                                    right_chunk_emb_ids, stoi,
                                                    right_chunks[int(sid) - 1],
                                                    unk_token_idx)

            left_norm = left_matrix / left_matrix.norm(dim=1)[:, None]
            right_norm = right_matrix / right_matrix.norm(dim=1)[:, None]
            cosine_sim = torch.mm(left_norm, right_norm.transpose(0, 1))
            values, indices = torch.max(cosine_sim, 1)
            indices = indices.numpy()
            selected_right_chunks = [right_chunk_emb_ids[i] for i in indices]
            total_alignments_counter += len(indices)

            for i in range(len(left_chunk_emb_ids)):
                selected_r_chunk = selected_right_chunks[i]
                current_l_chunk = left_chunk_emb_ids[i]
                r_emb_set = emb_ids_map['chunk_emb_mapping'][current_l_chunk]
                if selected_r_chunk in r_emb_set:
                    match_counter += 1
        except Exception as e:
            print e

    if emb_type == "bert" and output_file is not None:
        _write_vectors(vectors, output_file)

    print "Total alignment match percentage ratio : ", (
        float(match_counter) / total_alignments_counter) * 100.0
示例#48
0
import csv
import json
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
import numpy as np

fname = get_tmpfile("my_doc2vec_model")
num_features = 5

model = Doc2Vec.load(fname)
query = raw_input("Enter your query\n")
q_features = model.infer_vector(query.split())

shloka = ''
dist_array = {}
with open('result.csv', 'r') as file:
    fd = csv.reader(file)

    for item in fd:
        temp = []
        for i in range(num_features):
            temp.append(float(item[i]))

        dist = np.linalg.norm(q_features - temp)
        dist_array[item[-1]] = dist

final = sorted(dist_array, key=dist_array.__getitem__)

num_shlokas = 5
print("The Famous Quotes from Geeta says that")
print("\n")
示例#49
0
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

if __name__=='__main__':
	glove_file = datapath('/home/su/HL/FQA/model/vectors_qa.txt') 
	tmp_file = get_tmpfile('/home/su/HL/FQA/model/w2v_glove.txt')
	_ = glove2word2vec(glove_file, tmp_file) 
	model = KeyedVectors.load_word2vec_format('./model/w2v_glove.txt') 
示例#50
0
import os
import pickle
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

# Update the path
embedding_txt = '--------/embedding_text.txt'
embedding_temp = '-------/embedding_temp'
embedding_path = '-------/GCN_embedding.pkl'

with open('--------------/GCN_emb.pkl', 'rb') as f:
    word_embeddings = pickle.load(f)

with open('--------------/GCN_emb_idx2id_dict.pkl', 'rb') as f:
    idx2id_dict = pickle.load(f)

if not os.path.exists(embedding_txt):
    with open(embedding_txt, 'w') as f:
        for item in idx2id_dict:
            f.write(idx2id_dict[item] + ' ' +
                    ' '.join([str(i.item())
                              for i in word_embeddings[item]]) + '\n')

glove_file = datapath(embedding_txt)
temp_file = get_tmpfile(embedding_temp)
_ = glove2word2vec(glove_file, temp_file)

wv = KeyedVectors.load_word2vec_format(temp_file)
wv.save(embedding_path)
示例#51
0
def doc2vec(dataset_path, dataset_name, write_path, txt_path_list):

    # Convert text to lower-case and strip punctuation/symbols from words
    def normalize_text(text):
        norm_text = text.lower()
        # Replace breaks with spaces
        norm_text = norm_text.replace('<br />', ' ')
        # Pad punctuation with spaces on both sides
        norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
        return norm_text

    alldata_path = os.path.join(write_path,
                                'alldata-id_' + dataset_name + '.txt')

    if not os.path.isfile(alldata_path):

        # Collect & normalize test/train data
        print("Cleaning up dataset...")

        # list of the absolute paths of every text file
        print(" %i files" % (len(txt_path_list)))

        # for each file "txt"
        for i, txt in tqdm(enumerate(txt_path_list)):
            with smart_open(txt, "rb") as t:

                try:
                    # "one_text" is the whole document
                    one_text = t.read().decode("utf-8")
                    for c in control_chars:
                        one_text = one_text.replace(c, ' ')
                    one_text = normalize_text(one_text)
                    all_lines.append(one_text)
                except UnicodeDecodeError:

                    # we skip this file, but we need to preserve index pos
                    all_lines.append(" ")
                    continue

        # Save to disk for instant re-use on any future runs
        with smart_open(alldata_path, 'wb') as f:
            for idx, line in enumerate(all_lines):
                num_line = u"_*{0} {1}\n".format(idx, line)
                f.write(num_line.encode("utf-8"))

    assert os.path.isfile(alldata_path), "alldata unavailable"
    print("Success, alldata is available for next steps.")

    #===================================================================
    #=#BLOCK#=#: Read in alldata
    #===================================================================

    # this data object class suffices as a `TaggedDocument`
    # (with `words` and `tags`)
    # plus adds other state helpful for our later evaluation/reporting

    with smart_open(alldata_path, 'rb', encoding='utf-8') as alldata:
        alldata_list = list(alldata)
        print("Iterating up to: ", len(alldata_list))
    with smart_open(alldata_path, 'rb', encoding='utf-8') as alldata:
        documents = [
            TaggedDocument(doc, [i]) for i, doc in tqdm(enumerate(alldata))
        ]
        model = Doc2Vec(documents,
                        vector_size=5,
                        window=2,
                        min_count=1,
                        workers=4)

        fname = get_tmpfile(
            os.path.join(write_path, "doc2vec_model_" + dataset_name))
        model.save(fname)
        model = Doc2Vec.load(
            fname)  # you can continue training with the loaded model!

    return
def word2vec(in_txt):
    path = get_tmpfile("word2vec_lstm.model")
    model = Word2Vec(in_txt, size=150, window=6, min_count=1,
                     workers=multiprocessing.cpu_count())
    model.save("word2vec_lstm.model")
示例#53
0
 def testPersistenceWithConstructorRule(self):
     """Test storing/loading the entire model with a vocab trimming rule passed in the constructor."""
     tmpf = get_tmpfile('gensim_word2vec.tst')
     model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=_rule)
     model.save(tmpf)
     self.models_equal(model, word2vec.Word2Vec.load(tmpf))
示例#54
0
from glove import Glove
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors, Word2Vec

embedding_dim = 10
glove = Glove.load(f'glove_{embedding_dim}.txt')
print(glove.most_similar('我', number=10))

glove_file_path = f"/data/users/wangyuanzheng/projects/ucas_DL/3-poem/dev/glove_{embedding_dim}.txt"
word2vec_output_path = f"/data/users/wangyuanzheng/projects/ucas_DL/3-poem/dev/glove_{embedding_dim}_w2v.txt"
# 输入文件
glove_file = datapath(glove_file_path)
# 输出文件
tmp_file = get_tmpfile(word2vec_output_path)

# call glove2word2vec script
# default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>

# 开始转换
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_file, tmp_file)

# 加载转化后的文件
model = KeyedVectors.load_word2vec_format(tmp_file, unicode_errors='ignore')
示例#55
0
文件: utils.py 项目: StuartCHAN/KARL
        "w",
        encoding="UTF-8") as sparql:
    for line in lines:
        assert (3 == len(line))
        en.write(line[0] + "\n")
        sparql.write(line[1] + "\n")

##############################################################

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = datapath('D:/Downloads/Compressed/uniform.txt')

tmp_file = get_tmpfile(
    "D:/Downloads/Compressed/dbpedia_kglove_uniform_200.txt")

_ = glove2word2vec(glove_file, tmp_file)

model = KeyedVectors.load_word2vec_format(tmp_file)
"""
python -m spacy init-model xx ./dbpedia_kglove_uniform_200 --vectors-loc dbpedia_kglove_uniform_200.txt.gz 

"""
"D:/Downloads/Compressed/dbpedia_kglove_uniform_200.clean.txt"

model.save('/tmp/MyModel')
model.save_word2vec_format('/tmp/mymodel.txt', binary=False)
model.save_word2vec_format('/tmp/mymodel.bin.gz', binary=True)

tri = open("D:/Downloads/Compressed/try.txt", "r",
示例#56
0
model = word2vec.Word2Vec(sentences,
                          workers=num_workers,
                          size=num_features,
                          min_count=min_word_count,
                          window=context,
                          sample=downsampling)
word_vectors = model.wv
#trained word vectors are independent from the methods used to train them, we can
# represent them as a standalone structure KeyedVectors
# Word2Vec load functions are depreciated

# persist the word vectors to disk with...
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

fname = get_tmpfile("200features_40minwords_10context.kv")
word_vectors.save(fname)
word_vectors = KeyedVectors.load(fname, mmap='r')

#model.most_similar("man")
#Out[26]:
#[(u'woman', 0.6606647968292236),
# (u'lady', 0.6287051439285278),
# (u'lad', 0.5678527355194092),
# (u'guy', 0.5411106944084167),
# (u'men', 0.537365734577179),
# (u'person', 0.530378520488739),
# (u'monk', 0.5267703533172607),
# (u'businessman', 0.5263428688049316),
# (u'millionaire', 0.5201252102851868),
# (u'chap', 0.5184340476989746)]
示例#57
0
import numpy
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier as RFC
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pickle
from sklearn import metrics
import numpy as np
from pprint import pprint
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
import random

glove_file = 'glove.6B.50d.txt'
tmp_file = get_tmpfile("test_word2vec.txt")

# call glove2word2vec script
# default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_file, tmp_file)

model = KeyedVectors.load_word2vec_format(tmp_file)


def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = preprocess(doc)
    doc = [word for word in doc if word in word2vec_model.vocab]
    return np.mean(word2vec_model[doc], axis=0)
示例#58
0
def read_glove_vectors(glove_path):
    glove_file = datapath(glove_path)
    tmp_file = get_tmpfile("word2vec.txt")
    glove2word2vec(glove_file, tmp_file)
    model = KeyedVectors.load_word2vec_format(tmp_file)
    return model
示例#59
0
 def on_epoch_end(self, model):
     logging.info("Epoch " + str(self.epoch) + " completed")
     output_path = get_tmpfile('{}_epoch{}.model'.format(
         self.path, self.epoch))
     model.save(output_path)
     self.epoch += 1
 def convert_glove_word2vec(glove_path, dest_path):
     glove_file = datapath(glove_path)
     tmp_file = get_tmpfile(dest_path)
     _ = glove2word2vec(glove_file, tmp_file)