def testPersistenceKeyedVectorsFormatWithVocab(self): """Test storing/loading the entire model and vocabulary in word2vec format.""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() testvocab = get_tmpfile('gensim_word2vec.vocab') model.wv.save_word2vec_format(tmpf, testvocab, binary=True) kv_binary_model_with_vocab = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True) self.assertEqual(model.wv.vocab['human'].count, kv_binary_model_with_vocab.vocab['human'].count)
def test_persistence_fromfile(self): """Test storing/loading the entire model.""" with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) tmpf = get_tmpfile('gensim_doc2vec.tst') model = doc2vec.Doc2Vec(corpus_file=corpus_file, min_count=1) model.save(tmpf) self.models_equal(model, doc2vec.Doc2Vec.load(tmpf))
def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self): """Test storing/loading the entire model and vocabulary in word2vec format chained with saving and loading via `save` and `load` methods`. It was possible prior to 1.0.0 release, now raises Exception""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() testvocab = get_tmpfile('gensim_word2vec.vocab') model.wv.save_word2vec_format(tmpf, testvocab, binary=True) binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True) binary_model_with_vocab_kv.save(tmpf) self.assertRaises(AttributeError, word2vec.Word2Vec.load, tmpf)
def test_persistence(self): # Test persistence without using `smartirs` fname = get_tmpfile('gensim_models.tst') model = tfidfmodel.TfidfModel(self.corpus, normalize=True) model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname) self.assertTrue(model.idfs == model2.idfs) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector # Test persistence with using `smartirs` fname = get_tmpfile('gensim_models_smartirs.tst') model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname) self.assertTrue(model.idfs == model2.idfs) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector # Test persistence between Gensim v3.2.0 and current model. model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst')) idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] self.assertTrue(np.allclose(idfs3, idfs4)) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]])) self.assertTrue(np.allclose(model3[[]], model4[[]])) # try projecting an empty vector # Test persistence with using pivoted normalization fname = get_tmpfile('gensim_models_smartirs.tst') model = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1) model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname, mmap=None) self.assertTrue(model.idfs == model2.idfs) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) # Test persistence between Gensim v3.2.0 and pivoted normalization compressed model. model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1) model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst')) idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] self.assertTrue(np.allclose(idfs3, idfs4)) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
def test_saveAsText(self): """`Dictionary` can be saved as textfile. """ tmpf = get_tmpfile('save_dict_test.txt') small_text = [ ["prvé", "slovo"], ["slovo", "druhé"], ["druhé", "slovo"] ] d = Dictionary(small_text) d.save_as_text(tmpf) with codecs.open(tmpf, 'r', encoding='utf-8') as file: serialized_lines = file.readlines() self.assertEqual(serialized_lines[0], u"3\n") self.assertEqual(len(serialized_lines), 4) # We do not know, which word will have which index self.assertEqual(serialized_lines[1][1:], u"\tdruhé\t2\n") self.assertEqual(serialized_lines[2][1:], u"\tprvé\t1\n") self.assertEqual(serialized_lines[3][1:], u"\tslovo\t3\n") d.save_as_text(tmpf, sort_by_word=False) with codecs.open(tmpf, 'r', encoding='utf-8') as file: serialized_lines = file.readlines() self.assertEqual(serialized_lines[0], u"3\n") self.assertEqual(len(serialized_lines), 4) self.assertEqual(serialized_lines[1][1:], u"\tslovo\t3\n") self.assertEqual(serialized_lines[2][1:], u"\tdruhé\t2\n") self.assertEqual(serialized_lines[3][1:], u"\tprvé\t1\n")
def testMmap(self): if self.cls == similarities.WmdSimilarity and not PYEMD_EXT: return fname = get_tmpfile('gensim_similarities.tst.pkl') if self.cls == similarities.Similarity: index = self.cls(None, corpus, num_features=len(dictionary), shardsize=5) elif self.cls == similarities.WmdSimilarity: index = self.cls(texts, self.w2v_model) else: index = self.cls(corpus, num_features=len(dictionary)) # store all arrays separately index.save(fname, sep_limit=0) # same thing, but use mmap to load arrays index2 = self.cls.load(fname, mmap='r') if self.cls == similarities.Similarity: # for Similarity, only do a basic check self.assertTrue(len(index.shards) == len(index2.shards)) index.destroy() else: if isinstance(index, similarities.SparseMatrixSimilarity): # hack SparseMatrixSim indexes so they're easy to compare index.index = index.index.todense() index2.index = index2.index.todense() self.assertTrue(numpy.allclose(index.index, index2.index)) self.assertEqual(index.num_best, index2.num_best)
def testLoadOldModel(self): """Test loading word2vec models from previous version""" model_file = 'word2vec_old' model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) self.assertTrue(model.vocabulary.cum_table.shape == (12,)) self.onlineSanity(model, trained_model=True) # Model stored in multiple files model_file = 'word2vec_old_sep' model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) self.assertTrue(model.vocabulary.cum_table.shape == (12,)) self.onlineSanity(model, trained_model=True) # load really old model model_file = 'w2v-lee-v0.12.0' model = word2vec.Word2Vec.load(datapath(model_file)) self.onlineSanity(model, trained_model=True) # test for max_final_vocab for model saved in 3.3 model_file = 'word2vec_3.3' model = word2vec.Word2Vec.load(datapath(model_file)) self.assertEqual(model.max_final_vocab, None) self.assertEqual(model.vocabulary.max_final_vocab, None) # Test loading word2vec models from all previous versions old_versions = [ '0.12.0', '0.12.1', '0.12.2', '0.12.3', '0.12.4', '0.13.0', '0.13.1', '0.13.2', '0.13.3', '0.13.4', '1.0.0', '1.0.1', '2.0.0', '2.1.0', '2.2.0', '2.3.0', '3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0' ] saved_models_dir = datapath('old_w2v_models/w2v_{}.mdl') for old_version in old_versions: model = word2vec.Word2Vec.load(saved_models_dir.format(old_version)) self.assertTrue(len(model.wv.vocab) == 3) self.assertTrue(model.wv.vectors.shape == (3, 4)) # check if similarity search and online training works. self.assertTrue(len(model.wv.most_similar('sentence')) == 2) model.build_vocab(list_corpus, update=True) model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter) # check if similarity search and online training works after saving and loading back the model. tmpf = get_tmpfile('gensim_word2vec.tst') model.save(tmpf) loaded_model = word2vec.Word2Vec.load(tmpf) loaded_model.build_vocab(list_corpus, update=True) loaded_model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter)
def testPersistence(self): fname = get_tmpfile('gensim_models_nmf.tst') self.model.save(fname) model2 = nmf.Nmf.load(fname) tstvec = [] self.assertTrue(np.allclose(self.model[tstvec], model2[tstvec])) # try projecting an empty vector
def testWord2Vec(self): corpus = BigCorpus(words_only=True, num_docs=100000, num_terms=3000000, doc_len=200) tmpf = get_tmpfile('gensim_big.tst') model = gensim.models.Word2Vec(corpus, size=300, workers=4) model.save(tmpf, ignore=['syn1']) del model gensim.models.Word2Vec.load(tmpf)
def load_on_instance(): # Save and load a Doc2Vec Model on instance for test tmpf = get_tmpfile('gensim_doc2vec.tst') model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1) model.save(tmpf) model = doc2vec.Doc2Vec() # should fail at this point return model.load(tmpf)
def test_get_offsets_and_start_doctags_win(self): # Each line takes 7 bytes (including '\n' character which is actually '\r\n' on Windows) lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n'] tmpf = get_tmpfile('gensim_doc2vec.tst') with utils.smart_open(tmpf, 'wb', encoding='utf8') as fout: for line in lines: fout.write(utils.any2unicode(line)) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 1) self.assertEqual(offsets, [0]) self.assertEqual(start_doctags, [0]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 2) self.assertEqual(offsets, [0, 14]) self.assertEqual(start_doctags, [0, 2]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 3) self.assertEqual(offsets, [0, 7, 21]) self.assertEqual(start_doctags, [0, 1, 3]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 4) self.assertEqual(offsets, [0, 7, 14, 21]) self.assertEqual(start_doctags, [0, 1, 2, 3]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 5) self.assertEqual(offsets, [0, 7, 14, 21, 28]) self.assertEqual(start_doctags, [0, 1, 2, 3, 4]) offsets, start_doctags = doc2vec.Doc2Vec._get_offsets_and_start_doctags_for_corpusfile(tmpf, 6) self.assertEqual(offsets, [0, 0, 7, 14, 14, 21]) self.assertEqual(start_doctags, [0, 0, 1, 2, 2, 3])
def test_json_len(self): tmpf = get_tmpfile('script.tst.json') segment_and_write_all_articles(self.fname, tmpf, workers=1) expected_num_articles = 106 num_articles = sum(1 for line in smart_open(tmpf)) self.assertEqual(num_articles, expected_num_articles)
def testLdaModel(self): corpus = BigCorpus(num_docs=5000) tmpf = get_tmpfile('gensim_big.tst') model = gensim.models.LdaModel(corpus, num_topics=500, id2word=corpus.dictionary) model.save(tmpf) del model gensim.models.LdaModel.load(tmpf)
def testNormalizeAfterTrainingData(self): tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.save(tmpf) norm_only_model = word2vec.Word2Vec.load(tmpf) norm_only_model.delete_temporary_training_data(replace_word_vectors_with_normalized=True) self.assertFalse(np.allclose(model['human'], norm_only_model['human']))
def load_on_instance(): # Save and load a Word2Vec Model on instance for test tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.save(tmpf) model = word2vec.Word2Vec() # should fail at this point return model.load(tmpf)
def testPersistence(self): fname = get_tmpfile('gensim_models_atmodel.tst') model = self.model model.save(fname) model2 = self.class_.load(fname) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(np.allclose(model.expElogbeta, model2.expElogbeta)) self.assertTrue(np.allclose(model.state.gamma, model2.state.gamma))
def test_save_as_line_sentence_ru(self): corpus_file = get_tmpfile('gensim_utils.tst') ref_sentences = [l.split() for l in utils.any2unicode('привет мир\nкак ты поживаешь').split('\n')] utils.save_as_line_sentence(ref_sentences, corpus_file) with utils.smart_open(corpus_file, encoding='utf8') as fin: sentences = [line.strip().split() for line in fin.read().strip().split('\n')] self.assertEqual(sentences, ref_sentences)
def testPersistenceCompressed(self): fname = get_tmpfile('gensim_models_logentry.tst.gz') model = logentropy_model.LogEntropyModel(self.corpus_ok, normalize=True) model.save(fname) model2 = logentropy_model.LogEntropyModel.load(fname, mmap=None) self.assertTrue(model.entr == model2.entr) tstvec = [] self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))
def testPersistenceCompressed(self): fname = get_tmpfile('gensim_models_coherence.tst.gz') model = CoherenceModel( topics=self.topics1, corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass' ) model.save(fname) model2 = CoherenceModel.load(fname) self.assertTrue(model.get_coherence() == model2.get_coherence())
def testPersistence(self): """Test storing/loading the entire model""" if not self.wr_path: return tmpf = get_tmpfile('gensim_wordrank.test') self.test_model.save(tmpf) loaded = wordrank.Wordrank.load(tmpf) self.models_equal(self.test_model, loaded)
def test_unicode_in_doctag(self): """Test storing document vectors of a model with unicode titles.""" model = doc2vec.Doc2Vec(DocsLeeCorpus(unicode_tags=True), min_count=1) tmpf = get_tmpfile('gensim_doc2vec.tst') try: model.save_word2vec_format(tmpf, doctag_vec=True, word_vec=True, binary=True) except UnicodeEncodeError: self.fail('Failed storing unicode title.')
def testLargeMmapCompressed(self): fname = get_tmpfile('gensim_models_nmf.tst.gz') # simulate storing large arrays separately self.model.save(fname, sep_limit=0) # test loading the large model arrays with mmap self.assertRaises(IOError, nmf.Nmf.load, fname, mmap='r')
def test_persistence_word2vec_format(self): """Test storing/loading the model in word2vec format.""" tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst') model = FT_gensim(sentences, min_count=1, size=10) model.wv.save_word2vec_format(tmpf, binary=True) loaded_model_kv = Word2VecKeyedVectors.load_word2vec_format(tmpf, binary=True) self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab)) self.assertTrue(np.allclose(model['human'], loaded_model_kv['human']))
def testNoTrainingCFormat(self): tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.wv.save_word2vec_format(tmpf, binary=True) kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, binary=True) binary_model = word2vec.Word2Vec() binary_model.wv = kv self.assertRaises(ValueError, binary_model.train, sentences)
def testTooShortTextWord2VecFormat(self): tfile = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1) model.init_sims() model.wv.save_word2vec_format(tfile, binary=False) f = open(tfile, 'r+b') f.write(b'13') # write wrong (too-long) vector count f.close() self.assertRaises(EOFError, keyedvectors.KeyedVectors.load_word2vec_format, tfile, binary=False)
def test_dmc_neg_fromfile(self): """Test DBOW doc2vec training.""" with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_concat=1, vector_size=24, window=4, hs=0, negative=10, alpha=0.05, min_count=2, epochs=20 ) self.model_sanity(model)
def tearDown(self): # remove all temporary test files fname = get_tmpfile('gensim_corpus.tst') extensions = ['', '', '.bz2', '.gz', '.index', '.vocab'] for ext in itertools.permutations(extensions, 2): try: os.remove(fname + ext[0] + ext[1]) except OSError: pass
def testPersistenceWord2VecFormat(self): """Test storing the entire model in word2vec format.""" model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1) # test saving both document and word embedding test_doc_word = get_tmpfile('gensim_doc2vec.dw') model.save_word2vec_format(test_doc_word, doctag_vec=True, word_vec=True, binary=True) binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_doc_word, binary=True) self.assertEqual(len(model.wv.vocab) + len(model.docvecs), len(binary_model_dv.vocab)) # test saving document embedding only test_doc = get_tmpfile('gensim_doc2vec.d') model.save_word2vec_format(test_doc, doctag_vec=True, word_vec=False, binary=True) binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_doc, binary=True) self.assertEqual(len(model.docvecs), len(binary_model_dv.vocab)) # test saving word embedding only test_word = get_tmpfile('gensim_doc2vec.w') model.save_word2vec_format(test_word, doctag_vec=False, word_vec=True, binary=True) binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_word, binary=True) self.assertEqual(len(model.wv.vocab), len(binary_model_dv.vocab))
def test_save_as_line_sentence_en(self): corpus_file = get_tmpfile('gensim_utils.tst') ref_sentences = [l.split() for l in utils.any2unicode('hello world\nhow are you').split('\n')] utils.save_as_line_sentence(ref_sentences, corpus_file) with utils.smart_open(corpus_file, encoding='utf8') as fin: sentences = [line.strip().split() for line in fin.read().strip().split('\n')] self.assertEqual(sentences, ref_sentences)
def testPersistenceCompressed(self): fname = get_tmpfile('gensim_models_lda.tst.gz') model = self.model model.save(fname) model2 = self.class_.load(fname, mmap=None) self.assertEqual(model.num_topics, model2.num_topics) self.assertTrue(np.allclose(model.expElogbeta, model2.expElogbeta)) tstvec = [] self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
# imports needed and logging import gzip import gensim import logging import pdb import time from collections import defaultdict from gensim.test.utils import common_texts from gensim.models import Word2Vec from gensim.test.utils import get_tmpfile from gensim.models import KeyedVectors print('initializing ...') t0 = time.time() # Load Google's pre-trained Word2Vec model. model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True, ) word_vectors = model.wv fname = get_tmpfile("vectors.kv") word_vectors.save(fname) t1 = time.time() print('done', t1-t0)
def test_persistence(self): """Test storing/loading the entire model.""" tmpf = get_tmpfile('gensim_doc2vec.tst') model = doc2vec.Doc2Vec(DocsLeeCorpus(), min_count=1) model.save(tmpf) self.models_equal(model, doc2vec.Doc2Vec.load(tmpf))
def vector_loader(vector_path=vector_path): fname = get_tmpfile(vector_path) return KeyedVectors.load(fname, mmap="r")
def glove_to_word2vec(): glove_file = datapath(GLOVE_FILE) tmp_file = get_tmpfile(WORD2VEC_FILE) _ = glove2word2vec(glove_file, tmp_file)
# Save the model into fname = "model.kv" path = get_tmpfile(fname) model.wv.save(path) return model.wv #========================================initialization of data=========================================# # Taking G from memory G = nx.read_multiline_adjlist("./adjlists/train_networkxAfterRemove.adjlist") # Taking Memory from memory fname = "model.kv" path = get_tmpfile(fname) model = KeyedVectors.load(path, mmap='r') # the embeding section # model = embedding(G) #convert the json file to list of Conversation objects data = bz2.BZ2File( "saved_objects/conversations_train_dataset_after_remove.pbz2", 'rb') # 40820 conversations conversations = cPickle.load(data) print("data conversations amount " + str(len(conversations))) #=======================================preparing the intut data for bert========================================# #get centers with name of all plotter = Plotter.Plotter(G, model)
def avg_feature_vector(sentence, model, num_features, index2word_set): words = sentence.split() feature_vec = np.zeros((num_features, ), dtype='float32') n_words = 0 for word in words: if word in index2word_set: n_words += 1 feature_vec = np.add(feature_vec, model[word]) if (n_words > 0): feature_vec = np.divide(feature_vec, n_words) return feature_vec path = get_tmpfile("word2vec.model") #model = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True) model = gensim.models.KeyedVectors.load("word2vec.model") #model.save("word2vec.model") index2word_set = set(model.wv.index2word) with open('./txtfiles/testfile1.txt', 'r') as file: s1 = file.read().replace('\n', '') with open('./txtfiles/testfile2.txt', 'r') as file: s2 = file.read().replace('\n', '') s1_afv = avg_feature_vector(s1, model=model, num_features=300,
def setUp(self): self.datapath = datapath('word2vec_pre_kv_c') self.output_folder = get_tmpfile('w2v2t_test') self.metadata_file = self.output_folder + '_metadata.tsv' self.tensor_file = self.output_folder + '_tensor.tsv' self.vector_file = self.output_folder + '_vector.tsv'
from gensim.test.utils import datapath, get_tmpfile from gensim.models import KeyedVectors from gensim.scripts.glove2word2vec import glove2word2vec glove_file = datapath('D:\Downloads\glove.6B\glove.6B.200d.txt') tmp_file = get_tmpfile("D:\Downloads\glove.6B\glove.6B.200d.word2vec") glove2word2vec(glove_file, tmp_file)
""" from gensim.test.utils import common_texts, get_tmpfile from gensim.models import Word2Vec from gensim.models import KeyedVectors import os " more info at : https://radimrehurek.com/gensim/models/word2vec.html" #data loading and preprocessing class MySentences(object): def __init__(self, dirname): self.dirname = dirname def __iter__(self): for fname in os.listdir(self.dirname): for line in open(os.path.join(self.dirname, fname)): yield line.split() sentences = MySentences('C:\\Users\\luhoe\\Documents\\Git_Projects\\Github\\NeuronalNetwork_stuff\\first_steps\\training_data\\texts') # a memory-friendly iterator path = get_tmpfile("C:\\Users\\luhoe\\Documents\\Git_Projects\\Github\\NeuronalNetwork_stuff\\first_steps\\word2vec_models\\word2vec.model") print('start_training') model = Word2Vec(sentences, size=200, window=8, min_count=4, workers=6) print('finished') model.save("C:\\Users\\luhoe\\Documents\\Git_Projects\\Github\\NeuronalNetwork_stuff\\first_steps\\word2vec_models\\word2vec.model")
#get csv data allWords = {} with open("x:\\data\\Simtho-gensim\\model\\simtho999-10-25.csv", encoding="utf-8") as f: reader = csv.reader(f, delimiter=",") next(reader) for x in reader: allWords[x[0]] = [float(x[1]), float(x[2])] #in case the user inputs nothing if word == None: print("Content-Type: text/html\n") print("Please enter a word") else: #import the model fname2 = get_tmpfile("x:\\data\\Simtho-gensim\\model\\simthovectors-size999-window10-min25.kv") word_vectors = KeyedVectors.load(fname2, mmap='r') try: modelOutput = word_vectors.wv.most_similar(word, topn=10) queryResults = dict(modelOutput) for key in queryResults: x = allWords[key] x.append(queryResults[key]) queryResults[key] = x print(json.dumps(queryResults)) except: print("The word " + word + " is not in the corpus")
import argparse import os from gensim.scripts.glove2word2vec import glove2word2vec from gensim.test.utils import get_tmpfile from gensim.models import KeyedVectors if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("data_dir", help="Directory that contains the data") args = parser.parse_args() data_dir = args.data_dir # Create GloVe binary file glove_file = os.path.join(args.data_dir, 'original/Glove/glove.840B.300d.txt') tmp_file = get_tmpfile("glove.840B.300d.w2v.txt") _ = glove2word2vec(glove_file, tmp_file) model = KeyedVectors.load_word2vec_format(tmp_file) model.wv.save_word2vec_format(os.path.join( args.data_dir, "original/Glove/glove.840B.300d.bin"), binary=True)
def predict(self): try: #ML stuff below. #load pre-existing classifier from disk self.fp_fasttext_model = os.getcwd( ) + '\\assets\\model\\fasttext\\fasttext.model' if (os.path.isfile(self.fp_fasttext_model)): if (self.model == None): self.model = FastText.load( get_tmpfile(self.fp_fasttext_model)) else: raise Exception( 'Fasttext model file cannot be found at "{}"'.format( self.fp_fasttext_model)) self.fp_classifier = os.getcwd( ) + '\\assets\\model\\model_gen_1_0_75803995341754.pkl' if (os.path.isfile(self.fp_classifier)): with open(self.fp_classifier, 'rb') as fid: self._classifier = cPickle.load(fid) else: raise Exception( 'Machine learning model file cannot be found at "{}"'. format(self.fp_classifier)) # load data if (os.path.isfile(self.input_file_name.get())): self._df = pd.read_csv( self.input_file_name.get(), sep=";", skipinitialspace=True, engine='python' ) #engine='python' required to run in ipython else: raise Exception('Input file cannot be found at "{}"'.format( self.input_file_name.get())) # prepare try: self.le = preprocessing.LabelEncoder() self.le.fit(classes.classes) self._df['source'] = preprocessing.LabelEncoder( ).fit_transform(self._df['source'].values) self._df['sentence'] = convert_str_array_to_numpy_vector( self.model, self._df['sentence'].values.tolist(), self._df['source'].values.tolist(), 'Average') except Exception as ex: raise Exception( 'Error occured while preparing input data. Are you sure it has only (sentence,source) columns? Trace: {}' .format(str(ex))) # predict try: self._y_pred = self._classifier.predict( self._df['sentence'].values.tolist()) self._y_pred = self.le.inverse_transform(self._y_pred) except Exception as ex: raise Exception( 'Error occured while predicting input data. Are you sure it has only (sentence,source) columns? Trace: {}' .format(str(ex))) # save data np.savetxt(self.output_file_name.get(), self._y_pred, delimiter=',', fmt='%s') messagebox.showinfo( "Info", "Predicting is finished. File is saved to {}".format( self.output_file_name.get())) except Exception as e: messagebox.showinfo( "Error", "Error occured while predicting. Send details to alozta\nDetails:\n{}" .format(str(e)))
words.append(line.split()) print(words) with open('newfileformodel.txt', 'w') as filehandle: for listitem in words: filehandle.write('%s\n' % listitem) # In[ ]: from gensim.test.utils import common_texts, get_tmpfile from gensim.models import Word2Vec path2 = get_tmpfile("word2vecyear1.model") model23 = Word2Vec(my_texts1, size=100, window=5, min_count=5, workers=4) model23.save("word2vecyear1.model") modelyear1 = Word2Vec.load("word2vecyear1.model") modelyear1.train(newfileformodel, total_examples=1, epochs=1) print(modelyear1) # numpy vector of a word from gensim.models import KeyedVectors path21 = get_tmpfile("wordvectorsyear1.kv") modelyear1.wv.save('path/modelyear1.wv')
def load_word2vec(filename, binary=False): glove_file = datapath(filename) tmp_file = get_tmpfile("test_word2vec.txt") _ = glove2word2vec(glove_file, tmp_file) word2vec = KeyedVectors.load_word2vec_format(tmp_file) return word2vec
from gensim.models import KeyedVectors nltk.download("popular") #Methods required for running the project on colab from google.colab import drive drive.mount("/content/drive/") cd drive/My\ Drive/asap-aes(1) #Utility function for reading Stanford's Glove vectors using gensim def readWordVector(tmp_file): wv=KeyedVectors.load_word2vec_format(tmp_file, binary=False) return wv file="glove.6B.300d.txt" tmp_file = get_tmpfile('temp_word2vec.txt') glove2word2vec(file, tmp_file) wv = readWordVector(tmp_file) data = pd.read_csv('training_set_rel3.tsv', encoding = "ISO-8859-1",sep='\t') data.head(6) #Removing unnecessary columns since only domain1 score is required y=data['domain1_score'] data=data.dropna(axis=1) data=data.drop(axis=1,columns=['rater1_domain1','rater2_domain1']) #As picked up from essay_set_description.xlsx maximum_domain1_score=np.array([2,1,0,0,0,0,0,0]) minimum_domain1_score=np.array([12,6,3,3,4,4,30,60])
for _mod in modelo: for _dim in dim: ini = datetime.now() filepath = '/home/nilc_embeddings/' filename_txt = filepath + _alg + "/" + _mod + "_s" + _dim + ".txt" filename_model = filepath + _alg + "/" + _mod + "_s" + _dim + ".model" conhecedor = KeyedVectors.load_word2vec_format(filename_txt, encoding='utf8') advinhador = KeyedVectors.load_word2vec_format(filename_txt, encoding='utf8') fname = get_tmpfile(filename_model) conhecedor.save( fname) # conhecedor.wv.save(fname) # Funciona do mesmo jeito ## conhecedor.wv.save_word2vec_format(fname) # conhecedor = KeyedVectors.load(filename_model, mmap='r') # advinhador = KeyedVectors.load(filename_model, mmap='r') vocab = [] f = open(filename_txt) for line in f: if (line.strip() != ''): cols = [j.strip() for j in line.split(' ')] vocab.append(cols[0]) f.close()
def chunk_cosine_sim(gold_alignments, embedding_file, left_chunks_file, right_chunks_file, bert_tokenizer=None, bert_model=None, emb_type="chunk", output_file=None): left_chunks = __read_chunks(left_chunks_file) right_chunks = __read_chunks(right_chunks_file) sentence_alignments = __read_alignments(gold_alignments, left_chunks, right_chunks) if emb_type == "glove": tmp_file = get_tmpfile("_temp_w2v_{}.txt".format( datetime.datetime.now().strftime('%H_%M_%S'))) _ = glove2word2vec(embedding_file, tmp_file) embedding = KeyedVectors.load_word2vec_format(tmp_file) stoi = {w: i for i, w in enumerate(embedding.index2word)} itos = {i: w for i, w in enumerate(embedding.index2word)} num_words = len(embedding.index2word) unk_token_idx = num_words stoi["-UNK-"] = unk_token_idx itos[unk_token_idx] = "-UNK-" emb_matrix = embedding.vectors.copy() unk_symbol_emb = np.zeros((1, embedding.vector_size)) emb_matrix = np.concatenate((emb_matrix, unk_symbol_emb), axis=0) total_alignments_counter = 0 match_counter = 0 vectors = {} for sid, emb_ids_map in tqdm(sentence_alignments.items()): try: left_chunk_emb_ids = list(emb_ids_map['left_chunk_emb_ids']) right_chunk_emb_ids = list(emb_ids_map['right_chunk_emb_ids']) if emb_type == "bert": # get bert sentence level embedding left_sentence = emb_ids_map['left_sentence'] tokenized_left_text, left_sentence_embedding = get_bert_sentence_embedding( left_sentence, bert_tokenizer, bert_model) right_sentence = emb_ids_map['right_sentence'] tokenized_right_text, right_sentence_embedding = get_bert_sentence_embedding( right_sentence, bert_tokenizer, bert_model) left_matrix = get_bert_chunks_embedding( tokenized_left_text, left_sentence_embedding, left_chunk_emb_ids, left_chunks[int(sid) - 1]) unwrap_emb(left_chunk_emb_ids, left_matrix, vectors) right_matrix = get_bert_chunks_embedding( tokenized_right_text, right_sentence_embedding, right_chunk_emb_ids, right_chunks[int(sid) - 1]) unwrap_emb(right_chunk_emb_ids, right_matrix, vectors) elif emb_type == "glove": left_matrix = get_glove_embeddings(emb_matrix, left_chunk_emb_ids, stoi, left_chunks[int(sid) - 1], unk_token_idx) right_matrix = get_glove_embeddings(emb_matrix, right_chunk_emb_ids, stoi, right_chunks[int(sid) - 1], unk_token_idx) left_norm = left_matrix / left_matrix.norm(dim=1)[:, None] right_norm = right_matrix / right_matrix.norm(dim=1)[:, None] cosine_sim = torch.mm(left_norm, right_norm.transpose(0, 1)) values, indices = torch.max(cosine_sim, 1) indices = indices.numpy() selected_right_chunks = [right_chunk_emb_ids[i] for i in indices] total_alignments_counter += len(indices) for i in range(len(left_chunk_emb_ids)): selected_r_chunk = selected_right_chunks[i] current_l_chunk = left_chunk_emb_ids[i] r_emb_set = emb_ids_map['chunk_emb_mapping'][current_l_chunk] if selected_r_chunk in r_emb_set: match_counter += 1 except Exception as e: print e if emb_type == "bert" and output_file is not None: _write_vectors(vectors, output_file) print "Total alignment match percentage ratio : ", ( float(match_counter) / total_alignments_counter) * 100.0
import csv import json from gensim.models.doc2vec import Doc2Vec, TaggedDocument from gensim.test.utils import get_tmpfile import numpy as np fname = get_tmpfile("my_doc2vec_model") num_features = 5 model = Doc2Vec.load(fname) query = raw_input("Enter your query\n") q_features = model.infer_vector(query.split()) shloka = '' dist_array = {} with open('result.csv', 'r') as file: fd = csv.reader(file) for item in fd: temp = [] for i in range(num_features): temp.append(float(item[i])) dist = np.linalg.norm(q_features - temp) dist_array[item[-1]] = dist final = sorted(dist_array, key=dist_array.__getitem__) num_shlokas = 5 print("The Famous Quotes from Geeta says that") print("\n")
from gensim.test.utils import datapath, get_tmpfile from gensim.models import KeyedVectors from gensim.scripts.glove2word2vec import glove2word2vec if __name__=='__main__': glove_file = datapath('/home/su/HL/FQA/model/vectors_qa.txt') tmp_file = get_tmpfile('/home/su/HL/FQA/model/w2v_glove.txt') _ = glove2word2vec(glove_file, tmp_file) model = KeyedVectors.load_word2vec_format('./model/w2v_glove.txt')
import os import pickle from gensim.test.utils import datapath, get_tmpfile from gensim.scripts.glove2word2vec import glove2word2vec from gensim.models import KeyedVectors # Update the path embedding_txt = '--------/embedding_text.txt' embedding_temp = '-------/embedding_temp' embedding_path = '-------/GCN_embedding.pkl' with open('--------------/GCN_emb.pkl', 'rb') as f: word_embeddings = pickle.load(f) with open('--------------/GCN_emb_idx2id_dict.pkl', 'rb') as f: idx2id_dict = pickle.load(f) if not os.path.exists(embedding_txt): with open(embedding_txt, 'w') as f: for item in idx2id_dict: f.write(idx2id_dict[item] + ' ' + ' '.join([str(i.item()) for i in word_embeddings[item]]) + '\n') glove_file = datapath(embedding_txt) temp_file = get_tmpfile(embedding_temp) _ = glove2word2vec(glove_file, temp_file) wv = KeyedVectors.load_word2vec_format(temp_file) wv.save(embedding_path)
def doc2vec(dataset_path, dataset_name, write_path, txt_path_list): # Convert text to lower-case and strip punctuation/symbols from words def normalize_text(text): norm_text = text.lower() # Replace breaks with spaces norm_text = norm_text.replace('<br />', ' ') # Pad punctuation with spaces on both sides norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text) return norm_text alldata_path = os.path.join(write_path, 'alldata-id_' + dataset_name + '.txt') if not os.path.isfile(alldata_path): # Collect & normalize test/train data print("Cleaning up dataset...") # list of the absolute paths of every text file print(" %i files" % (len(txt_path_list))) # for each file "txt" for i, txt in tqdm(enumerate(txt_path_list)): with smart_open(txt, "rb") as t: try: # "one_text" is the whole document one_text = t.read().decode("utf-8") for c in control_chars: one_text = one_text.replace(c, ' ') one_text = normalize_text(one_text) all_lines.append(one_text) except UnicodeDecodeError: # we skip this file, but we need to preserve index pos all_lines.append(" ") continue # Save to disk for instant re-use on any future runs with smart_open(alldata_path, 'wb') as f: for idx, line in enumerate(all_lines): num_line = u"_*{0} {1}\n".format(idx, line) f.write(num_line.encode("utf-8")) assert os.path.isfile(alldata_path), "alldata unavailable" print("Success, alldata is available for next steps.") #=================================================================== #=#BLOCK#=#: Read in alldata #=================================================================== # this data object class suffices as a `TaggedDocument` # (with `words` and `tags`) # plus adds other state helpful for our later evaluation/reporting with smart_open(alldata_path, 'rb', encoding='utf-8') as alldata: alldata_list = list(alldata) print("Iterating up to: ", len(alldata_list)) with smart_open(alldata_path, 'rb', encoding='utf-8') as alldata: documents = [ TaggedDocument(doc, [i]) for i, doc in tqdm(enumerate(alldata)) ] model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) fname = get_tmpfile( os.path.join(write_path, "doc2vec_model_" + dataset_name)) model.save(fname) model = Doc2Vec.load( fname) # you can continue training with the loaded model! return
def word2vec(in_txt): path = get_tmpfile("word2vec_lstm.model") model = Word2Vec(in_txt, size=150, window=6, min_count=1, workers=multiprocessing.cpu_count()) model.save("word2vec_lstm.model")
def testPersistenceWithConstructorRule(self): """Test storing/loading the entire model with a vocab trimming rule passed in the constructor.""" tmpf = get_tmpfile('gensim_word2vec.tst') model = word2vec.Word2Vec(sentences, min_count=1, trim_rule=_rule) model.save(tmpf) self.models_equal(model, word2vec.Word2Vec.load(tmpf))
from glove import Glove from gensim.test.utils import datapath, get_tmpfile from gensim.models import KeyedVectors, Word2Vec embedding_dim = 10 glove = Glove.load(f'glove_{embedding_dim}.txt') print(glove.most_similar('我', number=10)) glove_file_path = f"/data/users/wangyuanzheng/projects/ucas_DL/3-poem/dev/glove_{embedding_dim}.txt" word2vec_output_path = f"/data/users/wangyuanzheng/projects/ucas_DL/3-poem/dev/glove_{embedding_dim}_w2v.txt" # 输入文件 glove_file = datapath(glove_file_path) # 输出文件 tmp_file = get_tmpfile(word2vec_output_path) # call glove2word2vec script # default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file> # 开始转换 from gensim.scripts.glove2word2vec import glove2word2vec glove2word2vec(glove_file, tmp_file) # 加载转化后的文件 model = KeyedVectors.load_word2vec_format(tmp_file, unicode_errors='ignore')
"w", encoding="UTF-8") as sparql: for line in lines: assert (3 == len(line)) en.write(line[0] + "\n") sparql.write(line[1] + "\n") ############################################################## from gensim.test.utils import datapath, get_tmpfile from gensim.models import KeyedVectors from gensim.scripts.glove2word2vec import glove2word2vec glove_file = datapath('D:/Downloads/Compressed/uniform.txt') tmp_file = get_tmpfile( "D:/Downloads/Compressed/dbpedia_kglove_uniform_200.txt") _ = glove2word2vec(glove_file, tmp_file) model = KeyedVectors.load_word2vec_format(tmp_file) """ python -m spacy init-model xx ./dbpedia_kglove_uniform_200 --vectors-loc dbpedia_kglove_uniform_200.txt.gz """ "D:/Downloads/Compressed/dbpedia_kglove_uniform_200.clean.txt" model.save('/tmp/MyModel') model.save_word2vec_format('/tmp/mymodel.txt', binary=False) model.save_word2vec_format('/tmp/mymodel.bin.gz', binary=True) tri = open("D:/Downloads/Compressed/try.txt", "r",
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count=min_word_count, window=context, sample=downsampling) word_vectors = model.wv #trained word vectors are independent from the methods used to train them, we can # represent them as a standalone structure KeyedVectors # Word2Vec load functions are depreciated # persist the word vectors to disk with... from gensim.test.utils import get_tmpfile from gensim.models import KeyedVectors fname = get_tmpfile("200features_40minwords_10context.kv") word_vectors.save(fname) word_vectors = KeyedVectors.load(fname, mmap='r') #model.most_similar("man") #Out[26]: #[(u'woman', 0.6606647968292236), # (u'lady', 0.6287051439285278), # (u'lad', 0.5678527355194092), # (u'guy', 0.5411106944084167), # (u'men', 0.537365734577179), # (u'person', 0.530378520488739), # (u'monk', 0.5267703533172607), # (u'businessman', 0.5263428688049316), # (u'millionaire', 0.5201252102851868), # (u'chap', 0.5184340476989746)]
import numpy from sklearn import cross_validation from sklearn.grid_search import GridSearchCV from sklearn.ensemble import RandomForestClassifier as RFC import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) import pickle from sklearn import metrics import numpy as np from pprint import pprint from gensim.test.utils import datapath, get_tmpfile from gensim.models import KeyedVectors import random glove_file = 'glove.6B.50d.txt' tmp_file = get_tmpfile("test_word2vec.txt") # call glove2word2vec script # default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file> from gensim.scripts.glove2word2vec import glove2word2vec glove2word2vec(glove_file, tmp_file) model = KeyedVectors.load_word2vec_format(tmp_file) def document_vector(word2vec_model, doc): # remove out-of-vocabulary words doc = preprocess(doc) doc = [word for word in doc if word in word2vec_model.vocab] return np.mean(word2vec_model[doc], axis=0)
def read_glove_vectors(glove_path): glove_file = datapath(glove_path) tmp_file = get_tmpfile("word2vec.txt") glove2word2vec(glove_file, tmp_file) model = KeyedVectors.load_word2vec_format(tmp_file) return model
def on_epoch_end(self, model): logging.info("Epoch " + str(self.epoch) + " completed") output_path = get_tmpfile('{}_epoch{}.model'.format( self.path, self.epoch)) model.save(output_path) self.epoch += 1
def convert_glove_word2vec(glove_path, dest_path): glove_file = datapath(glove_path) tmp_file = get_tmpfile(dest_path) _ = glove2word2vec(glove_file, tmp_file)