def test_online_learning_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \ temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file: utils.save_as_line_sentence(sentences, corpus_file) utils.save_as_line_sentence(new_sentences, new_corpus_file) model_hs = FT_gensim(corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab self.assertEqual(len(model_hs.wv.vocab), 14) self.assertTrue(model_hs.wv.vocab['graph'].count, 4) self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
def test_online_learning_after_save_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \ temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file: utils.save_as_line_sentence(sentences, corpus_file) utils.save_as_line_sentence(new_sentences, new_corpus_file) tmpf = get_tmpfile('gensim_fasttext.tst') model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab model_neg.train(corpus_file=new_corpus_file, total_words=model_neg.corpus_total_words, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14)
def test_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: utils.save_as_line_sentence(sentences, corpus_file) model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(corpus_file=corpus_file) self.model_sanity(model) model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.iter) sims = model.most_similar('graph', topn=10) self.assertEqual(model.wv.syn0.shape, (12, 10)) self.assertEqual(len(model.wv.vocab), 12) self.assertEqual(model.wv.syn0_vocab.shape[1], 10) self.assertEqual(model.wv.syn0_ngrams.shape[1], 10) self.model_sanity(model) # test querying for "most similar" by vector graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index] sims2 = model.most_similar(positive=[graph_vector], topn=11) sims2 = [(w, sim) for w, sim in sims2 if w != 'graph'] # ignore 'graph' itself self.assertEqual(sims, sims2) # verify oov-word vector retrieval invocab_vec = model['minors'] # invocab word self.assertEqual(len(invocab_vec), 10) oov_vec = model['minor'] # oov word self.assertEqual(len(oov_vec), 10)
def test_sg_neg_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: model_gensim = FT_gensim( size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) utils.save_as_line_sentence(lee_data, corpus_file) model_gensim.build_vocab(corpus_file=corpus_file) orig0 = np.copy(model_gensim.wv.vectors[0]) model_gensim.train(corpus_file=corpus_file, total_words=model_gensim.corpus_total_words, epochs=model_gensim.epochs) self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all()) # vector should vary after training sims_gensim = model_gensim.wv.most_similar('night', topn=10) sims_gensim_words = [word for (word, distance) in sims_gensim] # get similar words expected_sims_words = [ u'night.', u'night,', u'eight', u'overnight', u'overnight.', u'month', u'land', u'firm', u'singles', u'death'] overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words)) self.assertGreaterEqual(overlap_count, 2)
def testSaveLoad(self): """ Saving and loading a Phraser object.""" with temporary_file("test.pkl") as fpath: bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) self.assertEqual( bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']], ['graph_minors', 'survey', 'human_interface', 'system'])
def test_persistence_fromfile(self): """Test storing/loading the entire model.""" with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) tmpf = get_tmpfile('gensim_doc2vec.tst') model = doc2vec.Doc2Vec(corpus_file=corpus_file, min_count=1) model.save(tmpf) self.models_equal(model, doc2vec.Doc2Vec.load(tmpf))
def test_dmc_neg_fromfile(self): """Test DBOW doc2vec training.""" with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_concat=1, vector_size=24, window=4, hs=0, negative=10, alpha=0.05, min_count=2, epochs=20 ) self.model_sanity(model)
def test_persistence_fromfile(self): """Test storing/loading the entire model.""" with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) tmpf = get_tmpfile('gensim_doc2vec.tst') model = doc2vec.Doc2Vec(corpus_file=corpus_file, min_count=1) model.save(tmpf) self.models_equal(model, doc2vec.Doc2Vec.load(tmpf))
def testSaveLoad(self): """ Saving and loading a Phraser object.""" with temporary_file("test.pkl") as fpath: bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) self.assertEqual( bigram_loaded[[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]], ['graph_minors', 'survey', 'human_interface', 'system'])
def testSaveLoadCustomScorer(self): """Saving and loading a Phraser object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phraser( Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)) bigram.save(fpath) bigram_loaded = Phraser.load(fpath) # we do not much with scoring, just verify its the one expected self.assertEqual(bigram_loaded.scoring, dumb_scorer)
def test_dbow_hs_fromfile(self): """Test DBOW doc2vec training.""" with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec(corpus_file=corpus_file, dm=0, hs=1, negative=0, min_count=2, epochs=20) self.model_sanity(model)
def test_dbow_neg_fromfile(self): """Test DBOW doc2vec training.""" with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec(list_corpus, dm=0, hs=0, negative=10, min_count=2, iter=20) self.model_sanity(model)
def test_save_load_with_connector_words(self): """Test saving and loading a FrozenPhrases object.""" connector_words = frozenset({'of'}) with temporary_file("test.pkl") as fpath: bigram = FrozenPhrases( Phrases(self.sentences, min_count=1, threshold=1, connector_words=connector_words)) bigram.save(fpath) bigram_loaded = FrozenPhrases.load(fpath) self.assertEqual(bigram_loaded.connector_words, connector_words)
def testSaveLoadCustomScorer(self): """Test saving and loading a FrozenPhrases object with a custom scorer.""" with temporary_file("test.pkl") as fpath: bigram = FrozenPhrases( Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)) bigram.save(fpath) bigram_loaded = FrozenPhrases.load(fpath) self.assertEqual(bigram_loaded.scoring, dumb_scorer)
def test_dbow_fixedwindowsize_fromfile(self): """Test DBOW doc2vec training with fixed window size, from file.""" with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec(corpus_file=corpus_file, vector_size=16, shrink_windows=False, dm=0, hs=0, negative=5, min_count=2, epochs=20) self.model_sanity(model)
def test_training_fromfile(self): """Test doc2vec training.""" with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=20, workers=1) model.build_vocab(corpus_file=corpus_file) self.assertEqual(model.docvecs.vectors_docs.shape, (300, 100)) model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.epochs) self.model_sanity(model) model = doc2vec.Doc2Vec(corpus_file=corpus_file, vector_size=100, min_count=2, epochs=20, workers=1) self.model_sanity(model)
def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system'
def test_persistence_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file: utils.save_as_line_sentence(sentences, corpus_file) tmpf = get_tmpfile('gensim_fasttext.tst') model = FT_gensim(corpus_file=corpus_file, min_count=1) model.save(tmpf) self.models_equal(model, FT_gensim.load(tmpf)) # test persistence of the KeyedVectors of a model wv = model.wv wv.save(tmpf) loaded_wv = FastTextKeyedVectors.load(tmpf) self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams)) self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
def get_embeddings( embeddings: str, embeddings_format: str = 'glove', embeddings_binary: bool = False, ) -> KeyedVectors: """ Get the embeddings model and matrix used in the setup function Parameters ---------- embeddings : Optional[str], optional Path to pretrained embeddings, by default None embeddings_format : str, optional The format of the input embeddings, should be one of: 'glove', 'word2vec', 'fasttext' or 'gensim'. The latter can be used to download embeddings hosted on gensim on the fly. See https://github.com/RaRe-Technologies/gensim-data for the list of available embedding aliases. embeddings_binary : bool, optional Whether the input embeddings are provided in binary format, by default False Returns ------- KeyedVectors The embeddings object specified by the parameters. """ model = None if embeddings_format == 'glove': with temporary_file('temp.txt') as temp: glove2word2vec(embeddings, temp) model = KeyedVectors.load_word2vec_format(temp, binary=embeddings_binary) elif embeddings_format == 'word2vec': model = KeyedVectors.load_word2vec_format(embeddings, binary=embeddings_binary) elif embeddings_format == 'fasttext': model = fasttext.load_facebook_vectors(embeddings) elif embeddings_format == 'gensim': try: model = KeyedVectors.load(embeddings) except FileNotFoundError: model = api.load(embeddings) else: raise ValueError( "Only formats supported are word2vec, fasttext and gensim") return model
def testSaveLoad(self): """ Saving and loading a Phrases object.""" with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.add(round(score, 3)) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ])
def testSaveLoad(self): """Test saving and loading a Phrases object.""" with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] seen_scores = set( round(score, 3) for score in bigram_loaded.find_phrases( test_sentences).values()) assert seen_scores == set([ 5.167, # score for graph minors 3.444 # score for human interface ])
def example_1(): """ Example code from Gensim documentation on author-topic class. :return: """ author2doc = { 'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8] } corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) print("Corpus contents:") print(f"{corpus}\n") print(f"Documents in the corpus: ") for document in corpus: print(f"{document}") print("\nDictionary contents:") print(f"{common_dictionary}\n") print(f"Dictionary contents with word index value:") print(f"{common_dictionary.token2id}\n") with temporary_file("serialized") as s_path: model = AuthorTopicModel(corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4, serialized=True, serialization_path=s_path) model.update( corpus, author2doc ) # update the author-topic model with additional documents # construct vectors for authors author_vecs = [ model.get_author_topics(author) for author in model.id2author.values() ] print(f"Vectors for authors:") print(f"{author_vecs}\n")
def testSaveLoadCustomScorer(self): """Test saving and loading a Phrases object with a custom scorer.""" with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] seen_scores = list( bigram_loaded.find_phrases(test_sentences).values()) assert all(score == 1 for score in seen_scores) assert len( seen_scores ) == 3 # 'graph minors' and 'survey human' and 'interface system'
def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ with temporary_file("test.pkl") as fpath: bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save(fpath) bigram_loaded = Phrases.load(fpath) seen_scores = [] test_sentences = [[ 'graph', 'minors', 'survey', 'human', 'interface', 'system' ]] for phrase, score in bigram_loaded.export_phrases(test_sentences): seen_scores.append(score) assert all(seen_scores) # all scores 1 assert len( seen_scores ) == 3 # 'graph minors' and 'survey human' and 'interface system'
def setup(self, *data: np.ndarray) -> None: """Build the vocabulary and sets embeddings. Parameters ---------- data : Iterable[str] List of input strings. """ if self.embeddings is not None: # Load embedding model embeddings_matrix = [] if self.embeddings_format == 'glove': with temporary_file('temp.txt') as temp: glove2word2vec(self.embeddings, temp) model = KeyedVectors.load_word2vec_format( temp, binary=self.embeddings_binary) elif self.embeddings_format == 'word2vec': model = KeyedVectors.load_word2vec_format( self.embeddings, binary=self.embeddings_binary) elif self.embeddings_format == 'fasttext': model = fasttext.load_facebook_vectors(self.embeddings) elif self.embeddings_format == 'gensim': try: model = KeyedVectors.load(self.embeddings) except FileNotFoundError: model = api.load(self.embeddings) else: raise ValueError( "Only formats supported are word2vec, fasttext and gensim") # Add embeddings for special tokens for special in self.specials: if special in model: embeddings_matrix.append(torch.tensor(model[special])) else: embeddings_matrix.append(torch.randn(model.vector_size)) # Iterate over all examples examples = (e for dataset in data for e in dataset if dataset is not None) # Get current last id index = len(self.vocab) - 1 for example in examples: # Lowercase if requested example = example.lower() if self.lower else example # Tokenize and add to vocabulary for token in self.tokenizer(example): if token not in self.vocab: if self.embeddings is not None: if token in model: self.vocab[token] = index = index + 1 embeddings_matrix.append(torch.tensor( model[token])) else: if self.unk_init_all: # Give every OOV it's own embedding self.vocab[token] = index = index + 1 embeddings_matrix.append( torch.randn(model.vector_size)) else: # Collapse all OOV's to the same token # id self.vocab[token] = self.vocab[self.unk] self.unk_numericals.add(self.vocab[token]) else: self.vocab[token] = index = index + 1 if self.embeddings is not None: self.embedding_matrix = torch.stack(embeddings_matrix)
def test_dbow_neg_fromfile(self): """Test DBOW doc2vec training.""" with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec(list_corpus, dm=0, hs=0, negative=10, min_count=2, epochs=20) self.model_sanity(model)