예제 #1
0
    def test_online_learning_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \
                temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)
            utils.save_as_line_sentence(new_sentences, new_corpus_file)

            model_hs = FT_gensim(corpus_file=corpus_file, size=10, min_count=1, seed=42, hs=1, negative=0)
            self.assertTrue(len(model_hs.wv.vocab), 12)
            self.assertTrue(model_hs.wv.vocab['graph'].count, 3)
            model_hs.build_vocab(corpus_file=new_corpus_file, update=True)  # update vocab
            self.assertEqual(len(model_hs.wv.vocab), 14)
            self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
            self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
예제 #2
0
    def test_online_learning_after_save_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file, \
                temporary_file(get_tmpfile('gensim_fasttext2.tst')) as new_corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)
            utils.save_as_line_sentence(new_sentences, new_corpus_file)

            tmpf = get_tmpfile('gensim_fasttext.tst')
            model_neg = FT_gensim(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5)
            model_neg.save(tmpf)
            model_neg = FT_gensim.load(tmpf)
            self.assertTrue(len(model_neg.wv.vocab), 12)
            model_neg.build_vocab(corpus_file=new_corpus_file, update=True)  # update vocab
            model_neg.train(corpus_file=new_corpus_file, total_words=model_neg.corpus_total_words,
                            epochs=model_neg.iter)
            self.assertEqual(len(model_neg.wv.vocab), 14)
예제 #3
0
    def test_training_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)

            model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
            model.build_vocab(corpus_file=corpus_file)
            self.model_sanity(model)

            model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.iter)
            sims = model.most_similar('graph', topn=10)

            self.assertEqual(model.wv.syn0.shape, (12, 10))
            self.assertEqual(len(model.wv.vocab), 12)
            self.assertEqual(model.wv.syn0_vocab.shape[1], 10)
            self.assertEqual(model.wv.syn0_ngrams.shape[1], 10)
            self.model_sanity(model)

            # test querying for "most similar" by vector
            graph_vector = model.wv.syn0norm[model.wv.vocab['graph'].index]
            sims2 = model.most_similar(positive=[graph_vector], topn=11)
            sims2 = [(w, sim) for w, sim in sims2 if w != 'graph']  # ignore 'graph' itself
            self.assertEqual(sims, sims2)

            # verify oov-word vector retrieval
            invocab_vec = model['minors']  # invocab word
            self.assertEqual(len(invocab_vec), 10)

            oov_vec = model['minor']  # oov word
            self.assertEqual(len(oov_vec), 10)
예제 #4
0
    def test_sg_neg_training_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file:
            model_gensim = FT_gensim(
                size=50, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5,
                min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6,
                sorted_vocab=1, workers=1, min_alpha=0.0)

            lee_data = LineSentence(datapath('lee_background.cor'))
            utils.save_as_line_sentence(lee_data, corpus_file)

            model_gensim.build_vocab(corpus_file=corpus_file)
            orig0 = np.copy(model_gensim.wv.vectors[0])
            model_gensim.train(corpus_file=corpus_file,
                               total_words=model_gensim.corpus_total_words,
                               epochs=model_gensim.epochs)
            self.assertFalse((orig0 == model_gensim.wv.vectors[0]).all())  # vector should vary after training

            sims_gensim = model_gensim.wv.most_similar('night', topn=10)
            sims_gensim_words = [word for (word, distance) in sims_gensim]  # get similar words
            expected_sims_words = [
                u'night.',
                u'night,',
                u'eight',
                u'overnight',
                u'overnight.',
                u'month',
                u'land',
                u'firm',
                u'singles',
                u'death']
            overlap_count = len(set(sims_gensim_words).intersection(expected_sims_words))
            self.assertGreaterEqual(overlap_count, 2)
예제 #5
0
 def testSaveLoad(self):
     """ Saving and loading a Phraser object."""
     with temporary_file("test.pkl") as fpath:
         bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
         bigram.save(fpath)
         bigram_loaded = Phraser.load(fpath)
         self.assertEqual(
             bigram_loaded[['graph', 'minors', 'survey', 'human', 'interface', 'system']],
             ['graph_minors', 'survey', 'human_interface', 'system'])
예제 #6
0
    def test_persistence_fromfile(self):
        """Test storing/loading the entire model."""
        with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file:
            save_lee_corpus_as_line_sentence(corpus_file)

            tmpf = get_tmpfile('gensim_doc2vec.tst')
            model = doc2vec.Doc2Vec(corpus_file=corpus_file, min_count=1)
            model.save(tmpf)
            self.models_equal(model, doc2vec.Doc2Vec.load(tmpf))
예제 #7
0
 def test_dmc_neg_fromfile(self):
     """Test DBOW doc2vec training."""
     with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file:
         save_lee_corpus_as_line_sentence(corpus_file)
         model = doc2vec.Doc2Vec(
             list_corpus, dm=1, dm_concat=1, vector_size=24, window=4, hs=0,
             negative=10, alpha=0.05, min_count=2, epochs=20
         )
         self.model_sanity(model)
예제 #8
0
    def test_persistence_fromfile(self):
        """Test storing/loading the entire model."""
        with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file:
            save_lee_corpus_as_line_sentence(corpus_file)

            tmpf = get_tmpfile('gensim_doc2vec.tst')
            model = doc2vec.Doc2Vec(corpus_file=corpus_file, min_count=1)
            model.save(tmpf)
            self.models_equal(model, doc2vec.Doc2Vec.load(tmpf))
예제 #9
0
 def testSaveLoad(self):
     """ Saving and loading a Phraser object."""
     with temporary_file("test.pkl") as fpath:
         bigram = Phraser(Phrases(self.sentences, min_count=1, threshold=1))
         bigram.save(fpath)
         bigram_loaded = Phraser.load(fpath)
         self.assertEqual(
             bigram_loaded[[
                 'graph', 'minors', 'survey', 'human', 'interface', 'system'
             ]], ['graph_minors', 'survey', 'human_interface', 'system'])
예제 #10
0
    def testSaveLoadCustomScorer(self):
        """Saving and loading a Phraser object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phraser(
                Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer))
            bigram.save(fpath)
            bigram_loaded = Phraser.load(fpath)
            # we do not much with scoring, just verify its the one expected
            self.assertEqual(bigram_loaded.scoring, dumb_scorer)
예제 #11
0
 def test_dbow_hs_fromfile(self):
     """Test DBOW doc2vec training."""
     with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file:
         save_lee_corpus_as_line_sentence(corpus_file)
         model = doc2vec.Doc2Vec(corpus_file=corpus_file,
                                 dm=0,
                                 hs=1,
                                 negative=0,
                                 min_count=2,
                                 epochs=20)
         self.model_sanity(model)
예제 #12
0
 def test_dbow_neg_fromfile(self):
     """Test DBOW doc2vec training."""
     with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file:
         save_lee_corpus_as_line_sentence(corpus_file)
         model = doc2vec.Doc2Vec(list_corpus,
                                 dm=0,
                                 hs=0,
                                 negative=10,
                                 min_count=2,
                                 iter=20)
         self.model_sanity(model)
예제 #13
0
 def test_save_load_with_connector_words(self):
     """Test saving and loading a FrozenPhrases object."""
     connector_words = frozenset({'of'})
     with temporary_file("test.pkl") as fpath:
         bigram = FrozenPhrases(
             Phrases(self.sentences,
                     min_count=1,
                     threshold=1,
                     connector_words=connector_words))
         bigram.save(fpath)
         bigram_loaded = FrozenPhrases.load(fpath)
         self.assertEqual(bigram_loaded.connector_words, connector_words)
예제 #14
0
    def testSaveLoadCustomScorer(self):
        """Test saving and loading a FrozenPhrases object with a custom scorer."""

        with temporary_file("test.pkl") as fpath:
            bigram = FrozenPhrases(
                Phrases(self.sentences,
                        min_count=1,
                        threshold=.001,
                        scoring=dumb_scorer))
            bigram.save(fpath)
            bigram_loaded = FrozenPhrases.load(fpath)
            self.assertEqual(bigram_loaded.scoring, dumb_scorer)
예제 #15
0
    def testSaveLoadCustomScorer(self):
        """Saving and loading a Phraser object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phraser(
                Phrases(self.sentences,
                        min_count=1,
                        threshold=.001,
                        scoring=dumb_scorer))
            bigram.save(fpath)
            bigram_loaded = Phraser.load(fpath)
            # we do not much with scoring, just verify its the one expected
            self.assertEqual(bigram_loaded.scoring, dumb_scorer)
예제 #16
0
 def test_dbow_fixedwindowsize_fromfile(self):
     """Test DBOW doc2vec training with fixed window size, from file."""
     with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file:
         save_lee_corpus_as_line_sentence(corpus_file)
         model = doc2vec.Doc2Vec(corpus_file=corpus_file,
                                 vector_size=16,
                                 shrink_windows=False,
                                 dm=0,
                                 hs=0,
                                 negative=5,
                                 min_count=2,
                                 epochs=20)
         self.model_sanity(model)
예제 #17
0
    def test_training_fromfile(self):
        """Test doc2vec training."""
        with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file:
            save_lee_corpus_as_line_sentence(corpus_file)

            model = doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=20, workers=1)
            model.build_vocab(corpus_file=corpus_file)
            self.assertEqual(model.docvecs.vectors_docs.shape, (300, 100))
            model.train(corpus_file=corpus_file, total_words=model.corpus_total_words, epochs=model.epochs)

            self.model_sanity(model)

            model = doc2vec.Doc2Vec(corpus_file=corpus_file, vector_size=100, min_count=2, epochs=20, workers=1)
            self.model_sanity(model)
예제 #18
0
    def testSaveLoadCustomScorer(self):
        """ saving and loading a Phrases object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            seen_scores = []
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.append(score)

            assert all(seen_scores)  # all scores 1
            assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'
예제 #19
0
    def test_persistence_fromfile(self):
        with temporary_file(get_tmpfile('gensim_fasttext1.tst')) as corpus_file:
            utils.save_as_line_sentence(sentences, corpus_file)

            tmpf = get_tmpfile('gensim_fasttext.tst')
            model = FT_gensim(corpus_file=corpus_file, min_count=1)
            model.save(tmpf)
            self.models_equal(model, FT_gensim.load(tmpf))
            #  test persistence of the KeyedVectors of a model
            wv = model.wv
            wv.save(tmpf)
            loaded_wv = FastTextKeyedVectors.load(tmpf)
            self.assertTrue(np.allclose(wv.syn0_ngrams, loaded_wv.syn0_ngrams))
            self.assertEqual(len(wv.vocab), len(loaded_wv.vocab))
예제 #20
0
def get_embeddings(
    embeddings: str,
    embeddings_format: str = 'glove',
    embeddings_binary: bool = False,
) -> KeyedVectors:
    """
    Get the embeddings model and matrix used in the setup function

    Parameters
    ----------
    embeddings : Optional[str], optional
        Path to pretrained embeddings, by default None
    embeddings_format : str, optional
        The format of the input embeddings, should be one of:
        'glove', 'word2vec', 'fasttext' or 'gensim'. The latter can
        be used to download embeddings hosted on gensim on the fly.
        See https://github.com/RaRe-Technologies/gensim-data
        for the list of available embedding aliases.
    embeddings_binary : bool, optional
        Whether the input embeddings are provided in binary format,
        by default False

    Returns
    -------
    KeyedVectors
        The embeddings object specified by the parameters.
    """
    model = None

    if embeddings_format == 'glove':
        with temporary_file('temp.txt') as temp:
            glove2word2vec(embeddings, temp)
            model = KeyedVectors.load_word2vec_format(temp,
                                                      binary=embeddings_binary)
    elif embeddings_format == 'word2vec':
        model = KeyedVectors.load_word2vec_format(embeddings,
                                                  binary=embeddings_binary)
    elif embeddings_format == 'fasttext':
        model = fasttext.load_facebook_vectors(embeddings)
    elif embeddings_format == 'gensim':
        try:
            model = KeyedVectors.load(embeddings)
        except FileNotFoundError:
            model = api.load(embeddings)
    else:
        raise ValueError(
            "Only formats supported are word2vec, fasttext and gensim")

    return model
예제 #21
0
    def testSaveLoad(self):
        """ Saving and loading a Phrases object."""

        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            seen_scores = set()
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])
예제 #22
0
    def testSaveLoad(self):
        """Test saving and loading a Phrases object."""
        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            seen_scores = set(
                round(score, 3) for score in bigram_loaded.find_phrases(
                    test_sentences).values())

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])
def example_1():
    """
    Example code from Gensim documentation on author-topic class.
    :return:
    """
    author2doc = {
        'john': [0, 1, 2, 3, 4, 5, 6],
        'jane': [2, 3, 4, 5, 6, 7, 8],
        'jack': [0, 2, 4, 6, 8]
    }

    corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))

    print("Corpus contents:")
    print(f"{corpus}\n")

    print(f"Documents in the corpus: ")
    for document in corpus:
        print(f"{document}")

    print("\nDictionary contents:")
    print(f"{common_dictionary}\n")
    print(f"Dictionary contents with word index value:")
    print(f"{common_dictionary.token2id}\n")

    with temporary_file("serialized") as s_path:
        model = AuthorTopicModel(corpus,
                                 author2doc=author2doc,
                                 id2word=common_dictionary,
                                 num_topics=4,
                                 serialized=True,
                                 serialization_path=s_path)

        model.update(
            corpus, author2doc
        )  # update the author-topic model with additional documents

    # construct vectors for authors
    author_vecs = [
        model.get_author_topics(author) for author in model.id2author.values()
    ]
    print(f"Vectors for authors:")
    print(f"{author_vecs}\n")
예제 #24
0
    def testSaveLoadCustomScorer(self):
        """Test saving and loading a Phrases object with a custom scorer."""
        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences,
                             min_count=1,
                             threshold=.001,
                             scoring=dumb_scorer)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            seen_scores = list(
                bigram_loaded.find_phrases(test_sentences).values())

            assert all(score == 1 for score in seen_scores)
            assert len(
                seen_scores
            ) == 3  # 'graph minors' and 'survey human' and 'interface system'
예제 #25
0
    def testSaveLoadCustomScorer(self):
        """ saving and loading a Phrases object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences,
                             min_count=1,
                             threshold=.001,
                             scoring=dumb_scorer)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            seen_scores = []
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.append(score)

            assert all(seen_scores)  # all scores 1
            assert len(
                seen_scores
            ) == 3  # 'graph minors' and 'survey human' and 'interface system'
예제 #26
0
파일: text.py 프로젝트: sean-asapp/flambe
    def setup(self, *data: np.ndarray) -> None:
        """Build the vocabulary and sets embeddings.

        Parameters
        ----------
        data : Iterable[str]
            List of input strings.

        """
        if self.embeddings is not None:
            # Load embedding model
            embeddings_matrix = []
            if self.embeddings_format == 'glove':
                with temporary_file('temp.txt') as temp:
                    glove2word2vec(self.embeddings, temp)
                    model = KeyedVectors.load_word2vec_format(
                        temp, binary=self.embeddings_binary)
            elif self.embeddings_format == 'word2vec':
                model = KeyedVectors.load_word2vec_format(
                    self.embeddings, binary=self.embeddings_binary)
            elif self.embeddings_format == 'fasttext':
                model = fasttext.load_facebook_vectors(self.embeddings)
            elif self.embeddings_format == 'gensim':
                try:
                    model = KeyedVectors.load(self.embeddings)
                except FileNotFoundError:
                    model = api.load(self.embeddings)
            else:
                raise ValueError(
                    "Only formats supported are word2vec, fasttext and gensim")

            # Add embeddings for special tokens
            for special in self.specials:
                if special in model:
                    embeddings_matrix.append(torch.tensor(model[special]))
                else:
                    embeddings_matrix.append(torch.randn(model.vector_size))

        # Iterate over all examples
        examples = (e for dataset in data for e in dataset
                    if dataset is not None)

        # Get current last id
        index = len(self.vocab) - 1

        for example in examples:
            # Lowercase if requested
            example = example.lower() if self.lower else example
            # Tokenize and add to vocabulary
            for token in self.tokenizer(example):
                if token not in self.vocab:
                    if self.embeddings is not None:
                        if token in model:
                            self.vocab[token] = index = index + 1
                            embeddings_matrix.append(torch.tensor(
                                model[token]))
                        else:
                            if self.unk_init_all:
                                # Give every OOV it's own embedding
                                self.vocab[token] = index = index + 1
                                embeddings_matrix.append(
                                    torch.randn(model.vector_size))
                            else:
                                # Collapse all OOV's to the same token
                                # id
                                self.vocab[token] = self.vocab[self.unk]
                            self.unk_numericals.add(self.vocab[token])
                    else:
                        self.vocab[token] = index = index + 1

        if self.embeddings is not None:
            self.embedding_matrix = torch.stack(embeddings_matrix)
예제 #27
0
 def test_dbow_neg_fromfile(self):
     """Test DBOW doc2vec training."""
     with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file:
         save_lee_corpus_as_line_sentence(corpus_file)
         model = doc2vec.Doc2Vec(list_corpus, dm=0, hs=0, negative=10, min_count=2, epochs=20)
         self.model_sanity(model)