예제 #1
0
 def test_deterministic_dmc(self):
     """Test par2vec results identical with identical RNG seed."""
     # bigger, dmc
     model = par2vec.Doc2Vec(DocsLeeCorpus(), dm=1, dm_concat=1, size=24, window=4, hs=1, negative=3,
                             seed=42, workers=1)
     model2 = par2vec.Doc2Vec(DocsLeeCorpus(), dm=1, dm_concat=1, size=24, window=4, hs=1, negative=3,
                              seed=42, workers=1)
     self.models_equal(model, model2)
예제 #2
0
    def test_training_multi_n02(self):
        """Test par2vec training."""
        corpus = DocsLeeCorpus()
        model = par2vec.Doc2Vec(size=[75, 100], hs=0, negative=10, min_count=2, iter=20)
        model.build_vocab(corpus)
        self.assertEqual(model.docvecs.doctag_syn0.shape, (300, 75))
        model.train(corpus)

        #self.model_sanity(model)

        # build vocab and train in one step; must be the same as above
        model2 = par2vec.Doc2Vec(corpus, size=[75, 100], hs=0, negative=10, min_count=2, iter=20)
        self.models_equal(model, model2)
예제 #3
0
 def test_mixed_tag_types(self):
     """Ensure alternating int/string tags don't share indexes in doctag_syn0"""
     mixed_tag_corpus = [par2vec.TaggedDocument(words, [i, words[0]]) for i, words in enumerate(raw_sentences)]
     model = par2vec.Doc2Vec()
     model.build_vocab(mixed_tag_corpus)
     expected_length = len(sentences) + len(model.docvecs.doctags)  # 9 sentences, 7 unique first tokens
     self.assertEquals(len(model.docvecs.doctag_syn0), expected_length)
예제 #4
0
    def test_int_doctags(self):
        """Test par2vec doctag alternatives"""
        corpus = DocsLeeCorpus()

        model = par2vec.Doc2Vec(min_count=1)
        model.build_vocab(corpus)
        self.assertEqual(len(model.docvecs.doctag_syn0), 300)
        self.assertEqual(model.docvecs[0].shape, (300,))
        self.assertRaises(KeyError, model.__getitem__, '_*0')
예제 #5
0
    def test_missing_string_doctag(self):
        """Test par2vec doctag alternatives"""
        corpus = list(DocsLeeCorpus(True))
        # force duplicated tags
        corpus = corpus[0:10] + corpus

        model = par2vec.Doc2Vec(min_count=1)
        model.build_vocab(corpus)
        self.assertRaises(KeyError, model.docvecs.__getitem__, 'not_a_tag')
예제 #6
0
    def test_parallel(self):
        """Test par2vec parallel training."""
        if par2vec.FAST_VERSION < 0:  # don't test the plain NumPy version for parallelism (too slow)
            return

        corpus = utils.RepeatCorpus(DocsLeeCorpus(), 10000)

        for workers in [2, 4]:
            model = par2vec.Doc2Vec(corpus, workers=workers)
            self.model_sanity(model)
예제 #7
0
    def test_load_mmap(self):
        """Test storing/loading the entire model."""
        model = par2vec.Doc2Vec(sentences, min_count=1)

        # test storing the internal arrays into separate files
        model.save(testfile(), sep_limit=0)
        self.models_equal(model, par2vec.Doc2Vec.load(testfile()))

        # make sure mmaping the arrays back works, too
        self.models_equal(model, par2vec.Doc2Vec.load(testfile(), mmap='r'))
예제 #8
0
    def test_string_doctags(self):
        """Test par2vec doctag alternatives"""
        corpus = list(DocsLeeCorpus(True))
        # force duplicated tags
        corpus = corpus[0:10] + corpus

        model = par2vec.Doc2Vec(min_count=1)
        model.build_vocab(corpus)
        self.assertEqual(len(model.docvecs.doctag_syn0), 300)
        self.assertEqual(model.docvecs[0].shape, (300,))
        self.assertEqual(model.docvecs['_*0'].shape, (300,))
        self.assertTrue(all(model.docvecs['_*0'] == model.docvecs[0]))
        self.assertTrue(max(d.offset for d in model.docvecs.doctags.values()) < len(model.docvecs.doctags))
        self.assertTrue(max(model.docvecs._int_index(str_key) for str_key in model.docvecs.doctags.keys()) < len(model.docvecs.doctag_syn0))
        # verify docvecs.most_similar() returns string doctags rather than indexes
        self.assertEqual(model.docvecs.offset2doctag[0], model.docvecs.most_similar([model.docvecs[0]])[0][0])
예제 #9
0
 def test_dmm_hs(self):
     """Test DM/mean par2vec training."""
     model = par2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=1, negative=0,
                             alpha=0.05, min_count=2, iter=20)
     self.model_sanity(model)
예제 #10
0
 def test_dmm_neg_multi_n03(self):
     """Test DM/mean par2vec training."""
     # Fails most of the time, just check if runs
     model = par2vec.Doc2Vec(list_corpus, size=[100, 150], dm=1, dm_mean=1, window=4, hs=0,
                             negative=10, alpha=0.05, min_count=2, iter=20)
예제 #11
0
 def test_persistence(self):
     """Test storing/loading the entire model."""
     model = par2vec.Doc2Vec(DocsLeeCorpus(), min_count=1)
     model.save(testfile())
     self.models_equal(model, par2vec.Doc2Vec.load(testfile()))
예제 #12
0
 def test_dmc_hs_n02(self):
     """Test DM/concatenate par2vec training."""
     model = par2vec.Doc2Vec(list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=1, negative=0,
                             alpha=0.05, min_count=2, iter=20)
예제 #13
0
 def test_dmc_hs_n03(self):
     """Test DM/concatenate par2vec training."""
     model = par2vec.Doc2Vec(list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=1, negative=0,
                             alpha=0.05, min_count=2, iter=20)
     # Fails constantly on this one
     self.model_sanity_n03(model)
예제 #14
0
 def test_dbow_neg_n03(self):
     """Test DBOW par2vec training."""
     model = par2vec.Doc2Vec(list_corpus, dm=0, hs=0, negative=10, min_count=2, iter=20)
     self.model_sanity_n03(model)
예제 #15
0
 def test_deterministic_neg(self):
     """Test par2vec results identical with identical RNG seed."""
     # neg
     model = par2vec.Doc2Vec(DocsLeeCorpus(), hs=0, negative=3, seed=42, workers=1)
     model2 = par2vec.Doc2Vec(DocsLeeCorpus(), hs=0, negative=3, seed=42, workers=1)
     self.models_equal(model, model2)
예제 #16
0
 def test_dms_neg_n03(self):
     """Test DM/sum par2vec training."""
     model = par2vec.Doc2Vec(list_corpus, dm=1, dm_mean=0, size=24, window=4, hs=0, negative=10,
                             alpha=0.05, min_count=2, iter=20)
     self.model_sanity_n03(model)
예제 #17
0
 def test_dmc_neg_n03(self):
     """Test DM/concatenate par2vec training."""
     model = par2vec.Doc2Vec(list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=0, negative=10,
                             alpha=0.05, min_count=2, iter=20)
     # May fail sometimes
     self.model_sanity_n03(model)