def test_deterministic_dmc(self): """Test par2vec results identical with identical RNG seed.""" # bigger, dmc model = par2vec.Doc2Vec(DocsLeeCorpus(), dm=1, dm_concat=1, size=24, window=4, hs=1, negative=3, seed=42, workers=1) model2 = par2vec.Doc2Vec(DocsLeeCorpus(), dm=1, dm_concat=1, size=24, window=4, hs=1, negative=3, seed=42, workers=1) self.models_equal(model, model2)
def test_training_multi_n02(self): """Test par2vec training.""" corpus = DocsLeeCorpus() model = par2vec.Doc2Vec(size=[75, 100], hs=0, negative=10, min_count=2, iter=20) model.build_vocab(corpus) self.assertEqual(model.docvecs.doctag_syn0.shape, (300, 75)) model.train(corpus) #self.model_sanity(model) # build vocab and train in one step; must be the same as above model2 = par2vec.Doc2Vec(corpus, size=[75, 100], hs=0, negative=10, min_count=2, iter=20) self.models_equal(model, model2)
def test_mixed_tag_types(self): """Ensure alternating int/string tags don't share indexes in doctag_syn0""" mixed_tag_corpus = [par2vec.TaggedDocument(words, [i, words[0]]) for i, words in enumerate(raw_sentences)] model = par2vec.Doc2Vec() model.build_vocab(mixed_tag_corpus) expected_length = len(sentences) + len(model.docvecs.doctags) # 9 sentences, 7 unique first tokens self.assertEquals(len(model.docvecs.doctag_syn0), expected_length)
def test_int_doctags(self): """Test par2vec doctag alternatives""" corpus = DocsLeeCorpus() model = par2vec.Doc2Vec(min_count=1) model.build_vocab(corpus) self.assertEqual(len(model.docvecs.doctag_syn0), 300) self.assertEqual(model.docvecs[0].shape, (300,)) self.assertRaises(KeyError, model.__getitem__, '_*0')
def test_missing_string_doctag(self): """Test par2vec doctag alternatives""" corpus = list(DocsLeeCorpus(True)) # force duplicated tags corpus = corpus[0:10] + corpus model = par2vec.Doc2Vec(min_count=1) model.build_vocab(corpus) self.assertRaises(KeyError, model.docvecs.__getitem__, 'not_a_tag')
def test_parallel(self): """Test par2vec parallel training.""" if par2vec.FAST_VERSION < 0: # don't test the plain NumPy version for parallelism (too slow) return corpus = utils.RepeatCorpus(DocsLeeCorpus(), 10000) for workers in [2, 4]: model = par2vec.Doc2Vec(corpus, workers=workers) self.model_sanity(model)
def test_load_mmap(self): """Test storing/loading the entire model.""" model = par2vec.Doc2Vec(sentences, min_count=1) # test storing the internal arrays into separate files model.save(testfile(), sep_limit=0) self.models_equal(model, par2vec.Doc2Vec.load(testfile())) # make sure mmaping the arrays back works, too self.models_equal(model, par2vec.Doc2Vec.load(testfile(), mmap='r'))
def test_string_doctags(self): """Test par2vec doctag alternatives""" corpus = list(DocsLeeCorpus(True)) # force duplicated tags corpus = corpus[0:10] + corpus model = par2vec.Doc2Vec(min_count=1) model.build_vocab(corpus) self.assertEqual(len(model.docvecs.doctag_syn0), 300) self.assertEqual(model.docvecs[0].shape, (300,)) self.assertEqual(model.docvecs['_*0'].shape, (300,)) self.assertTrue(all(model.docvecs['_*0'] == model.docvecs[0])) self.assertTrue(max(d.offset for d in model.docvecs.doctags.values()) < len(model.docvecs.doctags)) self.assertTrue(max(model.docvecs._int_index(str_key) for str_key in model.docvecs.doctags.keys()) < len(model.docvecs.doctag_syn0)) # verify docvecs.most_similar() returns string doctags rather than indexes self.assertEqual(model.docvecs.offset2doctag[0], model.docvecs.most_similar([model.docvecs[0]])[0][0])
def test_dmm_hs(self): """Test DM/mean par2vec training.""" model = par2vec.Doc2Vec(list_corpus, dm=1, dm_mean=1, size=24, window=4, hs=1, negative=0, alpha=0.05, min_count=2, iter=20) self.model_sanity(model)
def test_dmm_neg_multi_n03(self): """Test DM/mean par2vec training.""" # Fails most of the time, just check if runs model = par2vec.Doc2Vec(list_corpus, size=[100, 150], dm=1, dm_mean=1, window=4, hs=0, negative=10, alpha=0.05, min_count=2, iter=20)
def test_persistence(self): """Test storing/loading the entire model.""" model = par2vec.Doc2Vec(DocsLeeCorpus(), min_count=1) model.save(testfile()) self.models_equal(model, par2vec.Doc2Vec.load(testfile()))
def test_dmc_hs_n02(self): """Test DM/concatenate par2vec training.""" model = par2vec.Doc2Vec(list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=1, negative=0, alpha=0.05, min_count=2, iter=20)
def test_dmc_hs_n03(self): """Test DM/concatenate par2vec training.""" model = par2vec.Doc2Vec(list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=1, negative=0, alpha=0.05, min_count=2, iter=20) # Fails constantly on this one self.model_sanity_n03(model)
def test_dbow_neg_n03(self): """Test DBOW par2vec training.""" model = par2vec.Doc2Vec(list_corpus, dm=0, hs=0, negative=10, min_count=2, iter=20) self.model_sanity_n03(model)
def test_deterministic_neg(self): """Test par2vec results identical with identical RNG seed.""" # neg model = par2vec.Doc2Vec(DocsLeeCorpus(), hs=0, negative=3, seed=42, workers=1) model2 = par2vec.Doc2Vec(DocsLeeCorpus(), hs=0, negative=3, seed=42, workers=1) self.models_equal(model, model2)
def test_dms_neg_n03(self): """Test DM/sum par2vec training.""" model = par2vec.Doc2Vec(list_corpus, dm=1, dm_mean=0, size=24, window=4, hs=0, negative=10, alpha=0.05, min_count=2, iter=20) self.model_sanity_n03(model)
def test_dmc_neg_n03(self): """Test DM/concatenate par2vec training.""" model = par2vec.Doc2Vec(list_corpus, dm=1, dm_concat=1, size=24, window=4, hs=0, negative=10, alpha=0.05, min_count=2, iter=20) # May fail sometimes self.model_sanity_n03(model)