def setUp(self):
     self.model = SklLdaSeqModel(id2word=dictionary_ldaseq,
                                 num_topics=2,
                                 time_slice=[10, 10, 11],
                                 initialize='own',
                                 sstats=sstats_ldaseq)
     self.model.fit(corpus_ldaseq)
Пример #2
0
 def testPipeline(self):
     numpy.random.seed(0)  # set fixed seed to get similar values everytime
     with open(datapath('mini_newsgroup'), 'rb') as f:
         compressed_content = f.read()
         uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
         cache = pickle.loads(uncompressed_content)
     data = cache
     test_data = data.data[0:2]
     test_target = data.target[0:2]
     id2word = Dictionary(map(lambda x: x.split(), test_data))
     corpus = [id2word.doc2bow(i.split()) for i in test_data]
     model = SklLdaSeqModel(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim')
     clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
     text_ldaseq = Pipeline((('features', model,), ('classifier', clf)))
     text_ldaseq.fit(corpus, test_target)
     score = text_ldaseq.score(corpus, test_target)
     self.assertGreater(score, 0.50)
Пример #3
0
 def testModelNotFitted(self):
     ldaseq_wrapper = SklLdaSeqModel(num_topics=2)
     doc = list(corpus_ldaseq)[0]
     self.assertRaises(NotFittedError, ldaseq_wrapper.transform, doc)
Пример #4
0
 def setUp(self):
     self.model = SklLdaSeqModel(id2word=dictionary_ldaseq, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats_ldaseq)
     self.model.fit(corpus_ldaseq)
Пример #5
0
class TestSklLdaSeqModelWrapper(unittest.TestCase):
    def setUp(self):
        self.model = SklLdaSeqModel(id2word=dictionary_ldaseq, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats_ldaseq)
        self.model.fit(corpus_ldaseq)

    def testTransform(self):
        # transforming two documents
        docs = []
        docs.append(list(corpus_ldaseq)[0])
        docs.append(list(corpus_ldaseq)[1])
        transformed_vecs = self.model.transform(docs)
        self.assertEqual(transformed_vecs.shape[0], 2)
        self.assertEqual(transformed_vecs.shape[1], self.model.num_topics)

        # transforming one document
        doc = list(corpus_ldaseq)[0]
        transformed_vecs = self.model.transform(doc)
        self.assertEqual(transformed_vecs.shape[0], 1)
        self.assertEqual(transformed_vecs.shape[1], self.model.num_topics)

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(num_topics=3)
        model_params = self.model.get_params()
        self.assertEqual(model_params["num_topics"], 3)

        # updating multiple params
        param_dict = {"passes": 20, "chunksize": 200}
        self.model.set_params(**param_dict)
        model_params = self.model.get_params()
        for key in param_dict.keys():
            self.assertEqual(model_params[key], param_dict[key])

    def testPipeline(self):
        numpy.random.seed(0)  # set fixed seed to get similar values everytime
        with open(datapath('mini_newsgroup'), 'rb') as f:
            compressed_content = f.read()
            uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
            cache = pickle.loads(uncompressed_content)
        data = cache
        test_data = data.data[0:2]
        test_target = data.target[0:2]
        id2word = Dictionary(map(lambda x: x.split(), test_data))
        corpus = [id2word.doc2bow(i.split()) for i in test_data]
        model = SklLdaSeqModel(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim')
        clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
        text_ldaseq = Pipeline((('features', model,), ('classifier', clf)))
        text_ldaseq.fit(corpus, test_target)
        score = text_ldaseq.score(corpus, test_target)
        self.assertGreater(score, 0.50)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        doc = list(corpus_ldaseq)[0]
        loaded_transformed_vecs = model_load.transform(doc)

        # sanity check for transformation operation
        self.assertEqual(loaded_transformed_vecs.shape[0], 1)
        self.assertEqual(loaded_transformed_vecs.shape[1], model_load.num_topics)

        # comparing the original and loaded models
        original_transformed_vecs = self.model.transform(doc)
        passed = numpy.allclose(sorted(loaded_transformed_vecs), sorted(original_transformed_vecs), atol=1e-1)
        self.assertTrue(passed)

    def testModelNotFitted(self):
        ldaseq_wrapper = SklLdaSeqModel(num_topics=2)
        doc = list(corpus_ldaseq)[0]
        self.assertRaises(NotFittedError, ldaseq_wrapper.transform, doc)