예제 #1
0
 def testPipeline(self):
     model = SklLsiModel(num_topics=2)
     with open(datapath('mini_newsgroup'), 'rb') as f:
         compressed_content = f.read()
         uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
         cache = pickle.loads(uncompressed_content)
     data = cache
     id2word = Dictionary(map(lambda x: x.split(), data.data))
     corpus = [id2word.doc2bow(i.split()) for i in data.data]
     numpy.random.mtrand.RandomState(1)  # set seed for getting same result
     clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
     text_lsi = Pipeline((('features', model,), ('classifier', clf)))
     text_lsi.fit(corpus, data.target)
     score = text_lsi.score(corpus, data.target)
     self.assertGreater(score, 0.50)
예제 #2
0
 def testModelNotFitted(self):
     lsi_wrapper = SklLsiModel(id2word=dictionary, num_topics=2)
     texts_new = ['graph', 'eulerian']
     bow = lsi_wrapper.id2word.doc2bow(texts_new)
     self.assertRaises(NotFittedError, lsi_wrapper.transform, bow)
예제 #3
0
 def setUp(self):
     numpy.random.seed(0)  # set fixed seed to get similar values everytime
     self.model = SklLsiModel(id2word=dictionary, num_topics=2)
     self.model.fit(corpus)
예제 #4
0
class TestSklLsiModelWrapper(unittest.TestCase):
    def setUp(self):
        numpy.random.seed(0)  # set fixed seed to get similar values everytime
        self.model = SklLsiModel(id2word=dictionary, num_topics=2)
        self.model.fit(corpus)

    def testTransform(self):
        texts_new = ['graph', 'eulerian']
        bow = self.model.id2word.doc2bow(texts_new)
        matrix = self.model.transform(bow)
        self.assertEqual(matrix.shape[0], 1)
        self.assertEqual(matrix.shape[1], self.model.num_topics)
        texts_new = [['graph', 'eulerian'], ['server', 'flow'], ['path', 'system']]
        bow = []
        for i in texts_new:
            bow.append(self.model.id2word.doc2bow(i))
        matrix = self.model.transform(bow)
        self.assertEqual(matrix.shape[0], 3)
        self.assertEqual(matrix.shape[1], self.model.num_topics)

    def testPartialFit(self):
        for i in range(10):
            self.model.partial_fit(X=corpus)  # fit against the model again
            doc = list(corpus)[0]  # transform only the first document
            transformed = self.model.transform(doc)
        expected = numpy.array([1.39, 1e-12])
        passed = numpy.allclose(sorted(transformed[0]), sorted(expected), atol=1)
        self.assertTrue(passed)

    def testPipeline(self):
        model = SklLsiModel(num_topics=2)
        with open(datapath('mini_newsgroup'), 'rb') as f:
            compressed_content = f.read()
            uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
            cache = pickle.loads(uncompressed_content)
        data = cache
        id2word = Dictionary(map(lambda x: x.split(), data.data))
        corpus = [id2word.doc2bow(i.split()) for i in data.data]
        numpy.random.mtrand.RandomState(1)  # set seed for getting same result
        clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
        text_lsi = Pipeline((('features', model,), ('classifier', clf)))
        text_lsi.fit(corpus, data.target)
        score = text_lsi.score(corpus, data.target)
        self.assertGreater(score, 0.50)

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(num_topics=3)
        model_params = self.model.get_params()
        self.assertEqual(model_params["num_topics"], 3)

        # updating multiple params
        param_dict = {"chunksize": 10000, "decay": 0.9}
        self.model.set_params(**param_dict)
        model_params = self.model.get_params()
        for key in param_dict.keys():
            self.assertEqual(model_params[key], param_dict[key])

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        texts_new = ['graph', 'eulerian']
        loaded_bow = model_load.id2word.doc2bow(texts_new)
        loaded_matrix = model_load.transform(loaded_bow)

        # sanity check for transformation operation
        self.assertEqual(loaded_matrix.shape[0], 1)
        self.assertEqual(loaded_matrix.shape[1], model_load.num_topics)

        # comparing the original and loaded models
        original_bow = self.model.id2word.doc2bow(texts_new)
        original_matrix = self.model.transform(original_bow)
        passed = numpy.allclose(sorted(loaded_matrix), sorted(original_matrix), atol=1e-1)
        self.assertTrue(passed)

    def testModelNotFitted(self):
        lsi_wrapper = SklLsiModel(id2word=dictionary, num_topics=2)
        texts_new = ['graph', 'eulerian']
        bow = lsi_wrapper.id2word.doc2bow(texts_new)
        self.assertRaises(NotFittedError, lsi_wrapper.transform, bow)