Exemplo n.º 1
0
 def testPipeline(self):
     numpy.random.seed(0)  # set fixed seed to get similar values everytime
     model = SklRpModel(num_topics=2)
     with open(datapath('mini_newsgroup'), 'rb') as f:
         compressed_content = f.read()
         uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
         cache = pickle.loads(uncompressed_content)
     data = cache
     id2word = Dictionary(map(lambda x: x.split(), data.data))
     corpus = [id2word.doc2bow(i.split()) for i in data.data]
     numpy.random.mtrand.RandomState(1)  # set seed for getting same result
     clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
     text_rp = Pipeline((('features', model,), ('classifier', clf)))
     text_rp.fit(corpus, data.target)
     score = text_rp.score(corpus, data.target)
     self.assertGreater(score, 0.40)
Exemplo n.º 2
0
 def testModelNotFitted(self):
     rpmodel_wrapper = SklRpModel(num_topics=2)
     doc = list(self.corpus)[0]
     self.assertRaises(NotFittedError, rpmodel_wrapper.transform, doc)
Exemplo n.º 3
0
 def setUp(self):
     numpy.random.seed(13)
     self.model = SklRpModel(num_topics=2)
     self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
     self.model.fit(self.corpus)
Exemplo n.º 4
0
class TestSklRpModelWrapper(unittest.TestCase):
    def setUp(self):
        numpy.random.seed(13)
        self.model = SklRpModel(num_topics=2)
        self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
        self.model.fit(self.corpus)

    def testTransform(self):
        # tranform two documents
        docs = []
        docs.append(list(self.corpus)[0])
        docs.append(list(self.corpus)[1])
        matrix = self.model.transform(docs)
        self.assertEqual(matrix.shape[0], 2)
        self.assertEqual(matrix.shape[1], self.model.num_topics)

        # tranform one document
        doc = list(self.corpus)[0]
        matrix = self.model.transform(doc)
        self.assertEqual(matrix.shape[0], 1)
        self.assertEqual(matrix.shape[1], self.model.num_topics)

    def testSetGetParams(self):
        # updating only one param
        self.model.set_params(num_topics=3)
        model_params = self.model.get_params()
        self.assertEqual(model_params["num_topics"], 3)

    def testPipeline(self):
        numpy.random.seed(0)  # set fixed seed to get similar values everytime
        model = SklRpModel(num_topics=2)
        with open(datapath('mini_newsgroup'), 'rb') as f:
            compressed_content = f.read()
            uncompressed_content = codecs.decode(compressed_content, 'zlib_codec')
            cache = pickle.loads(uncompressed_content)
        data = cache
        id2word = Dictionary(map(lambda x: x.split(), data.data))
        corpus = [id2word.doc2bow(i.split()) for i in data.data]
        numpy.random.mtrand.RandomState(1)  # set seed for getting same result
        clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
        text_rp = Pipeline((('features', model,), ('classifier', clf)))
        text_rp.fit(corpus, data.target)
        score = text_rp.score(corpus, data.target)
        self.assertGreater(score, 0.40)

    def testPersistence(self):
        model_dump = pickle.dumps(self.model)
        model_load = pickle.loads(model_dump)

        doc = list(self.corpus)[0]
        loaded_transformed_vecs = model_load.transform(doc)

        # sanity check for transformation operation
        self.assertEqual(loaded_transformed_vecs.shape[0], 1)
        self.assertEqual(loaded_transformed_vecs.shape[1], model_load.num_topics)

        # comparing the original and loaded models
        original_transformed_vecs = self.model.transform(doc)
        passed = numpy.allclose(sorted(loaded_transformed_vecs), sorted(original_transformed_vecs), atol=1e-1)
        self.assertTrue(passed)

    def testModelNotFitted(self):
        rpmodel_wrapper = SklRpModel(num_topics=2)
        doc = list(self.corpus)[0]
        self.assertRaises(NotFittedError, rpmodel_wrapper.transform, doc)