def setUp(self): self.model = SklLdaSeqModel(id2word=dictionary_ldaseq, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats_ldaseq) self.model.fit(corpus_ldaseq)
def testPipeline(self): numpy.random.seed(0) # set fixed seed to get similar values everytime with open(datapath('mini_newsgroup'), 'rb') as f: compressed_content = f.read() uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) data = cache test_data = data.data[0:2] test_target = data.target[0:2] id2word = Dictionary(map(lambda x: x.split(), test_data)) corpus = [id2word.doc2bow(i.split()) for i in test_data] model = SklLdaSeqModel(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim') clf = linear_model.LogisticRegression(penalty='l2', C=0.1) text_ldaseq = Pipeline((('features', model,), ('classifier', clf))) text_ldaseq.fit(corpus, test_target) score = text_ldaseq.score(corpus, test_target) self.assertGreater(score, 0.50)
def testModelNotFitted(self): ldaseq_wrapper = SklLdaSeqModel(num_topics=2) doc = list(corpus_ldaseq)[0] self.assertRaises(NotFittedError, ldaseq_wrapper.transform, doc)
class TestSklLdaSeqModelWrapper(unittest.TestCase): def setUp(self): self.model = SklLdaSeqModel(id2word=dictionary_ldaseq, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats_ldaseq) self.model.fit(corpus_ldaseq) def testTransform(self): # transforming two documents docs = [] docs.append(list(corpus_ldaseq)[0]) docs.append(list(corpus_ldaseq)[1]) transformed_vecs = self.model.transform(docs) self.assertEqual(transformed_vecs.shape[0], 2) self.assertEqual(transformed_vecs.shape[1], self.model.num_topics) # transforming one document doc = list(corpus_ldaseq)[0] transformed_vecs = self.model.transform(doc) self.assertEqual(transformed_vecs.shape[0], 1) self.assertEqual(transformed_vecs.shape[1], self.model.num_topics) def testSetGetParams(self): # updating only one param self.model.set_params(num_topics=3) model_params = self.model.get_params() self.assertEqual(model_params["num_topics"], 3) # updating multiple params param_dict = {"passes": 20, "chunksize": 200} self.model.set_params(**param_dict) model_params = self.model.get_params() for key in param_dict.keys(): self.assertEqual(model_params[key], param_dict[key]) def testPipeline(self): numpy.random.seed(0) # set fixed seed to get similar values everytime with open(datapath('mini_newsgroup'), 'rb') as f: compressed_content = f.read() uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) data = cache test_data = data.data[0:2] test_target = data.target[0:2] id2word = Dictionary(map(lambda x: x.split(), test_data)) corpus = [id2word.doc2bow(i.split()) for i in test_data] model = SklLdaSeqModel(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim') clf = linear_model.LogisticRegression(penalty='l2', C=0.1) text_ldaseq = Pipeline((('features', model,), ('classifier', clf))) text_ldaseq.fit(corpus, test_target) score = text_ldaseq.score(corpus, test_target) self.assertGreater(score, 0.50) def testPersistence(self): model_dump = pickle.dumps(self.model) model_load = pickle.loads(model_dump) doc = list(corpus_ldaseq)[0] loaded_transformed_vecs = model_load.transform(doc) # sanity check for transformation operation self.assertEqual(loaded_transformed_vecs.shape[0], 1) self.assertEqual(loaded_transformed_vecs.shape[1], model_load.num_topics) # comparing the original and loaded models original_transformed_vecs = self.model.transform(doc) passed = numpy.allclose(sorted(loaded_transformed_vecs), sorted(original_transformed_vecs), atol=1e-1) self.assertTrue(passed) def testModelNotFitted(self): ldaseq_wrapper = SklLdaSeqModel(num_topics=2) doc = list(corpus_ldaseq)[0] self.assertRaises(NotFittedError, ldaseq_wrapper.transform, doc)