def testPipeline(self): model = SklLsiModel(num_topics=2) with open(datapath('mini_newsgroup'), 'rb') as f: compressed_content = f.read() uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) data = cache id2word = Dictionary(map(lambda x: x.split(), data.data)) corpus = [id2word.doc2bow(i.split()) for i in data.data] numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) text_lsi = Pipeline((('features', model,), ('classifier', clf))) text_lsi.fit(corpus, data.target) score = text_lsi.score(corpus, data.target) self.assertGreater(score, 0.50)
def testModelNotFitted(self): lsi_wrapper = SklLsiModel(id2word=dictionary, num_topics=2) texts_new = ['graph', 'eulerian'] bow = lsi_wrapper.id2word.doc2bow(texts_new) self.assertRaises(NotFittedError, lsi_wrapper.transform, bow)
def setUp(self): numpy.random.seed(0) # set fixed seed to get similar values everytime self.model = SklLsiModel(id2word=dictionary, num_topics=2) self.model.fit(corpus)
class TestSklLsiModelWrapper(unittest.TestCase): def setUp(self): numpy.random.seed(0) # set fixed seed to get similar values everytime self.model = SklLsiModel(id2word=dictionary, num_topics=2) self.model.fit(corpus) def testTransform(self): texts_new = ['graph', 'eulerian'] bow = self.model.id2word.doc2bow(texts_new) matrix = self.model.transform(bow) self.assertEqual(matrix.shape[0], 1) self.assertEqual(matrix.shape[1], self.model.num_topics) texts_new = [['graph', 'eulerian'], ['server', 'flow'], ['path', 'system']] bow = [] for i in texts_new: bow.append(self.model.id2word.doc2bow(i)) matrix = self.model.transform(bow) self.assertEqual(matrix.shape[0], 3) self.assertEqual(matrix.shape[1], self.model.num_topics) def testPartialFit(self): for i in range(10): self.model.partial_fit(X=corpus) # fit against the model again doc = list(corpus)[0] # transform only the first document transformed = self.model.transform(doc) expected = numpy.array([1.39, 1e-12]) passed = numpy.allclose(sorted(transformed[0]), sorted(expected), atol=1) self.assertTrue(passed) def testPipeline(self): model = SklLsiModel(num_topics=2) with open(datapath('mini_newsgroup'), 'rb') as f: compressed_content = f.read() uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') cache = pickle.loads(uncompressed_content) data = cache id2word = Dictionary(map(lambda x: x.split(), data.data)) corpus = [id2word.doc2bow(i.split()) for i in data.data] numpy.random.mtrand.RandomState(1) # set seed for getting same result clf = linear_model.LogisticRegression(penalty='l2', C=0.1) text_lsi = Pipeline((('features', model,), ('classifier', clf))) text_lsi.fit(corpus, data.target) score = text_lsi.score(corpus, data.target) self.assertGreater(score, 0.50) def testSetGetParams(self): # updating only one param self.model.set_params(num_topics=3) model_params = self.model.get_params() self.assertEqual(model_params["num_topics"], 3) # updating multiple params param_dict = {"chunksize": 10000, "decay": 0.9} self.model.set_params(**param_dict) model_params = self.model.get_params() for key in param_dict.keys(): self.assertEqual(model_params[key], param_dict[key]) def testPersistence(self): model_dump = pickle.dumps(self.model) model_load = pickle.loads(model_dump) texts_new = ['graph', 'eulerian'] loaded_bow = model_load.id2word.doc2bow(texts_new) loaded_matrix = model_load.transform(loaded_bow) # sanity check for transformation operation self.assertEqual(loaded_matrix.shape[0], 1) self.assertEqual(loaded_matrix.shape[1], model_load.num_topics) # comparing the original and loaded models original_bow = self.model.id2word.doc2bow(texts_new) original_matrix = self.model.transform(original_bow) passed = numpy.allclose(sorted(loaded_matrix), sorted(original_matrix), atol=1e-1) self.assertTrue(passed) def testModelNotFitted(self): lsi_wrapper = SklLsiModel(id2word=dictionary, num_topics=2) texts_new = ['graph', 'eulerian'] bow = lsi_wrapper.id2word.doc2bow(texts_new) self.assertRaises(NotFittedError, lsi_wrapper.transform, bow)