class TestTfidfEmbeddingVectorizer(unittest.TestCase): def setUp(self): with open( os.path.join(os.path.dirname(__file__), 'corpus_txt_test.txt'), 'rb') as f: texts = [t.decode('utf-8') for t in f.readlines()] self.corpus = Corpus(u'en', texts=texts) self.text = u"disease drop due economic disease else" self.doc = Doc(self.text) self.w2v = { w: 5 * np.random.random_sample((300, )) - 2 for w in self.text.split() } def test_tfidf_vectorizer(self): vectorizer = TfidfEmbeddingVectorizer(self.w2v, self.corpus) vectorizer.fit() tokenized_doc = [ list( self.doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)) ] tfidf_doc = vectorizer.vectorizer.transform(tokenized_doc) v = tfidf_doc[:, vectorizer.vectorizer.vocabulary_terms['drop']].toarray( )[0] doc_term_matrix = vectorizer.doc_term_matrix vectorizer.save( os.path.join(os.path.dirname(__file__), 'test_doc_term_matrix.npz')) vectorizer.load(os.path.join(os.path.dirname(__file__), 'test_doc_term_matrix.npz'), force=True) self.assertAlmostEqual(np.asscalar(v), 0.42063495, delta=0.05) self.assertEqual(vectorizer.transform(self.doc).shape, (300, )) self.assertTrue( np.allclose(doc_term_matrix.toarray(), vectorizer.doc_term_matrix.toarray()))
class DocMethodsTestCase(unittest.TestCase): def setUp(self): self.doc = Doc(TEXT.strip(), lang='en_core_web_sm') def test_n_tokens_and_sents(self): self.assertEqual(self.doc.n_tokens, 241) self.assertEqual(self.doc.n_sents, 8) def test_term_count(self): self.assertEqual(self.doc.count('statistical'), 3) self.assertEqual(self.doc.count('machine learning'), 2) self.assertEqual(self.doc.count('foo'), 0) def test_tokenized_text(self): tokenized_text = self.doc.tokenized_text self.assertIsInstance(tokenized_text, list) self.assertIsInstance(tokenized_text[0], list) self.assertIsInstance(tokenized_text[0][0], compat.unicode_) self.assertEqual(len(tokenized_text), self.doc.n_sents) def test_pos_tagged_text(self): pos_tagged_text = self.doc.pos_tagged_text self.assertIsInstance(pos_tagged_text, list) self.assertIsInstance(pos_tagged_text[0], list) self.assertIsInstance(pos_tagged_text[0][0], tuple) self.assertIsInstance(pos_tagged_text[0][0][0], compat.unicode_) self.assertEqual(len(pos_tagged_text), self.doc.n_sents) def test_to_terms_list(self): full_terms_list = list(self.doc.to_terms_list(as_strings=True)) full_terms_list_ids = list(self.doc.to_terms_list(as_strings=False)) self.assertEqual(len(full_terms_list), len(full_terms_list_ids)) self.assertIsInstance(full_terms_list[0], compat.unicode_) self.assertIsInstance(full_terms_list_ids[0], int) self.assertNotEqual( full_terms_list[0], list(self.doc.to_terms_list(as_strings=True, normalize=False))[0]) self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))), len(full_terms_list)) self.assertLess(len(list(self.doc.to_terms_list(ngrams=1))), len(full_terms_list)) self.assertLess(len(list(self.doc.to_terms_list(ngrams=(1, 2)))), len(full_terms_list)) self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))), len(full_terms_list)) def test_to_bag_of_words(self): bow = self.doc.to_bag_of_words(weighting='count') self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], int_types) self.assertIsInstance(list(bow.values())[0], int) bow = self.doc.to_bag_of_words(weighting='binary') self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], int_types) self.assertIsInstance(list(bow.values())[0], int) for value in list(bow.values())[0:10]: self.assertLess(value, 2) bow = self.doc.to_bag_of_words(weighting='freq') self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], int_types) self.assertIsInstance(list(bow.values())[0], float) bow = self.doc.to_bag_of_words(as_strings=True) self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], compat.unicode_)
class DocMethodsTestCase(unittest.TestCase): def setUp(self): self.tempdir = tempfile.mkdtemp(prefix='test_doc', dir=os.path.dirname( os.path.abspath(__file__))) self.doc = Doc(TEXT.strip(), lang='en', metadata={'foo': 'bar!'}) def test_n_tokens_and_sents(self): self.assertEqual(self.doc.n_tokens, 241) self.assertEqual(self.doc.n_sents, 8) def test_term_count(self): self.assertEqual(self.doc.count('statistical'), 3) self.assertEqual(self.doc.count('machine learning'), 2) self.assertEqual(self.doc.count('foo'), 0) def test_tokenized_text(self): tokenized_text = self.doc.tokenized_text self.assertIsInstance(tokenized_text, list) self.assertIsInstance(tokenized_text[0], list) self.assertIsInstance(tokenized_text[0][0], compat.unicode_) self.assertEqual(len(tokenized_text), self.doc.n_sents) def test_pos_tagged_text(self): pos_tagged_text = self.doc.pos_tagged_text self.assertIsInstance(pos_tagged_text, list) self.assertIsInstance(pos_tagged_text[0], list) self.assertIsInstance(pos_tagged_text[0][0], tuple) self.assertIsInstance(pos_tagged_text[0][0][0], compat.unicode_) self.assertEqual(len(pos_tagged_text), self.doc.n_sents) def test_to_terms_list(self): full_terms_list = list(self.doc.to_terms_list(as_strings=True)) full_terms_list_ids = list(self.doc.to_terms_list(as_strings=False)) self.assertEqual(len(full_terms_list), len(full_terms_list_ids)) self.assertIsInstance(full_terms_list[0], compat.unicode_) self.assertIsInstance(full_terms_list_ids[0], compat.int_types) self.assertNotEqual( full_terms_list[0], list(self.doc.to_terms_list(as_strings=True, normalize=False))[0]) self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))), len(full_terms_list)) self.assertLess(len(list(self.doc.to_terms_list(ngrams=1))), len(full_terms_list)) self.assertLess(len(list(self.doc.to_terms_list(ngrams=(1, 2)))), len(full_terms_list)) self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))), len(full_terms_list)) def test_to_bag_of_words(self): bow = self.doc.to_bag_of_words(weighting='count') self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], compat.int_types) self.assertIsInstance(list(bow.values())[0], int) bow = self.doc.to_bag_of_words(weighting='binary') self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], compat.int_types) self.assertIsInstance(list(bow.values())[0], int) for value in list(bow.values())[0:10]: self.assertLess(value, 2) bow = self.doc.to_bag_of_words(weighting='freq') self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], compat.int_types) self.assertIsInstance(list(bow.values())[0], float) bow = self.doc.to_bag_of_words(as_strings=True) self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], compat.unicode_) def test_doc_save_and_load(self): filepath = os.path.join(self.tempdir, 'test_doc_save_and_load.pkl') self.doc.save(filepath) new_doc = Doc.load(filepath) self.assertIsInstance(new_doc, Doc) self.assertEqual(len(new_doc), len(self.doc)) self.assertEqual(new_doc.lang, self.doc.lang) self.assertEqual(new_doc.metadata, self.doc.metadata) def tearDown(self): shutil.rmtree(self.tempdir)