Exemplo n.º 1
0
class DocMethodsTestCase(unittest.TestCase):
    def setUp(self):
        self.doc = Doc(TEXT.strip(), lang='en_core_web_sm')

    def test_n_tokens_and_sents(self):
        self.assertEqual(self.doc.n_tokens, 241)
        self.assertEqual(self.doc.n_sents, 8)

    def test_term_count(self):
        self.assertEqual(self.doc.count('statistical'), 3)
        self.assertEqual(self.doc.count('machine learning'), 2)
        self.assertEqual(self.doc.count('foo'), 0)

    def test_tokenized_text(self):
        tokenized_text = self.doc.tokenized_text
        self.assertIsInstance(tokenized_text, list)
        self.assertIsInstance(tokenized_text[0], list)
        self.assertIsInstance(tokenized_text[0][0], compat.unicode_)
        self.assertEqual(len(tokenized_text), self.doc.n_sents)

    def test_pos_tagged_text(self):
        pos_tagged_text = self.doc.pos_tagged_text
        self.assertIsInstance(pos_tagged_text, list)
        self.assertIsInstance(pos_tagged_text[0], list)
        self.assertIsInstance(pos_tagged_text[0][0], tuple)
        self.assertIsInstance(pos_tagged_text[0][0][0], compat.unicode_)
        self.assertEqual(len(pos_tagged_text), self.doc.n_sents)

    def test_to_terms_list(self):
        full_terms_list = list(self.doc.to_terms_list(as_strings=True))
        full_terms_list_ids = list(self.doc.to_terms_list(as_strings=False))
        self.assertEqual(len(full_terms_list), len(full_terms_list_ids))
        self.assertIsInstance(full_terms_list[0], compat.unicode_)
        self.assertIsInstance(full_terms_list_ids[0], int)
        self.assertNotEqual(
            full_terms_list[0],
            list(self.doc.to_terms_list(as_strings=True, normalize=False))[0])
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=1))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=(1, 2)))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))

    def test_to_bag_of_words(self):
        bow = self.doc.to_bag_of_words(weighting='count')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        bow = self.doc.to_bag_of_words(weighting='binary')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        for value in list(bow.values())[0:10]:
            self.assertLess(value, 2)
        bow = self.doc.to_bag_of_words(weighting='freq')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], int_types)
        self.assertIsInstance(list(bow.values())[0], float)
        bow = self.doc.to_bag_of_words(as_strings=True)
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.unicode_)
Exemplo n.º 2
0
class DocMethodsTestCase(unittest.TestCase):
    def setUp(self):
        self.tempdir = tempfile.mkdtemp(prefix='test_doc',
                                        dir=os.path.dirname(
                                            os.path.abspath(__file__)))
        self.doc = Doc(TEXT.strip(), lang='en', metadata={'foo': 'bar!'})

    def test_n_tokens_and_sents(self):
        self.assertEqual(self.doc.n_tokens, 241)
        self.assertEqual(self.doc.n_sents, 8)

    def test_term_count(self):
        self.assertEqual(self.doc.count('statistical'), 3)
        self.assertEqual(self.doc.count('machine learning'), 2)
        self.assertEqual(self.doc.count('foo'), 0)

    def test_tokenized_text(self):
        tokenized_text = self.doc.tokenized_text
        self.assertIsInstance(tokenized_text, list)
        self.assertIsInstance(tokenized_text[0], list)
        self.assertIsInstance(tokenized_text[0][0], compat.unicode_)
        self.assertEqual(len(tokenized_text), self.doc.n_sents)

    def test_pos_tagged_text(self):
        pos_tagged_text = self.doc.pos_tagged_text
        self.assertIsInstance(pos_tagged_text, list)
        self.assertIsInstance(pos_tagged_text[0], list)
        self.assertIsInstance(pos_tagged_text[0][0], tuple)
        self.assertIsInstance(pos_tagged_text[0][0][0], compat.unicode_)
        self.assertEqual(len(pos_tagged_text), self.doc.n_sents)

    def test_to_terms_list(self):
        full_terms_list = list(self.doc.to_terms_list(as_strings=True))
        full_terms_list_ids = list(self.doc.to_terms_list(as_strings=False))
        self.assertEqual(len(full_terms_list), len(full_terms_list_ids))
        self.assertIsInstance(full_terms_list[0], compat.unicode_)
        self.assertIsInstance(full_terms_list_ids[0], compat.int_types)
        self.assertNotEqual(
            full_terms_list[0],
            list(self.doc.to_terms_list(as_strings=True, normalize=False))[0])
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=1))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=(1, 2)))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))

    def test_to_bag_of_words(self):
        bow = self.doc.to_bag_of_words(weighting='count')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        bow = self.doc.to_bag_of_words(weighting='binary')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        for value in list(bow.values())[0:10]:
            self.assertLess(value, 2)
        bow = self.doc.to_bag_of_words(weighting='freq')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.int_types)
        self.assertIsInstance(list(bow.values())[0], float)
        bow = self.doc.to_bag_of_words(as_strings=True)
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.unicode_)

    def test_doc_save_and_load(self):
        filepath = os.path.join(self.tempdir, 'test_doc_save_and_load.pkl')
        self.doc.save(filepath)
        new_doc = Doc.load(filepath)
        self.assertIsInstance(new_doc, Doc)
        self.assertEqual(len(new_doc), len(self.doc))
        self.assertEqual(new_doc.lang, self.doc.lang)
        self.assertEqual(new_doc.metadata, self.doc.metadata)

    def tearDown(self):
        shutil.rmtree(self.tempdir)