class TestTfidfEmbeddingVectorizer(unittest.TestCase):
    def setUp(self):
        with open(
                os.path.join(os.path.dirname(__file__), 'corpus_txt_test.txt'),
                'rb') as f:
            texts = [t.decode('utf-8') for t in f.readlines()]
            self.corpus = Corpus(u'en', texts=texts)
        self.text = u"disease drop due economic disease else"
        self.doc = Doc(self.text)
        self.w2v = {
            w: 5 * np.random.random_sample((300, )) - 2
            for w in self.text.split()
        }

    def test_tfidf_vectorizer(self):
        vectorizer = TfidfEmbeddingVectorizer(self.w2v, self.corpus)
        vectorizer.fit()
        tokenized_doc = [
            list(
                self.doc.to_terms_list(ngrams=1,
                                       named_entities=True,
                                       as_strings=True))
        ]
        tfidf_doc = vectorizer.vectorizer.transform(tokenized_doc)
        v = tfidf_doc[:,
                      vectorizer.vectorizer.vocabulary_terms['drop']].toarray(
                      )[0]
        doc_term_matrix = vectorizer.doc_term_matrix
        vectorizer.save(
            os.path.join(os.path.dirname(__file__),
                         'test_doc_term_matrix.npz'))
        vectorizer.load(os.path.join(os.path.dirname(__file__),
                                     'test_doc_term_matrix.npz'),
                        force=True)
        self.assertAlmostEqual(np.asscalar(v), 0.42063495, delta=0.05)
        self.assertEqual(vectorizer.transform(self.doc).shape, (300, ))
        self.assertTrue(
            np.allclose(doc_term_matrix.toarray(),
                        vectorizer.doc_term_matrix.toarray()))
Exemplo n.º 2
0
class DocMethodsTestCase(unittest.TestCase):
    def setUp(self):
        self.doc = Doc(TEXT.strip(), lang='en_core_web_sm')

    def test_n_tokens_and_sents(self):
        self.assertEqual(self.doc.n_tokens, 241)
        self.assertEqual(self.doc.n_sents, 8)

    def test_term_count(self):
        self.assertEqual(self.doc.count('statistical'), 3)
        self.assertEqual(self.doc.count('machine learning'), 2)
        self.assertEqual(self.doc.count('foo'), 0)

    def test_tokenized_text(self):
        tokenized_text = self.doc.tokenized_text
        self.assertIsInstance(tokenized_text, list)
        self.assertIsInstance(tokenized_text[0], list)
        self.assertIsInstance(tokenized_text[0][0], compat.unicode_)
        self.assertEqual(len(tokenized_text), self.doc.n_sents)

    def test_pos_tagged_text(self):
        pos_tagged_text = self.doc.pos_tagged_text
        self.assertIsInstance(pos_tagged_text, list)
        self.assertIsInstance(pos_tagged_text[0], list)
        self.assertIsInstance(pos_tagged_text[0][0], tuple)
        self.assertIsInstance(pos_tagged_text[0][0][0], compat.unicode_)
        self.assertEqual(len(pos_tagged_text), self.doc.n_sents)

    def test_to_terms_list(self):
        full_terms_list = list(self.doc.to_terms_list(as_strings=True))
        full_terms_list_ids = list(self.doc.to_terms_list(as_strings=False))
        self.assertEqual(len(full_terms_list), len(full_terms_list_ids))
        self.assertIsInstance(full_terms_list[0], compat.unicode_)
        self.assertIsInstance(full_terms_list_ids[0], int)
        self.assertNotEqual(
            full_terms_list[0],
            list(self.doc.to_terms_list(as_strings=True, normalize=False))[0])
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=1))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=(1, 2)))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))

    def test_to_bag_of_words(self):
        bow = self.doc.to_bag_of_words(weighting='count')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        bow = self.doc.to_bag_of_words(weighting='binary')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        for value in list(bow.values())[0:10]:
            self.assertLess(value, 2)
        bow = self.doc.to_bag_of_words(weighting='freq')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], int_types)
        self.assertIsInstance(list(bow.values())[0], float)
        bow = self.doc.to_bag_of_words(as_strings=True)
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.unicode_)
Exemplo n.º 3
0
class DocMethodsTestCase(unittest.TestCase):
    def setUp(self):
        self.tempdir = tempfile.mkdtemp(prefix='test_doc',
                                        dir=os.path.dirname(
                                            os.path.abspath(__file__)))
        self.doc = Doc(TEXT.strip(), lang='en', metadata={'foo': 'bar!'})

    def test_n_tokens_and_sents(self):
        self.assertEqual(self.doc.n_tokens, 241)
        self.assertEqual(self.doc.n_sents, 8)

    def test_term_count(self):
        self.assertEqual(self.doc.count('statistical'), 3)
        self.assertEqual(self.doc.count('machine learning'), 2)
        self.assertEqual(self.doc.count('foo'), 0)

    def test_tokenized_text(self):
        tokenized_text = self.doc.tokenized_text
        self.assertIsInstance(tokenized_text, list)
        self.assertIsInstance(tokenized_text[0], list)
        self.assertIsInstance(tokenized_text[0][0], compat.unicode_)
        self.assertEqual(len(tokenized_text), self.doc.n_sents)

    def test_pos_tagged_text(self):
        pos_tagged_text = self.doc.pos_tagged_text
        self.assertIsInstance(pos_tagged_text, list)
        self.assertIsInstance(pos_tagged_text[0], list)
        self.assertIsInstance(pos_tagged_text[0][0], tuple)
        self.assertIsInstance(pos_tagged_text[0][0][0], compat.unicode_)
        self.assertEqual(len(pos_tagged_text), self.doc.n_sents)

    def test_to_terms_list(self):
        full_terms_list = list(self.doc.to_terms_list(as_strings=True))
        full_terms_list_ids = list(self.doc.to_terms_list(as_strings=False))
        self.assertEqual(len(full_terms_list), len(full_terms_list_ids))
        self.assertIsInstance(full_terms_list[0], compat.unicode_)
        self.assertIsInstance(full_terms_list_ids[0], compat.int_types)
        self.assertNotEqual(
            full_terms_list[0],
            list(self.doc.to_terms_list(as_strings=True, normalize=False))[0])
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=1))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=(1, 2)))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))

    def test_to_bag_of_words(self):
        bow = self.doc.to_bag_of_words(weighting='count')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        bow = self.doc.to_bag_of_words(weighting='binary')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        for value in list(bow.values())[0:10]:
            self.assertLess(value, 2)
        bow = self.doc.to_bag_of_words(weighting='freq')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.int_types)
        self.assertIsInstance(list(bow.values())[0], float)
        bow = self.doc.to_bag_of_words(as_strings=True)
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.unicode_)

    def test_doc_save_and_load(self):
        filepath = os.path.join(self.tempdir, 'test_doc_save_and_load.pkl')
        self.doc.save(filepath)
        new_doc = Doc.load(filepath)
        self.assertIsInstance(new_doc, Doc)
        self.assertEqual(len(new_doc), len(self.doc))
        self.assertEqual(new_doc.lang, self.doc.lang)
        self.assertEqual(new_doc.metadata, self.doc.metadata)

    def tearDown(self):
        shutil.rmtree(self.tempdir)