def test_html_cleaner_strings(self): r = HtmlCleaner() docs = list(r.process([self.d1["body"]])) words = docs[0] self.assertEqual(len(words), 53) self.assertEqual(words[33], ".") self.assertEqual(words[34], "\n") self.assertEqual(words[35], " ")
def test_html_cleaner_docs(self): r = HtmlCleaner() docs = list(r.process(self.docs)) words = list(docs[0].words()) self.assertEqual(len(words), 11) self.assertEqual(words[6], "pipeline") self.assertEqual(words[7], ".") self.assertEqual(words[8], "another")