예제 #1
0
 def test_html_cleaner_strings(self):
     r = HtmlCleaner()
     docs = list(r.process([self.d1["body"]]))
     words = docs[0]
     self.assertEqual(len(words), 53)
     self.assertEqual(words[33], ".")
     self.assertEqual(words[34], "\n")
     self.assertEqual(words[35], " ")
예제 #2
0
 def test_html_cleaner_docs(self):
     r = HtmlCleaner()
     docs = list(r.process(self.docs))
     words = list(docs[0].words())
     self.assertEqual(len(words), 11)
     self.assertEqual(words[6], "pipeline")
     self.assertEqual(words[7], ".")
     self.assertEqual(words[8], "another")