def test_pdf_extraction(self): if not PdfFileReader: self.skipTest("pyPdf is not installed") pdf = open(os.path.join(my_path, 'testpdf.pdf')) extractor = BaseExtractor(pdf.read()) self.assertEqual(extractor.get_title(), u'This is a test PDF file') expected_content = [ 'universal file format', 'fonts, formatting, colours and graphics', 'regardless of the application and platform' ] content = extractor.get_content() for c in expected_content: self.assertTrue(c in content)
def test_extractor(self): extractor = BaseExtractor(self.html) self.assertEqual(extractor.get_title(), u'MY TITLE') expected_content = [ u'MY TITLE', u'Relative Link', u'Absolute Link', u'Offsite Link', u'THIS IS A H1 HEADER', u'Content in a paragraph', u'THIS IS A H2 HEADER', u'Content In A Span', ] content = extractor.get_content() for c in expected_content: self.assertTrue(c.lower() in content.lower()) expected_headings = [ u'THIS IS A H1 HEADER', u'THIS IS A H2 HEADER', ] headings = extractor.get_headings() for h in expected_headings: self.assertTrue(h.lower() in headings.lower()) unexpected_content = [ 'meta', 'Content-Type', 'stylesheet', 'script', 'style', 'relative_link', 'absolute_link', 'offsite_link', 'pagelocation', 'THIS IS A SCRIPT TAG', 'THIS_IS_A_STYLE_TAG' ] for u in unexpected_content: self.assertFalse(u in content) self.assertFalse(u in headings)