def test_bmp(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp") self.assertTrue(os.path.isfile(parser.archive_path)) self.assertTrue("this is a test document" in parser.get_text().lower())
def test_skip_noarchive_notext(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") self.assertTrue(os.path.join(parser.archive_path)) self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
def test_multi_page_pages_force(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") self.assertTrue(os.path.isfile(parser.archive_path)) self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"])
def test_multi_page_analog_pages_redo(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") self.assertTrue(os.path.isfile(parser.archive_path)) self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"]) self.assertFalse("page 3" in parser.get_text().lower())
def test_with_form_force(self): parser = RasterisedDocumentParser(None) parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."])