def test_fallback_ocr(self): try: # actually running tesseract takes ages -- for day-to-day # testing we can just as well use the canned hocr.html # files that _copy_sample fixes for us. if not os.environ.get("FERENDA_TEST_TESSERACT"): raise errors.ExternalCommandError reader = PDFReader( filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, images=False) except errors.ExternalCommandError: self._copy_sample() reader = PDFReader( filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, images=False) self.assertTrue(reader.is_empty()) reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, ocr_lang="eng") self.assertFalse(reader.is_empty()) self.assertEqual(2, len(reader)) self.assertEqual("EUROPEAN COMPUTER MANUFACTURERS ASSOCIATION", util.normalize_space(str(reader[0][1])))
def test_fallback_ocr(self): try: # actually running tesseract takes ages -- for day-to-day # testing we can just as well use the canned hocr.html # files that _copy_sample fixes for us. if not os.environ.get("FERENDA_TEST_TESSERACT"): raise errors.ExternalCommandError reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, images=False) except errors.ExternalCommandError: self._copy_sample() reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, images=False) self.assertTrue(reader.is_empty()) reader = PDFReader(filename="test/files/pdfreader/scanned-ecma-99.pdf", workdir=self.datadir, ocr_lang="eng") self.assertFalse(reader.is_empty()) self.assertEqual(2, len(reader)) self.assertEqual("EUROPEAN COMPUTER MANUFACTURERS ASSOCIATION", util.normalize_space(str(reader[0][1])))
def test_basic(self): try: reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir=self.datadir) except errors.ExternalCommandError: self._copy_sample() reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir=self.datadir) # a temporary copy of the pdf file should not be lying around # in workdir # print("Checking if %s has been unlinked" % (self.datadir + # os.sep + "sample.pdf")) self.assertFalse(os.path.exists(self.datadir + os.sep + "sample.pdf")) # but the XML file should be stored for subsequent parses self.assertTrue(os.path.exists(self.datadir + os.sep + "sample.xml")) # The PDF contained actual textboxes self.assertFalse(reader.is_empty()) self.assertEqual(len(reader), 1) # first page, first box title = str(reader[0][0]) self.assertEqual("Document title ", title) self.assertEqual(570, reader.median_box_width()) page = reader[0] self.assertEqual( "Page 1 (892 x 1263): 'Document title This is a simple documen...'", str(page)) # an uncropped doc should have nine nonempty textboxes self.assertEqual(9, len(list(page.boundingbox()))) # a smaller bounding box yields just one self.assertEqual(1, len(list(page.boundingbox(190, 130, 230, 460)))) # cropping it with the same dimensions # NOTE: This will fail if convert (from imagemagick) isn't installed) try: page.crop(190, 130, 230, 460) except errors.ExternalCommandError: # the rest of the tests cannot succeed now. FIXME: We # should try to find a way to run them anyway return # should also result in just one box -- the bottom one boxes = list(page.boundingbox()) self.assertEqual(1, len(boxes)) box = boxes[0] self.assertEqual("This is a simple document in PDF format. ", str(box)) self.assertEqual('#000000', box.font.color) self.assertEqual(16, box.font.size) self.assertEqual('1', box.font.id) self.assertEqual('Cambria', box.font.family) # this box should have four text elements self.assertEqual(4, len(box)) self.assertEqual(None, box[0].tag) self.assertEqual("i", box[1].tag) self.assertEqual("ib", box[2].tag) self.assertEqual(None, box[3].tag)
def test_basic(self): try: reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir=self.datadir) except errors.ExternalCommandError: self._copy_sample() reader = PDFReader(filename="test/files/pdfreader/sample.pdf", workdir=self.datadir) # a temporary copy of the pdf file should not be lying around # in workdir # print("Checking if %s has been unlinked" % (self.datadir + # os.sep + "sample.pdf")) self.assertFalse(os.path.exists(self.datadir + os.sep + "sample.pdf")) # but the XML file should be stored for subsequent parses self.assertTrue(os.path.exists(self.datadir + os.sep + "sample.xml")) # The PDF contained actual textboxes self.assertFalse(reader.is_empty()) self.assertEqual(len(reader), 1) # first page, first box title = str(reader[0][0]) self.assertEqual("Document title ", title) self.assertEqual(570, reader.median_box_width()) page = reader[0] self.assertEqual("Page 1 (892 x 1263): 'Document title This is a simple documen...'", str(page)) # an uncropped doc should have nine nonempty textboxes self.assertEqual(9, len(list(page.boundingbox()))) # a smaller bounding box yields just one self.assertEqual(1, len(list(page.boundingbox(190, 130, 230, 460)))) # cropping it with the same dimensions # NOTE: This will fail if convert (from imagemagick) isn't installed) try: page.crop(190, 130, 230, 460) except errors.ExternalCommandError: # the rest of the tests cannot succeed now. FIXME: We # should try to find a way to run them anyway return # should also result in just one box -- the bottom one boxes = list(page.boundingbox()) self.assertEqual(1, len(boxes)) box = boxes[0] self.assertEqual("This is a simple document in PDF format. ", str(box)) self.assertEqual('#000000', box.font.color) self.assertEqual(16, box.font.size) self.assertEqual('1', box.font.id) self.assertEqual('Cambria', box.font.family) # this box should have four text elements self.assertEqual(4, len(box)) self.assertEqual(None, box[0].tag) self.assertEqual("i", box[1].tag) self.assertEqual("ib", box[2].tag) self.assertEqual(None, box[3].tag)