def test_limited_pages(resources, outpdf, spoof_tesseract_cache): multi = resources / 'multipage.pdf' ocrmypdf.ocr( multi, outpdf, pages='5-6', optimize=0, output_type='pdf', tesseract_env=spoof_tesseract_cache, ) pi = PdfInfo(outpdf) assert not pi.pages[0].has_text assert pi.pages[4].has_text assert pi.pages[5].has_text
def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf): oversampled_pdf = check_ocrmypdf(resources / 'skew.pdf', outpdf, '--oversample', '350', '-f', '--pdf-renderer', renderer, env=spoof_tesseract_cache) pdfinfo = PdfInfo(oversampled_pdf) print(pdfinfo[0].xres) assert abs(pdfinfo[0].xres - 350) < 1
def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec, resources, image, compression, outpdf): input_file = str(resources / image) output_file = str(outpdf) im = Image.open(input_file) # Runs: ocrmypdf - output.pdf < testfile with open(input_file, 'rb') as input_stream: p_args = ocrmypdf_exec + [ '--image-dpi', '150', '--output-type', 'pdfa', '--optimize', '0', '--pdfa-image-compression', compression, '-', output_file, ] p = run( p_args, stdout=PIPE, stderr=PIPE, stdin=input_stream, universal_newlines=True, env=spoof_tesseract_noop, ) assert p.returncode == ExitCode.ok, p.stderr pdfinfo = PdfInfo(output_file) pdfimage = pdfinfo[0].images[0] if compression == "jpeg": assert pdfimage.enc == Encoding.jpeg else: if ghostscript.jpeg_passthrough_available(): # Ghostscript 9.23 adds JPEG passthrough, which allows a JPEG to be # copied without transcoding - so report if image.endswith('jpg'): assert pdfimage.enc == Encoding.jpeg else: assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000) if im.mode.startswith('RGB') or im.mode.startswith('BGR'): assert pdfimage.color == Colorspace.rgb, "Colorspace changed" elif im.mode.startswith('L'): assert pdfimage.color == Colorspace.gray, "Colorspace changed"
def test_limited_pages(resources, outpdf): multi = resources / 'multipage.pdf' ocrmypdf.ocr( multi, outpdf, pages='5-6', optimize=0, output_type='pdf', plugins=['tests/plugins/tesseract_cache.py'], ) pi = PdfInfo(outpdf) assert not pi.pages[0].has_text assert pi.pages[4].has_text assert pi.pages[5].has_text
def test_qpdf_merge_correctness(resources, outpdf, max_files, skip): # All of these must be only one page long inputs = [ '2400dpi.pdf', 'aspect.pdf', 'blank.pdf', 'ccitt.pdf', 'linn.pdf', 'masks.pdf', 'poster.pdf', 'overlay.pdf', 'skew.pdf', 'trivial.pdf' ] input_files = [str(resources / f) for f in inputs] qpdf.merge(input_files[skip:], outpdf, log=logging.getLogger(), max_files=max_files) assert len(PdfInfo(outpdf).pages) == len(input_files[skip:])
def test_compression_preserved(ocrmypdf_exec, resources, image, outpdf): input_file = str(resources / image) output_file = str(outpdf) im = Image.open(input_file) # Runs: ocrmypdf - output.pdf < testfile with open(input_file, 'rb') as input_stream: p_args = ocrmypdf_exec + [ '--optimize', '0', '--image-dpi', '150', '--output-type', 'pdf', '--plugin', 'tests/plugins/tesseract_noop.py', '-', output_file, ] p = run( p_args, stdout=PIPE, stderr=PIPE, stdin=input_stream, universal_newlines= True, # When dropping support for Python 3.6 change to text= check=False, ) if im.mode in ('RGBA', 'LA'): # If alpha image is input, expect an error assert p.returncode != ExitCode.ok and 'alpha' in p.stderr return assert p.returncode == ExitCode.ok, p.stderr pdfinfo = PdfInfo(output_file) pdfimage = pdfinfo[0].images[0] if input_file.endswith('.png'): assert pdfimage.enc != Encoding.jpeg, "Lossless compression changed to lossy!" elif input_file.endswith('.jpg'): assert pdfimage.enc == Encoding.jpeg, "Lossy compression changed to lossless!" if im.mode.startswith('RGB') or im.mode.startswith('BGR'): assert pdfimage.color == Colorspace.rgb, "Colorspace changed" elif im.mode.startswith('L'): assert pdfimage.color == Colorspace.gray, "Colorspace changed" im.close()
def test_rotated_skew_timeout(resources, outpdf): """This document contains an image that is rotated 90 into place with a /Rotate tag and intentionally skewed by altering the transformation matrix. This tests for a bug where the combination of preprocessing and a tesseract timeout produced a page whose dimensions did not match the original's. """ input_file = resources / 'rotated_skew.pdf' in_pageinfo = PdfInfo(input_file)[0] assert ( in_pageinfo.height_pixels < in_pageinfo.width_pixels ), "Expected the input page to be landscape" assert in_pageinfo.rotation == 90, "Expected a rotated page" out = check_ocrmypdf( input_file, outpdf, '--pdf-renderer', 'hocr', '--deskew', '--tesseract-timeout', '0', ) out_pageinfo = PdfInfo(out)[0] w, h = out_pageinfo.width_pixels, out_pageinfo.height_pixels assert h > w, "Expected the output page to be portrait" assert out_pageinfo.rotation == 0, "Expected no page rotation for output" assert ( in_pageinfo.width_pixels == h and in_pageinfo.height_pixels == w ), "Expected page rotation to be baked in"
def test_compression_preserved(spoof_tesseract_noop, ocrmypdf_exec, resources, image, outpdf): input_file = str(resources / image) output_file = str(outpdf) im = Image.open(input_file) # Runs: ocrmypdf - output.pdf < testfile with open(input_file, 'rb') as input_stream: p_args = ocrmypdf_exec + [ '--optimize', '0', '--image-dpi', '150', '--output-type', 'pdf', '-', output_file, ] p = Popen( p_args, close_fds=True, stdout=PIPE, stderr=PIPE, stdin=input_stream, env=spoof_tesseract_noop, ) out, err = p.communicate() if im.mode in ('RGBA', 'LA'): # If alpha image is input, expect an error assert p.returncode != ExitCode.ok and b'alpha' in err return assert p.returncode == ExitCode.ok, err.decode('utf-8') pdfinfo = PdfInfo(output_file) pdfimage = pdfinfo[0].images[0] if input_file.endswith('.png'): assert pdfimage.enc != Encoding.jpeg, "Lossless compression changed to lossy!" elif input_file.endswith('.jpg'): assert pdfimage.enc == Encoding.jpeg, "Lossy compression changed to lossless!" if im.mode.startswith('RGB') or im.mode.startswith('BGR'): assert pdfimage.color == Colorspace.rgb, "Colorspace changed" elif im.mode.startswith('L'): assert pdfimage.color == Colorspace.gray, "Colorspace changed"
def test_oversample(renderer, resources, outpdf): oversampled_pdf = check_ocrmypdf( resources / 'skew.pdf', outpdf, '--oversample', '350', '-f', '--pdf-renderer', renderer, '--plugin', 'tests/plugins/tesseract_cache.py', ) pdfinfo = PdfInfo(oversampled_pdf) print(pdfinfo[0].dpi.x) assert abs(pdfinfo[0].dpi.x - 350) < 1
def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec, resources, image, compression, outpdf): from PIL import Image input_file = str(resources / image) output_file = str(outpdf) im = Image.open(input_file) # Runs: ocrmypdf - output.pdf < testfile with open(input_file, 'rb') as input_stream: p_args = ocrmypdf_exec + [ '--image-dpi', '150', '--output-type', 'pdfa', '--pdfa-image-compression', compression, '-', output_file ] p = Popen(p_args, close_fds=True, stdout=PIPE, stderr=PIPE, stdin=input_stream, env=spoof_tesseract_noop) out, err = p.communicate() assert p.returncode == ExitCode.ok pdfinfo = PdfInfo(output_file) pdfimage = pdfinfo[0].images[0] if compression == "jpeg": assert pdfimage.enc == Encoding.jpeg else: if ghostscript.version() >= '9.23': # Ghostscript 9.23 adds JPEG passthrough, which allows a JPEG to be # copied without transcoding - so report if image.endswith('jpg'): assert pdfimage.enc == Encoding.jpeg else: assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000) if im.mode.startswith('RGB') or im.mode.startswith('BGR'): assert pdfimage.color == Colorspace.rgb, \ "Colorspace changed" elif im.mode.startswith('L'): assert pdfimage.color == Colorspace.gray, \ "Colorspace changed"
def test_sidecar_pagecount(spoof_tesseract_cache, resources, outpdf): sidecar = outpdf + '.txt' check_ocrmypdf( resources / 'multipage.pdf', outpdf, '--skip-text', '--sidecar', sidecar, env=spoof_tesseract_cache) pdfinfo = PdfInfo(resources / 'multipage.pdf') num_pages = len(pdfinfo) with open(sidecar, 'r') as f: ocr_text = f.read() # There should a formfeed between each pair of pages, so the count of # formfeeds is the page count less one assert ocr_text.count('\f') == num_pages - 1, \ "Sidecar page count does not match PDF page count"
def get_pdfinfo( input_file, detailed_analysis=False, progbar=False, max_workers=None, check_pages=None, ): try: return PdfInfo( input_file, detailed_analysis=detailed_analysis, progbar=progbar, max_workers=max_workers, check_pages=check_pages, ) except pikepdf.PasswordError: raise EncryptedPdfError() except pikepdf.PdfError: raise InputFileError()
def test_no_progress_bar(progress_bar, resources): opts = make_opts(progress_bar=progress_bar, input_file=(resources / 'trivial.pdf')) plugin_manager = get_plugin_manager(opts.plugins) vd._check_options(opts, plugin_manager, set()) pbar_disabled = None class CheckProgressBar(NullProgressBar): def __init__(self, disable, **kwargs): nonlocal pbar_disabled pbar_disabled = disable super().__init__(disable=disable, **kwargs) executor = SerialExecutor(pbar_class=CheckProgressBar) pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar, executor=executor) assert pdfinfo is not None assert pbar_disabled is not None and pbar_disabled != progress_bar
def test_malformed_docinfo(caplog, resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') with pikepdf.open(resources / 'trivial.pdf') as pike: pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>") pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False) options = parser.parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo) convert_to_pdfa(str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context) print(caplog.records) assert any('malformed DocumentInfo block' in record.message for record in caplog.records)
def test_prevent_gs_invalid_xml(resources, outdir): from ocrmypdf.__main__ import parser from ocrmypdf._pipeline import convert_to_pdfa from ocrmypdf.pdfa import generate_pdfa_ps from ocrmypdf.pdfinfo import PdfInfo generate_pdfa_ps(outdir / 'pdfa.ps') input_files = [ str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps') ] copyfile(resources / 'enron1.pdf', outdir / 'layers.rendered.pdf') log = logging.getLogger() context = JobContext() options = parser.parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']) context.options = options context.pdfinfo = PdfInfo(resources / 'enron1.pdf') convert_to_pdfa( input_files_groups=input_files, output_file=outdir / 'pdfa.pdf', log=log, context=context, ) with open(outdir / 'pdfa.pdf', 'rb') as f: with mmap.mmap(f.fileno(), 0, flags=mmap.MAP_PRIVATE, prot=mmap.PROT_READ) as mm: # Since the XML may be invalid, we scan instead of actually feeding it # to a parser. XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' xmp_start = mm.find(XMP_MAGIC) xmp_end = mm.rfind(b'<?xpacket end', xmp_start) assert 0 < xmp_start < xmp_end assert mm.find(b'�', xmp_start, xmp_end) == -1, "found escaped nul" assert mm.find(b'\x00', xmp_start, xmp_end) == -1
def test_sidecar_pagecount(spoof_tesseract_cache, resources, outpdf): sidecar = outpdf.with_suffix('.txt') check_ocrmypdf( resources / '3small.pdf', outpdf, '--skip-text', '--sidecar', sidecar, env=spoof_tesseract_cache, ) pdfinfo = PdfInfo(resources / '3small.pdf') num_pages = len(pdfinfo) with open(sidecar, 'r', encoding='utf-8') as f: ocr_text = f.read() # There should a formfeed between each pair of pages, so the count of # formfeeds is the page count less one assert (ocr_text.count('\f') == num_pages - 1), "Sidecar page count does not match PDF page count"
def test_compression_preserved(spoof_tesseract_noop, ocrmypdf_exec, resources, image, outpdf): from PIL import Image input_file = str(resources / image) output_file = str(outpdf) im = Image.open(input_file) # Runs: ocrmypdf - output.pdf < testfile with open(input_file, 'rb') as input_stream: p_args = ocrmypdf_exec + [ '--image-dpi', '150', '--output-type', 'pdf', '-', output_file ] p = Popen(p_args, close_fds=True, stdout=PIPE, stderr=PIPE, stdin=input_stream, env=spoof_tesseract_noop) out, err = p.communicate() assert p.returncode == ExitCode.ok pdfinfo = PdfInfo(output_file) pdfimage = pdfinfo[0].images[0] if input_file.endswith('.png'): assert pdfimage.enc != Encoding.jpeg, \ "Lossless compression changed to lossy!" elif input_file.endswith('.jpg'): assert pdfimage.enc == Encoding.jpeg, \ "Lossy compression changed to lossless!" if im.mode.startswith('RGB') or im.mode.startswith('BGR'): assert pdfimage.color == Colorspace.rgb, \ "Colorspace changed" elif im.mode.startswith('L'): assert pdfimage.color == Colorspace.gray, \ "Colorspace changed"
def get_pdfinfo( input_file, *, executor: Executor, detailed_analysis=False, progbar=False, max_workers=None, check_pages=None, ) -> PdfInfo: try: return PdfInfo( input_file, detailed_analysis=detailed_analysis, progbar=progbar, max_workers=max_workers, check_pages=check_pages, executor=executor, ) except pikepdf.PasswordError as e: raise EncryptedPdfError() from e except pikepdf.PdfError as e: raise InputFileError() from e
def test_prevent_gs_invalid_xml(resources, outdir): from ocrmypdf.cli import parser from ocrmypdf._pipeline import convert_to_pdfa from ocrmypdf.pdfa import generate_pdfa_ps from ocrmypdf.pdfinfo import PdfInfo generate_pdfa_ps(outdir / 'pdfa.ps') copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') # Inject a string with a trailing nul character into the DocumentInfo # dictionary of this PDF, as often occurs in practice. with pikepdf.open(outdir / 'layers.rendered.pdf') as pike: pike.Root.DocumentInfo = pikepdf.Dictionary( Title=b'String with trailing nul\x00') options = parser.parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo) convert_to_pdfa(str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context) with open(outdir / 'pdfa.pdf', 'r+b') as f: with mmap.mmap(f.fileno(), 0) as mm: # Since the XML may be invalid, we scan instead of actually feeding it # to a parser. XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' xmp_start = mm.find(XMP_MAGIC) xmp_end = mm.rfind(b'<?xpacket end', xmp_start) assert 0 < xmp_start < xmp_end # Ensure we did not carry the nul forward. assert mm.find(b'�', xmp_start, xmp_end) == -1, "found escaped nul" assert mm.find(b'\x00', xmp_start, xmp_end) == -1
def test_ocr_timeout(renderer, resources, outpdf): out = check_ocrmypdf(resources / 'skew.pdf', outpdf, '--tesseract-timeout', '1.0') pdfinfo = PdfInfo(out) assert not pdfinfo[0].has_text
def test_skip_big(spoof_tesseract_cache, resources, outpdf): out = check_ocrmypdf(resources / 'jbig2.pdf', outpdf, '--skip-big', '1', env=spoof_tesseract_cache) pdfinfo = PdfInfo(out) assert not pdfinfo[0].has_text
def test_skip_ocr(spoof_tesseract_cache, resources, outpdf): out = check_ocrmypdf(resources / 'graph_ocred.pdf', outpdf, '-s', env=spoof_tesseract_cache) pdfinfo = PdfInfo(out) assert pdfinfo[0].has_text
def test_userunit_qpdf_passes(spoof_tesseract_cache, poster, outpdf): before = PdfInfo(poster) check_ocrmypdf(poster, outpdf, '--output-type=pdf', env=spoof_tesseract_cache) after = PdfInfo(outpdf) assert isclose(before[0].width_inches, after[0].width_inches)