def report_output_file_size(options, input_file, output_file): try: output_size = Path(output_file).stat().st_size input_size = Path(input_file).stat().st_size except FileNotFoundError: return # Outputting to stream or something with pikepdf.open(output_file) as p: # Overhead constants obtained by estimating amount of data added by OCR # PDF/A conversion, and possible XMP metadata addition, with compression FILE_OVERHEAD = 4000 OCR_PER_PAGE_OVERHEAD = 3000 reasonable_overhead = FILE_OVERHEAD + OCR_PER_PAGE_OVERHEAD * len( p.pages) ratio = output_size / input_size reasonable_ratio = output_size / (input_size + reasonable_overhead) if reasonable_ratio < 1.35 or input_size < 25000: return # Seems fine reasons = [] image_preproc = { 'deskew', 'clean_final', 'remove_background', 'oversample', 'force_ocr', } for arg in image_preproc: if getattr(options, arg, False): reasons.append( f"The argument --{arg.replace('_', '-')} was issued, causing transcoding." ) if options.optimize == 0: reasons.append("Optimization was disabled.") else: image_optimizers = { 'jbig2': jbig2enc.available(), 'pngquant': pngquant.available(), } for name, available in image_optimizers.items(): if not available: reasons.append( f"The optional dependency '{name}' was not found, so some image " f"optimizations could not be attempted.") if options.output_type.startswith('pdfa'): reasons.append( "PDF/A conversion was enabled. (Try `--output-type pdf`.)") if options.plugins: reasons.append("Plugins were used.") if reasons: explanation = "Possible reasons for this include:\n" + '\n'.join( reasons) + "\n" else: explanation = "No reason for this increase is known. Please report this issue." log.warning( f"The output file size is {ratio:.2f}× larger than the input file.\n" f"{explanation}")
def test_mono_not_inverted(resources, outdir): infile = resources / '2400dpi.pdf' opt.main(infile, outdir / 'out.pdf', level=3) rasterize_pdf( outdir / 'out.pdf', outdir / 'im.png', raster_device='pnggray', raster_dpi=Resolution(10, 10), ) with Image.open(fspath(outdir / 'im.png')) as im: assert im.getpixel((0, 0)) == 255, "Expected white background" @pytest.mark.skipif(not pngquant.available(), reason='need pngquant') def test_jpg_png_params(resources, outpdf): check_ocrmypdf( resources / 'crom.png', outpdf, '--image-dpi', '200', '--optimize', '3', '--jpg-quality', '50', '--png-quality', '20', '--plugin', 'tests/plugins/tesseract_noop.py', )
from pathlib import Path from unittest.mock import patch import img2pdf import pikepdf import pytest from PIL import Image, ImageDraw from ocrmypdf import optimize as opt from ocrmypdf._exec import jbig2enc, pngquant from ocrmypdf._exec.ghostscript import rasterize_pdf from ocrmypdf.helpers import Resolution from .conftest import check_ocrmypdf needs_pngquant = pytest.mark.skipif(not pngquant.available(), reason="pngquant not installed") needs_jbig2enc = pytest.mark.skipif(not jbig2enc.available(), reason="jbig2enc not installed") @needs_pngquant @pytest.mark.parametrize('pdf', ['multipage.pdf', 'palette.pdf']) def test_basic(resources, pdf, outpdf): infile = resources / pdf opt.main(infile, outpdf, level=3) assert 0.98 * Path(outpdf).stat().st_size <= Path(infile).stat().st_size @needs_pngquant