def extract_image_jbig2(*, pike: pikepdf.Pdf, root: Path, image: Object, xref: Xref, options) -> Optional[XrefExt]: result = extract_image_filter(pike, root, image, xref) if result is None: return None pim, filtdp = result if (pim.bits_per_component == 1 and filtdp[0] != Name.JBIG2Decode and jbig2enc.available()): # Save any colorspace associated with the image, so that we # will export a pure 1-bit PNG with no palette or ICC profile. # Showing the palette or ICC to jbig2enc will cause it to perform # colorspace transform to 1bpp, which will conflict the palette or # ICC if it exists. colorspace = pim.obj.get(pikepdf.Name.ColorSpace, None) if colorspace is not None or pim.image_mask: try: # Set to DeviceGray temporarily; we already in 1 bpc. pim.obj.ColorSpace = pikepdf.Name.DeviceGray imgname = root / f'{xref:08d}' with imgname.open('wb') as f: ext = pim.extract_to(stream=f) imgname.rename(imgname.with_suffix(ext)) except pikepdf.UnsupportedImageTypeError: return None finally: # Restore image colorspace after temporarily setting it to DeviceGray if colorspace is not None: pim.obj.ColorSpace = colorspace else: del pim.obj.ColorSpace return XrefExt(xref, ext) return None
def report_output_file_size(options, input_file, output_file): try: output_size = Path(output_file).stat().st_size input_size = Path(input_file).stat().st_size except FileNotFoundError: return # Outputting to stream or something with pikepdf.open(output_file) as p: # Overhead constants obtained by estimating amount of data added by OCR # PDF/A conversion, and possible XMP metadata addition, with compression FILE_OVERHEAD = 4000 OCR_PER_PAGE_OVERHEAD = 3000 reasonable_overhead = FILE_OVERHEAD + OCR_PER_PAGE_OVERHEAD * len( p.pages) ratio = output_size / input_size reasonable_ratio = output_size / (input_size + reasonable_overhead) if reasonable_ratio < 1.35 or input_size < 25000: return # Seems fine reasons = [] image_preproc = { 'deskew', 'clean_final', 'remove_background', 'oversample', 'force_ocr', } for arg in image_preproc: if getattr(options, arg, False): reasons.append( f"The argument --{arg.replace('_', '-')} was issued, causing transcoding." ) if options.optimize == 0: reasons.append("Optimization was disabled.") else: image_optimizers = { 'jbig2': jbig2enc.available(), 'pngquant': pngquant.available(), } for name, available in image_optimizers.items(): if not available: reasons.append( f"The optional dependency '{name}' was not found, so some image " f"optimizations could not be attempted.") if options.output_type.startswith('pdfa'): reasons.append( "PDF/A conversion was enabled. (Try `--output-type pdf`.)") if options.plugins: reasons.append("Plugins were used.") if reasons: explanation = "Possible reasons for this include:\n" + '\n'.join( reasons) + "\n" else: explanation = "No reason for this increase is known. Please report this issue." log.warning( f"The output file size is {ratio:.2f}× larger than the input file.\n" f"{explanation}")
def extract_image_jbig2(*, pike: pikepdf.Pdf, root: Path, image: Object, xref: Xref, options) -> Optional[XrefExt]: result = extract_image_filter(pike, root, image, xref) if result is None: return None pim, filtdp = result if (pim.bits_per_component == 1 and filtdp[0] != Name.JBIG2Decode and jbig2enc.available()): try: imgname = root / f'{xref:08d}' with imgname.open('wb') as f: ext = pim.extract_to(stream=f) imgname.rename(imgname.with_suffix(ext)) except pikepdf.UnsupportedImageTypeError: return None return XrefExt(xref, ext) return None
resources / 'crom.png', outpdf, '--image-dpi', '200', '--optimize', '3', '--jpg-quality', '50', '--png-quality', '20', '--plugin', 'tests/plugins/tesseract_noop.py', ) @pytest.mark.skipif(not jbig2enc.available(), reason='need jbig2enc') @pytest.mark.parametrize('lossy', [False, True]) def test_jbig2_lossy(lossy, resources, outpdf): args = [ resources / 'ccitt.pdf', outpdf, '--image-dpi', '200', '--optimize', 3, '--jpg-quality', '50', '--png-quality', '20', '--plugin', 'tests/plugins/tesseract_noop.py',
import img2pdf import pikepdf import pytest from PIL import Image, ImageDraw from ocrmypdf import optimize as opt from ocrmypdf._exec import jbig2enc, pngquant from ocrmypdf._exec.ghostscript import rasterize_pdf from ocrmypdf.helpers import Resolution from .conftest import check_ocrmypdf needs_pngquant = pytest.mark.skipif(not pngquant.available(), reason="pngquant not installed") needs_jbig2enc = pytest.mark.skipif(not jbig2enc.available(), reason="jbig2enc not installed") @needs_pngquant @pytest.mark.parametrize('pdf', ['multipage.pdf', 'palette.pdf']) def test_basic(resources, pdf, outpdf): infile = resources / pdf opt.main(infile, outpdf, level=3) assert 0.98 * Path(outpdf).stat().st_size <= Path(infile).stat().st_size @needs_pngquant def test_mono_not_inverted(resources, outdir): infile = resources / '2400dpi.pdf'