def extract_images( pike: Pdf, root: Path, options, extract_fn: Callable[..., Optional[XrefExt]], ) -> Iterator[Tuple[int, XrefExt]]: """Extract image using extract_fn Enumerate images on each page, lookup their xref/ID number in the PDF. Exclude images that are soft masks (i.e. alpha transparency related). Record the page number on which an image is first used, since images may be used on multiple pages (or multiple times on the same page). Current we do not check Form XObjects or other objects that may contain images, and we don't evaluate alternate images or thumbnails. extract_fn must decide if wants to extract the image in this context. If it does a tuple should be returned: (xref, ext) where .ext is the file extension. extract_fn must also extract the file it finds interesting. """ include_xrefs: MutableSet[Xref] = set() exclude_xrefs: MutableSet[Xref] = set() pageno_for_xref = {} errors = 0 for pageno, page in enumerate(pike.pages): try: xobjs = page.Resources.XObject except AttributeError: continue for _imname, image in dict(xobjs).items(): if image.objgen[1] != 0: continue # Ignore images in an incremental PDF xref = Xref(image.objgen[0]) if hasattr(image, 'SMask'): # Ignore soft masks smask_xref = Xref(image.SMask.objgen[0]) exclude_xrefs.add(smask_xref) log.debug(f"Skipping image {smask_xref} because it is an SMask") include_xrefs.add(xref) log.debug(f"Treating {xref} as an optimization candidate") if xref not in pageno_for_xref: pageno_for_xref[xref] = pageno working_xrefs = include_xrefs - exclude_xrefs for xref in working_xrefs: image = pike.get_object((xref, 0)) try: result = extract_fn( pike=pike, root=root, image=image, xref=xref, options=options ) except Exception: # pylint: disable=broad-except log.exception(f"While extracting image xref {xref}, an error occurred") errors += 1 else: if result: _, ext = result yield pageno_for_xref[xref], XrefExt(xref, ext)
def convert_to_jbig2( pike: Pdf, jbig2_groups: Dict[int, List[XrefExt]], root: Path, options, executor: Executor, ) -> None: """Convert images to JBIG2 and insert into PDF. When the JBIG2 page group size is > 1 we do several JBIG2 images at once and build a symbol dictionary that will span several pages. Each JBIG2 image must reference to its symbol dictionary. If too many pages shared the same dictionary JBIG2 encoding becomes more expensive and less efficient. The default value of 10 was determined through testing. Currently this must be lossy encoding since jbig2enc does not support refinement coding. When the JBIG2 symbolic coder is not used, each JBIG2 stands on its own and needs no dictionary. Currently this must be lossless JBIG2. """ jbig2_globals_dict: Optional[Dictionary] _produce_jbig2_images(jbig2_groups, root, options, executor) for group, xref_exts in jbig2_groups.items(): prefix = f'group{group:08d}' jbig2_symfile = root / (prefix + '.sym') if jbig2_symfile.exists(): jbig2_globals_data = jbig2_symfile.read_bytes() jbig2_globals = Stream(pike, jbig2_globals_data) jbig2_globals_dict = Dictionary(JBIG2Globals=jbig2_globals) elif options.jbig2_page_group_size == 1: jbig2_globals_dict = None else: raise FileNotFoundError(jbig2_symfile) for n, xref_ext in enumerate(xref_exts): xref, _ = xref_ext jbig2_im_file = root / (prefix + f'.{n:04d}') jbig2_im_data = jbig2_im_file.read_bytes() im_obj = pike.get_object(xref, 0) im_obj.write(jbig2_im_data, filter=Name.JBIG2Decode, decode_parms=jbig2_globals_dict)
def transcode_jpegs(pike: Pdf, jpegs: Sequence[Xref], root: Path, options) -> None: for xref in tqdm( jpegs, desc="JPEGs", unit='image', disable=not options.progress_bar ): in_jpg = jpg_name(root, xref) opt_jpg = in_jpg.with_suffix('.opt.jpg') # This produces a debug warning from PIL # DEBUG:PIL.Image:Error closing: 'NoneType' object has no attribute # 'close'. Seems to be mostly harmless # https://github.com/python-pillow/Pillow/issues/1144 with Image.open(in_jpg) as im: im.save(opt_jpg, optimize=True, quality=options.jpeg_quality) if opt_jpg.stat().st_size > in_jpg.stat().st_size: log.debug("xref %s, jpeg, made larger - skip", xref) continue compdata = leptonica.CompressedData.open(opt_jpg) im_obj = pike.get_object(xref, 0) im_obj.write(compdata.read(), filter=Name.DCTDecode)
def _transcode_png(pike: Pdf, filename: Path, xref: Xref) -> bool: output = filename.with_suffix('.png.pdf') with output.open('wb') as f: img2pdf.convert(fspath(filename), outputstream=f) with pikepdf.open(output) as pdf_image: foreign_image = next(pdf_image.pages[0].images.values()) local_image = pike.copy_foreign(foreign_image) im_obj = pike.get_object(xref, 0) im_obj.write( local_image.read_raw_bytes(), filter=local_image.Filter, decode_parms=local_image.DecodeParms, ) # Don't copy keys from the new image... del_keys = set(im_obj.keys()) - set(local_image.keys()) # ...except for the keep_fields, which are essential to displaying # the image correctly and preserving its metadata. (/Decode arrays # and /SMaskInData are implicitly discarded prior to this point.) keep_fields = { '/ID', '/Intent', '/Interpolate', '/Mask', '/Metadata', '/OC', '/OPI', '/SMask', '/StructParent', } del_keys -= keep_fields for key in local_image.keys(): if key != Name.Length and str(key) not in keep_fields: im_obj[key] = local_image[key] for key in del_keys: del im_obj[key] return True
def _transcode_png(pike: Pdf, filename: Path, xref: Xref) -> bool: output = filename.with_suffix('.png.pdf') with output.open('wb') as f: img2pdf.convert(fspath(filename), outputstream=f) with pikepdf.open(output) as pdf_image: foreign_image = next(pdf_image.pages[0].images.values()) local_image = pike.copy_foreign(foreign_image) im_obj = pike.get_object(xref, 0) im_obj.write( local_image.read_raw_bytes(), filter=local_image.Filter, decode_parms=local_image.DecodeParms, ) del_keys = set(im_obj.keys()) - set(local_image.keys()) for key in local_image.keys(): if key != Name.Length: im_obj[key] = local_image[key] for key in del_keys: del im_obj[key]
def transcode_pngs( pike: Pdf, images: Sequence[Xref], image_name_fn: Callable[[Path, Xref], Path], root: Path, options, ) -> None: modified: MutableSet[Xref] = set() if options.optimize >= 2: png_quality = ( max(10, options.png_quality - 10), min(100, options.png_quality + 10), ) def pngquant_args(): for xref in images: log.debug(image_name_fn(root, xref)) yield ( image_name_fn(root, xref), png_name(root, xref), png_quality[0], png_quality[1], ) modified.add(xref) def pngquant_fn(args): pngquant.quantize(*args) exec_progress_pool( use_threads=True, max_workers=options.jobs, tqdm_kwargs=dict( desc="PNGs", total=len(images), unit='image', disable=not options.progress_bar, ), task=pngquant_fn, task_arguments=pngquant_args(), ) for xref in modified: im_obj = pike.get_object(xref, 0) try: pix = leptonica.Pix.open(png_name(root, xref)) if pix.mode == '1': compdata = pix.generate_pdf_ci_data(leptonica.lept.L_G4_ENCODE, 0) else: compdata = leptonica.CompressedData.open(png_name(root, xref)) except leptonica.LeptonicaError as e: # Most likely this means file not found, i.e. quantize did not # produce an improved version log.error(e) continue # If re-coded image is larger don't use it - we test here because # pngquant knows the size of the temporary output file but not the actual # object in the PDF if len(compdata) > int(im_obj.stream_dict.Length): log.debug( f"pngquant: pngquant did not improve over original image " f"{len(compdata)} > {int(im_obj.stream_dict.Length)}") continue if compdata.type == leptonica.lept.L_FLATE_ENCODE: rewrite_png(pike, im_obj, compdata) elif compdata.type == leptonica.lept.L_G4_ENCODE: rewrite_png_as_g4(pike, im_obj, compdata)