def rewrite_png(pike: Pdf, im_obj: Object, compdata) -> None: # pragma: no cover # When a PNG is inserted into a PDF, we more or less copy the IDAT section from # the PDF and transfer the rest of the PNG headers to PDF image metadata. # One thing we have to do is tell the PDF reader whether a predictor was used # on the image before Flate encoding. (Typically one is.) # According to Leptonica source, PDF readers don't actually need us # to specify the correct predictor, they just need a value of either: # 1 - no predictor # 10-14 - there is a predictor # Leptonica's compdata->predictor only tells TRUE or FALSE # 10-14 means the actual predictor is specified in the data, so for any # number >= 10 the PDF reader will use whatever the PNG data specifies. # In practice Leptonica should use Paeth, 14, but 15 seems to be the # designated value for "optimal". So we will use 15. # See: # - PDF RM 7.4.4.4 Table 10 # - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757 predictor = 15 if compdata.predictor > 0 else 1 dparms = Dictionary(Predictor=predictor) if predictor > 1: dparms.BitsPerComponent = compdata.bps # Yes, this is redundant dparms.Colors = compdata.spp dparms.Columns = compdata.w im_obj.BitsPerComponent = compdata.bps im_obj.Width = compdata.w im_obj.Height = compdata.h log.debug( f"PNG {im_obj.objgen}: palette={compdata.ncolors} spp={compdata.spp} bps={compdata.bps}" ) if compdata.ncolors > 0: # .ncolors is the number of colors in the palette, not the number of # colors used in a true color image. The palette string is always # given as RGB tuples even when the image is grayscale; see # https://github.com/DanBloomberg/leptonica/blob/master/src/colormap.c#L2067 palette_pdf_string = compdata.get_palette_pdf_string() palette_data = pikepdf.Object.parse(palette_pdf_string) palette_stream = pikepdf.Stream(pike, bytes(palette_data)) palette = [ Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream ] cs = palette else: # ncolors == 0 means we are using a colorspace without a palette if compdata.spp == 1: cs = Name.DeviceGray elif compdata.spp == 4: cs = Name.DeviceCMYK else: # spp == 3 cs = Name.DeviceRGB im_obj.ColorSpace = cs im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)
def transcode_pngs(pike, images, image_name_fn, root, log, options): if options.optimize >= 2: png_quality = ( max(10, options.png_quality - 10), min(100, options.png_quality + 10), ) with concurrent.futures.ThreadPoolExecutor( max_workers=options.jobs) as executor: for xref in images: log.debug(image_name_fn(root, xref)) executor.submit( pngquant.quantize, image_name_fn(root, xref), png_name(root, xref), png_quality[0], png_quality[1], ) for xref in images: im_obj = pike.get_object(xref, 0) try: compdata = leptonica.CompressedData.open(png_name(root, xref)) except leptonica.LeptonicaError as e: # Most likely this means file not found, i.e. quantize did not # produce an improved version log.error(e) continue # If re-coded image is larger don't use it - we test here because # pngquant knows the size of the temporary output file but not the actual # object in the PDF if len(compdata) > int(im_obj.stream_dict.Length): log.debug( f"pngquant: pngquant did not improve over original image " f"{len(compdata)} > {int(im_obj.stream_dict.Length)}") continue # When a PNG is inserted into a PDF, we more or less copy the IDAT section from # the PDF and transfer the rest of the PNG headers to PDF image metadata. # One thing we have to do is tell the PDF reader whether a predictor was used # on the image before Flate encoding. (Typically one is.) # According to Leptonica source, PDF readers don't actually need us # to specify the correct predictor, they just need a value of either: # 1 - no predictor # 10-14 - there is a predictor # Leptonica's compdata->predictor only tells TRUE or FALSE # From there the PNG decoder can infer the rest from the file. # In practice the predictor should be Paeth, 14, so we'll use that. # See: # - PDF RM 7.4.4.4 Table 10 # - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757 predictor = 14 if compdata.predictor > 0 else 1 dparms = Dictionary(Predictor=predictor) if predictor > 1: dparms.BitsPerComponent = compdata.bps # Yes, this is redundant dparms.Colors = compdata.spp dparms.Columns = compdata.w im_obj.BitsPerComponent = compdata.bps im_obj.Width = compdata.w im_obj.Height = compdata.h if compdata.ncolors > 0: # .ncolors is the number of colors in the palette, not the number of # colors used in a true color image palette_pdf_string = compdata.get_palette_pdf_string() palette_data = pikepdf.Object.parse(palette_pdf_string) palette_stream = pikepdf.Stream(pike, bytes(palette_data)) palette = [ Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream, ] cs = palette else: if compdata.spp == 1: # PDF interprets binary-1 as black in 1bpp, but PNG sets # black to 0 for 1bpp. Create a palette that informs the PDF # of the mapping - seems cleaner to go this way but pikepdf # needs to be patched to support it. # palette = [Name.Indexed, Name.DeviceGray, 1, b"\xff\x00"] # cs = palette cs = Name.DeviceGray elif compdata.spp == 3: cs = Name.DeviceRGB elif compdata.spp == 4: cs = Name.DeviceCMYK if compdata.bps == 1: im_obj.Decode = [ 1, 0 ] # Bit of a kludge but this inverts photometric too im_obj.ColorSpace = cs im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)
def transcode_pngs(pike, images, image_name_fn, root, log, options): if options.optimize >= 2: png_quality = ( max(10, options.png_quality - 10), min(100, options.png_quality + 10), ) with concurrent.futures.ThreadPoolExecutor( max_workers=options.jobs ) as executor: for xref in images: log.debug(image_name_fn(root, xref)) executor.submit( pngquant.quantize, image_name_fn(root, xref), png_name(root, xref), png_quality[0], png_quality[1], ) for xref in images: im_obj = pike.get_object(xref, 0) try: compdata = leptonica.CompressedData.open(png_name(root, xref)) except leptonica.LeptonicaError as e: # Most likely this means file not found, i.e. quantize did not # produce an improved version log.error(e) continue # If re-coded image is larger don't use it - we test here because # pngquant knows the size of the temporary output file but not the actual # object in the PDF if len(compdata) > int(im_obj.stream_dict.Length): log.debug( f"pngquant: pngquant did not improve over original image " f"{len(compdata)} > {int(im_obj.stream_dict.Length)}" ) continue # When a PNG is inserted into a PDF, we more or less copy the IDAT section from # the PDF and transfer the rest of the PNG headers to PDF image metadata. # One thing we have to do is tell the PDF reader whether a predictor was used # on the image before Flate encoding. (Typically one is.) # According to Leptonica source, PDF readers don't actually need us # to specify the correct predictor, they just need a value of either: # 1 - no predictor # 10-14 - there is a predictor # Leptonica's compdata->predictor only tells TRUE or FALSE # From there the PNG decoder can infer the rest from the file. # In practice the predictor should be Paeth, 14, so we'll use that. # See: # - PDF RM 7.4.4.4 Table 10 # - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757 predictor = 14 if compdata.predictor > 0 else 1 dparms = Dictionary(Predictor=predictor) if predictor > 1: dparms.BitsPerComponent = compdata.bps # Yes, this is redundant dparms.Colors = compdata.spp dparms.Columns = compdata.w im_obj.BitsPerComponent = compdata.bps im_obj.Width = compdata.w im_obj.Height = compdata.h if compdata.ncolors > 0: # .ncolors is the number of colors in the palette, not the number of # colors used in a true color image palette_pdf_string = compdata.get_palette_pdf_string() palette_data = pikepdf.Object.parse(palette_pdf_string) palette_stream = pikepdf.Stream(pike, bytes(palette_data)) palette = [ Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream, ] cs = palette else: if compdata.spp == 1: # PDF interprets binary-1 as black in 1bpp, but PNG sets # black to 0 for 1bpp. Create a palette that informs the PDF # of the mapping - seems cleaner to go this way but pikepdf # needs to be patched to support it. # palette = [Name.Indexed, Name.DeviceGray, 1, b"\xff\x00"] # cs = palette cs = Name.DeviceGray elif compdata.spp == 3: cs = Name.DeviceRGB elif compdata.spp == 4: cs = Name.DeviceCMYK if compdata.bps == 1: im_obj.Decode = [1, 0] # Bit of a kludge but this inverts photometric too im_obj.ColorSpace = cs im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)