def set_layer_visibility(pdf, layers_to_show): """Set visibility of layers.""" try: ocgs = pdf.Root.OCProperties.OCGs except (AttributeError, KeyError): logger.error("Unable to locate layers in PDF.") sys.exit(1) ocgs_on = [] for ocg in ocgs: if ocg.Name in layers_to_show: logger.info("Layer %s will be visible.", ocg.Name) ocgs_on.append(ocg) else: logger.info("Layer %s will be hidden.", ocg.Name) ocgs_config = pikepdf.Dictionary( BaseState=pikepdf.Name('/OFF'), ON=ocgs_on, Order=ocgs, ) pdf.Root.OCProperties = pikepdf.Dictionary( D=ocgs_config, OCGs=ocgs, ) # Needed for google-chrome (at least): for ocg in ocgs: if '/View' in ocg.Usage: del ocg.Usage.View if '/Print' in ocg.Usage: del ocg.Usage.Print
def transcode_pngs(pike, pngs, root, log, options): if options.optimize >= 2: png_quality = ( max(10, options.png_quality - 10), min(100, options.png_quality + 10) ) with concurrent.futures.ThreadPoolExecutor( max_workers=options.jobs) as executor: for xref in pngs: executor.submit( pngquant.quantize, png_name(root, xref), png_name(root, xref), png_quality[0], png_quality[1]) for xref in pngs: im_obj = pike.get_object(xref, 0) # Open, transcode (!), package for PDF try: pix = leptonica.Pix.open(png_name(root, xref)) if pix.depth == 1: pix = pix.invert() # PDF assumes 1 is black for monochrome compdata = pix.generate_pdf_ci_data( leptonica.lept.L_FLATE_ENCODE, 0 ) except leptonica.LeptonicaError as e: log.error(e) continue # This is what we should be doing: open the compressed data without # transcoding. However this shifts each pixel row by one for some # reason. #compdata = leptonica.CompressedData.open(png_name(root, xref)) if len(compdata) > int(im_obj.stream_dict.Length): continue # If we produced a larger image, don't use predictor = Null() if compdata.predictor > 0: predictor = pikepdf.Dictionary({'/Predictor': compdata.predictor}) im_obj.BitsPerComponent = compdata.bps im_obj.Width = compdata.w im_obj.Height = compdata.h if compdata.ncolors > 0: palette_pdf_string = compdata.get_palette_pdf_string() palette_data = pikepdf.Object.parse(palette_pdf_string) palette_stream = pikepdf.Stream(pike, bytes(palette_data)) palette = [pikepdf.Name('/Indexed'), pikepdf.Name('/DeviceRGB'), compdata.ncolors - 1, palette_stream] cs = palette else: if compdata.spp == 1: cs = pikepdf.Name('/DeviceGray') elif compdata.spp == 3: cs = pikepdf.Name('/DeviceRGB') elif compdata.spp == 4: cs = pikepdf.Name('/DeviceCMYK') im_obj.ColorSpace = cs im_obj.write(compdata.read(), pikepdf.Name('/FlateDecode'), predictor)
def merge(pdf_streams, names, outpath, first_page): output = pikepdf.new() pgcounts = [] for stream, name in tqdm.tqdm(zip(pdf_streams, names), total=len(names), desc="Merging PDFs"): pgcounts.append(append_pdf(output, stream, name)) # add page numbering amount_of_contents = 0 for amt, name in zip(pgcounts, names): if is_contents(name): amount_of_contents += amt else: break if amount_of_contents != 0: output.Root.PageLabels = { "/Nums": [ 0, { "/S": pikepdf.Name("/r") }, amount_of_contents, { "/S": pikepdf.Name("/D"), "/St": first_page } ] } with tqdm.tqdm(total=100, desc="Writing PDF") as pbar: last = 0 def update(x): nonlocal last g = x - last last = x pbar.update(g) output.save(outpath, progress=update)
def convert_to_jbig2(pike, jbig2_groups, root, log, options): """ Convert a group of JBIG2 images and insert into PDF. We use a group because JBIG2 works best with a symbol dictionary that spans multiple pages. When inserted back into the PDF, each JBIG2 must reference the symbol dictionary it is associated with. So convert a group at a time, and replace their streams with a parameter set that points to the appropriate dictionary. If too many pages shared the same dictionary JBIG2 encoding becomes more expensive and less efficient. """ with concurrent.futures.ThreadPoolExecutor( max_workers=options.jobs) as executor: futures = [] for group, xref_exts in jbig2_groups.items(): prefix = 'group{:08d}'.format(group) future = executor.submit( jbig2enc.convert_group, cwd=fspath(root), infiles=(img_name(root, xref, ext) for xref, ext in xref_exts), out_prefix=prefix ) futures.append(future) for future in concurrent.futures.as_completed(futures): proc = future.result() log.debug(proc.stderr.decode()) for group, xref_exts in jbig2_groups.items(): prefix = 'group{:08d}'.format(group) jbig2_globals_data = (root / (prefix + '.sym')).read_bytes() jbig2_globals = pikepdf.Stream(pike, jbig2_globals_data) for n, xref_ext in enumerate(xref_exts): xref, _ = xref_ext jbig2_im_file = root / (prefix + '.{:04d}'.format(n)) jbig2_im_data = jbig2_im_file.read_bytes() im_obj = pike.get_object(xref, 0) im_obj.write( jbig2_im_data, pikepdf.Name('/JBIG2Decode'), pikepdf.Dictionary({ '/JBIG2Globals': jbig2_globals }) )
def transcode_jpegs(pike, jpegs, root, log, options): for xref in jpegs: in_jpg = Path(jpg_name(root, xref)) opt_jpg = in_jpg.with_suffix('.opt.jpg') # This produces a debug warning from PIL # DEBUG:PIL.Image:Error closing: 'NoneType' object has no attribute # 'close'. Seems to be mostly harmless # https://github.com/python-pillow/Pillow/issues/1144 with Image.open(fspath(in_jpg)) as im: im.save(fspath(opt_jpg), optimize=True, quality=options.jpeg_quality) # pylint: disable=no-member if opt_jpg.stat().st_size > in_jpg.stat().st_size: log.debug("xref %s, jpeg, made larger - skip", xref) continue compdata = leptonica.CompressedData.open(opt_jpg) im_obj = pike.get_object(xref, 0) im_obj.write(compdata.read(), filter=pikepdf.Name('/DCTDecode'))
def update_dest(zoom_factor,current): dest = pikepdf.Array() dest.append(current[0]) dest.append(pikepdf.Name("/XYZ")) dest.append(0) dest.append(0) dest.append(zoom_factor) dest_type = current[1] if dest_type == "/XYZ": dest[2] = current[2] dest[3] = current[3] elif dest_type in ("/FitH","/FitBH"): dest[3] = current[2] elif dest_type in ("/FitV","/FitBV"): dest[2] = current[2] elif dest_type == "/FitR": dest[2] = current[2] dest[3] = current[4] #("/Fit","/FitB"): return dest
def convert_to_jbig2(pike, jbig2_groups, root, log, options): """Convert images to JBIG2 and insert into PDF. When the JBIG2 page group size is > 1 we do several JBIG2 images at once and build a symbol dictionary that will span several pages. Each JBIG2 image must reference to its symbol dictionary. If too many pages shared the same dictionary JBIG2 encoding becomes more expensive and less efficient. The default value of 10 was determined through testing. Currently this must be lossy encoding since jbig2enc does not support refinement coding. When the JBIG2 symbolic coder is not used, each JBIG2 stands on its own and needs no dictionary. Currently this is must be lossless JBIG2. """ _produce_jbig2_images(jbig2_groups, root, log, options) for group, xref_exts in jbig2_groups.items(): prefix = f'group{group:08d}' jbig2_symfile = root / (prefix + '.sym') if jbig2_symfile.exists(): jbig2_globals_data = jbig2_symfile.read_bytes() jbig2_globals = pikepdf.Stream(pike, jbig2_globals_data) jbig2_globals_dict = pikepdf.Dictionary( {'/JBIG2Globals': jbig2_globals}) elif options.jbig2_page_group_size == 1: jbig2_globals_dict = None else: raise FileNotFoundError(jbig2_symfile) for n, xref_ext in enumerate(xref_exts): xref, _ = xref_ext jbig2_im_file = root / (prefix + f'.{n:04d}') jbig2_im_data = jbig2_im_file.read_bytes() im_obj = pike.get_object(xref, 0) im_obj.write( jbig2_im_data, filter=pikepdf.Name('/JBIG2Decode'), decode_parms=jbig2_globals_dict, )
def main(tmpdirname, pdf_name): total_savings = 0 logging.info('Processing %s.', pdf_name) my_pdf = pikepdf.open(pdf_name) img_num = 0 # total_objs = num_image_objects(mypdf) # for image_obj in tqdm(image_objects(my_pdf), total=total_objs): for image_obj in image_objects(my_pdf): if '/Filter' not in image_obj: continue # FIXME: to improve *a lot* if (image_obj.Filter != '/DCTDecode' and not (isinstance(image_obj.Filter, pikepdf.Array) and len(image_obj.Filter) == 1 and image_obj.Filter[0] == '/DCTDecode')): continue if not (image_obj.ColorSpace in ('/DeviceRGB', '/DeviceGray') or (isinstance(image_obj.ColorSpace, pikepdf.Array) and image_obj.ColorSpace[0] == '/DeviceN' and image_obj.ColorSpace[2] in ('/DeviceRGB', '/DeviceGray'))): continue # FIXME: Enable this code to process more images # if not (image_obj.ColorSpace in ('/DeviceRGB', '/DeviceGray') or # (isinstance(image_obj.ColorSpace, pikepdf.Array) and # image_obj.ColorSpace[0] == '/ICCBased' and str(image_obj.ColorSpace[1].Alternate) in # ('/DeviceRGB', '/DeviceGray'))): # continue img_num += 1 logging.debug('Found a JPEG as %s', image_obj.ColorSpace) tempname = os.path.join(tmpdirname, f'img-{img_num:05d}.jpg') source = open(tempname, 'wb') size_before = source.write(image_obj.read_raw_bytes()) logging.debug('Wrote %d bytes to the tempfile %s.', size_before, tempname) source.close() # print('Calling jpgcrush...') subprocess.check_call(['jpgcrush', tempname]) # print('Return code was: %d.' % ret) # # Unfortunately, the -purejpg of jhead is too aggressive and may # # strip way too much to the point of modifying the image, in some # # cases. # logging.debug('Calling jhead...') # subprocess.check_call(['jhead', '-dt', '-dc', '-de', source.name]) # # print('Return code was: %d.' % ret) targetfn = open(tempname, 'rb') target = targetfn.read() size_after = len(target) logging.debug('Read back %d bytes from the tempfile %s.', size_after, tempname) image_obj.write(target, filter=pikepdf.Name('/DCTDecode')) logging.debug('The image is back on the PDF file.') total_savings += size_before - size_after final_filename = os.path.splitext(pdf_name)[0] + '.jpg.pdf' logging.info('Saved %d bytes to create %s.', total_savings, final_filename) my_pdf.save(final_filename) my_pdf.close()
import pikepdf from pikepdf import Pdf, OutlineItem with Pdf.open('TampaFD_TemporalPDF-4.pdf') as pdf: with pdf.open_outline() as outline: new_action = pikepdf.Dictionary() new_action['/S'] = pikepdf.Name('/JavaScript') new_action['/JS'] = "app.alert(\"Hello from Robin\");" test_item = OutlineItem('Test Alert Robin', action=new_action) outline.root.append(test_item) pdf.save('output1.pdf')
def run(self, progress_dlg): # open a new copy of the input output = pikepdf.Pdf.open(self.pdf.filename) self.colour_type = None if self.keep_ocs == 'all' and len(self.line_props) == 0: return output if self.keep_ocs is None and self.keep_non_oc == False: print(_('No layers selected, generated PDF would be blank.')) return None if len(self.page_range) == 0: # human input page range is 1-indexed page_range = range(1, len(output.pages) + 1) else: # get rid of duplicates and zeros in the page range page_range = list(set([p for p in self.page_range if p > 0])) n_page = len(page_range) progress_dlg.SetRange(n_page) Yield() # change the decimal precision because it's really high for p in page_range: # print(_('Processing layers in page {}...'.format(p))) # apply the filter and reassign the page contents newstream = self.filter_content(output.pages[p - 1]) output.pages[p - 1].Contents = output.make_stream(newstream) # check if there are form xobjects, and if so, filter them as well if '/XObject' in output.pages[p - 1].Resources.keys(): for k in output.pages[p - 1].Resources.XObject.keys(): xobj = output.pages[p - 1].Resources.XObject[k] if '/OC' in xobj.keys(): oc = None if '/Name' in xobj.OC.keys(): oc = str(xobj.OC.Name) elif '/OCGs' in xobj.OC.keys( ) and '/Name' in xobj.OC.OCGs.keys(): oc = str(xobj.OC.OCGs.keys()) if oc in self.keep_ocs: if oc in self.line_props.keys(): newstream = self.filter_content(xobj, layer=oc) xobj.write(newstream) else: # if we don't want to keep it, just blank it out newstream = b'' xobj.write(newstream) else: if xobj.Subtype == pikepdf.Name('/Form'): newstream = self.filter_content(xobj) xobj.write(newstream) progress_dlg.Update(page_range.index(p)) Yield() if progress_dlg.WasCancelled(): return None # edit the OCG listing in the root OCGs = [ oc for oc in output.Root.OCProperties.OCGs if str(oc.Name) in self.keep_ocs ] output.Root.OCProperties.OCGs = OCGs # by default, unlock all layers and show all OCGs output.Root.OCProperties.D.Locked = [] output.Root.OCProperties.D.Order = self.filter_ocg_order( output.Root.OCProperties.D.Order) output.remove_unreferenced_resources() return output