def test_stack_abuse(): p = pikepdf.Pdf.new() stream = pikepdf.Stream(p, b'q ' * 35) with pytest.warns(None) as record: pdfinfo.info._interpret_contents(stream) assert 'overflowed' in str(record[0].message) stream = pikepdf.Stream(p, b'q Q Q Q Q') with pytest.warns(None) as record: pdfinfo.info._interpret_contents(stream) assert 'underflowed' in str(record[0].message) stream = pikepdf.Stream(p, b'q ' * 135) with pytest.warns(None): with pytest.raises(RuntimeError): pdfinfo.info._interpret_contents(stream)
def rewrite_png(pike: Pdf, im_obj: Object, compdata) -> None: # pragma: no cover # When a PNG is inserted into a PDF, we more or less copy the IDAT section from # the PDF and transfer the rest of the PNG headers to PDF image metadata. # One thing we have to do is tell the PDF reader whether a predictor was used # on the image before Flate encoding. (Typically one is.) # According to Leptonica source, PDF readers don't actually need us # to specify the correct predictor, they just need a value of either: # 1 - no predictor # 10-14 - there is a predictor # Leptonica's compdata->predictor only tells TRUE or FALSE # 10-14 means the actual predictor is specified in the data, so for any # number >= 10 the PDF reader will use whatever the PNG data specifies. # In practice Leptonica should use Paeth, 14, but 15 seems to be the # designated value for "optimal". So we will use 15. # See: # - PDF RM 7.4.4.4 Table 10 # - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757 predictor = 15 if compdata.predictor > 0 else 1 dparms = Dictionary(Predictor=predictor) if predictor > 1: dparms.BitsPerComponent = compdata.bps # Yes, this is redundant dparms.Colors = compdata.spp dparms.Columns = compdata.w im_obj.BitsPerComponent = compdata.bps im_obj.Width = compdata.w im_obj.Height = compdata.h log.debug( f"PNG {im_obj.objgen}: palette={compdata.ncolors} spp={compdata.spp} bps={compdata.bps}" ) if compdata.ncolors > 0: # .ncolors is the number of colors in the palette, not the number of # colors used in a true color image. The palette string is always # given as RGB tuples even when the image is grayscale; see # https://github.com/DanBloomberg/leptonica/blob/master/src/colormap.c#L2067 palette_pdf_string = compdata.get_palette_pdf_string() palette_data = pikepdf.Object.parse(palette_pdf_string) palette_stream = pikepdf.Stream(pike, bytes(palette_data)) palette = [ Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream ] cs = palette else: # ncolors == 0 means we are using a colorspace without a palette if compdata.spp == 1: cs = Name.DeviceGray elif compdata.spp == 4: cs = Name.DeviceCMYK else: # spp == 3 cs = Name.DeviceRGB im_obj.ColorSpace = cs im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)
def mergePage(layerPage, mainPage, pdf, name) -> None: contentsForName = pdf.copy_foreign( pikepdf.Page(layerPage).as_form_xobject()) newContents = b'q\n %s Do\nQ\n' % (name.encode()) if not mainPage.Resources.get("/XObject"): mainPage.Resources["/XObject"] = pikepdf.Dictionary({}) mainPage.Resources["/XObject"][name] = contentsForName # Use the MediaBox from the merged page mainPage.MediaBox = layerPage.MediaBox mainPage.page_contents_add(contents=pikepdf.Stream(pdf, newContents), prepend=True)
def generate_booklet(pdfqueue, tmp_dir, pages): file, filename = make_tmp_file(tmp_dir) content_dict = pikepdf.Dictionary({}) file_indexes = {p.nfile for p in pages} source_files = { n: pikepdf.open(pdfqueue[n - 1].copyname) for n in file_indexes } for i in range(len(pages) // 2): even = i % 2 == 0 first = pages[-i - 1 if even else i] second = pages[i if even else -i - 1] second_page_size = second.size_in_points() first_page_size = first.size_in_points() page_size = [ max(second_page_size[0], first_page_size[0]) * 2, max(second_page_size[1], first_page_size[1]) ] first_original = source_files[first.nfile].pages[first.npage - 1] first_foreign = _apply_geom_transform( file, file.copy_foreign(first_original), first) second_original = source_files[second.nfile].pages[second.npage - 1] second_foreign = _apply_geom_transform( file, file.copy_foreign(second_original), second) content_dict[f'/Page{i*2}'] = pikepdf.Page( first_foreign).as_form_xobject() content_dict[f'/Page{i*2 + 1}'] = pikepdf.Page( second_foreign).as_form_xobject() # See PDF reference section 4.2.3 Transformation Matrices tx1 = -first_foreign.MediaBox[0] ty1 = -first_foreign.MediaBox[1] tx2 = first_page_size[0] - float(second_foreign.MediaBox[0]) ty2 = -second_foreign.MediaBox[1] content_txt = (f"q 1 0 0 1 {tx1} {ty1} cm /Page{i*2} Do Q " f"q 1 0 0 1 {tx2} {ty2} cm /Page{i*2 + 1} Do Q ") newpage = pikepdf.Dictionary( Type=pikepdf.Name.Page, MediaBox=[0, 0, *page_size], Resources=pikepdf.Dictionary(XObject=content_dict), Contents=pikepdf.Stream(file, content_txt.encode())) # workaround for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174 if pikepdf.__version__ < '2.7.0': newpage = file.make_indirect(newpage) file.pages.append(newpage) file.save(filename) return filename
def generate_booklet(pdfqueue, tmp_dir, pages): file, filename = make_tmp_file(tmp_dir) content_dict = pikepdf.Dictionary({}) file_indexes = {p.nfile for p in pages} source_files = { n: pikepdf.open(pdfqueue[n - 1].copyname) for n in file_indexes } for i in range(len(pages) // 2): even = i % 2 == 0 first = pages[-i - 1 if even else i] second = pages[i if even else -i - 1] second_page_size = second.size_in_points() first_page_size = first.size_in_points() page_size = [ max(second_page_size[0], first_page_size[0]) * 2, max(second_page_size[1], first_page_size[1]) ] first_original = source_files[first.nfile].pages[first.npage - 1] first_foreign = file.copy_foreign(first_original) _update_angle(first, first_original, first_foreign) second_original = source_files[second.nfile].pages[second.npage - 1] second_foreign = file.copy_foreign(second_original) _update_angle(second, second_original, second_foreign) content_dict[f'/Page{i*2}'] = pikepdf.Page( first_foreign).as_form_xobject() content_dict[f'/Page{i*2 + 1}'] = pikepdf.Page( second_foreign).as_form_xobject() content_txt = ( f'q 1 0 0 1 0 0 cm /Page{i*2} Do Q' f' q 1 0 0 1 {first_page_size[0]} 0 cm /Page{i*2 + 1} Do Q ') newpage = pikepdf.Dictionary( Type=pikepdf.Name.Page, MediaBox=[0, 0, *page_size], Resources=pikepdf.Dictionary(XObject=content_dict), Contents=pikepdf.Stream(file, content_txt.encode())) # workaround for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174 if pikepdf.__version__ < '2.7.0': newpage = file.make_indirect(newpage) file.pages.append(newpage) file.save(filename) return filename
def convert_to_jbig2(pike, jbig2_groups, root, log, options): """ Convert a group of JBIG2 images and insert into PDF. We use a group because JBIG2 works best with a symbol dictionary that spans multiple pages. When inserted back into the PDF, each JBIG2 must reference the symbol dictionary it is associated with. So convert a group at a time, and replace their streams with a parameter set that points to the appropriate dictionary. If too many pages shared the same dictionary JBIG2 encoding becomes more expensive and less efficient. """ with concurrent.futures.ThreadPoolExecutor( max_workers=options.jobs) as executor: futures = [] for group, xref_exts in jbig2_groups.items(): prefix = 'group{:08d}'.format(group) future = executor.submit( jbig2enc.convert_group, cwd=fspath(root), infiles=(img_name(root, xref, ext) for xref, ext in xref_exts), out_prefix=prefix ) futures.append(future) for future in concurrent.futures.as_completed(futures): proc = future.result() log.debug(proc.stderr.decode()) for group, xref_exts in jbig2_groups.items(): prefix = 'group{:08d}'.format(group) jbig2_globals_data = (root / (prefix + '.sym')).read_bytes() jbig2_globals = pikepdf.Stream(pike, jbig2_globals_data) for n, xref_ext in enumerate(xref_exts): xref, _ = xref_ext jbig2_im_file = root / (prefix + '.{:04d}'.format(n)) jbig2_im_data = jbig2_im_file.read_bytes() im_obj = pike.get_object(xref, 0) im_obj.write( jbig2_im_data, pikepdf.Name('/JBIG2Decode'), pikepdf.Dictionary({ '/JBIG2Globals': jbig2_globals }) )
def strip_invisible_text(pdf, page): stream = [] in_text_obj = False render_mode = 0 text_objects = [] rich_page = pikepdf.Page(page) rich_page.contents_coalesce() for operands, operator in pikepdf.parse_content_stream(page, ''): if not in_text_obj: if operator == pikepdf.Operator('BT'): in_text_obj = True render_mode = 0 text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == pikepdf.Operator('Tr'): render_mode = operands[0] text_objects.append((operands, operator)) if operator == pikepdf.Operator('ET'): in_text_obj = False if render_mode != 3: stream.extend(text_objects) text_objects.clear() def convert(op): try: return op.unparse() except AttributeError: return str(op).encode('ascii') lines = [] for operands, operator in stream: if operator == pikepdf.Operator('INLINE IMAGE'): iim = operands[0] line = iim.unparse() else: line = b' '.join(convert(op) for op in operands) + b' ' + operator.unparse() lines.append(line) content_stream = b'\n'.join(lines) page.Contents = pikepdf.Stream(pdf, content_stream)
def strip_invisible_text(pdf, page): stream = [] in_text_obj = False render_mode = 0 text_objects = [] page.page_contents_coalesce() for operands, operator in pikepdf.parse_content_stream(page, ""): if not in_text_obj: if operator == pikepdf.Operator("BT"): in_text_obj = True render_mode = 0 text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == pikepdf.Operator("Tr"): render_mode = operands[0] text_objects.append((operands, operator)) if operator == pikepdf.Operator("ET"): in_text_obj = False if render_mode != 3: stream.extend(text_objects) text_objects.clear() def convert(op): try: return op.unparse() except AttributeError: return str(op).encode("ascii") lines = [] for operands, operator in stream: if operator == pikepdf.Operator("INLINE IMAGE"): iim = operands[0] line = iim.unparse() else: line = b" ".join(convert(op) for op in operands) + b" " + operator.unparse() lines.append(line) content_stream = b"\n".join(lines) page.Contents = pikepdf.Stream(pdf, content_stream)
def test_image_scale0(resources, outpdf): with pikepdf.open(resources / 'cmyk.pdf') as cmyk: xobj = pikepdf.Page(cmyk.pages[0]).as_form_xobject() p = pikepdf.Pdf.new() p.add_blank_page(page_size=(72, 72)) objname = pikepdf.Page(p.pages[0]).add_resource( p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0) print(objname) p.pages[0].Contents = pikepdf.Stream( p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname)) p.save(outpdf) pi = pdfinfo.PdfInfo(outpdf, detailed_analysis=True, progbar=False, max_workers=1) assert not pi.pages[0]._images[0].dpi.is_finite assert pi.pages[0].dpi == Resolution(0, 0)
def convert_to_jbig2( pike: Pdf, jbig2_groups: Dict[int, List[XrefExt]], root: Path, options, executor: Executor, ) -> None: """Convert images to JBIG2 and insert into PDF. When the JBIG2 page group size is > 1 we do several JBIG2 images at once and build a symbol dictionary that will span several pages. Each JBIG2 image must reference to its symbol dictionary. If too many pages shared the same dictionary JBIG2 encoding becomes more expensive and less efficient. The default value of 10 was determined through testing. Currently this must be lossy encoding since jbig2enc does not support refinement coding. When the JBIG2 symbolic coder is not used, each JBIG2 stands on its own and needs no dictionary. Currently this must be lossless JBIG2. """ _produce_jbig2_images(jbig2_groups, root, options, executor) for group, xref_exts in jbig2_groups.items(): prefix = f'group{group:08d}' jbig2_symfile = root / (prefix + '.sym') if jbig2_symfile.exists(): jbig2_globals_data = jbig2_symfile.read_bytes() jbig2_globals = pikepdf.Stream(pike, jbig2_globals_data) jbig2_globals_dict = Dictionary(JBIG2Globals=jbig2_globals) elif options.jbig2_page_group_size == 1: jbig2_globals_dict = None else: raise FileNotFoundError(jbig2_symfile) for n, xref_ext in enumerate(xref_exts): xref, _ = xref_ext jbig2_im_file = root / (prefix + f'.{n:04d}') jbig2_im_data = jbig2_im_file.read_bytes() im_obj = pike.get_object(xref, 0) im_obj.write(jbig2_im_data, filter=Name.JBIG2Decode, decode_parms=jbig2_globals_dict)
def test_malformed_docinfo(caplog, resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') with pikepdf.open(resources / 'trivial.pdf') as pike: pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>") pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False) options = parser.parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo) convert_to_pdfa(str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context) print(caplog.records) assert any('malformed DocumentInfo block' in record.message for record in caplog.records)
def attach_notebook(pdf_in, pdf_out, notebook): N = pikepdf.Name main_pdf = pikepdf.open(pdf_in) the_file = pikepdf.Stream(main_pdf, notebook["contents"]) the_file[N("/Type")] = N("/EmbeddedFile") file_wrapper = pikepdf.Dictionary(F=the_file) fname = notebook["file_name"] embedded_file = pikepdf.Dictionary( Type=N("/Filespec"), UF=fname, F=fname, EF=file_wrapper ) name_tree = pikepdf.Array([pikepdf.String(fname), embedded_file]) embedded_files = pikepdf.Dictionary(Names=name_tree) names = pikepdf.Dictionary(EmbeddedFiles=embedded_files) main_pdf.Root[N("/Names")] = names main_pdf.save(pdf_out)
def run(self, rows=0, cols=0, actually_trim=0): if self.in_doc is None: print(_('Input document not loaded')) return if len(self.page_range) == 0: self.set_page_range() # initialize a new document and copy over the layer info (OCGs) if it exists new_doc = pikepdf.Pdf.new() if '/OCProperties' in self.in_doc.Root.keys(): localRoot = new_doc.copy_foreign(self.in_doc.Root) new_doc.Root.OCProperties = localRoot.OCProperties content_dict = pikepdf.Dictionary({}) page_names = [] pw = None ph = None page_size_ref = 0 page_count = len(self.in_doc.pages) trim = [self.units_to_px(t) for t in self.trim] for p in self.page_range: if p > page_count: print( _('Only {} pages in document, skipping {}').format( page_count, p)) continue if p > 0: pagekey = f'/Page{p}' if pagekey not in content_dict.keys(): # copy the page over as an xobject # pikepdf.pages is zero indexed, so subtract one localpage = new_doc.copy_foreign(self.in_doc.pages[p - 1]) # set the trim box to cut off content if requested if actually_trim == 1: if '/TrimBox' not in localpage.keys(): localpage.TrimBox = copy.copy(localpage.MediaBox) localpage.TrimBox[0] = float( localpage.TrimBox[0]) + trim[0] localpage.TrimBox[1] = float( localpage.TrimBox[1]) + trim[3] localpage.TrimBox[2] = float( localpage.TrimBox[2]) - trim[1] localpage.TrimBox[3] = float( localpage.TrimBox[3]) - trim[2] content_dict[pagekey] = pikepdf.Page( localpage).as_form_xobject() # only get the width/height for the first page if pw is None: pw = float(localpage.MediaBox[2]) ph = float(localpage.MediaBox[3]) page_size_ref = p elif abs(pw - float(localpage.MediaBox[2])) > 1 or abs( ph - float(localpage.MediaBox[3])) > 1: print( _('Warning: page {} is a different size from {}, output may be unpredictable' .format(p, page_size_ref))) page_names.append(pagekey) else: page_names.append(None) # take the most common page width/height # create a new document with a page big enough to contain all the tiled pages, plus requested margin # figure out how big it needs to be based on requested columns/rows n_tiles = len(page_names) if cols == 0 and rows == 0: # try for square cols = math.ceil(math.sqrt(n_tiles)) rows = cols # columns take priority if both are specified if cols > 0: rrows = rows rows = math.ceil(n_tiles / cols) if rrows != rows and rrows != 0: print( _('Warning: requested {} columns and {} rows, but {} rows are needed with {} pages' ).format(cols, rrows, rows, n_tiles)) else: cols = math.ceil(n_tiles / rows) # convert the margin and trim options into pixels unitstr = 'cm' if self.units else 'in' margin = self.units_to_px(self.margin) rotstr = _('None') if self.rotation == 1: rotstr = _('Clockwise') if self.rotation == 2: rotstr = _('Counterclockwise') orderstr = _('Rows then columns') if self.col_major: orderstr = _('Columns then rows') lrstr = _('Left to right') if self.right_to_left: lrstr = _('Right to left') btstr = _('Top to bottom') if self.bottom_to_top: btstr = _('Bottom to top') print(_('Tiling with {} rows and {} columns').format(rows, cols)) print(_('Options') + ':') print(' ' + _('Margins') + ': {} {}'.format(self.margin, unitstr)) print(' ' + _('Trim') + ': {} {}'.format(self.trim, unitstr)) print(' ' + _('Rotation') + ': {}'.format(rotstr)) print(' ' + _('Page order') + ': {}, {}, {}'.format(orderstr, lrstr, btstr)) # define the media box with the final grid + margins # run through the width/height combos to find the maximum required # R is the rotation matrix (default to identity) R = [1, 0, 0, 1] # We need to account for the shift in origin if page rotation is applied o_shift = [0, 0] if self.rotation != 0: # define the rotation transform and # swap the trim order if self.rotation == 1: R = [0, -1, 1, 0] o_shift = [0, pw] order = [3, 2, 0, 1] if self.rotation == 2: R = [0, 1, -1, 0] o_shift = [ph, 0] order = [2, 3, 1, 0] # swap width and height of pages tmp = ph ph = pw pw = tmp trim = [trim[o] for o in order] # define the output page media box width = (pw - trim[0] - trim[1]) * cols height = (ph - trim[2] - trim[3]) * rows media_box = [0, 0, width + 2 * margin, height + 2 * margin] i = 0 content_txt = '' for i in range(n_tiles): if not page_names[i]: continue if self.col_major: c = math.floor(i / rows) r = i % rows else: r = math.floor(i / cols) c = i % cols if self.right_to_left: c = cols - c - 1 if not self.bottom_to_top: r = rows - r - 1 x0 = margin - trim[0] + c * (pw - trim[0] - trim[1]) y0 = margin - trim[3] + r * (ph - trim[2] - trim[3]) # don't scale, just shift and rotate # first shift to origin, then rotate, then shift to final destination content_txt += f'q {R[0]} {R[1]} {R[2]} {R[3]} {x0+o_shift[0]} {y0+o_shift[1]} cm ' content_txt += f'{page_names[i]} Do Q ' newpage = pikepdf.Dictionary( Type=pikepdf.Name.Page, MediaBox=media_box, Resources=pikepdf.Dictionary(XObject=content_dict), Contents=pikepdf.Stream(new_doc, content_txt.encode())) new_doc.pages.append(newpage) return new_doc
def transcode_pngs(pike, images, image_name_fn, root, log, options): if options.optimize >= 2: png_quality = ( max(10, options.png_quality - 10), min(100, options.png_quality + 10), ) with concurrent.futures.ThreadPoolExecutor( max_workers=options.jobs) as executor: for xref in images: log.debug(image_name_fn(root, xref)) executor.submit( pngquant.quantize, image_name_fn(root, xref), png_name(root, xref), png_quality[0], png_quality[1], ) for xref in images: im_obj = pike.get_object(xref, 0) try: compdata = leptonica.CompressedData.open(png_name(root, xref)) except leptonica.LeptonicaError as e: # Most likely this means file not found, i.e. quantize did not # produce an improved version log.error(e) continue # If re-coded image is larger don't use it - we test here because # pngquant knows the size of the temporary output file but not the actual # object in the PDF if len(compdata) > int(im_obj.stream_dict.Length): log.debug( f"pngquant: pngquant did not improve over original image " f"{len(compdata)} > {int(im_obj.stream_dict.Length)}") continue # When a PNG is inserted into a PDF, we more or less copy the IDAT section from # the PDF and transfer the rest of the PNG headers to PDF image metadata. # One thing we have to do is tell the PDF reader whether a predictor was used # on the image before Flate encoding. (Typically one is.) # According to Leptonica source, PDF readers don't actually need us # to specify the correct predictor, they just need a value of either: # 1 - no predictor # 10-14 - there is a predictor # Leptonica's compdata->predictor only tells TRUE or FALSE # From there the PNG decoder can infer the rest from the file. # In practice the predictor should be Paeth, 14, so we'll use that. # See: # - PDF RM 7.4.4.4 Table 10 # - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757 predictor = 14 if compdata.predictor > 0 else 1 dparms = Dictionary(Predictor=predictor) if predictor > 1: dparms.BitsPerComponent = compdata.bps # Yes, this is redundant dparms.Colors = compdata.spp dparms.Columns = compdata.w im_obj.BitsPerComponent = compdata.bps im_obj.Width = compdata.w im_obj.Height = compdata.h if compdata.ncolors > 0: # .ncolors is the number of colors in the palette, not the number of # colors used in a true color image palette_pdf_string = compdata.get_palette_pdf_string() palette_data = pikepdf.Object.parse(palette_pdf_string) palette_stream = pikepdf.Stream(pike, bytes(palette_data)) palette = [ Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream, ] cs = palette else: if compdata.spp == 1: # PDF interprets binary-1 as black in 1bpp, but PNG sets # black to 0 for 1bpp. Create a palette that informs the PDF # of the mapping - seems cleaner to go this way but pikepdf # needs to be patched to support it. # palette = [Name.Indexed, Name.DeviceGray, 1, b"\xff\x00"] # cs = palette cs = Name.DeviceGray elif compdata.spp == 3: cs = Name.DeviceRGB elif compdata.spp == 4: cs = Name.DeviceCMYK if compdata.bps == 1: im_obj.Decode = [ 1, 0 ] # Bit of a kludge but this inverts photometric too im_obj.ColorSpace = cs im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)
def _graft_text_layer( self, *, page_num: int, textpdf: Path, font: pikepdf.Object, font_key: pikepdf.Object, procset: pikepdf.Object, text_rotation: int, strip_old_text: bool, ): """Insert the text layer from text page 0 on to pdf_base at page_num""" if Path(textpdf).stat().st_size == 0: return # This is a pointer indicating a specific page in the base file with pikepdf.open(textpdf) as pdf_text: pdf_text_contents = pdf_text.pages[0].Contents.read_bytes() base_page = self.pdf_base.pages.p(page_num) # The text page always will be oriented up by this stage but the original # content may have a rotation applied. Wrap the text stream with a rotation # so it will be oriented the same way as the rest of the page content. # (Previous versions OCRmyPDF rotated the content layer to match the text.) mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)] wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] mediabox = [float(base_page.MediaBox[v]) for v in range(4)] wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2) untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2) corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1]) # -rotation because the input is a clockwise angle and this formula # uses CCW text_rotation = -text_rotation % 360 rotate = pikepdf.PdfMatrix().rotated(text_rotation) # Because of rounding of DPI, we might get a text layer that is not # identically sized to the target page. Scale to adjust. Normally this # is within 0.998. if text_rotation in (90, 270): wt, ht = ht, wt scale_x = wp / wt scale_y = hp / ht # log.debug('%r', scale_x, scale_y) scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y) # Translate the text so it is centered at (0, 0), rotate it there, adjust # for a size different between initial and text PDF, then untranslate, and # finally move the lower left corner to match the mediabox ctm = translate @ rotate @ scale @ untranslate @ corner base_resources = _ensure_dictionary(base_page, Name.Resources) base_xobjs = _ensure_dictionary(base_resources, Name.XObject) text_xobj_name = Name("/" + str(uuid.uuid4())) xobj = self.pdf_base.make_stream(pdf_text_contents) base_xobjs[text_xobj_name] = xobj xobj.Type = Name.XObject xobj.Subtype = Name.Form xobj.FormType = 1 xobj.BBox = mediabox _update_resources( obj=xobj, font=font, font_key=font_key, procset=[Name.PDF] ) pdf_draw_xobj = ( (b"q %s cm\n" % ctm.encode()) + (b"%s Do\n" % text_xobj_name) + b"\nQ\n" ) new_text_layer = pikepdf.Stream(self.pdf_base, pdf_draw_xobj) if strip_old_text: strip_invisible_text(self.pdf_base, base_page) base_page.page_contents_add(new_text_layer, prepend=True) _update_resources( obj=base_page, font=font, font_key=font_key, procset=procset )
def _graft_text_layer(*, pdf_base, page_num, text, font, font_key, procset, rotation, strip_old_text, log): """Insert the text layer from text page 0 on to pdf_base at page_num""" log.debug("Grafting") if Path(text).stat().st_size == 0: return # This is a pointer indicating a specific page in the base file pdf_text = pikepdf.open(text) pdf_text_contents = pdf_text.pages[0].Contents.read_bytes() base_page = pdf_base.pages.p(page_num) # The text page always will be oriented up by this stage but the original # content may have a rotation applied. Wrap the text stream with a rotation # so it will be oriented the same way as the rest of the page content. # (Previous versions OCRmyPDF rotated the content layer to match the text.) mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)] wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] mediabox = [float(base_page.MediaBox[v]) for v in range(4)] wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2) untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2) corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1]) # -rotation because the input is a clockwise angle and this formula # uses CCW rotation = -rotation % 360 rotate = pikepdf.PdfMatrix().rotated(rotation) # Because of rounding of DPI, we might get a text layer that is not # identically sized to the target page. Scale to adjust. Normally this # is within 0.998. if rotation in (90, 270): wt, ht = ht, wt scale_x = wp / wt scale_y = hp / ht # log.debug('%r', scale_x, scale_y) scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y) # Translate the text so it is centered at (0, 0), rotate it there, adjust # for a size different between initial and text PDF, then untranslate, and # finally move the lower left corner to match the mediabox ctm = translate @ rotate @ scale @ untranslate @ corner pdf_text_contents = b'q %s cm\n' % ctm.encode( ) + pdf_text_contents + b'\nQ\n' new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents) if strip_old_text: strip_invisible_text(pdf_base, base_page) base_page.page_contents_add(new_text_layer, prepend=True) _update_page_resources(page=base_page, font=font, font_key=font_key, procset=procset) pdf_text.close()
def transcode_pngs(pike, pngs, root, log, options): if options.optimize >= 2: png_quality = ( max(10, options.png_quality - 10), min(100, options.png_quality + 10), ) with concurrent.futures.ThreadPoolExecutor( max_workers=options.jobs ) as executor: for xref in pngs: executor.submit( pngquant.quantize, png_name(root, xref), png_name(root, xref), png_quality[0], png_quality[1], ) for xref in pngs: im_obj = pike.get_object(xref, 0) # Open, transcode (!), package for PDF try: pix = leptonica.Pix.open(png_name(root, xref)) if pix.depth == 1: pix = pix.invert() # PDF assumes 1 is black for monochrome compdata = pix.generate_pdf_ci_data(leptonica.lept.L_FLATE_ENCODE, 0) except leptonica.LeptonicaError as e: log.error(e) continue # This is what we should be doing: open the compressed data without # transcoding. However this shifts each pixel row by one for some # reason. # compdata = leptonica.CompressedData.open(png_name(root, xref)) if len(compdata) > int(im_obj.stream_dict.Length): continue # If we produced a larger image, don't use predictor = None if compdata.predictor > 0: predictor = Dictionary(Predictor=compdata.predictor) im_obj.BitsPerComponent = compdata.bps im_obj.Width = compdata.w im_obj.Height = compdata.h if compdata.ncolors > 0: palette_pdf_string = compdata.get_palette_pdf_string() palette_data = pikepdf.Object.parse(palette_pdf_string) palette_stream = pikepdf.Stream(pike, bytes(palette_data)) palette = [ Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream, ] cs = palette else: if compdata.spp == 1: cs = Name.DeviceGray elif compdata.spp == 3: cs = Name.DeviceRGB elif compdata.spp == 4: cs = Name.DeviceCMYK im_obj.ColorSpace = cs im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=predictor)
def _weave_layers_graft(*, pdf_base, page_num, text, font, font_key, procset, rotation, log): """Insert the text layer from text page 0 on to pdf_base at page_num""" log.debug("Grafting") if Path(text).stat().st_size == 0: return # This is a pointer indicating a specific page in the base file pdf_text = pikepdf.open(text) pdf_text_contents = pdf_text.pages[0].Contents.read_bytes() if not tesseract.has_textonly_pdf(): # If we don't have textonly_pdf, edit the stream to delete the # instruction to draw the image Tesseract generated, which we do not # use. stream = bytearray(pdf_text_contents) pattern = b'/Im1 Do' idx = stream.find(pattern) stream[idx:(idx + len(pattern))] = b' ' * len(pattern) pdf_text_contents = bytes(stream) base_page = pdf_base.pages.p(page_num) # The text page always will be oriented up by this stage but the original # content may have a rotation applied. Wrap the text stream with a rotation # so it will be oriented the same way as the rest of the page content. # (Previous versions OCRmyPDF rotated the content layer to match the text.) mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)] wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] mediabox = [float(base_page.MediaBox[v]) for v in range(4)] wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2) untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2) # -rotation because the input is a clockwise angle and this formula # uses CCW rotation = -rotation % 360 rotate = pikepdf.PdfMatrix().rotated(rotation) # Because of rounding of DPI, we might get a text layer that is not # identically sized to the target page. Scale to adjust. Normally this # is within 0.998. if rotation in (90, 270): wt, ht = ht, wt scale_x = wp / wt scale_y = hp / ht log.debug('%r', (scale_x, scale_y)) scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y) # Translate the text so it is centered at (0, 0), rotate it there, adjust # for a size different between initial and text PDF, then untranslate ctm = translate @ rotate @ scale @ untranslate pdf_text_contents = (b'q %s cm\n' % ctm.encode() + pdf_text_contents + b'\nQ\n') new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents) base_page.page_contents_add(new_text_layer, prepend=True) _update_page_resources(page=base_page, font=font, font_key=font_key, procset=procset)