def test_page_contents_add(resources, outdir): pdf = Pdf.open(resources / 'graph.pdf') mat = PdfMatrix().rotated(45) stream1 = Stream(pdf, b'q ' + mat.encode() + b' cm') stream2 = Stream(pdf, b'Q') pdf.pages[0].page_contents_add(stream1, True) pdf.pages[0].page_contents_add(stream2, False) pdf.save(outdir / 'out.pdf')
def _process_content_streams( *, pdf: Pdf, container: Object, shorthand=None ) -> Iterator[Union[VectorMarker, TextMarker, ImageInfo]]: """Find all individual instances of images drawn in the container Usually the container is a page, but it may also be a Form XObject. On a typical page images are stored inline or as regular images in an XObject. Form XObjects may include inline images, XObject images, and recursively, other Form XObjects; and also vector graphic objects. Every instance of an image being drawn somewhere is flattened and treated as a unique image, since if the same image is drawn multiple times on one page it may be drawn at differing resolutions, and our objective is to find the resolution at which the page can be rastered without downsampling. """ if container.get('/Type') == '/Page' and '/Contents' in container: initial_shorthand = shorthand or UNIT_SQUARE elif container.get( '/Type') == '/XObject' and container['/Subtype'] == '/Form': # Set the CTM to the state it was when the "Do" operator was # encountered that is drawing this instance of the Form XObject ctm = PdfMatrix(shorthand) if shorthand else PdfMatrix.identity() # A Form XObject may provide its own matrix to map form space into # user space. Get this if one exists form_shorthand = container.get('/Matrix', PdfMatrix.identity()) form_matrix = PdfMatrix(form_shorthand) # Concatenate form matrix with CTM to ensure CTM is correct for # drawing this instance of the XObject ctm = form_matrix @ ctm initial_shorthand = ctm.shorthand else: return contentsinfo = _interpret_contents(container, initial_shorthand) if contentsinfo.found_vector: yield VectorMarker() if contentsinfo.found_text: yield TextMarker() yield from _find_inline_images(contentsinfo) yield from _find_regular_images(container, contentsinfo) yield from _find_form_xobject_images(pdf, container, contentsinfo)
def test_page_contents_add(graph, outdir): pdf = graph mat = PdfMatrix().rotated(45) stream1 = Stream(pdf, b'q ' + mat.encode() + b' cm') stream2 = Stream(pdf, b'Q') pdf.pages[0].page_contents_add(stream1, True) pdf.pages[0].page_contents_add(stream2, False) pdf.save(outdir / 'out.pdf') with pytest.raises(TypeError, match="Not a Page"): Array([42]).page_contents_add(stream1)
def test_unparse_cs(): instructions = [ ([], Operator('q')), ([*PdfMatrix.identity().shorthand], Operator('cm')), ([], Operator('Q')), ] assert unparse_content_stream(instructions).strip() == b'q\n1 0 0 1 0 0 cm\n Q'
def _simple_interpret_content_stream(page: Union[Page, Object]): ctm = PdfMatrix.identity() stack: List[PdfMatrix] = [] for instruction in parse_content_stream(page, operators='q Q cm Do'): if isinstance(instruction, ContentStreamInlineImage): continue operands, op = instruction.operands, instruction.operator if op == Operator('q'): stack.append(ctm) elif op == Operator('Q'): ctm = stack.pop() elif op == Operator('cm'): ctm = PdfMatrix(operands) @ ctm elif op == Operator('Do'): xobj_name = operands[0] yield (xobj_name, ctm)
def _process_content_streams(*, pdf, container, shorthand=None): """Find all individual instances of images drawn in the container Usually the container is a page, but it may also be a Form XObject. On a typical page images are stored inline or as regular images in an XObject. Form XObjects may include inline images, XObject images, and recursively, other Form XObjects; and also vector graphic objects. Every instance of an image being drawn somewhere is flattened and treated as a unique image, since if the same image is drawn multiple times on one page it may be drawn at differing resolutions, and our objective is to find the resolution at which the page can be rastered without downsampling. """ if container.get('/Type') == '/Page' and '/Contents' in container: initial_shorthand = shorthand or UNIT_SQUARE elif container.get('/Type') == '/XObject' and container['/Subtype'] == '/Form': # Set the CTM to the state it was when the "Do" operator was # encountered that is drawing this instance of the Form XObject ctm = PdfMatrix(shorthand) if shorthand else PdfMatrix.identity() # A Form XObject may provide its own matrix to map form space into # user space. Get this if one exists form_shorthand = container.get('/Matrix', PdfMatrix.identity()) form_matrix = PdfMatrix(form_shorthand) # Concatenate form matrix with CTM to ensure CTM is correct for # drawing this instance of the XObject ctm = form_matrix @ ctm initial_shorthand = ctm.shorthand else: return contentsinfo = _interpret_contents(container, initial_shorthand) if contentsinfo.found_vector: yield VectorInfo() yield from _find_inline_images(contentsinfo) yield from _find_regular_images(container, contentsinfo) yield from _find_form_xobject_images(pdf, container, contentsinfo)
def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE): """Interpret the PDF content stream. The stack represents the state of the PDF graphics stack. We are only interested in the current transformation matrix (CTM) so we only track this object; a full implementation would need to track many other items. The CTM is initialized to the mapping from user space to device space. PDF units are 1/72". In a PDF viewer or printer this matrix is initialized to the transformation to device space. For example if set to (1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches. Images are always considered to be (0, 0) -> (1, 1). Before drawing an image there should be a 'cm' that sets up an image coordinate system where drawing from (0, 0) -> (1, 1) will draw on the desired area of the page. PDF units suit our needs so we initialize ctm to the identity matrix. According to the PDF specification, the maximum stack depth is 32. Other viewers tolerate some amount beyond this. We issue a warning if the stack depth exceeds the spec limit and set a hard limit beyond this to bound our memory requirements. If the stack underflows behavior is undefined in the spec, but we just pretend nothing happened and leave the CTM unchanged. """ stack = [] ctm = PdfMatrix(initial_shorthand) xobject_settings = [] inline_images = [] name_index = defaultdict(lambda: []) found_vector = False found_text = False vector_ops = set('S s f F f* B B* b b*'.split()) text_showing_ops = set("""TJ Tj " '""".split()) image_ops = set('BI ID EI q Q Do cm'.split()) operator_whitelist = ' '.join(vector_ops | text_showing_ops | image_ops) for n, graphobj in enumerate( _normalize_stack( pikepdf.parse_content_stream(contentstream, operator_whitelist) ) ): operands, operator = graphobj if operator == 'q': stack.append(ctm) if len(stack) > 32: # See docstring if len(stack) > 128: raise RuntimeError( "PDF graphics stack overflowed hard limit, operator %i" % n ) warn("PDF graphics stack overflowed spec limit") elif operator == 'Q': try: ctm = stack.pop() except IndexError: # Keeping the ctm the same seems to be the only sensible thing # to do. Just pretend nothing happened, keep calm and carry on. warn("PDF graphics stack underflowed - PDF may be malformed") elif operator == 'cm': ctm = PdfMatrix(operands) @ ctm elif operator == 'Do': image_name = operands[0] settings = XobjectSettings( name=image_name, shorthand=ctm.shorthand, stack_depth=len(stack) ) xobject_settings.append(settings) name_index[image_name].append(settings) elif operator == 'INLINE IMAGE': # BI/ID/EI are grouped into this iimage = operands[0] inline = InlineSettings( iimage=iimage, shorthand=ctm.shorthand, stack_depth=len(stack) ) inline_images.append(inline) elif operator in vector_ops: found_vector = True elif operator in text_showing_ops: found_text = True return ContentsInfo( xobject_settings=xobject_settings, inline_images=inline_images, found_vector=found_vector, found_text=found_text, name_index=name_index, )
def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE): """Interpret the PDF content stream The stack represents the state of the PDF graphics stack. We are only interested in the current transformation matrix (CTM) so we only track this object; a full implementation would need to track many other items. The CTM is initialized to the mapping from user space to device space. PDF units are 1/72". In a PDF viewer or printer this matrix is initialized to the transformation to device space. For example if set to (1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches. Images are always considered to be (0, 0) -> (1, 1). Before drawing an image there should be a 'cm' that sets up an image coordinate system where drawing from (0, 0) -> (1, 1) will draw on the desired area of the page. PDF units suit our needs so we initialize ctm to the identity matrix. """ stack = [] ctm = PdfMatrix(initial_shorthand) xobject_settings = [] inline_images = [] found_text = False text_operators = set(['Tj', 'TJ', '"', "'"]) operator_whitelist = """q Q Do cm TJ Tj " ' BI ID EI""" for n, op in enumerate( _normalize_stack( pikepdf.parse_content_stream(contentstream, operator_whitelist))): operands, command = op if command == 'q': stack.append(ctm) if len(stack) > 32: raise RuntimeError("PDF graphics stack overflow, command %i" % n) elif command == 'Q': try: ctm = stack.pop() except IndexError: raise RuntimeError("PDF graphics stack underflow, command %i" % n) elif command == 'cm': ctm = PdfMatrix(operands) @ ctm elif command == 'Do': image_name = operands[0] settings = XobjectSettings(name=image_name, shorthand=ctm.shorthand, stack_depth=len(stack)) xobject_settings.append(settings) elif command == 'INLINE IMAGE': iimage = operands[0] inline = InlineSettings(iimage=iimage, shorthand=ctm.shorthand, stack_depth=len(stack)) inline_images.append(inline) elif command in text_operators: found_text = True return ContentsInfo(xobject_settings=xobject_settings, inline_images=inline_images, found_text=found_text)
def _graft_text_layer( self, *, page_num: int, textpdf: Path, font: Object, font_key: Object, procset: Object, text_rotation: int, strip_old_text: bool, ): """Insert the text layer from text page 0 on to pdf_base at page_num""" log.debug("Grafting") if Path(textpdf).stat().st_size == 0: return # This is a pointer indicating a specific page in the base file with Pdf.open(textpdf) as pdf_text: pdf_text_contents = pdf_text.pages[0].Contents.read_bytes() base_page = self.pdf_base.pages.p(page_num) # The text page always will be oriented up by this stage but the original # content may have a rotation applied. Wrap the text stream with a rotation # so it will be oriented the same way as the rest of the page content. # (Previous versions OCRmyPDF rotated the content layer to match the text.) mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)] wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] mediabox = [float(base_page.MediaBox[v]) for v in range(4)] wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] translate = PdfMatrix().translated(-wt / 2, -ht / 2) untranslate = PdfMatrix().translated(wp / 2, hp / 2) corner = PdfMatrix().translated(mediabox[0], mediabox[1]) # -rotation because the input is a clockwise angle and this formula # uses CCW text_rotation = -text_rotation % 360 rotate = PdfMatrix().rotated(text_rotation) # Because of rounding of DPI, we might get a text layer that is not # identically sized to the target page. Scale to adjust. Normally this # is within 0.998. if text_rotation in (90, 270): wt, ht = ht, wt scale_x = wp / wt scale_y = hp / ht # log.debug('%r', scale_x, scale_y) scale = PdfMatrix().scaled(scale_x, scale_y) # Translate the text so it is centered at (0, 0), rotate it there, adjust # for a size different between initial and text PDF, then untranslate, and # finally move the lower left corner to match the mediabox ctm = translate @ rotate @ scale @ untranslate @ corner base_resources = _ensure_dictionary(base_page, Name.Resources) base_xobjs = _ensure_dictionary(base_resources, Name.XObject) text_xobj_name = Name('/' + str(uuid.uuid4())) xobj = self.pdf_base.make_stream(pdf_text_contents) base_xobjs[text_xobj_name] = xobj xobj.Type = Name.XObject xobj.Subtype = Name.Form xobj.FormType = 1 xobj.BBox = mediabox _update_resources(obj=xobj, font=font, font_key=font_key, procset=[Name.PDF]) pdf_draw_xobj = ((b'q %s cm\n' % ctm.encode()) + (b'%s Do\n' % text_xobj_name) + b'\nQ\n') new_text_layer = Stream(self.pdf_base, pdf_draw_xobj) if strip_old_text: strip_invisible_text(self.pdf_base, base_page) if hasattr(Page, 'contents_add'): # pikepdf >= 2.14 adds this method and deprecates the one below Page(base_page).contents_add(new_text_layer, prepend=True) else: # pikepdf < 2.14 base_page.page_contents_add(new_text_layer, prepend=True) # pragma: no cover _update_resources(obj=base_page, font=font, font_key=font_key, procset=procset)