def _process_content_streams( *, pdf: Pdf, container: Object, shorthand=None ) -> Iterator[Union[VectorMarker, TextMarker, ImageInfo]]: """Find all individual instances of images drawn in the container Usually the container is a page, but it may also be a Form XObject. On a typical page images are stored inline or as regular images in an XObject. Form XObjects may include inline images, XObject images, and recursively, other Form XObjects; and also vector graphic objects. Every instance of an image being drawn somewhere is flattened and treated as a unique image, since if the same image is drawn multiple times on one page it may be drawn at differing resolutions, and our objective is to find the resolution at which the page can be rastered without downsampling. """ if container.get('/Type') == '/Page' and '/Contents' in container: initial_shorthand = shorthand or UNIT_SQUARE elif container.get( '/Type') == '/XObject' and container['/Subtype'] == '/Form': # Set the CTM to the state it was when the "Do" operator was # encountered that is drawing this instance of the Form XObject ctm = PdfMatrix(shorthand) if shorthand else PdfMatrix.identity() # A Form XObject may provide its own matrix to map form space into # user space. Get this if one exists form_shorthand = container.get('/Matrix', PdfMatrix.identity()) form_matrix = PdfMatrix(form_shorthand) # Concatenate form matrix with CTM to ensure CTM is correct for # drawing this instance of the XObject ctm = form_matrix @ ctm initial_shorthand = ctm.shorthand else: return contentsinfo = _interpret_contents(container, initial_shorthand) if contentsinfo.found_vector: yield VectorMarker() if contentsinfo.found_text: yield TextMarker() yield from _find_inline_images(contentsinfo) yield from _find_regular_images(container, contentsinfo) yield from _find_form_xobject_images(pdf, container, contentsinfo)
def parse_content_stream( page_or_stream: Object, operators: str = '') -> List[ContentStreamInstructions]: """ Parse a PDF content stream into a sequence of instructions. A PDF content stream is list of instructions that describe where to render the text and graphics in a PDF. This is the starting point for analyzing PDFs. If the input is a page and page.Contents is an array, then the content stream is automatically treated as one coalesced stream. Each instruction contains at least one operator and zero or more operands. Args: page_or_stream: A page object, or the content stream attached to another object such as a Form XObject. operators: A space-separated string of operators to whitelist. For example 'q Q cm Do' will return only operators that pertain to drawing images. Use 'BI ID EI' for inline images. All other operators and associated tokens are ignored. If blank, all tokens are accepted. Returns: list: List of ``(operands, command)`` tuples where ``command`` is an operator (str) and ``operands`` is a tuple of str; the PDF drawing command and the command's operands, respectively. Example: >>> pdf = pikepdf.Pdf.open(input_pdf) >>> page = pdf.pages[0] >>> for operands, command in parse_content_stream(page): >>> print(command) """ if not isinstance(page_or_stream, Object): raise TypeError("stream must a PDF object") if (page_or_stream._type_code != ObjectType.stream and page_or_stream.get('/Type') != '/Page'): raise TypeError("parse_content_stream called on page or stream object") try: if page_or_stream.get('/Type') == '/Page': page = page_or_stream instructions = cast( List[ContentStreamInstructions], page._parse_page_contents_grouped(operators), ) else: stream = page_or_stream instructions = cast( List[ContentStreamInstructions], Object._parse_stream_grouped(stream, operators), ) except PdfError as e: if 'ignoring non-stream while parsing' in str(e): raise TypeError("parse_content_stream called on non-stream Object") raise e from e return instructions