def extract_jbig2(im_obj: pikepdf.Object, globals_obj: pikepdf.Object = None) -> Image: with TemporaryDirectory(prefix='pikepdf', suffix='.jbig2') as tmpdir: image_path = Path(tmpdir) / "image" global_path = Path(tmpdir) / "global" output_path = Path(tmpdir) / "outfile" args = ["jbig2dec", "-e", "-o", os.fspath(output_path)] # Get the raw stream, because we can't decode im_obj - that is why we are here # (Strictly speaking we should remove any non-JBIG2 filters if double encoded) image_path.write_bytes(im_obj.get_raw_stream_buffer()) if globals_obj is not None: # For globals, we do want to remove any encoding since it's just a binary # blob and won't be marked with /JBIG2Decode global_path.write_bytes(globals_obj.get_stream_buffer()) args.append(os.fspath(global_path)) args.append(os.fspath(image_path)) run(args, stdout=DEVNULL, check=True) im = Image.open(output_path) im.load() # Load pixel data into memory so file/tempdir can be closed return im
def test_create_pdf(outdir): pdf = Pdf.new() font = pdf.make_indirect( Object.parse(b""" << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >>""")) width, height = 100, 100 image_data = b"\xff\x7f\x00" * (width * height) image = Stream(pdf, image_data) image.stream_dict = Object.parse(b""" << /Type /XObject /Subtype /Image /ColorSpace /DeviceRGB /BitsPerComponent 8 /Width 100 /Height 100 >>""") rfont = {'/F1': font} xobj = {'/Im1': image} resources = { '/Font': rfont, '/XObject': xobj } mediabox = [0, 0, 612, 792] stream = b""" BT /F1 24 Tf 72 720 Td (Hi there) Tj ET q 144 0 0 144 234 324 cm /Im1 Do Q """ contents = Stream(pdf, stream) page_dict = { '/Type': Name('/Page'), '/MediaBox': mediabox, '/Contents': contents, '/Resources': resources } qpdf_page_dict = page_dict page = pdf.make_indirect(qpdf_page_dict) pdf.pages.append(page) pdf.save(outdir / 'hi.pdf')
def _process_content_streams( *, pdf: Pdf, container: Object, shorthand=None ) -> Iterator[Union[VectorMarker, TextMarker, ImageInfo]]: """Find all individual instances of images drawn in the container Usually the container is a page, but it may also be a Form XObject. On a typical page images are stored inline or as regular images in an XObject. Form XObjects may include inline images, XObject images, and recursively, other Form XObjects; and also vector graphic objects. Every instance of an image being drawn somewhere is flattened and treated as a unique image, since if the same image is drawn multiple times on one page it may be drawn at differing resolutions, and our objective is to find the resolution at which the page can be rastered without downsampling. """ if container.get('/Type') == '/Page' and '/Contents' in container: initial_shorthand = shorthand or UNIT_SQUARE elif container.get( '/Type') == '/XObject' and container['/Subtype'] == '/Form': # Set the CTM to the state it was when the "Do" operator was # encountered that is drawing this instance of the Form XObject ctm = PdfMatrix(shorthand) if shorthand else PdfMatrix.identity() # A Form XObject may provide its own matrix to map form space into # user space. Get this if one exists form_shorthand = container.get('/Matrix', PdfMatrix.identity()) form_matrix = PdfMatrix(form_shorthand) # Concatenate form matrix with CTM to ensure CTM is correct for # drawing this instance of the XObject ctm = form_matrix @ ctm initial_shorthand = ctm.shorthand else: return contentsinfo = _interpret_contents(container, initial_shorthand) if contentsinfo.found_vector: yield VectorMarker() if contentsinfo.found_text: yield TextMarker() yield from _find_inline_images(contentsinfo) yield from _find_regular_images(container, contentsinfo) yield from _find_form_xobject_images(pdf, container, contentsinfo)
def extract_jbig2(im_obj: pikepdf.Object, globals_obj: pikepdf.Object = None) -> Image: with NamedTemp() as imgfile, NamedTemp() as globalfile, NamedTemp( ) as outfile: imgfile.write(im_obj.read_raw_bytes()) imgfile.seek(0) args = ['jbig2dec', '-e', '-o', outfile.name] if globals_obj is not None: globalfile.write(globals_obj.read_raw_bytes()) globalfile.seek(0) args.append(globalfile.name) args.append(imgfile.name) run(args, check=True) return Image.open(outfile)
def __init__(self, *, image_data, image_object: tuple): """ Args: image_data: data stream for image, extracted from content stream image_object: the metadata for image, also from content stream """ # Convert the sequence of pikepdf.Object from the content stream into # a dictionary object by unparsing it (to bytes), eliminating inline # image abbreviations, and constructing a bytes string equivalent to # what an image XObject would look like. Then retrieve data from there self._data = image_data self._image_object = image_object reparse = b' '.join( self._unparse_obj(obj, remap_names=self.ABBREVS) for obj in image_object) try: reparsed_obj = Object.parse(b'<< ' + reparse + b' >>') except PdfError as e: raise PdfError("parsing inline " + reparse.decode('unicode_escape')) from e self.obj = reparsed_obj self.pil = None
def extract_jbig2(im_obj: pikepdf.Object, globals_obj: pikepdf.Object = None) -> Image: with NamedTemp() as imgfile, NamedTemp() as globalfile, NamedTemp() as outfile: imgfile.write(im_obj.read_raw_bytes()) imgfile.seek(0) args = ['jbig2dec', '-e', '-o', outfile.name] if globals_obj is not None: globalfile.write(globals_obj.read_raw_bytes()) globalfile.seek(0) args.append(globalfile.name) args.append(imgfile.name) run(args, stdout=DEVNULL, check=True) im = Image.open(outfile) im.load() # Load pixel data into memory so file can be closed return im
def parse_content_stream(page_or_stream, operators=''): """ Parse a PDF content stream into a sequence of instructions. A PDF content stream is list of instructions that describe where to render the text and graphics in a PDF. This is the starting point for analyzing PDFs. If the input is a page and page.Contents is an array, then the content stream is automatically treated as one coalesced stream. Each instruction contains at least one operator and zero or more operands. Args: page_or_stream (pikepdf.Object): A page object, or the content stream attached to another object such as a Form XObject. operators (str): A space-separated string of operators to whitelist. For example 'q Q cm Do' will return only operators that pertain to drawing images. Use 'BI ID EI' for inline images. All other operators and associated tokens are ignored. If blank, all tokens are accepted. Returns: list: List of ``(operands, command)`` tuples where ``command`` is an operator (str) and ``operands`` is a tuple of str; the PDF drawing command and the command's operands, respectively. Example: >>> pdf = pikepdf.Pdf.open(input_pdf) >>> page = pdf.pages[0] >>> for operands, command in parse_content_stream(page): >>> print(command) """ if not isinstance(page_or_stream, Object): raise TypeError("stream must a PDF object") if ( page_or_stream._type_code != ObjectType.stream and page_or_stream.get('/Type') != '/Page' ): raise TypeError("parse_content_stream called on page or stream object") try: if page_or_stream.get('/Type') == '/Page': page = page_or_stream instructions = page._parse_page_contents_grouped(operators) else: stream = page_or_stream instructions = Object._parse_stream_grouped(stream, operators) except PdfError as e: # This is the error message for qpdf >= 7.0. It was different in 6.x # but we no longer support 6.x if 'ignoring non-stream while parsing' in str(e): raise TypeError("parse_content_stream called on non-stream Object") raise e from e return instructions
def test_decimal_from_float(f): d = Decimal(f) if isfinite(f) and d.is_finite(): try: # PDF is limited to ~5 sig figs decstr = str(d.quantize(Decimal('1.000000'))) except InvalidOperation: return # PDF doesn't support exponential notation try: py_d = Object.parse(decstr) except RuntimeError as e: if 'overflow' in str(e) or 'underflow' in str(e): py_d = Object.parse(str(f)) assert isclose(py_d, d, abs_tol=1e-5), (d, f.hex()) else: with pytest.raises(PdfError): Object.parse(str(d))
def __init__(self, context): self.context = context self.path_base = context.origin self.pdf_base = Pdf.open(self.path_base) self.font, self.font_key = None, None self.pdfinfo = context.pdfinfo self.output_file = context.get_path('graft_layers.pdf') self.procset = self.pdf_base.make_indirect( Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]')) self.emplacements = 1 self.interim_count = 0
def rewrite_png(pike: Pdf, im_obj: Object, compdata) -> None: # pragma: no cover # When a PNG is inserted into a PDF, we more or less copy the IDAT section from # the PDF and transfer the rest of the PNG headers to PDF image metadata. # One thing we have to do is tell the PDF reader whether a predictor was used # on the image before Flate encoding. (Typically one is.) # According to Leptonica source, PDF readers don't actually need us # to specify the correct predictor, they just need a value of either: # 1 - no predictor # 10-14 - there is a predictor # Leptonica's compdata->predictor only tells TRUE or FALSE # 10-14 means the actual predictor is specified in the data, so for any # number >= 10 the PDF reader will use whatever the PNG data specifies. # In practice Leptonica should use Paeth, 14, but 15 seems to be the # designated value for "optimal". So we will use 15. # See: # - PDF RM 7.4.4.4 Table 10 # - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757 predictor = 15 if compdata.predictor > 0 else 1 dparms = Dictionary(Predictor=predictor) if predictor > 1: dparms.BitsPerComponent = compdata.bps # Yes, this is redundant dparms.Colors = compdata.spp dparms.Columns = compdata.w im_obj.BitsPerComponent = compdata.bps im_obj.Width = compdata.w im_obj.Height = compdata.h log.debug( f"PNG {im_obj.objgen}: palette={compdata.ncolors} spp={compdata.spp} bps={compdata.bps}" ) if compdata.ncolors > 0: # .ncolors is the number of colors in the palette, not the number of # colors used in a true color image. The palette string is always # given as RGB tuples even when the image is grayscale; see # https://github.com/DanBloomberg/leptonica/blob/master/src/colormap.c#L2067 palette_pdf_string = compdata.get_palette_pdf_string() palette_data = pikepdf.Object.parse(palette_pdf_string) palette_stream = pikepdf.Stream(pike, bytes(palette_data)) palette = [ Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream ] cs = palette else: # ncolors == 0 means we are using a colorspace without a palette if compdata.spp == 1: cs = Name.DeviceGray elif compdata.spp == 4: cs = Name.DeviceCMYK else: # spp == 3 cs = Name.DeviceRGB im_obj.ColorSpace = cs im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)
def rewrite_png_as_g4(pike: Pdf, im_obj: Object, compdata) -> None: # pragma: no cover im_obj.BitsPerComponent = 1 im_obj.Width = compdata.w im_obj.Height = compdata.h im_obj.write(compdata.read()) log.debug(f"PNG to G4 {im_obj.objgen}") if Name.Predictor in im_obj: del im_obj.Predictor if Name.DecodeParms in im_obj: del im_obj.DecodeParms im_obj.DecodeParms = Dictionary( K=-1, BlackIs1=bool(compdata.minisblack), Columns=compdata.w ) im_obj.Filter = Name.CCITTFaxDecode return
def test_numbers(self): self.check(Object.parse('1.0'), 1) self.check(Object.parse('42'), 42)
def test_open_pdf(graph): page = graph.pages[0] Object._parse_stream(page.obj, PrintParser())
def test_parser_exception(graph): stream = graph.pages[0]['/Contents'] with pytest.raises(ValueError): Object._parse_stream(stream, ExceptionParser())
def test_parser_exception(resources): pdf = Pdf.open(resources / 'graph.pdf') stream = pdf.pages[0]['/Contents'] with pytest.raises(ValueError): Object._parse_stream(stream, ExceptionParser())
def test_bool_comparison(self): self.check(Object.parse('0.0'), False) self.check(True, 1)
def parse_content_stream( page_or_stream: Object, operators: str = '') -> List[ContentStreamInstructions]: """ Parse a PDF content stream into a sequence of instructions. A PDF content stream is list of instructions that describe where to render the text and graphics in a PDF. This is the starting point for analyzing PDFs. If the input is a page and page.Contents is an array, then the content stream is automatically treated as one coalesced stream. Each instruction contains at least one operator and zero or more operands. This function does not have anything to do with opening a PDF file itself or processing data from a whole PDF. It is for processing a specific object inside a PDF that is already opened. Args: page_or_stream: A page object, or the content stream attached to another object such as a Form XObject. operators: A space-separated string of operators to whitelist. For example 'q Q cm Do' will return only operators that pertain to drawing images. Use 'BI ID EI' for inline images. All other operators and associated tokens are ignored. If blank, all tokens are accepted. Returns: list: List of ``(operands, command)`` tuples where ``command`` is an operator (str) and ``operands`` is a tuple of str; the PDF drawing command and the command's operands, respectively. Example: >>> pdf = pikepdf.Pdf.open(input_pdf) >>> page = pdf.pages[0] >>> for operands, command in parse_content_stream(page): >>> print(command) """ if not isinstance(page_or_stream, Object): raise TypeError("stream must be a pikepdf.Object") if (page_or_stream._type_code != ObjectType.stream and page_or_stream.get('/Type') != '/Page'): raise TypeError("parse_content_stream called on page or stream object") try: if page_or_stream.get('/Type') == '/Page': page = page_or_stream instructions = cast( List[ContentStreamInstructions], page._parse_page_contents_grouped(operators), ) else: stream = page_or_stream instructions = cast( List[ContentStreamInstructions], Object._parse_stream_grouped(stream, operators), ) except PdfError as e: if 'supposed to be a stream or an array' in str(e): raise TypeError( "parse_content_stream called on non-stream Object") from e else: raise e from e return instructions
def test_create_form_xobjects(outdir): pdf = Pdf.new() font = pdf.make_indirect( Object.parse(b""" << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >> """)) width, height = 100, 100 image_data = b"\xff\x7f\x00" * (width * height) image = Stream(pdf, image_data) image.stream_dict = Object.parse(""" << /Type /XObject /Subtype /Image /ColorSpace /DeviceRGB /BitsPerComponent 8 /Width 100 /Height 100 >> """) xobj_image = Dictionary({'/Im1': image}) form_xobj_res = Dictionary({'/XObject': xobj_image}) form_xobj = Stream( pdf, b""" /Im1 Do """, ) form_xobj['/Type'] = Name('/XObject') form_xobj['/Subtype'] = Name('/Form') form_xobj['/FormType'] = 1 form_xobj['/Matrix'] = [1, 0, 0, 1, 0, 0] form_xobj['/BBox'] = [0, 0, 1, 1] form_xobj['/Resources'] = form_xobj_res rfont = {'/F1': font} resources = {'/Font': rfont, '/XObject': {'/Form1': form_xobj}} mediabox = [0, 0, 612, 792] stream = b""" BT /F1 24 Tf 72 720 Td (Hi there) Tj ET q 144 0 0 144 234 324 cm /Form1 Do Q q 72 0 0 72 378 180 cm /Form1 Do Q """ contents = Stream(pdf, stream) page = pdf.make_indirect({ '/Type': Name('/Page'), '/MediaBox': mediabox, '/Contents': contents, '/Resources': resources, }) pdf.pages.append(page) pdf.save(outdir / 'formxobj.pdf')
def test_not_constructible(): with pytest.raises(TypeError, match="constructor"): Object()
def test_open_pdf(resources): pdf = Pdf.open(resources / 'graph.pdf') page = pdf.pages[0] Object._parse_stream(page, PrintParser())