Пример #1
0
def extract_jbig2(im_obj: pikepdf.Object, globals_obj: pikepdf.Object = None) -> Image:

    with TemporaryDirectory(prefix='pikepdf', suffix='.jbig2') as tmpdir:
        image_path = Path(tmpdir) / "image"
        global_path = Path(tmpdir) / "global"
        output_path = Path(tmpdir) / "outfile"

        args = ["jbig2dec", "-e", "-o", os.fspath(output_path)]

        # Get the raw stream, because we can't decode im_obj - that is why we are here
        # (Strictly speaking we should remove any non-JBIG2 filters if double encoded)
        image_path.write_bytes(im_obj.get_raw_stream_buffer())

        if globals_obj is not None:
            # For globals, we do want to remove any encoding since it's just a binary
            # blob and won't be marked with /JBIG2Decode
            global_path.write_bytes(globals_obj.get_stream_buffer())
            args.append(os.fspath(global_path))

        args.append(os.fspath(image_path))

        run(args, stdout=DEVNULL, check=True)
        im = Image.open(output_path)
        im.load()  # Load pixel data into memory so file/tempdir can be closed
        return im
Пример #2
0
def test_create_pdf(outdir):
    pdf = Pdf.new()

    font = pdf.make_indirect(
        Object.parse(b"""
            <<
                /Type /Font
                /Subtype /Type1
                /Name /F1
                /BaseFont /Helvetica
                /Encoding /WinAnsiEncoding
            >>"""))

    width, height = 100, 100
    image_data = b"\xff\x7f\x00" * (width * height)

    image = Stream(pdf, image_data)
    image.stream_dict = Object.parse(b"""
            <<
                /Type /XObject
                /Subtype /Image
                /ColorSpace /DeviceRGB
                /BitsPerComponent 8
                /Width 100
                /Height 100
            >>""")

    rfont = {'/F1': font}

    xobj = {'/Im1': image}

    resources = {
        '/Font': rfont,
        '/XObject': xobj
        }

    mediabox = [0, 0, 612, 792]

    stream = b"""
        BT /F1 24 Tf 72 720 Td (Hi there) Tj ET
        q 144 0 0 144 234 324 cm /Im1 Do Q
        """

    contents = Stream(pdf, stream)

    page_dict = {
        '/Type': Name('/Page'),
        '/MediaBox': mediabox,
        '/Contents': contents,
        '/Resources': resources
        }
    qpdf_page_dict = page_dict
    page = pdf.make_indirect(qpdf_page_dict)

    pdf.pages.append(page)
    pdf.save(outdir / 'hi.pdf')
Пример #3
0
def _process_content_streams(
        *,
        pdf: Pdf,
        container: Object,
        shorthand=None
) -> Iterator[Union[VectorMarker, TextMarker, ImageInfo]]:
    """Find all individual instances of images drawn in the container

    Usually the container is a page, but it may also be a Form XObject.

    On a typical page images are stored inline or as regular images
    in an XObject.

    Form XObjects may include inline images, XObject images,
    and recursively, other Form XObjects; and also vector graphic objects.

    Every instance of an image being drawn somewhere is flattened and
    treated as a unique image, since if the same image is drawn multiple times
    on one page it may be drawn at differing resolutions, and our objective
    is to find the resolution at which the page can be rastered without
    downsampling.

    """

    if container.get('/Type') == '/Page' and '/Contents' in container:
        initial_shorthand = shorthand or UNIT_SQUARE
    elif container.get(
            '/Type') == '/XObject' and container['/Subtype'] == '/Form':
        # Set the CTM to the state it was when the "Do" operator was
        # encountered that is drawing this instance of the Form XObject
        ctm = PdfMatrix(shorthand) if shorthand else PdfMatrix.identity()

        # A Form XObject may provide its own matrix to map form space into
        # user space. Get this if one exists
        form_shorthand = container.get('/Matrix', PdfMatrix.identity())
        form_matrix = PdfMatrix(form_shorthand)

        # Concatenate form matrix with CTM to ensure CTM is correct for
        # drawing this instance of the XObject
        ctm = form_matrix @ ctm
        initial_shorthand = ctm.shorthand
    else:
        return

    contentsinfo = _interpret_contents(container, initial_shorthand)

    if contentsinfo.found_vector:
        yield VectorMarker()
    if contentsinfo.found_text:
        yield TextMarker()
    yield from _find_inline_images(contentsinfo)
    yield from _find_regular_images(container, contentsinfo)
    yield from _find_form_xobject_images(pdf, container, contentsinfo)
Пример #4
0
def extract_jbig2(im_obj: pikepdf.Object,
                  globals_obj: pikepdf.Object = None) -> Image:
    with NamedTemp() as imgfile, NamedTemp() as globalfile, NamedTemp(
    ) as outfile:
        imgfile.write(im_obj.read_raw_bytes())
        imgfile.seek(0)

        args = ['jbig2dec', '-e', '-o', outfile.name]
        if globals_obj is not None:
            globalfile.write(globals_obj.read_raw_bytes())
            globalfile.seek(0)
            args.append(globalfile.name)
        args.append(imgfile.name)

        run(args, check=True)
        return Image.open(outfile)
Пример #5
0
    def __init__(self, *, image_data, image_object: tuple):
        """
        Args:
            image_data: data stream for image, extracted from content stream
            image_object: the metadata for image, also from content stream
        """

        # Convert the sequence of pikepdf.Object from the content stream into
        # a dictionary object by unparsing it (to bytes), eliminating inline
        # image abbreviations, and constructing a bytes string equivalent to
        # what an image XObject would look like. Then retrieve data from there

        self._data = image_data
        self._image_object = image_object

        reparse = b' '.join(
            self._unparse_obj(obj, remap_names=self.ABBREVS)
            for obj in image_object)
        try:
            reparsed_obj = Object.parse(b'<< ' + reparse + b' >>')
        except PdfError as e:
            raise PdfError("parsing inline " +
                           reparse.decode('unicode_escape')) from e
        self.obj = reparsed_obj
        self.pil = None
Пример #6
0
def extract_jbig2(im_obj: pikepdf.Object, globals_obj: pikepdf.Object = None) -> Image:
    with NamedTemp() as imgfile, NamedTemp() as globalfile, NamedTemp() as outfile:
        imgfile.write(im_obj.read_raw_bytes())
        imgfile.seek(0)

        args = ['jbig2dec', '-e', '-o', outfile.name]
        if globals_obj is not None:
            globalfile.write(globals_obj.read_raw_bytes())
            globalfile.seek(0)
            args.append(globalfile.name)
        args.append(imgfile.name)

        run(args, stdout=DEVNULL, check=True)
        im = Image.open(outfile)
        im.load()  # Load pixel data into memory so file can be closed
        return im
Пример #7
0
def parse_content_stream(page_or_stream, operators=''):
    """
    Parse a PDF content stream into a sequence of instructions.

    A PDF content stream is list of instructions that describe where to render
    the text and graphics in a PDF. This is the starting point for analyzing
    PDFs.

    If the input is a page and page.Contents is an array, then the content
    stream is automatically treated as one coalesced stream.

    Each instruction contains at least one operator and zero or more operands.

    Args:
        page_or_stream (pikepdf.Object): A page object, or the content
            stream attached to another object such as a Form XObject.
        operators (str): A space-separated string of operators to whitelist.
            For example 'q Q cm Do' will return only operators
            that pertain to drawing images. Use 'BI ID EI' for inline images.
            All other operators and associated tokens are ignored. If blank,
            all tokens are accepted.

    Returns:
        list: List of ``(operands, command)`` tuples where ``command`` is an
            operator (str) and ``operands`` is a tuple of str; the PDF drawing
            command and the command's operands, respectively.

    Example:

        >>> pdf = pikepdf.Pdf.open(input_pdf)
        >>> page = pdf.pages[0]
        >>> for operands, command in parse_content_stream(page):
        >>>     print(command)

    """

    if not isinstance(page_or_stream, Object):
        raise TypeError("stream must a PDF object")

    if (
        page_or_stream._type_code != ObjectType.stream
        and page_or_stream.get('/Type') != '/Page'
    ):
        raise TypeError("parse_content_stream called on page or stream object")

    try:
        if page_or_stream.get('/Type') == '/Page':
            page = page_or_stream
            instructions = page._parse_page_contents_grouped(operators)
        else:
            stream = page_or_stream
            instructions = Object._parse_stream_grouped(stream, operators)
    except PdfError as e:
        # This is the error message for qpdf >= 7.0. It was different in 6.x
        # but we no longer support 6.x
        if 'ignoring non-stream while parsing' in str(e):
            raise TypeError("parse_content_stream called on non-stream Object")
        raise e from e

    return instructions
Пример #8
0
def test_decimal_from_float(f):
    d = Decimal(f)
    if isfinite(f) and d.is_finite():
        try:
            # PDF is limited to ~5 sig figs
            decstr = str(d.quantize(Decimal('1.000000')))
        except InvalidOperation:
            return  # PDF doesn't support exponential notation
        try:
            py_d = Object.parse(decstr)
        except RuntimeError as e:
            if 'overflow' in str(e) or 'underflow' in str(e):
                py_d = Object.parse(str(f))

        assert isclose(py_d, d, abs_tol=1e-5), (d, f.hex())
    else:
        with pytest.raises(PdfError):
            Object.parse(str(d))
Пример #9
0
    def __init__(self, context):
        self.context = context
        self.path_base = context.origin

        self.pdf_base = Pdf.open(self.path_base)
        self.font, self.font_key = None, None

        self.pdfinfo = context.pdfinfo
        self.output_file = context.get_path('graft_layers.pdf')

        self.procset = self.pdf_base.make_indirect(
            Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]'))

        self.emplacements = 1
        self.interim_count = 0
Пример #10
0
def rewrite_png(pike: Pdf, im_obj: Object,
                compdata) -> None:  # pragma: no cover
    # When a PNG is inserted into a PDF, we more or less copy the IDAT section from
    # the PDF and transfer the rest of the PNG headers to PDF image metadata.
    # One thing we have to do is tell the PDF reader whether a predictor was used
    # on the image before Flate encoding. (Typically one is.)
    # According to Leptonica source, PDF readers don't actually need us
    # to specify the correct predictor, they just need a value of either:
    #   1 - no predictor
    #   10-14 - there is a predictor
    # Leptonica's compdata->predictor only tells TRUE or FALSE
    # 10-14 means the actual predictor is specified in the data, so for any
    # number >= 10 the PDF reader will use whatever the PNG data specifies.
    # In practice Leptonica should use Paeth, 14, but 15 seems to be the
    # designated value for "optimal". So we will use 15.
    # See:
    #   - PDF RM 7.4.4.4 Table 10
    #   - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757
    predictor = 15 if compdata.predictor > 0 else 1
    dparms = Dictionary(Predictor=predictor)
    if predictor > 1:
        dparms.BitsPerComponent = compdata.bps  # Yes, this is redundant
        dparms.Colors = compdata.spp
        dparms.Columns = compdata.w

    im_obj.BitsPerComponent = compdata.bps
    im_obj.Width = compdata.w
    im_obj.Height = compdata.h

    log.debug(
        f"PNG {im_obj.objgen}: palette={compdata.ncolors} spp={compdata.spp} bps={compdata.bps}"
    )
    if compdata.ncolors > 0:
        # .ncolors is the number of colors in the palette, not the number of
        # colors used in a true color image. The palette string is always
        # given as RGB tuples even when the image is grayscale; see
        # https://github.com/DanBloomberg/leptonica/blob/master/src/colormap.c#L2067
        palette_pdf_string = compdata.get_palette_pdf_string()
        palette_data = pikepdf.Object.parse(palette_pdf_string)
        palette_stream = pikepdf.Stream(pike, bytes(palette_data))
        palette = [
            Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream
        ]
        cs = palette
    else:
        # ncolors == 0 means we are using a colorspace without a palette
        if compdata.spp == 1:
            cs = Name.DeviceGray
        elif compdata.spp == 4:
            cs = Name.DeviceCMYK
        else:  # spp == 3
            cs = Name.DeviceRGB
    im_obj.ColorSpace = cs
    im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)
Пример #11
0
def rewrite_png_as_g4(pike: Pdf, im_obj: Object, compdata) -> None:  # pragma: no cover
    im_obj.BitsPerComponent = 1
    im_obj.Width = compdata.w
    im_obj.Height = compdata.h

    im_obj.write(compdata.read())

    log.debug(f"PNG to G4 {im_obj.objgen}")
    if Name.Predictor in im_obj:
        del im_obj.Predictor
    if Name.DecodeParms in im_obj:
        del im_obj.DecodeParms
    im_obj.DecodeParms = Dictionary(
        K=-1, BlackIs1=bool(compdata.minisblack), Columns=compdata.w
    )

    im_obj.Filter = Name.CCITTFaxDecode
    return
Пример #12
0
 def test_numbers(self):
     self.check(Object.parse('1.0'), 1)
     self.check(Object.parse('42'), 42)
Пример #13
0
def test_open_pdf(graph):
    page = graph.pages[0]
    Object._parse_stream(page.obj, PrintParser())
Пример #14
0
def test_parser_exception(graph):
    stream = graph.pages[0]['/Contents']
    with pytest.raises(ValueError):
        Object._parse_stream(stream, ExceptionParser())
Пример #15
0
def test_parser_exception(resources):
    pdf = Pdf.open(resources / 'graph.pdf')
    stream = pdf.pages[0]['/Contents']
    with pytest.raises(ValueError):
        Object._parse_stream(stream, ExceptionParser())
Пример #16
0
 def test_bool_comparison(self):
     self.check(Object.parse('0.0'), False)
     self.check(True, 1)
Пример #17
0
def parse_content_stream(
        page_or_stream: Object,
        operators: str = '') -> List[ContentStreamInstructions]:
    """
    Parse a PDF content stream into a sequence of instructions.

    A PDF content stream is list of instructions that describe where to render
    the text and graphics in a PDF. This is the starting point for analyzing
    PDFs.

    If the input is a page and page.Contents is an array, then the content
    stream is automatically treated as one coalesced stream.

    Each instruction contains at least one operator and zero or more operands.

    This function does not have anything to do with opening a PDF file itself or
    processing data from a whole PDF. It is for processing a specific object inside
    a PDF that is already opened.

    Args:
        page_or_stream: A page object, or the content
            stream attached to another object such as a Form XObject.
        operators: A space-separated string of operators to whitelist.
            For example 'q Q cm Do' will return only operators
            that pertain to drawing images. Use 'BI ID EI' for inline images.
            All other operators and associated tokens are ignored. If blank,
            all tokens are accepted.

    Returns:
        list: List of ``(operands, command)`` tuples where ``command`` is an
            operator (str) and ``operands`` is a tuple of str; the PDF drawing
            command and the command's operands, respectively.

    Example:

        >>> pdf = pikepdf.Pdf.open(input_pdf)
        >>> page = pdf.pages[0]
        >>> for operands, command in parse_content_stream(page):
        >>>     print(command)

    """

    if not isinstance(page_or_stream, Object):
        raise TypeError("stream must be a pikepdf.Object")

    if (page_or_stream._type_code != ObjectType.stream
            and page_or_stream.get('/Type') != '/Page'):
        raise TypeError("parse_content_stream called on page or stream object")

    try:
        if page_or_stream.get('/Type') == '/Page':
            page = page_or_stream
            instructions = cast(
                List[ContentStreamInstructions],
                page._parse_page_contents_grouped(operators),
            )
        else:
            stream = page_or_stream
            instructions = cast(
                List[ContentStreamInstructions],
                Object._parse_stream_grouped(stream, operators),
            )
    except PdfError as e:
        if 'supposed to be a stream or an array' in str(e):
            raise TypeError(
                "parse_content_stream called on non-stream Object") from e
        else:
            raise e from e

    return instructions
Пример #18
0
def test_create_form_xobjects(outdir):
    pdf = Pdf.new()

    font = pdf.make_indirect(
        Object.parse(b"""
            <<
                /Type /Font
                /Subtype /Type1
                /Name /F1
                /BaseFont /Helvetica
                /Encoding /WinAnsiEncoding
            >>
        """))

    width, height = 100, 100
    image_data = b"\xff\x7f\x00" * (width * height)

    image = Stream(pdf, image_data)
    image.stream_dict = Object.parse("""
            <<
                /Type /XObject
                /Subtype /Image
                /ColorSpace /DeviceRGB
                /BitsPerComponent 8
                /Width 100
                /Height 100
            >>
    """)
    xobj_image = Dictionary({'/Im1': image})

    form_xobj_res = Dictionary({'/XObject': xobj_image})
    form_xobj = Stream(
        pdf,
        b"""
        /Im1 Do
    """,
    )
    form_xobj['/Type'] = Name('/XObject')
    form_xobj['/Subtype'] = Name('/Form')
    form_xobj['/FormType'] = 1
    form_xobj['/Matrix'] = [1, 0, 0, 1, 0, 0]
    form_xobj['/BBox'] = [0, 0, 1, 1]
    form_xobj['/Resources'] = form_xobj_res

    rfont = {'/F1': font}

    resources = {'/Font': rfont, '/XObject': {'/Form1': form_xobj}}

    mediabox = [0, 0, 612, 792]

    stream = b"""
        BT /F1 24 Tf 72 720 Td (Hi there) Tj ET
        q 144 0 0 144 234 324 cm /Form1 Do Q
        q 72 0 0 72 378 180 cm /Form1 Do Q
        """

    contents = Stream(pdf, stream)

    page = pdf.make_indirect({
        '/Type': Name('/Page'),
        '/MediaBox': mediabox,
        '/Contents': contents,
        '/Resources': resources,
    })

    pdf.pages.append(page)
    pdf.save(outdir / 'formxobj.pdf')
Пример #19
0
def test_not_constructible():
    with pytest.raises(TypeError, match="constructor"):
        Object()
Пример #20
0
def test_open_pdf(resources):
    pdf = Pdf.open(resources / 'graph.pdf')
    page = pdf.pages[0]
    Object._parse_stream(page, PrintParser())