Пример #1
0
def test_objectlist_repr(pal):
    cs = pikepdf.parse_content_stream(pal.pages[0].Contents)
    assert isinstance(cs[1][0], pikepdf._qpdf._ObjectList)
    ol = cs[1][0]
    assert (
        "[Decimal('144.0000'), 0, 0, Decimal('144.0000'), Decimal('0.0000'), Decimal('0.0000')]"
        in repr(ol))
Пример #2
0
def test_text_filter(resources, outdir):
    input_pdf = resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf'

    # Ensure the test PDF has detect we can find
    proc = run(['pdftotext', str(input_pdf), '-'],
               check=True,
               stdout=PIPE,
               encoding='utf-8')
    assert proc.stdout.strip() != '', "Need input test file that contains text"

    pdf = Pdf.open(input_pdf)
    page = pdf.pages[0]

    keep = []
    for operands, command in parse_content_stream(page):
        if command == Operator('Tj'):
            print("skipping Tj")
            continue
        keep.append((operands, command))

    new_stream = Stream(pdf, keep)
    print(new_stream.read_bytes())  # pylint: disable=no-member
    page['/Contents'] = new_stream
    page['/Rotate'] = 90

    pdf.save(outdir / 'notext.pdf', True)

    proc = run(
        ['pdftotext', str(outdir / 'notext.pdf'), '-'],
        check=True,
        stdout=PIPE,
        encoding='utf-8',
    )

    assert proc.stdout.strip() == '', "Expected text to be removed"
Пример #3
0
def strip_invisible_text(pdf, page):
    stream = []
    in_text_obj = False
    render_mode = 0
    text_objects = []

    rich_page = Page(page)
    rich_page.contents_coalesce()
    for operands, operator in parse_content_stream(page, ''):
        if not in_text_obj:
            if operator == Operator('BT'):
                in_text_obj = True
                render_mode = 0
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            if operator == Operator('Tr'):
                render_mode = operands[0]
            text_objects.append((operands, operator))
            if operator == Operator('ET'):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()

    content_stream = unparse_content_stream(stream)
    page.Contents = Stream(pdf, content_stream)
Пример #4
0
def test_inline_copy(inline):
    for instr in parse_content_stream(inline.pages[0].Contents):
        if not isinstance(instr, ContentStreamInlineImage):
            continue
        csiimage = instr
        _copy_of_csiimage = ContentStreamInlineImage(csiimage)  # noqa: F841
        new_iimage = ContentStreamInlineImage(csiimage.iimage)
        assert unparse_content_stream([new_iimage]).startswith(b'BI')
Пример #5
0
def test_inline(inline):
    iimage, pdf = inline
    assert iimage.width == 8
    assert iimage.image_mask == False
    assert iimage.mode == 'RGB'
    assert iimage.is_inline
    assert iimage.colorspace == '/DeviceRGB'

    unparsed = iimage.unparse()

    cs = pdf.make_stream(unparsed)
    for operands, _command in parse_content_stream(cs):
        if operands and isinstance(operands[0], PdfInlineImage):
            reparsed_iim = operands[0]
            assert reparsed_iim == iimage
Пример #6
0
def test_invalid_stream_object():
    with pytest.raises(TypeError):
        parse_content_stream(42)

    with pytest.raises(TypeError):
        parse_content_stream(Dictionary({"/Hi": 3}))

    with pytest.raises(PdfError):
        false_page = Dictionary(Type=Name.Page, Contents=42)
        parse_content_stream(false_page)
Пример #7
0
def _simple_interpret_content_stream(page: Union[Page, Object]):
    ctm = PdfMatrix.identity()
    stack: List[PdfMatrix] = []
    for instruction in parse_content_stream(page, operators='q Q cm Do'):
        if isinstance(instruction, ContentStreamInlineImage):
            continue
        operands, op = instruction.operands, instruction.operator
        if op == Operator('q'):
            stack.append(ctm)
        elif op == Operator('Q'):
            ctm = stack.pop()
        elif op == Operator('cm'):
            ctm = PdfMatrix(operands) @ ctm
        elif op == Operator('Do'):
            xobj_name = operands[0]
            yield (xobj_name, ctm)
Пример #8
0
def test_parse_results(inline):
    p0 = inline.pages[0]
    cmds = parse_content_stream(p0)
    assert isinstance(cmds[0], ContentStreamInstruction)
    csi = cmds[0]
    assert isinstance(csi.operands, _qpdf._ObjectList)
    assert isinstance(csi.operator, Operator)
    assert 'Operator' in repr(csi)

    assert ContentStreamInstruction(cmds[0]).operator == cmds[0].operator

    for cmd in cmds:
        if isinstance(cmd, ContentStreamInlineImage):
            assert cmd.operator == Operator("INLINE IMAGE")
            assert isinstance(cmd.operands[0], PdfInlineImage)
            assert 'INLINE' in repr(cmd)
            assert cmd.operands[0] == cmd.iimage
Пример #9
0
def test_invalid_stream_object():
    with pytest.raises(TypeError, match="must be a pikepdf.Object"):
        parse_content_stream(42)

    with pytest.raises(TypeError, match="called on page or stream"):
        parse_content_stream(Dictionary({"/Hi": 3}))

    with pytest.raises(
            TypeError,
            match="parse_content_stream called on non-stream Object"):
        false_page = Dictionary(Type=Name.Page, Contents=42)
        parse_content_stream(false_page)
Пример #10
0
def strip_invisible_text(pdf, page):
    stream = []
    in_text_obj = False
    render_mode = 0
    text_objects = []

    rich_page = pikepdf.Page(page)
    rich_page.contents_coalesce()
    for operands, operator in pikepdf.parse_content_stream(page, ''):
        if not in_text_obj:
            if operator == pikepdf.Operator('BT'):
                in_text_obj = True
                render_mode = 0
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            if operator == pikepdf.Operator('Tr'):
                render_mode = operands[0]
            text_objects.append((operands, operator))
            if operator == pikepdf.Operator('ET'):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()

    def convert(op):
        try:
            return op.unparse()
        except AttributeError:
            return str(op).encode('ascii')

    lines = []

    for operands, operator in stream:
        if operator == pikepdf.Operator('INLINE IMAGE'):
            iim = operands[0]
            line = iim.unparse()
        else:
            line = b' '.join(convert(op)
                             for op in operands) + b' ' + operator.unparse()
        lines.append(line)

    content_stream = b'\n'.join(lines)
    page.Contents = pikepdf.Stream(pdf, content_stream)
Пример #11
0
def test_inline(inline):
    iimage, pdf = inline
    assert iimage.width == 8
    assert iimage.image_mask == False
    assert iimage.mode == 'RGB'
    assert iimage.is_inline
    assert iimage.colorspace == '/DeviceRGB'
    assert 'PdfInlineImage' in repr(iimage)

    unparsed = iimage.unparse()
    assert b'/W 8' in unparsed, "inline images should have abbreviated metadata"
    assert b'/Width 8' not in unparsed, "abbreviations expanded in inline image"

    cs = pdf.make_stream(unparsed)
    for operands, _command in parse_content_stream(cs):
        if operands and isinstance(operands[0], PdfInlineImage):
            reparsed_iim = operands[0]
            assert reparsed_iim == iimage
Пример #12
0
def strip_invisible_text(pdf, page):
    stream = []
    in_text_obj = False
    render_mode = 0
    text_objects = []

    page.page_contents_coalesce()
    for operands, operator in pikepdf.parse_content_stream(page, ""):
        if not in_text_obj:
            if operator == pikepdf.Operator("BT"):
                in_text_obj = True
                render_mode = 0
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            if operator == pikepdf.Operator("Tr"):
                render_mode = operands[0]
            text_objects.append((operands, operator))
            if operator == pikepdf.Operator("ET"):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()

    def convert(op):
        try:
            return op.unparse()
        except AttributeError:
            return str(op).encode("ascii")

    lines = []

    for operands, operator in stream:
        if operator == pikepdf.Operator("INLINE IMAGE"):
            iim = operands[0]
            line = iim.unparse()
        else:
            line = b" ".join(convert(op) for op in operands) + b" " + operator.unparse()
        lines.append(line)

    content_stream = b"\n".join(lines)
    page.Contents = pikepdf.Stream(pdf, content_stream)
Пример #13
0
def strip_invisible_text(pdf, page, log):
    stream = []
    in_text_obj = False
    render_mode = 0
    text_objects = []

    page.page_contents_coalesce()
    for operands, operator in pikepdf.parse_content_stream(page, ''):
        if not in_text_obj:
            if operator == pikepdf.Operator('BT'):
                in_text_obj = True
                render_mode = 0
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            if operator == pikepdf.Operator('Tr'):
                render_mode = operands[0]
            text_objects.append((operands, operator))
            if operator == pikepdf.Operator('ET'):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()

    def convert(op):
        try:
            return op.unparse()
        except AttributeError:
            return str(op).encode('ascii')

    lines = []

    for operands, operator in stream:
        if operator == pikepdf.Operator('INLINE IMAGE'):
            iim = operands[0]
            line = iim.unparse()
        else:
            line = b' '.join(convert(op) for op in operands) + b' ' + operator.unparse()
        lines.append(line)

    content_stream = b'\n'.join(lines)
    page.Contents = pikepdf.Stream(pdf, content_stream)
Пример #14
0
def parse_text(qpdf_page: pikepdf.Page, font_map, synthesizer: PdfSynthesizer):
    content_stream = iter(pikepdf.parse_content_stream(qpdf_page))
    new_content_stream = []
    last_used_font = None
    text_lengths = collections.Counter()

    for operands, operator in content_stream:
        if operator == pikepdf.Operator('Do'):
            if has_form(qpdf_page, operands):
                raise HasFormException

        if operator == pikepdf.Operator('Tf'):
            last_used_font = _parse_font(operands, font_map)

        if operator == pikepdf.Operator('BT'):
            text_block, last_used_font = _parse_text_block(
                font_map=font_map,
                start=(operands, operator),
                content_stream=content_stream,
                current_font=last_used_font,
            )
            for text_id, text, font in text_block:
                text_lengths[len(text)] += 1
                modified_text = synthesizer.modify_text(text, font=font)
                text_block.set_unicode_text(text_id, modified_text)
            new_content_stream.extend(text_block.content_stream)
        else:
            new_content_stream.append((operands, operator))

    single_chars = text_lengths[1] / sum(text_lengths.values())
    if single_chars > 0.9:
        raise TooManySingleChars(
            f'Too many single characters in document ({single_chars * 100:.2f}%)'
        )

    return new_content_stream
Пример #15
0
def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
    """Interpret the PDF content stream

    The stack represents the state of the PDF graphics stack.  We are only
    interested in the current transformation matrix (CTM) so we only track
    this object; a full implementation would need to track many other items.

    The CTM is initialized to the mapping from user space to device space.
    PDF units are 1/72".  In a PDF viewer or printer this matrix is initialized
    to the transformation to device space.  For example if set to
    (1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches.

    Images are always considered to be (0, 0) -> (1, 1).  Before drawing an
    image there should be a 'cm' that sets up an image coordinate system
    where drawing from (0, 0) -> (1, 1) will draw on the desired area of the
    page.

    PDF units suit our needs so we initialize ctm to the identity matrix.

    """

    stack = []
    ctm = PdfMatrix(initial_shorthand)
    xobject_settings = []
    inline_images = []
    found_text = False
    text_operators = set(['Tj', 'TJ', '"', "'"])
    operator_whitelist = """q Q Do cm TJ Tj " ' BI ID EI"""

    for n, op in enumerate(
            _normalize_stack(
                pikepdf.parse_content_stream(contentstream,
                                             operator_whitelist))):
        operands, command = op

        if command == 'q':
            stack.append(ctm)
            if len(stack) > 32:
                raise RuntimeError("PDF graphics stack overflow, command %i" %
                                   n)
        elif command == 'Q':
            try:
                ctm = stack.pop()
            except IndexError:
                raise RuntimeError("PDF graphics stack underflow, command %i" %
                                   n)
        elif command == 'cm':
            ctm = PdfMatrix(operands) @ ctm
        elif command == 'Do':
            image_name = operands[0]
            settings = XobjectSettings(name=image_name,
                                       shorthand=ctm.shorthand,
                                       stack_depth=len(stack))
            xobject_settings.append(settings)
        elif command == 'INLINE IMAGE':
            iimage = operands[0]
            inline = InlineSettings(iimage=iimage,
                                    shorthand=ctm.shorthand,
                                    stack_depth=len(stack))
            inline_images.append(inline)
        elif command in text_operators:
            found_text = True

    return ContentsInfo(xobject_settings=xobject_settings,
                        inline_images=inline_images,
                        found_text=found_text)
Пример #16
0
def test_unparse_inline(inline):
    p0 = inline.pages[0]
    cmds = parse_content_stream(p0)
    unparsed = unparse_content_stream(cmds)
    assert b'BI' in unparsed
    assert unparsed == slow_unparse_content_stream(cmds)
Пример #17
0
def test_parse_xobject(resources):
    with Pdf.open(resources / 'formxobject.pdf') as pdf:
        form1 = pdf.pages[0].Resources.XObject.Form1
        instructions = parse_content_stream(form1)
        assert instructions[0][1] == Operator('cm')
Пример #18
0
    def filter_content(self, content, layer=None):
        # content can be either a page or an xobject
        if '/Resources' in content.keys():
            page_keep = self.find_page_keep(content.Resources)
        else:
            page_keep = {}

        commands = pikepdf.parse_content_stream(content)
        show_ops = [
            pikepdf.Operator(k) for k, v in pdf_ops.ops.items()
            if v[0] == 'show'
        ]
        stroke_ops = [
            pikepdf.Operator(k) for k, v in pdf_ops.ops.items()
            if v[0] == 'show' and v[1] == 'stroke'
        ]
        new_content = []
        in_oc = False
        currently_copying = self.keep_non_oc
        gs_mod = []
        new_q = False

        if layer is not None:
            layer_mod, mod_applied = self.convert_layer_props(
                self.line_props[layer])
            in_oc = True
            currently_copying = True
        else:
            layer_mod = None
            mod_applied = None

        for operands, operator in commands:
            # check to see if this pdf has CMYK or RGB colour definitions
            if not self.colour_type:
                self.check_colour(operator, operands)

            # look for optional content
            if layer is None and operator == pikepdf.Operator('BDC'):
                # BDC/BMC doesn't necessarily mean optional content block
                # check the operands for the /OC flag
                if len(operands) > 1 and operands[0] == '/OC':
                    in_oc = True
                    if operands[1] in page_keep.keys():
                        currently_copying = True

                        # get a link to the current line property modifications requested
                        if page_keep[operands[1]] in self.line_props.keys():
                            layer_mod, mod_applied = self.convert_layer_props(
                                self.line_props[page_keep[operands[1]]])
                    else:
                        currently_copying = False

            # all kinds of crazy stuff going on behind the scenes, so to select layers we can't just delete everything.
            # Just copy the non-showing operations
            if currently_copying or operator not in show_ops:
                new_command = [operands, operator]

                if in_oc and layer_mod is not None:
                    op_string = str(operator)

                    # if we need to modify graphics state dictionaries, we need to retrieve that from the resources
                    if op_string == 'gs' and str(operands) not in gs_mod:
                        gs_mod.append(operands)

                    # check for one of the line property modification operators
                    if op_string in layer_mod.keys():
                        new_command[0] = layer_mod[op_string]
                        mod_applied[op_string] = True

                    # check if we're drawing but haven't applied all mods yet
                    if operator in stroke_ops and not all(
                            mod_applied.values()):
                        needs_mod = [
                            k for k, v in mod_applied.items() if not v
                        ]
                        for key in needs_mod:
                            new_content.append(
                                [layer_mod[key],
                                 pikepdf.Operator(key)])
                            mod_applied[key] = True

                    if op_string == 'Q':
                        # reset the dictionary if we're in a new q/Q block
                        if all(mod_applied.values()):
                            mod_applied = {
                                key: False
                                for key in mod_applied.keys()
                            }

                new_content.append(new_command)

                # q is the only command that needs to go after the current command
                if new_q:
                    new_content.append([[], pikepdf.Operator('q')])
                    new_q = False

            if in_oc and operator == pikepdf.Operator('EMC'):
                currently_copying = self.keep_non_oc
                in_oc = False
                layer_mod = None

        if len(gs_mod) > 0:
            print(
                'Found graphics state dictionary, layer modification may not work as expected'
            )

        return pikepdf.unparse_content_stream(new_content)
Пример #19
0
def test_unparse_inline(resources):
    with Pdf.open(resources / 'image-mono-inline.pdf') as pdf:
        p0 = pdf.pages[0]
        cmds = parse_content_stream(p0)
        unparsed = unparse_content_stream(cmds)
        assert b'BI' in unparsed
Пример #20
0
def test_invalid_stream_object():
    with pytest.raises(TypeError):
        parse_content_stream(Dictionary({"/Hi": 3}))
Пример #21
0
def inline(resources):
    pdf = Pdf.open(resources / 'image-mono-inline.pdf')
    for operands, _command in parse_content_stream(pdf.pages[0]):
        if operands and isinstance(operands[0], PdfInlineImage):
            return operands[0], pdf
Пример #22
0
def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
    """Interpret the PDF content stream.

    The stack represents the state of the PDF graphics stack.  We are only
    interested in the current transformation matrix (CTM) so we only track
    this object; a full implementation would need to track many other items.

    The CTM is initialized to the mapping from user space to device space.
    PDF units are 1/72".  In a PDF viewer or printer this matrix is initialized
    to the transformation to device space.  For example if set to
    (1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches.

    Images are always considered to be (0, 0) -> (1, 1).  Before drawing an
    image there should be a 'cm' that sets up an image coordinate system
    where drawing from (0, 0) -> (1, 1) will draw on the desired area of the
    page.

    PDF units suit our needs so we initialize ctm to the identity matrix.

    According to the PDF specification, the maximum stack depth is 32. Other
    viewers tolerate some amount beyond this.  We issue a warning if the
    stack depth exceeds the spec limit and set a hard limit beyond this to
    bound our memory requirements.  If the stack underflows behavior is
    undefined in the spec, but we just pretend nothing happened and leave the
    CTM unchanged.
    """

    stack = []
    ctm = PdfMatrix(initial_shorthand)
    xobject_settings = []
    inline_images = []
    name_index = defaultdict(lambda: [])
    found_vector = False
    found_text = False
    vector_ops = set('S s f F f* B B* b b*'.split())
    text_showing_ops = set("""TJ Tj " '""".split())
    image_ops = set('BI ID EI q Q Do cm'.split())
    operator_whitelist = ' '.join(vector_ops | text_showing_ops | image_ops)

    for n, graphobj in enumerate(
        _normalize_stack(
            pikepdf.parse_content_stream(contentstream, operator_whitelist)
        )
    ):
        operands, operator = graphobj
        if operator == 'q':
            stack.append(ctm)
            if len(stack) > 32:  # See docstring
                if len(stack) > 128:
                    raise RuntimeError(
                        "PDF graphics stack overflowed hard limit, operator %i" % n
                    )
                warn("PDF graphics stack overflowed spec limit")
        elif operator == 'Q':
            try:
                ctm = stack.pop()
            except IndexError:
                # Keeping the ctm the same seems to be the only sensible thing
                # to do. Just pretend nothing happened, keep calm and carry on.
                warn("PDF graphics stack underflowed - PDF may be malformed")
        elif operator == 'cm':
            ctm = PdfMatrix(operands) @ ctm
        elif operator == 'Do':
            image_name = operands[0]
            settings = XobjectSettings(
                name=image_name, shorthand=ctm.shorthand, stack_depth=len(stack)
            )
            xobject_settings.append(settings)
            name_index[image_name].append(settings)
        elif operator == 'INLINE IMAGE':  # BI/ID/EI are grouped into this
            iimage = operands[0]
            inline = InlineSettings(
                iimage=iimage, shorthand=ctm.shorthand, stack_depth=len(stack)
            )
            inline_images.append(inline)
        elif operator in vector_ops:
            found_vector = True
        elif operator in text_showing_ops:
            found_text = True

    return ContentsInfo(
        xobject_settings=xobject_settings,
        inline_images=inline_images,
        found_vector=found_vector,
        found_text=found_text,
        name_index=name_index,
    )
Пример #23
0
def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
    """Interpret the PDF content stream.

    The stack represents the state of the PDF graphics stack.  We are only
    interested in the current transformation matrix (CTM) so we only track
    this object; a full implementation would need to track many other items.

    The CTM is initialized to the mapping from user space to device space.
    PDF units are 1/72".  In a PDF viewer or printer this matrix is initialized
    to the transformation to device space.  For example if set to
    (1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches.

    Images are always considered to be (0, 0) -> (1, 1).  Before drawing an
    image there should be a 'cm' that sets up an image coordinate system
    where drawing from (0, 0) -> (1, 1) will draw on the desired area of the
    page.

    PDF units suit our needs so we initialize ctm to the identity matrix.

    According to the PDF specification, the maximum stack depth is 32. Other
    viewers tolerate some amount beyond this.  We issue a warning if the
    stack depth exceeds the spec limit and set a hard limit beyond this to
    bound our memory requirements.  If the stack underflows behavior is
    undefined in the spec, but we just pretend nothing happened and leave the
    CTM unchanged.
    """

    stack = []
    ctm = PdfMatrix(initial_shorthand)
    xobject_settings = []
    inline_images = []
    found_vector = False
    vector_ops = set('S s f F f* B B* b b*'.split())
    image_ops = set('BI ID EI q Q Do cm'.split())
    operator_whitelist = ' '.join(vector_ops | image_ops)

    for n, graphobj in enumerate(
        _normalize_stack(
            pikepdf.parse_content_stream(contentstream, operator_whitelist)
        )
    ):
        operands, operator = graphobj
        if operator == 'q':
            stack.append(ctm)
            if len(stack) > 32:  # See docstring
                if len(stack) > 128:
                    raise RuntimeError(
                        "PDF graphics stack overflowed hard limit, operator %i" % n
                    )
                warn("PDF graphics stack overflowed spec limit")
        elif operator == 'Q':
            try:
                ctm = stack.pop()
            except IndexError:
                # Keeping the ctm the same seems to be the only sensible thing
                # to do. Just pretend nothing happened, keep calm and carry on.
                warn("PDF graphics stack underflowed - PDF may be malformed")
        elif operator == 'cm':
            ctm = PdfMatrix(operands) @ ctm
        elif operator == 'Do':
            image_name = operands[0]
            settings = XobjectSettings(
                name=image_name, shorthand=ctm.shorthand, stack_depth=len(stack)
            )
            xobject_settings.append(settings)
        elif operator == 'INLINE IMAGE':  # BI/ID/EI are grouped into this
            iimage = operands[0]
            inline = InlineSettings(
                iimage=iimage, shorthand=ctm.shorthand, stack_depth=len(stack)
            )
            inline_images.append(inline)
        elif operator in vector_ops:
            found_vector = True

    return ContentsInfo(
        xobject_settings=xobject_settings,
        inline_images=inline_images,
        found_vector=found_vector,
    )