Exemplo n.º 1
0
def strip_invisible_text(pdf, page):
    stream = []
    in_text_obj = False
    render_mode = 0
    text_objects = []

    rich_page = pikepdf.Page(page)
    rich_page.contents_coalesce()
    for operands, operator in pikepdf.parse_content_stream(page, ''):
        if not in_text_obj:
            if operator == pikepdf.Operator('BT'):
                in_text_obj = True
                render_mode = 0
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            if operator == pikepdf.Operator('Tr'):
                render_mode = operands[0]
            text_objects.append((operands, operator))
            if operator == pikepdf.Operator('ET'):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()

    def convert(op):
        try:
            return op.unparse()
        except AttributeError:
            return str(op).encode('ascii')

    lines = []

    for operands, operator in stream:
        if operator == pikepdf.Operator('INLINE IMAGE'):
            iim = operands[0]
            line = iim.unparse()
        else:
            line = b' '.join(convert(op)
                             for op in operands) + b' ' + operator.unparse()
        lines.append(line)

    content_stream = b'\n'.join(lines)
    page.Contents = pikepdf.Stream(pdf, content_stream)
Exemplo n.º 2
0
    def content_stream(self):
        new_content_stream = []
        for operands, operator in self.original_content_stream:
            new_operands = operands[:]

            if operator == pikepdf.Operator('Tj'):
                new_operands = self._get_objects(operands)
            elif operator == pikepdf.Operator('TJ'):
                new_operands = []
                for tja in operands:
                    if _is_array_op(tja):
                        objects = self._get_objects(tja)
                        new_operands.append(pikepdf.Array(objects))
                    else:
                        new_operands.append(tja)

            new_content_stream.append((new_operands, operator))
        return new_content_stream
Exemplo n.º 3
0
def strip_invisible_text(pdf, page):
    stream = []
    in_text_obj = False
    render_mode = 0
    text_objects = []

    page.page_contents_coalesce()
    for operands, operator in pikepdf.parse_content_stream(page, ""):
        if not in_text_obj:
            if operator == pikepdf.Operator("BT"):
                in_text_obj = True
                render_mode = 0
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            if operator == pikepdf.Operator("Tr"):
                render_mode = operands[0]
            text_objects.append((operands, operator))
            if operator == pikepdf.Operator("ET"):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()

    def convert(op):
        try:
            return op.unparse()
        except AttributeError:
            return str(op).encode("ascii")

    lines = []

    for operands, operator in stream:
        if operator == pikepdf.Operator("INLINE IMAGE"):
            iim = operands[0]
            line = iim.unparse()
        else:
            line = b" ".join(convert(op) for op in operands) + b" " + operator.unparse()
        lines.append(line)

    content_stream = b"\n".join(lines)
    page.Contents = pikepdf.Stream(pdf, content_stream)
Exemplo n.º 4
0
def _parse_text_block(font_map, start, content_stream, current_font):
    text_block = TextBlock()
    operands, operator = start
    text_block.original_content_stream.append(start)

    while operator != pikepdf.Operator('ET'):
        if operator == pikepdf.Operator('Tf'):
            current_font = _parse_font(operands, font_map)
        elif operator == pikepdf.Operator('Tj'):
            for tj in filter(_is_string_op, operands):
                text_block.add_text_object(tj, current_font)
        elif operator == pikepdf.Operator('TJ'):
            for tja in filter(_is_array_op, operands):
                for tj in filter(_is_string_op, tja):
                    text_block.add_text_object(tj, current_font)
        operands, operator = next(content_stream)
        text_block.original_content_stream.append((operands, operator))

    return text_block, current_font
Exemplo n.º 5
0
def parse_text(qpdf_page: pikepdf.Page, font_map, synthesizer: PdfSynthesizer):
    content_stream = iter(pikepdf.parse_content_stream(qpdf_page))
    new_content_stream = []
    last_used_font = None
    text_lengths = collections.Counter()

    for operands, operator in content_stream:
        if operator == pikepdf.Operator('Do'):
            if has_form(qpdf_page, operands):
                raise HasFormException

        if operator == pikepdf.Operator('Tf'):
            last_used_font = _parse_font(operands, font_map)

        if operator == pikepdf.Operator('BT'):
            text_block, last_used_font = _parse_text_block(
                font_map=font_map,
                start=(operands, operator),
                content_stream=content_stream,
                current_font=last_used_font,
            )
            for text_id, text, font in text_block:
                text_lengths[len(text)] += 1
                modified_text = synthesizer.modify_text(text, font=font)
                text_block.set_unicode_text(text_id, modified_text)
            new_content_stream.extend(text_block.content_stream)
        else:
            new_content_stream.append((operands, operator))

    single_chars = text_lengths[1] / sum(text_lengths.values())
    if single_chars > 0.9:
        raise TooManySingleChars(
            f'Too many single characters in document ({single_chars * 100:.2f}%)'
        )

    return new_content_stream
Exemplo n.º 6
0
def test_operator_inline(resources):
    with pikepdf.open(resources / 'image-mono-inline.pdf') as pdf:
        instructions = parse_content_stream(pdf.pages[0], operators='BI ID EI')
        assert len(instructions) == 1
        operands, operator = instructions[0]
        assert operator == pikepdf.Operator("INLINE IMAGE")
Exemplo n.º 7
0
    def filter_content(self, content, layer=None):
        # content can be either a page or an xobject
        if '/Resources' in content.keys():
            page_keep = self.find_page_keep(content.Resources)
        else:
            page_keep = {}

        commands = pikepdf.parse_content_stream(content)
        show_ops = [
            pikepdf.Operator(k) for k, v in pdf_ops.ops.items()
            if v[0] == 'show'
        ]
        stroke_ops = [
            pikepdf.Operator(k) for k, v in pdf_ops.ops.items()
            if v[0] == 'show' and v[1] == 'stroke'
        ]
        new_content = []
        in_oc = False
        currently_copying = self.keep_non_oc
        gs_mod = []
        new_q = False

        if layer is not None:
            layer_mod, mod_applied = self.convert_layer_props(
                self.line_props[layer])
            in_oc = True
            currently_copying = True
        else:
            layer_mod = None
            mod_applied = None

        for operands, operator in commands:
            # check to see if this pdf has CMYK or RGB colour definitions
            if not self.colour_type:
                self.check_colour(operator, operands)

            # look for optional content
            if layer is None and operator == pikepdf.Operator('BDC'):
                # BDC/BMC doesn't necessarily mean optional content block
                # check the operands for the /OC flag
                if len(operands) > 1 and operands[0] == '/OC':
                    in_oc = True
                    if operands[1] in page_keep.keys():
                        currently_copying = True

                        # get a link to the current line property modifications requested
                        if page_keep[operands[1]] in self.line_props.keys():
                            layer_mod, mod_applied = self.convert_layer_props(
                                self.line_props[page_keep[operands[1]]])
                    else:
                        currently_copying = False

            # all kinds of crazy stuff going on behind the scenes, so to select layers we can't just delete everything.
            # Just copy the non-showing operations
            if currently_copying or operator not in show_ops:
                new_command = [operands, operator]

                if in_oc and layer_mod is not None:
                    op_string = str(operator)

                    # if we need to modify graphics state dictionaries, we need to retrieve that from the resources
                    if op_string == 'gs' and str(operands) not in gs_mod:
                        gs_mod.append(operands)

                    # check for one of the line property modification operators
                    if op_string in layer_mod.keys():
                        new_command[0] = layer_mod[op_string]
                        mod_applied[op_string] = True

                    # check if we're drawing but haven't applied all mods yet
                    if operator in stroke_ops and not all(
                            mod_applied.values()):
                        needs_mod = [
                            k for k, v in mod_applied.items() if not v
                        ]
                        for key in needs_mod:
                            new_content.append(
                                [layer_mod[key],
                                 pikepdf.Operator(key)])
                            mod_applied[key] = True

                    if op_string == 'Q':
                        # reset the dictionary if we're in a new q/Q block
                        if all(mod_applied.values()):
                            mod_applied = {
                                key: False
                                for key in mod_applied.keys()
                            }

                new_content.append(new_command)

                # q is the only command that needs to go after the current command
                if new_q:
                    new_content.append([[], pikepdf.Operator('q')])
                    new_q = False

            if in_oc and operator == pikepdf.Operator('EMC'):
                currently_copying = self.keep_non_oc
                in_oc = False
                layer_mod = None

        if len(gs_mod) > 0:
            print(
                'Found graphics state dictionary, layer modification may not work as expected'
            )

        return pikepdf.unparse_content_stream(new_content)