def strip_invisible_text(pdf, page): stream = [] in_text_obj = False render_mode = 0 text_objects = [] rich_page = pikepdf.Page(page) rich_page.contents_coalesce() for operands, operator in pikepdf.parse_content_stream(page, ''): if not in_text_obj: if operator == pikepdf.Operator('BT'): in_text_obj = True render_mode = 0 text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == pikepdf.Operator('Tr'): render_mode = operands[0] text_objects.append((operands, operator)) if operator == pikepdf.Operator('ET'): in_text_obj = False if render_mode != 3: stream.extend(text_objects) text_objects.clear() def convert(op): try: return op.unparse() except AttributeError: return str(op).encode('ascii') lines = [] for operands, operator in stream: if operator == pikepdf.Operator('INLINE IMAGE'): iim = operands[0] line = iim.unparse() else: line = b' '.join(convert(op) for op in operands) + b' ' + operator.unparse() lines.append(line) content_stream = b'\n'.join(lines) page.Contents = pikepdf.Stream(pdf, content_stream)
def content_stream(self): new_content_stream = [] for operands, operator in self.original_content_stream: new_operands = operands[:] if operator == pikepdf.Operator('Tj'): new_operands = self._get_objects(operands) elif operator == pikepdf.Operator('TJ'): new_operands = [] for tja in operands: if _is_array_op(tja): objects = self._get_objects(tja) new_operands.append(pikepdf.Array(objects)) else: new_operands.append(tja) new_content_stream.append((new_operands, operator)) return new_content_stream
def strip_invisible_text(pdf, page): stream = [] in_text_obj = False render_mode = 0 text_objects = [] page.page_contents_coalesce() for operands, operator in pikepdf.parse_content_stream(page, ""): if not in_text_obj: if operator == pikepdf.Operator("BT"): in_text_obj = True render_mode = 0 text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == pikepdf.Operator("Tr"): render_mode = operands[0] text_objects.append((operands, operator)) if operator == pikepdf.Operator("ET"): in_text_obj = False if render_mode != 3: stream.extend(text_objects) text_objects.clear() def convert(op): try: return op.unparse() except AttributeError: return str(op).encode("ascii") lines = [] for operands, operator in stream: if operator == pikepdf.Operator("INLINE IMAGE"): iim = operands[0] line = iim.unparse() else: line = b" ".join(convert(op) for op in operands) + b" " + operator.unparse() lines.append(line) content_stream = b"\n".join(lines) page.Contents = pikepdf.Stream(pdf, content_stream)
def _parse_text_block(font_map, start, content_stream, current_font): text_block = TextBlock() operands, operator = start text_block.original_content_stream.append(start) while operator != pikepdf.Operator('ET'): if operator == pikepdf.Operator('Tf'): current_font = _parse_font(operands, font_map) elif operator == pikepdf.Operator('Tj'): for tj in filter(_is_string_op, operands): text_block.add_text_object(tj, current_font) elif operator == pikepdf.Operator('TJ'): for tja in filter(_is_array_op, operands): for tj in filter(_is_string_op, tja): text_block.add_text_object(tj, current_font) operands, operator = next(content_stream) text_block.original_content_stream.append((operands, operator)) return text_block, current_font
def parse_text(qpdf_page: pikepdf.Page, font_map, synthesizer: PdfSynthesizer): content_stream = iter(pikepdf.parse_content_stream(qpdf_page)) new_content_stream = [] last_used_font = None text_lengths = collections.Counter() for operands, operator in content_stream: if operator == pikepdf.Operator('Do'): if has_form(qpdf_page, operands): raise HasFormException if operator == pikepdf.Operator('Tf'): last_used_font = _parse_font(operands, font_map) if operator == pikepdf.Operator('BT'): text_block, last_used_font = _parse_text_block( font_map=font_map, start=(operands, operator), content_stream=content_stream, current_font=last_used_font, ) for text_id, text, font in text_block: text_lengths[len(text)] += 1 modified_text = synthesizer.modify_text(text, font=font) text_block.set_unicode_text(text_id, modified_text) new_content_stream.extend(text_block.content_stream) else: new_content_stream.append((operands, operator)) single_chars = text_lengths[1] / sum(text_lengths.values()) if single_chars > 0.9: raise TooManySingleChars( f'Too many single characters in document ({single_chars * 100:.2f}%)' ) return new_content_stream
def test_operator_inline(resources): with pikepdf.open(resources / 'image-mono-inline.pdf') as pdf: instructions = parse_content_stream(pdf.pages[0], operators='BI ID EI') assert len(instructions) == 1 operands, operator = instructions[0] assert operator == pikepdf.Operator("INLINE IMAGE")
def filter_content(self, content, layer=None): # content can be either a page or an xobject if '/Resources' in content.keys(): page_keep = self.find_page_keep(content.Resources) else: page_keep = {} commands = pikepdf.parse_content_stream(content) show_ops = [ pikepdf.Operator(k) for k, v in pdf_ops.ops.items() if v[0] == 'show' ] stroke_ops = [ pikepdf.Operator(k) for k, v in pdf_ops.ops.items() if v[0] == 'show' and v[1] == 'stroke' ] new_content = [] in_oc = False currently_copying = self.keep_non_oc gs_mod = [] new_q = False if layer is not None: layer_mod, mod_applied = self.convert_layer_props( self.line_props[layer]) in_oc = True currently_copying = True else: layer_mod = None mod_applied = None for operands, operator in commands: # check to see if this pdf has CMYK or RGB colour definitions if not self.colour_type: self.check_colour(operator, operands) # look for optional content if layer is None and operator == pikepdf.Operator('BDC'): # BDC/BMC doesn't necessarily mean optional content block # check the operands for the /OC flag if len(operands) > 1 and operands[0] == '/OC': in_oc = True if operands[1] in page_keep.keys(): currently_copying = True # get a link to the current line property modifications requested if page_keep[operands[1]] in self.line_props.keys(): layer_mod, mod_applied = self.convert_layer_props( self.line_props[page_keep[operands[1]]]) else: currently_copying = False # all kinds of crazy stuff going on behind the scenes, so to select layers we can't just delete everything. # Just copy the non-showing operations if currently_copying or operator not in show_ops: new_command = [operands, operator] if in_oc and layer_mod is not None: op_string = str(operator) # if we need to modify graphics state dictionaries, we need to retrieve that from the resources if op_string == 'gs' and str(operands) not in gs_mod: gs_mod.append(operands) # check for one of the line property modification operators if op_string in layer_mod.keys(): new_command[0] = layer_mod[op_string] mod_applied[op_string] = True # check if we're drawing but haven't applied all mods yet if operator in stroke_ops and not all( mod_applied.values()): needs_mod = [ k for k, v in mod_applied.items() if not v ] for key in needs_mod: new_content.append( [layer_mod[key], pikepdf.Operator(key)]) mod_applied[key] = True if op_string == 'Q': # reset the dictionary if we're in a new q/Q block if all(mod_applied.values()): mod_applied = { key: False for key in mod_applied.keys() } new_content.append(new_command) # q is the only command that needs to go after the current command if new_q: new_content.append([[], pikepdf.Operator('q')]) new_q = False if in_oc and operator == pikepdf.Operator('EMC'): currently_copying = self.keep_non_oc in_oc = False layer_mod = None if len(gs_mod) > 0: print( 'Found graphics state dictionary, layer modification may not work as expected' ) return pikepdf.unparse_content_stream(new_content)