def test_unparse_cs(): instructions = [ ([], Operator('q')), ([*PdfMatrix.identity().shorthand], Operator('cm')), ([], Operator('Q')), ] assert unparse_content_stream(instructions).strip() == b'q\n1 0 0 1 0 0 cm\n Q'
def strip_invisible_text(pdf, page): stream = [] in_text_obj = False render_mode = 0 text_objects = [] rich_page = Page(page) rich_page.contents_coalesce() for operands, operator in parse_content_stream(page, ''): if not in_text_obj: if operator == Operator('BT'): in_text_obj = True render_mode = 0 text_objects.append((operands, operator)) else: stream.append((operands, operator)) else: if operator == Operator('Tr'): render_mode = operands[0] text_objects.append((operands, operator)) if operator == Operator('ET'): in_text_obj = False if render_mode != 3: stream.extend(text_objects) text_objects.clear() content_stream = unparse_content_stream(stream) page.Contents = Stream(pdf, content_stream)
def test_inline_copy(inline): for instr in parse_content_stream(inline.pages[0].Contents): if not isinstance(instr, ContentStreamInlineImage): continue csiimage = instr _copy_of_csiimage = ContentStreamInlineImage(csiimage) # noqa: F841 new_iimage = ContentStreamInlineImage(csiimage.iimage) assert unparse_content_stream([new_iimage]).startswith(b'BI')
def test_unparse_interpret_operator(): commands = [] matrix = [2, 0, 0, 2, 0, 0] commands.insert(0, (matrix, 'cm')) commands.insert(0, (matrix, b'cm')) commands.insert(0, (matrix, Operator('cm'))) assert (unparse_content_stream(commands) == b'2 0 0 2 0 0 cm\n2 0 0 2 0 0 cm\n2 0 0 2 0 0 cm')
def test_text_filter(resources, outdir): input_pdf = resources / 'veraPDF test suite 6-2-10-t02-pass-a.pdf' # Ensure the test PDF has detect we can find proc = run( ['pdftotext', str(input_pdf), '-'], check=True, stdout=PIPE, encoding='utf-8' ) assert proc.stdout.strip() != '', "Need input test file that contains text" pdf = Pdf.open(input_pdf) page = pdf.pages[0] keep = [] for operands, command in parse_content_stream( page, """TJ Tj ' " BT ET Td TD Tm T* Tc Tw Tz TL Tf Tr Ts""" ): if command == Operator('Tj'): print("skipping Tj") continue keep.append((operands, command)) new_stream = Stream(pdf, pikepdf.unparse_content_stream(keep)) print(new_stream.read_bytes()) # pylint: disable=no-member page['/Contents'] = new_stream page['/Rotate'] = 90 pdf.save(outdir / 'notext.pdf', True) pdf.close() proc = run( ['pdftotext', str(outdir / 'notext.pdf'), '-'], check=True, stdout=PIPE, encoding='utf-8', ) assert proc.stdout.strip() == '', "Expected text to be removed"
def test_rejects_inline_image_missing(self): with pytest.raises(PdfParsingError): unparse_content_stream([('should be a PdfInlineImage but is not', b'INLINE IMAGE')])
def test_rejects_not_operator(self): with pytest.raises(PdfParsingError, match="While unparsing"): unparse_content_stream([(['one', 'two'], Name.FortyTwo) ]) # Name is not an operator
def test_rejects_not_castable_to_object(self): with pytest.raises(PdfParsingError, match="While unparsing"): unparse_content_stream([(['one', 'two'], 42)]) # 42 is not an operator
def test_rejects_not_list_of_pairs(self): with pytest.raises(PdfParsingError): unparse_content_stream([(1, 2, 3)])
def test_unparse_invalid_inline_image(): instructions = [((42, ), Operator(b'INLINE IMAGE'))] with pytest.raises(PdfParsingError): unparse_content_stream(instructions)
def synthesize_pdf( pdf_file, json_file, dst_dir, max_fonts, max_pages, num_outputs_per_document, synthesizer_class, ): ground_truth = json.loads(json_file.read_text()) pdf_io = BytesIO(pdf_file.read_bytes()) output_string = StringIO() rsrcmgr = PDFResourceManager(caching=True) device = TextConverter(rsrcmgr, output_string, codec='utf-8', laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter_fonts = {} def _out_path(_i, suffix): return dst_dir / f'{json_file.stem}-{_i}{suffix}' k_to_process = [] for i in range(num_outputs_per_document): if not (_out_path(i, '.pdf').exists() and _out_path(i, '.json').exists()): k_to_process.append(i) if not k_to_process: raise AlreadyProcessed(f'Already processed {pdf_file} {json_file}') with pikepdf.Pdf.open(pdf_file) as pdf: if max_pages and len(pdf.pages) > max_pages: raise TooManyPagesException( f'Too many pages {len(pdf.pages)} > {max_pages} in PDF, skipping!' ) for page_number, (page, miner) in enumerate( zip(pdf.pages, PDFPage.get_pages(pdf_io))): interpreter.process_page(miner) interpreter_fonts.update(interpreter.fontmap) if max_fonts and len(interpreter_fonts) > max_fonts: raise TooManyFontsException( f'Too many fonts {len(interpreter_fonts)} > {max_fonts} in PDF, skipping!' ) if not re.sub(f'[{re.escape(string.whitespace)}]', '', output_string.getvalue()): raise NoTextException('PDF does not have any text! Skipping') font_map = { f'/{k}': Font(f'/{k}', v) for k, v in interpreter_fonts.items() } synthesizer = synthesizer_class(ground_truth, font_map) with pikepdf.Pdf.open(pdf_file) as pdf: new_contents = collections.defaultdict(list) new_ground_truths = {} for i in k_to_process: for page_number, page in enumerate(pdf.pages): new_content_stream = parse_text(page, font_map, synthesizer) new_contents[i].append( pdf.make_stream( pikepdf.unparse_content_stream(new_content_stream))) new_ground_truths[i] = synthesizer.create_new_ground_truth() synthesizer.reset() for i in k_to_process: for page_number, page in enumerate(pdf.pages): page.Contents = new_contents[i][page_number] pdf.save(_out_path(i, '.pdf')) _out_path(i, '.json').write_text( json.dumps(new_ground_truths[i], indent=2))
def test_build_instructions(): cs = ContentStreamInstruction([1, 0, 0, 1, 0, 0], Operator('cm')) assert 'cm' in repr(cs) assert unparse_content_stream([cs]) == b'1 0 0 1 0 0 cm'
def test_unparse_failure(): instructions = [([float('nan')], Operator('cm'))] with pytest.raises(PdfParsingError): unparse_content_stream(instructions)
def test_unparse_inline(resources): with Pdf.open(resources / 'image-mono-inline.pdf') as pdf: p0 = pdf.pages[0] cmds = parse_content_stream(p0) unparsed = unparse_content_stream(cmds) assert b'BI' in unparsed
def filter_content(self, content, layer=None): # content can be either a page or an xobject if '/Resources' in content.keys(): page_keep = self.find_page_keep(content.Resources) else: page_keep = {} commands = pikepdf.parse_content_stream(content) show_ops = [ pikepdf.Operator(k) for k, v in pdf_ops.ops.items() if v[0] == 'show' ] stroke_ops = [ pikepdf.Operator(k) for k, v in pdf_ops.ops.items() if v[0] == 'show' and v[1] == 'stroke' ] new_content = [] in_oc = False currently_copying = self.keep_non_oc gs_mod = [] new_q = False if layer is not None: layer_mod, mod_applied = self.convert_layer_props( self.line_props[layer]) in_oc = True currently_copying = True else: layer_mod = None mod_applied = None for operands, operator in commands: # check to see if this pdf has CMYK or RGB colour definitions if not self.colour_type: self.check_colour(operator, operands) # look for optional content if layer is None and operator == pikepdf.Operator('BDC'): # BDC/BMC doesn't necessarily mean optional content block # check the operands for the /OC flag if len(operands) > 1 and operands[0] == '/OC': in_oc = True if operands[1] in page_keep.keys(): currently_copying = True # get a link to the current line property modifications requested if page_keep[operands[1]] in self.line_props.keys(): layer_mod, mod_applied = self.convert_layer_props( self.line_props[page_keep[operands[1]]]) else: currently_copying = False # all kinds of crazy stuff going on behind the scenes, so to select layers we can't just delete everything. # Just copy the non-showing operations if currently_copying or operator not in show_ops: new_command = [operands, operator] if in_oc and layer_mod is not None: op_string = str(operator) # if we need to modify graphics state dictionaries, we need to retrieve that from the resources if op_string == 'gs' and str(operands) not in gs_mod: gs_mod.append(operands) # check for one of the line property modification operators if op_string in layer_mod.keys(): new_command[0] = layer_mod[op_string] mod_applied[op_string] = True # check if we're drawing but haven't applied all mods yet if operator in stroke_ops and not all( mod_applied.values()): needs_mod = [ k for k, v in mod_applied.items() if not v ] for key in needs_mod: new_content.append( [layer_mod[key], pikepdf.Operator(key)]) mod_applied[key] = True if op_string == 'Q': # reset the dictionary if we're in a new q/Q block if all(mod_applied.values()): mod_applied = { key: False for key in mod_applied.keys() } new_content.append(new_command) # q is the only command that needs to go after the current command if new_q: new_content.append([[], pikepdf.Operator('q')]) new_q = False if in_oc and operator == pikepdf.Operator('EMC'): currently_copying = self.keep_non_oc in_oc = False layer_mod = None if len(gs_mod) > 0: print( 'Found graphics state dictionary, layer modification may not work as expected' ) return pikepdf.unparse_content_stream(new_content)
def test_accepts_all_lists(self): unparse_content_stream([[[], b'Q']])
def test_accepts_all_tuples(self): unparse_content_stream((((Name.Foo, ), b'/Do'), ))
def test_unparse_inline(inline): p0 = inline.pages[0] cmds = parse_content_stream(p0) unparsed = unparse_content_stream(cmds) assert b'BI' in unparsed assert unparsed == slow_unparse_content_stream(cmds)