def test_creation_date_preserved( spoof_tesseract_noop, output_type, resources, infile, outpdf ): input_file = resources / infile check_ocrmypdf( input_file, outpdf, '--output-type', output_type, env=spoof_tesseract_noop ) pdf_before = pikepdf.open(input_file) pdf_after = pikepdf.open(outpdf) before = pdf_before.trailer.get('/Info', {}) after = pdf_after.trailer.get('/Info', {}) if not before: assert after.get('/CreationDate', '') != '' else: # We expect that the creation date stayed the same date_before = decode_pdf_date(str(before['/CreationDate'])) date_after = decode_pdf_date(str(after['/CreationDate'])) assert seconds_between_dates(date_before, date_after) < 1000 # We expect that the modified date is quite recent date_after = decode_pdf_date(str(after['/ModDate'])) assert seconds_between_dates(date_after, datetime.datetime.now(timezone.utc)) < 1000
def test_override_metadata(spoof_tesseract_noop, output_type, resources, outpdf): input_file = resources / 'c02-22.pdf' german = 'Du siehst den Wald vor lauter Bäumen nicht.' chinese = '孔子' p, out, err = run_ocrmypdf( input_file, outpdf, '--title', german, '--author', chinese, '--output-type', output_type, env=spoof_tesseract_noop, ) assert p.returncode == ExitCode.ok, err before = pikepdf.open(input_file) after = pikepdf.open(outpdf) assert after.docinfo.Title == german, after.docinfo assert after.docinfo.Author == chinese, after.docinfo assert after.docinfo.get('/Keywords', '') == '' before_date = decode_pdf_date(str(before.docinfo.CreationDate)) after_date = decode_pdf_date(str(after.docinfo.CreationDate)) assert before_date == after_date pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['output'] == output_type
def test_jbig2_lossy(lossy, resources, outpdf, spoof_tesseract_noop): args = [ resources / 'ccitt.pdf', outpdf, '--image-dpi', '200', '--optimize', 3, '--jpg-quality', '50', '--png-quality', '20', ] if lossy: args.append('--jbig2-lossy') check_ocrmypdf(*args, env=spoof_tesseract_noop) pdf = pikepdf.open(outpdf) pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values()))) assert pim.filters[0] == '/JBIG2Decode' if lossy: assert '/JBIG2Globals' in pim.decode_parms[0] else: assert len(pim.decode_parms) == 0
def convert_to_pdfa(input_files_groups, output_file, log, context): options = context.get_options() input_pdfinfo = context.get_pdfinfo() input_files = list(f for f in flatten_groups(input_files_groups)) layers_file = next( (ii for ii in input_files if ii.endswith('layers.rendered.pdf')), None ) # If the DocumentInfo record contains NUL characters, Ghostscript will # produce XMP metadata which contains invalid XML entities (�). # NULs in DocumentInfo seem to be common since older Acrobats included them. # pikepdf can deal with this, but we make the world a better place by # stamping them out as soon as possible. with pikepdf.open(layers_file) as pdf_layers_file: if pdf_layers_file.docinfo: modified = False for k, v in pdf_layers_file.docinfo.items(): if b'\x00' in bytes(v): pdf_layers_file.docinfo[k] = bytes(v).replace(b'\x00', b'') modified = True if modified: pdf_layers_file.save(layers_file) ps = next((ii for ii in input_files if ii.endswith('.ps')), None) ghostscript.generate_pdfa( pdf_version=input_pdfinfo.min_version, pdf_pages=[layers_file, ps], output_file=output_file, compression=options.pdfa_image_compression, log=log, threads=options.jobs or 1, pdfa_part=options.output_type[-1], # is pdfa-1, pdfa-2, or pdfa-3 )
def test_preserve_metadata(spoof_tesseract_noop, output_type, resources, outpdf): pdf_before = pikepdf.open(resources / 'graph.pdf') output = check_ocrmypdf( resources / 'graph.pdf', outpdf, '--output-type', output_type, env=spoof_tesseract_noop, ) pdf_after = pikepdf.open(output) for key in ('/Title', '/Author'): assert pdf_before.docinfo[key] == pdf_after.docinfo[key] pdfa_info = file_claims_pdfa(str(output)) assert pdfa_info['output'] == output_type
def test_kodak_toc(resources, outpdf, spoof_tesseract_noop): output = check_ocrmypdf( resources / 'kcs.pdf', outpdf, '--output-type', 'pdf', env=spoof_tesseract_noop ) p = pikepdf.open(outpdf) if pikepdf.Name.First in p.root.Outlines: assert isinstance(p.root.Outlines.First, pikepdf.Dictionary)
def test_no_glyphless_weave(resources, outdir): pdf = pikepdf.open(resources / 'francais.pdf') pdf_aspect = pikepdf.open(resources / 'aspect.pdf') pdf_cmyk = pikepdf.open(resources / 'cmyk.pdf') pdf.pages.extend(pdf_aspect.pages) pdf.pages.extend(pdf_cmyk.pages) pdf.save(outdir / 'test.pdf') env = os.environ.copy() env['_OCRMYPDF_MAX_REPLACE_PAGES'] = '2' check_ocrmypdf( outdir / 'test.pdf', outdir / 'out.pdf', '--deskew', '--tesseract-timeout', '0', env=env, )
def metadata_fixup(input_files_groups, output_file, log, context): options = context.get_options() input_files = list(f for f in flatten_groups(input_files_groups)) original_file = next( (ii for ii in input_files if ii.endswith('.repaired.pdf')), None ) layers_file = next( (ii for ii in input_files if ii.endswith('layers.rendered.pdf')), None ) pdfa_file = next((ii for ii in input_files if ii.endswith('pdfa.pdf')), None) original = pikepdf.open(original_file) docinfo = get_docinfo(original, options) working_file = pdfa_file if pdfa_file else layers_file pdf = pikepdf.open(working_file) with pdf.open_metadata() as meta: meta.load_from_docinfo(docinfo, delete_missing=False) # If xmp:CreateDate is missing, set it to the modify date to # match Ghostscript, for consistency if 'xmp:CreateDate' not in meta: meta['xmp:CreateDate'] = meta.get('xmp:ModifyDate', '') if pdfa_file: meta_original = original.open_metadata() not_copied = set(meta_original.keys()) - set(meta.keys()) if not_copied: log.warning( "Some input metadata could not be copied because it is not " "permitted in PDF/A. You may wish to examine the output " "PDF's XMP metadata." ) log.debug( "The following metadata fields were not copied: %r", not_copied ) pdf.save( output_file, compress_streams=True, object_stream_mode=pikepdf.ObjectStreamMode.generate, ) original.close() pdf.close()
def test_links(resources, outpdf): check_ocrmypdf( resources / 'link.pdf', outpdf, '--redo-ocr', '--oversample', '200', '--output-type', 'pdf', ) pdf = pikepdf.open(outpdf) p1 = pdf.pages[0] p2 = pdf.pages[1] assert p1.Annots[0].A.D[0].objgen == p2.objgen assert p2.Annots[0].A.D[0].objgen == p1.objgen
def unlock_PDf(source_folder, destination_folder, password): count = 0 for item in os.scandir(source_folder): if ".pdf" in item.name: file_name = item.name try: mypdf = pikepdf.open( source_folder + "/" + file_name, password=password) # open the locked pdf in source folder mypdf.save( destination_folder + "/" + file_name) # save the unlocked pdf in destination folder except pikepdf.PasswordError: print("The password failed to open the file!") print("\t\"" + file_name + "\"" + " unlocked") count = count + 1 return count
def export(input_files, pages, file_out, metadata): pdf_output = pikepdf.Pdf.new() pdf_input = [pikepdf.open(p.copyname) for p in input_files] for row in pages: current_page = pdf_input[row[2] - 1].pages[row[3] - 1] angle = row[6] angle0 = current_page.Rotate if '/Rotate' in current_page else 0 if angle != 0: current_page.Rotate = angle + angle0 cropped = _mediabox(row, angle, angle0, current_page.MediaBox) if cropped: current_page.MediaBox = cropped pdf_output.pages.append(current_page) with pdf_output.open_metadata() as outmeta: outmeta.load_from_docinfo(pdf_input[0].docinfo) for k, v in metadata.items(): outmeta[k] = v pdf_output.save(file_out)
def del_page(path): # 读取文件夹下的文件 # out_path = path +'\out' for root, dirs, files in os.walk(path): # print(root) #当前目录路径 # print(dirs) #当前路径下所有子目录 # print(files) #当前路径下所有非目录子文件 for file in files: # print(file) # file1 = os.path.splitext(file)[0] # file2 = os.path.splitext(file)[1] # print(file1,file2) # if os.path.splitext(file)[1] == '.pdf': # filename = os.path.splitext(file)[0] # if 'Unlock' in filename : # filename.replace('Unlock','') # pdfReader = PdfFileReader(open(path+file, 'rb')) # pdfFileWriter = PdfFileWriter() # numPages = pdfReader.getNumPages() # print(numPages) # for i in range(0,numPages): # if i != 1 or i != numPages: # pageObj = pdfReader.getPage(i) # pdfFileWriter.addPage(pageObj) # pdfFileWriter.write(open(path+'out'+filename+'.pdf', 'wb')) if os.path.splitext(file)[1] == '.pdf': filepath = path + file print(filepath) with pikepdf.open(filepath, 'wb', allow_overwriting_input=True) as pdf: # nums = len(pdf.pages)-2 # remove = [0,nums] # # print(len(pdf.pages)) # if 'Unlock' in str(file): # file = file.replace('Unlock','') # filepath = path + file # for index in remove : # # print(index) # del pdf.pages[index] # pdf.save(filepath) if 'Unlock' in str(file): file = file.split('.')[1] filepath = path + file + '.pdf' pdf.save(filepath)
def split_pdf_to_page_blocks( src_pdf_fn: str, pages_per_block: int = 1, page_block_base_name: str = None, ) -> Generator[List[str], None, None]: with pikepdf.open(src_pdf_fn) as pdf: if len(pdf.pages) < 1: yield [] return if len(pdf.pages) < pages_per_block: yield [src_pdf_fn] return if not page_block_base_name: page_block_base_name = os.path.basename(src_pdf_fn) temp_dir = mkdtemp() try: res: List[str] = list() page_start: int = 0 out_pdf: Optional[pikepdf.Pdf] = None for n, page in enumerate(pdf.pages): if n % pages_per_block == 0: if out_pdf is not None: out_fn = build_block_fn(str(page_block_base_name), page_start, n - 1) out_pdf.save(os.path.join(temp_dir, out_fn)) out_pdf.close() res.append(os.path.join(temp_dir, out_fn)) page_start = n out_pdf = pikepdf.new() out_pdf.pages.append(page) if out_pdf is not None and len(out_pdf.pages) > 0: out_fn = build_block_fn(str(page_block_base_name), page_start, n) out_pdf.save(os.path.join(temp_dir, out_fn)) out_pdf.close() res.append(os.path.join(temp_dir, out_fn)) yield res finally: shutil.rmtree(temp_dir)
def extract_columns(file: str, start: int, end: int, out_dir: str): start_time = time.time() with pikepdf.open(file) as pdf: for page_num in range(start, end + 1): page = get_page(pdf, page_num) segmenter = ColumnSegmenter(page) columns = segmenter.detect_columns() if len(columns) in [5, 10]: page_out_dir = os.path.join(out_dir, f"page-{page_num:04}") logging.info( f"saving {len(columns)} columns for page {page_num} to {page_out_dir}" ) save_segments(page, columns, page_out_dir) else: logging.warning( logging.warning( f"page {page_num} has {len(columns)} columns, not saving" )) logging.info(f"{time.time() - start_time:.2f} seconds elapsed")
def decrypt_pdf(file_pdf): chars = string.ascii_letters + string.digits attempts = 0 print("Searching for password!\nThis may take long time...") # print that you can go shopping :D for plen in range(1, 6): # brute-force procedure already the same for guess in itertools.product(chars, repeat=plen): attempts += 1 guess = ''.join(guess) # print(guess,attempts) #Debug try: pdf = pikepdf.open(file_pdf, password=guess) # try start pikepdf with open the file (declared as # file_pdf) and generated password pdf.save('decrypted.pdf') # save opened pdf-file decrypted in new file print("[PDF BRUTE-FORCE]: found password! " "password: {} with {} attempts".format(guess, attempts)) # print that you've won :D return True except: # print(str(attempts)+" not correct!") #Debug continue # if open failed, continue with next password
def pdf_cracker(language, mode): if language == "English": pdf_file = input( "Insert the name of the PDF file you want to crack --> ") password_file = input("Insert the name of the password list file --> ") else: pdf_file = input("Inserisci il nome del file PDF da crackare --> ") password_file = input( "Inserisci il nome del file di elenco delle password --> ") passwords = [line.strip() for line in open(password_file)] for password in tqdm(passwords, "Decrypting PDF"): try: with pikepdf.open(pdf_file, password=password) as pdf_file: if language == "English": print(f"[+] Password found: {password}") else: print(f"[+] Password trovata: {password}") except pikepdf._qpdf.PasswordError as e: continue
def test_image_scale0(resources, outpdf): with pikepdf.open(resources / 'cmyk.pdf') as cmyk: xobj = pikepdf.Page(cmyk.pages[0]).as_form_xobject() p = pikepdf.Pdf.new() p.add_blank_page(page_size=(72, 72)) objname = pikepdf.Page(p.pages[0]).add_resource( p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0) print(objname) p.pages[0].Contents = pikepdf.Stream( p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname)) p.save(outpdf) pi = pdfinfo.PdfInfo(outpdf, detailed_analysis=True, progbar=False, max_workers=1) assert not pi.pages[0]._images[0].dpi.is_finite assert pi.pages[0].dpi == Resolution(0, 0)
def _find_font(text, pdf_base): "Copy a font from the filename text into pdf_base" font, font_key = None, None possible_font_names = ('/f-0-0', '/F1') try: pdf_text = pikepdf.open(text) pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {}) except Exception: return None, None for f in possible_font_names: pdf_text_font = pdf_text_fonts.get(f, None) if pdf_text_font is not None: font_key = f break if pdf_text_font: font = pdf_base.copy_foreign(pdf_text_font) return font, font_key
def generate_booklet(pdfqueue, tmp_dir, pages): file, filename = make_tmp_file(tmp_dir) content_dict = pikepdf.Dictionary({}) file_indexes = {p.nfile for p in pages} source_files = {n: pikepdf.open(pdfqueue[n - 1].copyname) for n in file_indexes} for i in range(len(pages)//2): even = i % 2 == 0 first = pages[-i - 1 if even else i] second = pages[i if even else -i - 1] second_page_size = second.size_in_points() first_page_size = first.size_in_points() page_size = [max(second_page_size[0], first_page_size[0]) * 2, max(second_page_size[1], first_page_size[1])] first_original = source_files[first.nfile].pages[first.npage - 1] first_foreign = file.copy_foreign(first_original) _update_angle(first, first_original, first_foreign) second_original = source_files[second.nfile].pages[second.npage - 1] second_foreign = file.copy_foreign(second_original) _update_angle(second, second_original, second_foreign) content_dict[f'/Page{i*2}'] = pikepdf.Page(first_foreign).as_form_xobject() content_dict[f'/Page{i*2 + 1}'] = pikepdf.Page(second_foreign).as_form_xobject() content_txt = (f'q 1 0 0 1 0 0 cm /Page{i*2} Do Q' f' q 1 0 0 1 {first_page_size[0]} 0 cm /Page{i*2 + 1} Do Q ') newpage = pikepdf.Dictionary( Type=pikepdf.Name.Page, MediaBox=[0, 0, *page_size], Resources=pikepdf.Dictionary(XObject=content_dict), Contents=pikepdf.Stream(file, content_txt.encode()) ) # workaround for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174 if pikepdf.__version__ < '2.7.0': newpage = file.make_indirect(newpage) file.pages.append(newpage) file.save(filename) return filename
def check_pdf(input_file: Path) -> bool: """Check if a PDF complies with the PDF specification. Checks for proper formatting and proper linearization. """ pdf = None try: pdf = pikepdf.open(input_file) except pikepdf.PdfError as e: log.error(e) return False else: messages = pdf.check() for msg in messages: if 'error' in msg.lower(): log.error(msg) else: log.warning(msg) sio = StringIO() linearize_msgs = '' try: # If linearization is missing entirely, we do not complain. We do # complain if linearization is present but incorrect. pdf.check_linearization(sio) except RuntimeError: pass except (getattr(pikepdf, 'ForeignObjectError') if pikepdf.__version__ == '2.1.0' # This version may throw wrong exception else NeverRaise): pass else: linearize_msgs = sio.getvalue() if linearize_msgs: log.warning(linearize_msgs) if not messages and not linearize_msgs: return True return False finally: if pdf: pdf.close()
def convert_to_pdfa(input_pdf: Path, input_ps_stub: Path, context: PdfContext): options = context.options input_pdfinfo = context.pdfinfo fix_docinfo_file = context.get_path('fix_docinfo.pdf') output_file = context.get_path('pdfa.pdf') # If the DocumentInfo record contains NUL characters, Ghostscript will # produce XMP metadata which contains invalid XML entities (�). # NULs in DocumentInfo seem to be common since older Acrobats included them. # pikepdf can deal with this, but we make the world a better place by # stamping them out as soon as possible. modified = False with pikepdf.open(input_pdf) as pdf_file: try: len(pdf_file.docinfo) except TypeError: log.error( "File contains a malformed DocumentInfo block - continuing anyway" ) else: if pdf_file.docinfo: for k, v in pdf_file.docinfo.items(): if b'\x00' in bytes(v): pdf_file.docinfo[k] = bytes(v).replace(b'\x00', b'') modified = True if modified: pdf_file.save(fix_docinfo_file) else: safe_symlink(input_pdf, fix_docinfo_file) context.plugin_manager.hook.generate_pdfa( pdf_version=input_pdfinfo.min_version, pdf_pages=[fix_docinfo_file], pdfmark=input_ps_stub, output_file=output_file, compression=options.pdfa_image_compression, pdfa_part=options.output_type[-1], # is pdfa-1, pdfa-2, or pdfa-3 progressbar_class=(context.plugin_manager.hook.get_progressbar_class() if options.progress_bar else None), ) return output_file
def check_pdf(input_file: Path) -> bool: """Check if a PDF complies with the PDF specification. Checks for proper formatting and proper linearization. Uses pikepdf (which in turn, uses QPDF) to perform the checks. """ try: pdf = pikepdf.open(input_file) except pikepdf.PdfError as e: log.error(e) return False else: with pdf: messages = pdf.check() for msg in messages: if 'error' in msg.lower(): log.error(msg) else: log.warning(msg) sio = StringIO() linearize_msgs = '' try: # If linearization is missing entirely, we do not complain. We do # complain if linearization is present but incorrect. pdf.check_linearization(sio) except RuntimeError: pass except ( # Workaround for a problematic pikepdf version # pragma: no cover getattr(pikepdf, 'ForeignObjectError') if pikepdf.__version__ == '2.1.0' else NeverRaise): pass else: linearize_msgs = sio.getvalue() if linearize_msgs: log.warning(linearize_msgs) if not messages and not linearize_msgs: return True return False
def wait_for_file_ready(file_path): # This loop waits to make sure that the file is completely loaded on # disk before attempting to read. Docker sometimes will publish the # watchdog event before the file is actually fully on disk, causing # pikepdf to fail. retries = 5 while retries: try: pdf = pikepdf.open(file_path) except (FileNotFoundError, pikepdf.PdfError) as e: log.info(f"File {file_path} is not ready yet") log.debug("Exception was", exc_info=e) time.sleep(POLL_NEW_FILE_SECONDS) retries -= 1 else: pdf.close() return True return False
def extractContent(content=""): fileNames = [] # numPageBooks = [] pdfFileText = [] pdfFileReader = '' for files in glob.glob("Resource/*.pdf"): fileNames.append(files) for i in range(len(fileNames)): pdfFile = open(fileNames[i], 'rb') pdfFileReader = pyPDF.PdfFileReader(fileNames[i]) if (pdfFileReader.isEncrypted): pdfFile = pikepdf.open(fileNames[i]) #pdfFile.save(fileNames[i]) print("%s decrypted!" % fileNames[i]) pdfFileReader = pyPDF.PdfFileReader(fileNames[i]) #numPageBooks.append(pdfFileReader.numPages) pdfText = pdfFileReader.getPage(100) pdfText = pdfText.extractText() pdfFileText.append(pdfText) print(pdfFileText)
def test_pdfa(resources, outpdf, optimize, pdfa_level): check_ocrmypdf( resources / 'francais.pdf', outpdf, '--plugin', 'tests/plugins/tesseract_noop.py', f'--output-type=pdfa-{pdfa_level}', f'--optimize={optimize}', ) if pdfa_level in (2, 3): # PDF/A-2 allows ObjStm assert b'/ObjStm' in outpdf.read_bytes() elif pdfa_level == 1: # PDF/A-1 might allow ObjStm, but Acrobat does not approve it, so # we don't use it assert b'/ObjStm' not in outpdf.read_bytes() with pikepdf.open(outpdf) as pdf: with pdf.open_metadata() as m: assert m.pdfa_status == f'{pdfa_level}B'
def _pdf_get_all_pageinfo(infile, detailed_page_analysis, log=None): if not log: log = Mock() pdf = pikepdf.open(infile) if not detailed_page_analysis: pages_xml = None else: pages_xml = ghosttext.extract_text_xml(infile, pdf, pageno=None, log=log) pages = [] for n in range(len(pdf.pages)): page_xml = pages_xml[n] if pages_xml else None page = PageInfo(pdf, n, infile, page_xml) pages.append(page) return pages, pdf
def brute_force_pdf(plock_file, password_length, asci): """ function uses brute-force techniques to gain access to password protected pdf files :param plock_file: file with password protection :param password_length: possible length of password :param asci: string combination of ascii values :return: """ # iterates through all asci values and generates a progress bar showing completion process for i in tqdm(range(1, (password_length + 1))): for letter in itertools.product(asci, repeat=i): password = ''.join(letter) # uses pikepdf to try and open password protected pdf file using brute force method try: with pikepdf.open(plock_file, password=password): print(f"Password found: {password}") break except pikepdf._qpdf.PasswordError as e: # if password fail continue continue
def make_rotate_test(prefix, image_angle, page_angle): im = Image.open(fspath(resources / 'typewriter.png')) if image_angle != 0: ccw_angle = -image_angle % 360 im = im.transpose(getattr(Image, f'ROTATE_{ccw_angle}')) memimg = BytesIO() im.save(memimg, format='PNG') memimg.seek(0) mempdf = BytesIO() img2pdf.convert( memimg.read(), layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)), outputstream=mempdf, ) mempdf.seek(0) pike = pikepdf.open(mempdf) pike.pages[0].Rotate = page_angle target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf' pike.save(target) return target
def make_rotate_test(prefix, image_angle, page_angle): im = Image.open(fspath(resources / 'typewriter.png')) if image_angle != 0: ccw_angle = -image_angle % 360 im = im.transpose(getattr(Image, 'ROTATE_{}'.format(ccw_angle))) memimg = BytesIO() im.save(memimg, format='PNG') memimg.seek(0) mempdf = BytesIO() img2pdf.convert(memimg.read(), layout_fun=img2pdf.get_fixed_dpi_layout_fun( (200, 200)), outputstream=mempdf) mempdf.seek(0) pike = pikepdf.open(mempdf) pike.pages[0].Rotate = page_angle target = outdir / '{}_{}_{}.pdf'.format(prefix, image_angle, page_angle) pike.save(target) return target
def _pdf_get_all_pageinfo(infile, detailed_analysis=False, log=None): if not log: log = Mock() pdf = pikepdf.open(infile) # Do not close in this function if pdf.is_encrypted: pdf.close() raise EncryptedPdfError() # Triggered by encryption with empty passwd if detailed_analysis: pages_xml = None else: pages_xml = ghosttext.extract_text_xml(infile, pdf, pageno=None, log=log) pages = [] for n in range(len(pdf.pages)): page_xml = pages_xml[n] if pages_xml else None page = PageInfo(pdf, n, infile, page_xml, detailed_analysis) pages.append(page) return pages, pdf
def test_malformed_docinfo(caplog, resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') with pikepdf.open(resources / 'trivial.pdf') as pike: pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>") pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False) options = parser.parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo) convert_to_pdfa(str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context) print(caplog.records) assert any('malformed DocumentInfo block' in record.message for record in caplog.records)
def test_simulated_scan(outdir): canvas = Canvas( fspath(outdir / 'fakescan.pdf'), pagesize=(209.8, 297.6), ) page_vars = [(2, 36, 250), (91, 170, 240), (179, 190, 36), (271, 36, 36)] for n, page_var in enumerate(page_vars): text = canvas.beginText() text.setFont('Helvetica', 20) angle, x, y = page_var cos_a, sin_a = cos(angle / 180.0 * pi), sin(angle / 180.0 * pi) text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, x, y) text.textOut(f'Page {n + 1}') canvas.drawText(text) canvas.showPage() canvas.save() check_ocrmypdf( outdir / 'fakescan.pdf', outdir / 'out.pdf', '--force-ocr', '--deskew', '--rotate-pages', '--plugin', 'tests/plugins/tesseract_debug_rotate.py', ) with pikepdf.open(outdir / 'out.pdf') as pdf: assert (pdf.pages[1].MediaBox[2] > pdf.pages[1].MediaBox[3]), "Wrong orientation: not landscape" assert (pdf.pages[3].MediaBox[2] > pdf.pages[3].MediaBox[3]), "Wrong orientation: Not landscape" assert (pdf.pages[0].MediaBox[2] < pdf.pages[0].MediaBox[3]), "Wrong orientation: Not portrait" assert (pdf.pages[2].MediaBox[2] < pdf.pages[2].MediaBox[3]), "Wrong orientation: Not portrait"
def _page_extract_cb(self, filename): # open allows you to read the file destination = os.path.join(os.path.split(filename)[0], self.pages_dir) if not os.path.exists(destination): os.mkdir(destination) prefix = os.path.splitext(os.path.basename(filename))[0] try: pdfobject = pikepdf.open(filename) num_pages = len(pdfobject.pages) count = 0 text = "" while count < num_pages: pageObj = pdfobject.pages[count] count += 1 decodeImage( pageObj, os.path.join(destination, prefix + '_%03d' % count)) #text += pageObj.extractText() except Exception as ex: log(ERROR, 'Cannot extract %s due to %s' % (filename, str(ex)))
def test_flate_to_jbig2(resources, outdir, spoof_tesseract_noop): # This test requires an image that pngquant is capable of converting to # to 1bpp - so use an existing 1bpp image, convert up, confirm it can # convert down im = Image.open(fspath(resources / 'typewriter.png')) assert im.mode in ('1', 'P') im = im.convert('L') im.save(fspath(outdir / 'type8.png')) check_ocrmypdf(outdir / 'type8.png', outdir / 'out.pdf', '--image-dpi', '100', '--png-quality', '10', '--optimize', '3', env=spoof_tesseract_noop) pdf = pikepdf.open(outdir / 'out.pdf') pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values()))) assert pim.filters[0] == '/JBIG2Decode'
def _find_font(text, pdf_base): """Copy a font from the filename text into pdf_base""" font, font_key = None, None possible_font_names = ('/f-0-0', '/F1') try: with pikepdf.open(text) as pdf_text: try: pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {}) except (AttributeError, IndexError, KeyError): return None, None for f in possible_font_names: pdf_text_font = pdf_text_fonts.get(f, None) if pdf_text_font is not None: font_key = f break if pdf_text_font: font = pdf_base.copy_foreign(pdf_text_font) return font, font_key except (FileNotFoundError, pikepdf.PdfError): # PdfError occurs if a 0-length file is written e.g. due to OCR timeout return None, None
def _transcode_png(pike: Pdf, filename: Path, xref: Xref) -> bool: output = filename.with_suffix('.png.pdf') with output.open('wb') as f: img2pdf.convert(fspath(filename), outputstream=f) with pikepdf.open(output) as pdf_image: foreign_image = next(pdf_image.pages[0].images.values()) local_image = pike.copy_foreign(foreign_image) im_obj = pike.get_object(xref, 0) im_obj.write( local_image.read_raw_bytes(), filter=local_image.Filter, decode_parms=local_image.DecodeParms, ) # Don't copy keys from the new image... del_keys = set(im_obj.keys()) - set(local_image.keys()) # ...except for the keep_fields, which are essential to displaying # the image correctly and preserving its metadata. (/Decode arrays # and /SMaskInData are implicitly discarded prior to this point.) keep_fields = { '/ID', '/Intent', '/Interpolate', '/Mask', '/Metadata', '/OC', '/OPI', '/SMask', '/StructParent', } del_keys -= keep_fields for key in local_image.keys(): if key != Name.Length and str(key) not in keep_fields: im_obj[key] = local_image[key] for key in del_keys: del im_obj[key] return True
def test_metadata_fixup_warning(resources, outdir, caplog): options = get_parser().parse_args( args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf']) copyfile(resources / 'graph.pdf', outdir / 'graph.pdf') context = PdfContext(options, outdir, outdir / 'graph.pdf', None, get_plugin_manager([])) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) for record in caplog.records: assert record.levelname != 'WARNING' # Now add some metadata that will not be copyable graph = pikepdf.open(outdir / 'graph.pdf') with graph.open_metadata() as meta: meta['prism2:publicationName'] = 'OCRmyPDF Test' graph.save(outdir / 'graph_mod.pdf') context = PdfContext(options, outdir, outdir / 'graph_mod.pdf', None, get_plugin_manager([])) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) assert any(record.levelname == 'WARNING' for record in caplog.records)
def _find_font(self, text): """Copy a font from the filename text into pdf_base""" font, font_key = None, None possible_font_names = ('/f-0-0', '/F1') try: with pikepdf.open(text) as pdf_text: try: pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {}) except (AttributeError, IndexError, KeyError): return None, None for f in possible_font_names: pdf_text_font = pdf_text_fonts.get(f, None) if pdf_text_font is not None: font_key = f break if pdf_text_font: font = self.pdf_base.copy_foreign(pdf_text_font) return font, font_key except (FileNotFoundError, pikepdf.PdfError): # PdfError occurs if a 0-length file is written e.g. due to OCR timeout return None, None
def set_all_bookmark_zooms(in_file,out_file=None,zoom_factor=None,only_bookmarks=False): def _apply_set_all_bookmark_zoom(names,depth,children): for child in children: set_zoom_factor(names,depth,child,zoom_factor,only_bookmarks) _apply_set_all_bookmark_zoom(names,depth+1,child.children) pdf = pikepdf.open(in_file) if not only_bookmarks: names = get_names(pdf.root) for key,values in names.items(): index,array = values array[index] = update_dest(zoom_factor,array[index]) with pdf.open_outline() as outline: _apply_set_all_bookmark_zoom(pdf.root,0,outline.root) save_name = out_file if out_file else in_file.split(".")[0] + "-zoomed.pdf" print("saving %s ..." % save_name) pdf.save(save_name)
def strip(file, password): # Removes the password of pdf using pikepdf module try: name_pdf = file.replace('.pdf', '_decrypted.pdf') print('%s Working on %s file..' % (WORKING, file)) with pikepdf.open(file, password) as pdf_clean: pdf_clean.save(name_pdf) print('%s Successfully removed password. Saved as %s' % (SUCCESS, name_pdf)) pdf_clean.close() except pikepdf._qpdf.PasswordError: print('%s Unable to remove protection. Invalid password.' % (FAILED)) except FileNotFoundError: print('%s %s not found.' % (FAILED, file))
def _transcode_png(pike: Pdf, filename: Path, xref: Xref) -> bool: output = filename.with_suffix('.png.pdf') with output.open('wb') as f: img2pdf.convert(fspath(filename), outputstream=f) with pikepdf.open(output) as pdf_image: foreign_image = next(pdf_image.pages[0].images.values()) local_image = pike.copy_foreign(foreign_image) im_obj = pike.get_object(xref, 0) im_obj.write( local_image.read_raw_bytes(), filter=local_image.Filter, decode_parms=local_image.DecodeParms, ) del_keys = set(im_obj.keys()) - set(local_image.keys()) for key in local_image.keys(): if key != Name.Length: im_obj[key] = local_image[key] for key in del_keys: del im_obj[key]
def test_metadata_fixup_warning(resources, outdir, caplog): from ocrmypdf._pipeline import metadata_fixup input_files = [ str(outdir / 'graph.repaired.pdf'), str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.pdf'), # It is okay that this is not a PDF/A ] for f in input_files: copyfile(resources / 'graph.pdf', f) log = logging.getLogger() context = MagicMock() metadata_fixup( input_files_groups=input_files, output_file=outdir / 'out.pdf', log=log, context=context, ) for record in caplog.records: assert record.levelname != 'WARNING' # Now add some metadata that will not be copyable graph = pikepdf.open(outdir / 'graph.repaired.pdf') with graph.open_metadata() as meta: meta['prism2:publicationName'] = 'OCRmyPDF Test' graph.save(outdir / 'graph.repaired.pdf') log = logging.getLogger() context = MagicMock() metadata_fixup( input_files_groups=input_files, output_file=outdir / 'out.pdf', log=log, context=context, ) assert any(record.levelname == 'WARNING' for record in caplog.records)
def test_flate_to_jbig2(resources, outdir, spoof_tesseract_noop): # This test requires an image that pngquant is capable of converting to # to 1bpp - so use an existing 1bpp image, convert up, confirm it can # convert down im = Image.open(fspath(resources / 'typewriter.png')) assert im.mode in ('1', 'P') im = im.convert('L') im.save(fspath(outdir / 'type8.png')) check_ocrmypdf( outdir / 'type8.png', outdir / 'out.pdf', '--image-dpi', '100', '--png-quality', '50', '--optimize', '3', env=spoof_tesseract_noop, ) pdf = pikepdf.open(outdir / 'out.pdf') pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values()))) assert pim.filters[0] == '/JBIG2Decode'
def _weave_layers_graft( *, pdf_base, page_num, text, font, font_key, procset, rotation, strip_old_text, log ): """Insert the text layer from text page 0 on to pdf_base at page_num""" log.debug("Grafting") if Path(text).stat().st_size == 0: return # This is a pointer indicating a specific page in the base file pdf_text = pikepdf.open(text) pdf_text_contents = pdf_text.pages[0].Contents.read_bytes() if not tesseract.has_textonly_pdf(): # If we don't have textonly_pdf, edit the stream to delete the # instruction to draw the image Tesseract generated, which we do not # use. stream = bytearray(pdf_text_contents) pattern = b'/Im1 Do' idx = stream.find(pattern) stream[idx : (idx + len(pattern))] = b' ' * len(pattern) pdf_text_contents = bytes(stream) base_page = pdf_base.pages.p(page_num) # The text page always will be oriented up by this stage but the original # content may have a rotation applied. Wrap the text stream with a rotation # so it will be oriented the same way as the rest of the page content. # (Previous versions OCRmyPDF rotated the content layer to match the text.) mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)] wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] mediabox = [float(base_page.MediaBox[v]) for v in range(4)] wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2) untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2) # -rotation because the input is a clockwise angle and this formula # uses CCW rotation = -rotation % 360 rotate = pikepdf.PdfMatrix().rotated(rotation) # Because of rounding of DPI, we might get a text layer that is not # identically sized to the target page. Scale to adjust. Normally this # is within 0.998. if rotation in (90, 270): wt, ht = ht, wt scale_x = wp / wt scale_y = hp / ht log.debug('%r', (scale_x, scale_y)) scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y) # Translate the text so it is centered at (0, 0), rotate it there, adjust # for a size different between initial and text PDF, then untranslate ctm = translate @ rotate @ scale @ untranslate pdf_text_contents = b'q %s cm\n' % ctm.encode() + pdf_text_contents + b'\nQ\n' new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents) if strip_old_text: strip_invisible_text(pdf_base, base_page, log) base_page.page_contents_add(new_text_layer, prepend=True) _update_page_resources( page=base_page, font=font, font_key=font_key, procset=procset ) pdf_text.close()
def weave_layers(infiles, output_file, log, context): """Apply text layer and/or image layer changes to baseline file This is where the magic happens. infiles will be the main PDF to modify, and optional .text.pdf and .image-layer.pdf files, organized however ruffus organizes them. From .text.pdf, we copy the content stream (which contains the Tesseract OCR results), and rotate it into place. The first time we do this, we also copy the GlyphlessFont, and then reference that font again. For .image-layer.pdf, we check if this is a "pointer" to the original file, or a new file. If a new file, we replace the page and remember that we replaced this page. Every 100 open files, we save intermediate results, to avoid any resource limits, since pikepdf/qpdf need to keep a lot of open file handles in the background. When objects are copied from one file to another qpdf, qpdf doesn't actually copy the data until asked to write, so all the resources it may need to remain available. For completeness, we set up a /ProcSet on every page, although it's unlikely any PDF viewer cares about this anymore. """ def input_sorter(key): try: return page_number(key) except ValueError: return -1 flat_inputs = sorted(flatten_groups(infiles), key=input_sorter) groups = groupby(flat_inputs, key=input_sorter) # Extract first item _, basegroup = next(groups) base = list(basegroup)[0] path_base = Path(base).resolve() pdf_base = pikepdf.open(path_base) font, font_key, procset = None, None, None pdfinfo = context.get_pdfinfo() procset = pdf_base.make_indirect( pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]') ) emplacements = 1 interim_count = 0 # Iterate rest for page_num, layers in groups: layers = list(layers) log.debug(page_num) log.debug(layers) text = next((ii for ii in layers if ii.endswith('.text.pdf')), None) image = next((ii for ii in layers if ii.endswith('.image-layer.pdf')), None) if text and not font: font, font_key = _find_font(text, pdf_base) emplaced_page = False content_rotation = pdfinfo[page_num - 1].rotation path_image = Path(image).resolve() if image else None if path_image is not None and path_image != path_base: # We are updating the old page with a rasterized PDF of the new # page (without changing objgen, to preserve references) log.debug("Emplacement update") with pikepdf.open(image) as pdf_image: emplacements += 1 foreign_image_page = pdf_image.pages[0] pdf_base.pages.append(foreign_image_page) local_image_page = pdf_base.pages[-1] pdf_base.pages[page_num - 1].emplace(local_image_page) del pdf_base.pages[-1] emplaced_page = True autorotate_correction = context.get_rotation(page_num - 1) if emplaced_page: content_rotation = autorotate_correction text_rotation = autorotate_correction text_misaligned = (text_rotation - content_rotation) % 360 log.debug( '%r', [text_rotation, autorotate_correction, text_misaligned, content_rotation], ) if text and font: # Graft the text layer onto this page, whether new or old strip_old = context.get_options().redo_ocr _weave_layers_graft( pdf_base=pdf_base, page_num=page_num, text=text, font=font, font_key=font_key, rotation=text_misaligned, procset=procset, strip_old_text=strip_old, log=log, ) # Correct the rotation if applicable pdf_base.pages[page_num - 1].Rotate = ( content_rotation - autorotate_correction ) % 360 if emplacements % MAX_REPLACE_PAGES == 0: # Periodically save and reload the Pdf object. This will keep a # lid on our memory usage for very large files. Attach the font to # page 1 even if page 1 doesn't use it, so we have a way to get it # back. # TODO refactor this to outside the loop page0 = pdf_base.pages[0] _update_page_resources( page=page0, font=font, font_key=font_key, procset=procset ) # We cannot read and write the same file, that will corrupt it # but we don't to keep more copies than we need to. Delete intermediates. # {interim_count} is the opened file we were updateing # {interim_count - 1} can be deleted # {interim_count + 1} is the new file will produce and open old_file = output_file + f'_working{interim_count - 1}.pdf' if not context.get_options().keep_temporary_files: with suppress(FileNotFoundError): os.unlink(old_file) next_file = output_file + f'_working{interim_count + 1}.pdf' pdf_base.save(next_file) pdf_base.close() pdf_base = pikepdf.open(next_file) procset = pdf_base.pages[0].Resources.ProcSet font, font_key = None, None # Ensure we reacquire this information interim_count += 1 pdf_base.save(output_file) pdf_base.close()
def linn(resources): path = resources / 'linn.pdf' return path, pikepdf.open(path)