def validate_pdfinfo_options(context: PdfContext): pdfinfo = context.pdfinfo options = context.options if pdfinfo.needs_rendering: log.error( "This PDF contains dynamic XFA forms created by Adobe LiveCycle " "Designer and can only be read by Adobe Acrobat or Adobe Reader.") raise InputFileError() if pdfinfo.has_userunit and options.output_type.startswith('pdfa'): log.error( "This input file uses a PDF feature that is not supported " "by Ghostscript, so you cannot use --output-type=pdfa for this " "file. (Specifically, it uses the PDF-1.6 /UserUnit feature to " "support very large or small page sizes, and Ghostscript cannot " "output these files.) Use --output-type=pdf instead.") raise InputFileError() if pdfinfo.has_acroform: if options.redo_ocr: log.error("This PDF has a user fillable form. --redo-ocr is not " "currently possible on such files.") raise InputFileError() else: log.warning("This PDF has a fillable form. " "Chances are it is a pure digital " "document that does not need OCR.") if not options.force_ocr: log.info( "Use the option --force-ocr to produce an image of the " "form and all filled form fields. The output PDF will be " "'flattened' and will no longer be fillable.") context.plugin_manager.hook.validate(pdfinfo=pdfinfo, options=options)
def create_input_file(options, work_folder: Path) -> Tuple[Path, str]: if options.input_file == '-': # stdin log.info('reading file from standard input') target = work_folder / 'stdin' with open(target, 'wb') as stream_buffer: copyfileobj(sys.stdin.buffer, stream_buffer) return target, "stdin" elif hasattr(options.input_file, 'readable'): if not options.input_file.readable(): raise InputFileError("Input file stream is not readable") log.info('reading file from input stream') target = work_folder / 'stream' with open(target, 'wb') as stream_buffer: copyfileobj(options.input_file, stream_buffer) return target, "stream" else: try: target = work_folder / 'origin' safe_symlink(options.input_file, target) return target, os.fspath(options.input_file) except FileNotFoundError: msg = f"File not found - {options.input_file}" if Path('/.dockerenv').exists(): # pragma: no cover msg += ( "\nDocker cannot your working directory unless you " "explicitly share it with the Docker container and set up" "permissions correctly.\n" "You may find it easier to use stdin/stdout:" "\n" "\tdocker run -i --rm jbarlow83/ocrmypdf - - <input.pdf >output.pdf\n" ) raise InputFileError(msg)
def create_input_file(options, work_folder: Path) -> Tuple[Path, str]: if options.input_file == '-': # stdin log.info('reading file from standard input') target = work_folder / 'stdin' with open(target, 'wb') as stream_buffer: copyfileobj(sys.stdin.buffer, stream_buffer) return target, "stdin" elif hasattr(options.input_file, 'readable'): if not options.input_file.readable(): raise InputFileError("Input file stream is not readable") log.info('reading file from input stream') target = work_folder / 'stream' with open(target, 'wb') as stream_buffer: copyfileobj(options.input_file, stream_buffer) return target, "stream" else: try: target = work_folder / 'origin' safe_symlink(options.input_file, target) return target, os.fspath(options.input_file) except FileNotFoundError: raise InputFileError(f"File not found - {options.input_file}")
def create_input_file(options, work_folder: Path) -> (Path, str): if options.input_file == '-': # stdin log.info('reading file from standard input') target = work_folder / 'stdin' with open(target, 'wb') as stream_buffer: copyfileobj(sys.stdin.buffer, stream_buffer) return target, "<stdin>" else: try: target = work_folder / 'origin' safe_symlink(options.input_file, target) return target, os.fspath(options.input_file) except FileNotFoundError: raise InputFileError(f"File not found - {options.input_file}")
def triage(original_filename, input_file, output_file, options): try: if _pdf_guess_version(input_file): if options.image_dpi: log.warning( "Argument --image-dpi is being ignored because the " "input file is a PDF, not an image.") # Origin file is a pdf create a symlink with pdf extension safe_symlink(input_file, output_file) return output_file except EnvironmentError as e: log.debug(f"Temporary file was at: {input_file}") msg = str(e).replace(str(input_file), original_filename) raise InputFileError(msg) from e triage_image_file(input_file, output_file, options) return output_file
def get_pdfinfo( input_file, detailed_analysis=False, progbar=False, max_workers=None, check_pages=None, ): try: return PdfInfo( input_file, detailed_analysis=detailed_analysis, progbar=progbar, max_workers=max_workers, check_pages=check_pages, ) except pikepdf.PasswordError: raise EncryptedPdfError() except pikepdf.PdfError: raise InputFileError()
def get_page_analysis(infile, pageno, pscript5_mode): rman = pdfminer.pdfinterp.PDFResourceManager(caching=True) if pdfminer.__version__ < '20200402': # Workaround for https://github.com/pdfminer/pdfminer.six/issues/395 disable_boxes_flow = 2 else: disable_boxes_flow = None dev = TextPositionTracker( rman, laparams=LAParams(all_texts=True, detect_vertical=True, boxes_flow=disable_boxes_flow), ) interp = pdfminer.pdfinterp.PDFPageInterpreter(rman, dev) patcher = None if pscript5_mode: patcher = patch.multiple( 'pdfminer.pdffont.PDFType3Font', spec=True, get_ascent=PDFType3Font__PScript5_get_ascent, get_descent=PDFType3Font__PScript5_get_descent, get_height=PDFType3Font__PScript5_get_height, ) patcher.start() try: with Path(infile).open('rb') as f: page_iter = PDFPage.get_pages(f, pagenos=[pageno], maxpages=0) page = next(page_iter, None) if page is None: raise InputFileError( f"pdfminer could not process page {pageno} (counting from 0)." ) interp.process_page(page) except PDFTextExtractionNotAllowed as e: raise EncryptedPdfError() from e finally: if patcher is not None: patcher.stop() return dev.get_result()
def get_pdfinfo( input_file, *, executor: Executor, detailed_analysis=False, progbar=False, max_workers=None, check_pages=None, ) -> PdfInfo: try: return PdfInfo( input_file, detailed_analysis=detailed_analysis, progbar=progbar, max_workers=max_workers, check_pages=check_pages, executor=executor, ) except pikepdf.PasswordError as e: raise EncryptedPdfError() from e except pikepdf.PdfError as e: raise InputFileError() from e
def process_sigbus(*args): raise InputFileError("A worker process lost access to an input file")
def sigbus(*args): raise InputFileError("Lost access to the input file")
def update_pageinfo(result, pbar): page = result if not page: raise InputFileError("Could read a page in the PDF") pages[page.pageno] = page pbar.update()