Exemplo n.º 1
0
def validate_pdfinfo_options(context: PdfContext):
    pdfinfo = context.pdfinfo
    options = context.options

    if pdfinfo.needs_rendering:
        log.error(
            "This PDF contains dynamic XFA forms created by Adobe LiveCycle "
            "Designer and can only be read by Adobe Acrobat or Adobe Reader.")
        raise InputFileError()
    if pdfinfo.has_userunit and options.output_type.startswith('pdfa'):
        log.error(
            "This input file uses a PDF feature that is not supported "
            "by Ghostscript, so you cannot use --output-type=pdfa for this "
            "file. (Specifically, it uses the PDF-1.6 /UserUnit feature to "
            "support very large or small page sizes, and Ghostscript cannot "
            "output these files.)  Use --output-type=pdf instead.")
        raise InputFileError()
    if pdfinfo.has_acroform:
        if options.redo_ocr:
            log.error("This PDF has a user fillable form. --redo-ocr is not "
                      "currently possible on such files.")
            raise InputFileError()
        else:
            log.warning("This PDF has a fillable form. "
                        "Chances are it is a pure digital "
                        "document that does not need OCR.")
            if not options.force_ocr:
                log.info(
                    "Use the option --force-ocr to produce an image of the "
                    "form and all filled form fields. The output PDF will be "
                    "'flattened' and will no longer be fillable.")
    context.plugin_manager.hook.validate(pdfinfo=pdfinfo, options=options)
Exemplo n.º 2
0
def create_input_file(options, work_folder: Path) -> Tuple[Path, str]:
    if options.input_file == '-':
        # stdin
        log.info('reading file from standard input')
        target = work_folder / 'stdin'
        with open(target, 'wb') as stream_buffer:
            copyfileobj(sys.stdin.buffer, stream_buffer)
        return target, "stdin"
    elif hasattr(options.input_file, 'readable'):
        if not options.input_file.readable():
            raise InputFileError("Input file stream is not readable")
        log.info('reading file from input stream')
        target = work_folder / 'stream'
        with open(target, 'wb') as stream_buffer:
            copyfileobj(options.input_file, stream_buffer)
        return target, "stream"
    else:
        try:
            target = work_folder / 'origin'
            safe_symlink(options.input_file, target)
            return target, os.fspath(options.input_file)
        except FileNotFoundError:
            msg = f"File not found - {options.input_file}"
            if Path('/.dockerenv').exists():  # pragma: no cover
                msg += (
                    "\nDocker cannot your working directory unless you "
                    "explicitly share it with the Docker container and set up"
                    "permissions correctly.\n"
                    "You may find it easier to use stdin/stdout:"
                    "\n"
                    "\tdocker run -i --rm jbarlow83/ocrmypdf - - <input.pdf >output.pdf\n"
                )
            raise InputFileError(msg)
Exemplo n.º 3
0
def create_input_file(options, work_folder: Path) -> Tuple[Path, str]:
    if options.input_file == '-':
        # stdin
        log.info('reading file from standard input')
        target = work_folder / 'stdin'
        with open(target, 'wb') as stream_buffer:
            copyfileobj(sys.stdin.buffer, stream_buffer)
        return target, "stdin"
    elif hasattr(options.input_file, 'readable'):
        if not options.input_file.readable():
            raise InputFileError("Input file stream is not readable")
        log.info('reading file from input stream')
        target = work_folder / 'stream'
        with open(target, 'wb') as stream_buffer:
            copyfileobj(options.input_file, stream_buffer)
        return target, "stream"
    else:
        try:
            target = work_folder / 'origin'
            safe_symlink(options.input_file, target)
            return target, os.fspath(options.input_file)
        except FileNotFoundError:
            raise InputFileError(f"File not found - {options.input_file}")
Exemplo n.º 4
0
def create_input_file(options, work_folder: Path) -> (Path, str):
    if options.input_file == '-':
        # stdin
        log.info('reading file from standard input')
        target = work_folder / 'stdin'
        with open(target, 'wb') as stream_buffer:
            copyfileobj(sys.stdin.buffer, stream_buffer)
        return target, "<stdin>"
    else:
        try:
            target = work_folder / 'origin'
            safe_symlink(options.input_file, target)
            return target, os.fspath(options.input_file)
        except FileNotFoundError:
            raise InputFileError(f"File not found - {options.input_file}")
Exemplo n.º 5
0
def triage(original_filename, input_file, output_file, options):
    try:
        if _pdf_guess_version(input_file):
            if options.image_dpi:
                log.warning(
                    "Argument --image-dpi is being ignored because the "
                    "input file is a PDF, not an image.")
            # Origin file is a pdf create a symlink with pdf extension
            safe_symlink(input_file, output_file)
            return output_file
    except EnvironmentError as e:
        log.debug(f"Temporary file was at: {input_file}")
        msg = str(e).replace(str(input_file), original_filename)
        raise InputFileError(msg) from e

    triage_image_file(input_file, output_file, options)
    return output_file
Exemplo n.º 6
0
def get_pdfinfo(
    input_file,
    detailed_analysis=False,
    progbar=False,
    max_workers=None,
    check_pages=None,
):
    try:
        return PdfInfo(
            input_file,
            detailed_analysis=detailed_analysis,
            progbar=progbar,
            max_workers=max_workers,
            check_pages=check_pages,
        )
    except pikepdf.PasswordError:
        raise EncryptedPdfError()
    except pikepdf.PdfError:
        raise InputFileError()
Exemplo n.º 7
0
def get_page_analysis(infile, pageno, pscript5_mode):
    rman = pdfminer.pdfinterp.PDFResourceManager(caching=True)
    if pdfminer.__version__ < '20200402':
        # Workaround for https://github.com/pdfminer/pdfminer.six/issues/395
        disable_boxes_flow = 2
    else:
        disable_boxes_flow = None
    dev = TextPositionTracker(
        rman,
        laparams=LAParams(all_texts=True,
                          detect_vertical=True,
                          boxes_flow=disable_boxes_flow),
    )
    interp = pdfminer.pdfinterp.PDFPageInterpreter(rman, dev)

    patcher = None
    if pscript5_mode:
        patcher = patch.multiple(
            'pdfminer.pdffont.PDFType3Font',
            spec=True,
            get_ascent=PDFType3Font__PScript5_get_ascent,
            get_descent=PDFType3Font__PScript5_get_descent,
            get_height=PDFType3Font__PScript5_get_height,
        )
        patcher.start()

    try:
        with Path(infile).open('rb') as f:
            page_iter = PDFPage.get_pages(f, pagenos=[pageno], maxpages=0)
            page = next(page_iter, None)
            if page is None:
                raise InputFileError(
                    f"pdfminer could not process page {pageno} (counting from 0)."
                )
            interp.process_page(page)
    except PDFTextExtractionNotAllowed as e:
        raise EncryptedPdfError() from e
    finally:
        if patcher is not None:
            patcher.stop()

    return dev.get_result()
Exemplo n.º 8
0
def get_pdfinfo(
    input_file,
    *,
    executor: Executor,
    detailed_analysis=False,
    progbar=False,
    max_workers=None,
    check_pages=None,
) -> PdfInfo:
    try:
        return PdfInfo(
            input_file,
            detailed_analysis=detailed_analysis,
            progbar=progbar,
            max_workers=max_workers,
            check_pages=check_pages,
            executor=executor,
        )
    except pikepdf.PasswordError as e:
        raise EncryptedPdfError() from e
    except pikepdf.PdfError as e:
        raise InputFileError() from e
Exemplo n.º 9
0
def process_sigbus(*args):
    raise InputFileError("A worker process lost access to an input file")
Exemplo n.º 10
0
def sigbus(*args):
    raise InputFileError("Lost access to the input file")
Exemplo n.º 11
0
 def update_pageinfo(result, pbar):
     page = result
     if not page:
         raise InputFileError("Could read a page in the PDF")
     pages[page.pageno] = page
     pbar.update()