예제 #1
0
파일: info.py 프로젝트: peduallex/OCRmyPDF
def _pdf_pageinfo_sync_init(infile):
    global worker_pdf  # pylint: disable=global-statement
    pikepdf_enable_mmap()
    # If this function is called as a thread initializer, we need a messy hack
    # to close worker_pdf. If called as a process, it will be released when the
    # process is terminated.
    worker_pdf = pikepdf.open(infile)
예제 #2
0
def _pdf_pageinfo_sync_init(infile: Path, pdfminer_loglevel):
    global worker_pdf  # pylint: disable=global-statement
    pikepdf_enable_mmap()

    logging.getLogger('pdfminer').setLevel(pdfminer_loglevel)

    # If this function is called as a thread initializer, we need a messy hack
    # to close worker_pdf. If called as a process, it will be released when the
    # process is terminated.
    worker_pdf = pikepdf.open(infile)
예제 #3
0
def _pdf_pageinfo_sync_init(pdf: Pdf, infile: Path, pdfminer_loglevel):
    global worker_pdf  # pylint: disable=global-statement
    pikepdf_enable_mmap()

    logging.getLogger('pdfminer').setLevel(pdfminer_loglevel)

    # If the pdf is not opened, open a copy for our worker process to use
    if pdf is None:
        worker_pdf = pikepdf.open(infile)

        def on_process_close():
            worker_pdf.close()

        # Close when this process exits
        atexit.register(on_process_close)
예제 #4
0
파일: _sync.py 프로젝트: knobix/OCRmyPDF
def run_pipeline(options, *, plugin_manager, api=False):
    # Any changes to options will not take effect for options that are already
    # bound to function parameters in the pipeline. (For example
    # options.input_file, options.pdf_renderer are already bound.)
    if not options.jobs:
        options.jobs = available_cpu_count()
    if not plugin_manager:
        plugin_manager = get_plugin_manager(options.plugins)

    work_folder = Path(mkdtemp(prefix="ocrmypdf.io."))
    debug_log_handler = None
    if (
        (options.keep_temporary_files or options.verbose >= 1)
        and not os.environ.get('PYTEST_CURRENT_TEST', '')
        and not api
    ):
        # Debug log for command line interface only with verbose output
        # See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this
        # when pytest is running
        debug_log_handler = configure_debug_logging(
            Path(work_folder) / "debug.log"
        )  # pragma: no cover

    pikepdf_enable_mmap()

    executor = setup_executor(plugin_manager)
    try:
        check_requested_output_file(options)
        start_input_file, original_filename = create_input_file(options, work_folder)

        # Triage image or pdf
        origin_pdf = triage(
            original_filename, start_input_file, work_folder / 'origin.pdf', options
        )

        # Gather pdfinfo and create context
        pdfinfo = get_pdfinfo(
            origin_pdf,
            executor=executor,
            detailed_analysis=options.redo_ocr,
            progbar=options.progress_bar,
            max_workers=options.jobs if not options.use_threads else 1,  # To help debug
            check_pages=options.pages,
        )

        context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)

        # Validate options are okay for this pdf
        validate_pdfinfo_options(context)

        # Execute the pipeline
        exec_concurrent(context, executor)

        if options.output_file == '-':
            log.info("Output sent to stdout")
        elif (
            hasattr(options.output_file, 'writable') and options.output_file.writable()
        ):
            log.info("Output written to stream")
        elif samefile(options.output_file, os.devnull):
            pass  # Say nothing when sending to dev null
        else:
            if options.output_type.startswith('pdfa'):
                pdfa_info = file_claims_pdfa(options.output_file)
                if pdfa_info['pass']:
                    log.info(
                        "Output file is a %s (as expected)", pdfa_info['conformance']
                    )
                else:
                    log.warning(
                        "Output file is okay but is not PDF/A (seems to be %s)",
                        pdfa_info['conformance'],
                    )
                    return ExitCode.pdfa_conversion_failed
            if not check_pdf(options.output_file):
                log.warning('Output file: The generated PDF is INVALID')
                return ExitCode.invalid_output_pdf
            report_output_file_size(options, start_input_file, options.output_file)

    except (KeyboardInterrupt if not api else NeverRaise) as e:
        if options.verbose >= 1:
            log.exception("KeyboardInterrupt")
        else:
            log.error("KeyboardInterrupt")
        return ExitCode.ctrl_c
    except (ExitCodeException if not api else NeverRaise) as e:
        if options.verbose >= 1:
            log.exception("ExitCodeException")
        elif str(e):
            log.error("%s: %s", type(e).__name__, str(e))
        else:
            log.error(type(e).__name__)
        return e.exit_code
    except (Exception if not api else NeverRaise) as e:  # pylint: disable=broad-except
        log.exception("An exception occurred while executing the pipeline")
        return ExitCode.other_error
    finally:
        if debug_log_handler:
            try:
                debug_log_handler.close()
                log.removeHandler(debug_log_handler)
            except EnvironmentError as e:
                print(e, file=sys.stderr)
        cleanup_working_files(work_folder, options)

    return ExitCode.ok
예제 #5
0
파일: _sync.py 프로젝트: knobix/OCRmyPDF
def worker_init(max_pixels: int):
    # In Windows, child process will not inherit our change to this value in
    # the parent process, so ensure workers get it set. Not needed when running
    # threaded, but harmless to set again.
    PIL.Image.MAX_IMAGE_PIXELS = max_pixels
    pikepdf_enable_mmap()
예제 #6
0
파일: info.py 프로젝트: dffgdfgdfg/ocr-test
def _pdf_pageinfo_sync_init(infile):
    global worker_pdf  # pylint: disable=global-statement
    pikepdf_enable_mmap()
    worker_pdf = pikepdf.open(infile)