def test_no_cpu_count(monkeypatch): def cpu_count_raises(): raise NotImplementedError() monkeypatch.setattr(multiprocessing, 'cpu_count', cpu_count_raises) with pytest.warns(expected_warning=UserWarning): assert helpers.available_cpu_count() == 1
def _pdf_pageinfo_concurrent( pdf, executor: Executor, infile, progbar, max_workers, check_pages, detailed_analysis=False, ): pages = [None] * len(pdf.pages) def update_pageinfo(result, pbar): page = result if not page: raise InputFileError("Could read a page in the PDF") pages[page.pageno] = page pbar.update() if max_workers is None: max_workers = available_cpu_count() total = len(pdf.pages) use_threads = False # No performance gain if threaded due to GIL n_workers = min(1 + len(pages) // 4, max_workers) if n_workers == 1: # But if we decided on only one worker, there is no point in using # a separate process. use_threads = True # If we use a thread, we can pass the already-open Pdf for them to use # If we use processes, we pass a None which tells the init function to open its # own initial_pdf = pdf if use_threads else None contexts = ((n, initial_pdf, infile, check_pages, detailed_analysis) for n in range(total)) assert n_workers == 1 if use_threads else n_workers >= 1, "Not multithreadable" executor( use_threads=use_threads, max_workers=n_workers, tqdm_kwargs=dict(total=total, desc="Scanning contents", unit='page', disable=not progbar), worker_initializer=partial( _pdf_pageinfo_sync_init, initial_pdf, infile, logging.getLogger('pdfminer').level, ), task=_pdf_pageinfo_sync, task_arguments=contexts, task_finished=update_pageinfo, ) return pages
def test_no_cpu_count(monkeypatch): invoked = False def cpu_count_raises(): nonlocal invoked invoked = True raise NotImplementedError() monkeypatch.setattr(multiprocessing, 'cpu_count', cpu_count_raises) with pytest.warns(expected_warning=UserWarning): assert helpers.available_cpu_count() == 1 assert invoked, "Patched function called during test"
def _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers, check_pages, detailed_analysis=False): global worker_pdf # pylint: disable=global-statement pages = [None] * len(pdf.pages) def update_pageinfo(result, pbar): page = result if not page: raise InputFileError("Could read a page in the PDF") pages[page.pageno] = page pbar.update() if max_workers is None: max_workers = available_cpu_count() total = len(pdf.pages) contexts = ((n, infile, check_pages, detailed_analysis) for n in range(total)) use_threads = False # No performance gain if threaded due to GIL n_workers = min(1 + len(pages) // 4, max_workers) if n_workers == 1: # But if we decided on only one worker, there is no point in using # a separate process. use_threads = True try: exec_progress_pool( use_threads=use_threads, max_workers=n_workers, tqdm_kwargs=dict(total=total, desc="Scanning contents", unit='page', disable=not progbar), task_initializer=partial(_pdf_pageinfo_sync_init, infile, logging.getLogger('pdfminer').level), task=_pdf_pageinfo_sync, task_arguments=contexts, task_finished=update_pageinfo, ) finally: if worker_pdf and use_threads: assert n_workers == 1, "Should have only one worker when threaded" # This is messy, but if we ran in thread, close worker_pdf worker_pdf.close() return pages
def run_pipeline(options, *, plugin_manager, api=False): # Any changes to options will not take effect for options that are already # bound to function parameters in the pipeline. (For example # options.input_file, options.pdf_renderer are already bound.) if not options.jobs: options.jobs = available_cpu_count() if not plugin_manager: plugin_manager = get_plugin_manager(options.plugins) work_folder = Path(mkdtemp(prefix="ocrmypdf.io.")) debug_log_handler = None if ( (options.keep_temporary_files or options.verbose >= 1) and not os.environ.get('PYTEST_CURRENT_TEST', '') and not api ): # Debug log for command line interface only with verbose output # See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this # when pytest is running debug_log_handler = configure_debug_logging( Path(work_folder) / "debug.log" ) # pragma: no cover pikepdf_enable_mmap() executor = setup_executor(plugin_manager) try: check_requested_output_file(options) start_input_file, original_filename = create_input_file(options, work_folder) # Triage image or pdf origin_pdf = triage( original_filename, start_input_file, work_folder / 'origin.pdf', options ) # Gather pdfinfo and create context pdfinfo = get_pdfinfo( origin_pdf, executor=executor, detailed_analysis=options.redo_ocr, progbar=options.progress_bar, max_workers=options.jobs if not options.use_threads else 1, # To help debug check_pages=options.pages, ) context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager) # Validate options are okay for this pdf validate_pdfinfo_options(context) # Execute the pipeline exec_concurrent(context, executor) if options.output_file == '-': log.info("Output sent to stdout") elif ( hasattr(options.output_file, 'writable') and options.output_file.writable() ): log.info("Output written to stream") elif samefile(options.output_file, os.devnull): pass # Say nothing when sending to dev null else: if options.output_type.startswith('pdfa'): pdfa_info = file_claims_pdfa(options.output_file) if pdfa_info['pass']: log.info( "Output file is a %s (as expected)", pdfa_info['conformance'] ) else: log.warning( "Output file is okay but is not PDF/A (seems to be %s)", pdfa_info['conformance'], ) return ExitCode.pdfa_conversion_failed if not check_pdf(options.output_file): log.warning('Output file: The generated PDF is INVALID') return ExitCode.invalid_output_pdf report_output_file_size(options, start_input_file, options.output_file) except (KeyboardInterrupt if not api else NeverRaise) as e: if options.verbose >= 1: log.exception("KeyboardInterrupt") else: log.error("KeyboardInterrupt") return ExitCode.ctrl_c except (ExitCodeException if not api else NeverRaise) as e: if options.verbose >= 1: log.exception("ExitCodeException") elif str(e): log.error("%s: %s", type(e).__name__, str(e)) else: log.error(type(e).__name__) return e.exit_code except (Exception if not api else NeverRaise) as e: # pylint: disable=broad-except log.exception("An exception occurred while executing the pipeline") return ExitCode.other_error finally: if debug_log_handler: try: debug_log_handler.close() log.removeHandler(debug_log_handler) except EnvironmentError as e: print(e, file=sys.stderr) cleanup_working_files(work_folder, options) return ExitCode.ok