def get_orientation(input_file, options): with patch_tesseract_run(): return TesseractOcrEngine.get_orientation(input_file, options)
def generate_pdf(input_file, output_pdf, output_text, options): with patch_tesseract_run(): TesseractOcrEngine.generate_pdf( input_file, output_pdf, output_text, options )
def generate_pdf(input_file, output_pdf, output_text, options): with patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)): TesseractOcrEngine.generate_pdf(input_file, output_pdf, output_text, options)
def cached_run(options, run_args, **run_kwargs): run_args = [str(arg) for arg in run_args] # flatten PosixPaths args = parser.parse_args(run_args[1:]) if args.imagename in ('stdin', '-'): return run(run_args, **run_kwargs) source_file = options.input_file cache_folder = get_cache_folder(source_file, run_args, args) cache_folder.mkdir(parents=True, exist_ok=True) log.debug(f"Using Tesseract cache {cache_folder}") if (cache_folder / 'stderr.bin').exists(): log.debug("Cache HIT") # Replicate stdout/err if args.outputbase != 'stdout': if not args.configfiles: args.configfiles.append('txt') for configfile in args.configfiles: # cp cache -> output tessfile = args.outputbase + '.' + configfile shutil.copy(str(cache_folder / configfile) + '.bin', tessfile) return CompletedProcess( args=run_args, returncode=0, stdout=(cache_folder / 'stdout.bin').read_bytes(), stderr=(cache_folder / 'stderr.bin').read_bytes(), ) log.debug("Cache MISS") cache_kwargs = { k: v for k, v in run_kwargs.items() if k not in ('stdout', 'stderr') } assert cache_kwargs['check'] try: p = run(run_args, stdout=PIPE, stderr=PIPE, **cache_kwargs) except CalledProcessError as e: log.exception(e) raise # Pass exception onward # Update cache (cache_folder / 'stdout.bin').write_bytes(p.stdout) (cache_folder / 'stderr.bin').write_bytes(p.stderr) if args.outputbase != 'stdout': if not args.configfiles: args.configfiles.append('txt') for configfile in args.configfiles: if configfile not in ('hocr', 'pdf', 'txt'): continue # cp pwd/{outputbase}.{configfile} -> {cache}/{configfile} tessfile = args.outputbase + '.' + configfile shutil.copy(tessfile, str(cache_folder / configfile) + '.bin') manifest = {} manifest['tesseract_version'] = TesseractOcrEngine.version().replace( '\n', ' ') manifest['platform'] = platform.platform() manifest['python'] = platform.python_version() manifest['argv_slug'] = cache_folder.name manifest['sourcefile'] = str(Path(source_file).relative_to(TESTS_ROOT)) def clean_sys_argv(): for arg in run_args[1:]: yield re.sub(r'.*/ocrmypdf[.]io[.][^/]+[/](.*)', r'$TMPDIR/\1', arg) manifest['args'] = list(clean_sys_argv()) with (Path(CACHE_ROOT) / 'manifest.jsonl').open('a') as f: json.dump(manifest, f) f.write('\n') f.flush() return p
def get_orientation(input_file, options): with patch('ocrmypdf._exec.tesseract.run', new=partial(cached_run, options)): return TesseractOcrEngine.get_orientation(input_file, options)
def generate_pdf(input_file, output_pdf, output_text, options): with patch('ocrmypdf._exec.tesseract.run', new=raise_size_exception): TesseractOcrEngine.generate_pdf( input_file, output_pdf, output_text, options )
def get_orientation(input_file, options): with patch('ocrmypdf._exec.tesseract.run', new=raise_size_exception): return TesseractOcrEngine.get_orientation(input_file, options)
def generate_pdf(input_file, output_pdf, output_text, options): with patch('ocrmypdf._exec.tesseract.run', new=bad_utf8): TesseractOcrEngine.generate_pdf(input_file, output_pdf, output_text, options)
def generate_hocr(input_file, output_hocr, output_text, options): with patch('ocrmypdf._exec.tesseract.run', new=raise_crash): TesseractOcrEngine.generate_hocr(input_file, output_hocr, output_text, options)