def get_orientation(input_file, options):
     with patch_tesseract_run():
         return TesseractOcrEngine.get_orientation(input_file, options)
 def generate_pdf(input_file, output_pdf, output_text, options):
     with patch_tesseract_run():
         TesseractOcrEngine.generate_pdf(
             input_file, output_pdf, output_text, options
         )
예제 #3
0
 def generate_pdf(input_file, output_pdf, output_text, options):
     with patch('ocrmypdf._exec.tesseract.run',
                new=partial(cached_run, options)):
         TesseractOcrEngine.generate_pdf(input_file, output_pdf,
                                         output_text, options)
예제 #4
0
def cached_run(options, run_args, **run_kwargs):
    run_args = [str(arg) for arg in run_args]  # flatten PosixPaths
    args = parser.parse_args(run_args[1:])

    if args.imagename in ('stdin', '-'):
        return run(run_args, **run_kwargs)

    source_file = options.input_file
    cache_folder = get_cache_folder(source_file, run_args, args)
    cache_folder.mkdir(parents=True, exist_ok=True)

    log.debug(f"Using Tesseract cache {cache_folder}")

    if (cache_folder / 'stderr.bin').exists():
        log.debug("Cache HIT")

        # Replicate stdout/err
        if args.outputbase != 'stdout':
            if not args.configfiles:
                args.configfiles.append('txt')
            for configfile in args.configfiles:
                # cp cache -> output
                tessfile = args.outputbase + '.' + configfile
                shutil.copy(str(cache_folder / configfile) + '.bin', tessfile)
        return CompletedProcess(
            args=run_args,
            returncode=0,
            stdout=(cache_folder / 'stdout.bin').read_bytes(),
            stderr=(cache_folder / 'stderr.bin').read_bytes(),
        )

    log.debug("Cache MISS")

    cache_kwargs = {
        k: v
        for k, v in run_kwargs.items() if k not in ('stdout', 'stderr')
    }
    assert cache_kwargs['check']
    try:
        p = run(run_args, stdout=PIPE, stderr=PIPE, **cache_kwargs)
    except CalledProcessError as e:
        log.exception(e)
        raise  # Pass exception onward

    # Update cache
    (cache_folder / 'stdout.bin').write_bytes(p.stdout)
    (cache_folder / 'stderr.bin').write_bytes(p.stderr)

    if args.outputbase != 'stdout':
        if not args.configfiles:
            args.configfiles.append('txt')

        for configfile in args.configfiles:
            if configfile not in ('hocr', 'pdf', 'txt'):
                continue
            # cp pwd/{outputbase}.{configfile} -> {cache}/{configfile}
            tessfile = args.outputbase + '.' + configfile
            shutil.copy(tessfile, str(cache_folder / configfile) + '.bin')

    manifest = {}
    manifest['tesseract_version'] = TesseractOcrEngine.version().replace(
        '\n', ' ')
    manifest['platform'] = platform.platform()
    manifest['python'] = platform.python_version()
    manifest['argv_slug'] = cache_folder.name
    manifest['sourcefile'] = str(Path(source_file).relative_to(TESTS_ROOT))

    def clean_sys_argv():
        for arg in run_args[1:]:
            yield re.sub(r'.*/ocrmypdf[.]io[.][^/]+[/](.*)', r'$TMPDIR/\1',
                         arg)

    manifest['args'] = list(clean_sys_argv())
    with (Path(CACHE_ROOT) / 'manifest.jsonl').open('a') as f:
        json.dump(manifest, f)
        f.write('\n')
        f.flush()
    return p
예제 #5
0
 def get_orientation(input_file, options):
     with patch('ocrmypdf._exec.tesseract.run',
                new=partial(cached_run, options)):
         return TesseractOcrEngine.get_orientation(input_file, options)
예제 #6
0
 def generate_pdf(input_file, output_pdf, output_text, options):
     with patch('ocrmypdf._exec.tesseract.run', new=raise_size_exception):
         TesseractOcrEngine.generate_pdf(
             input_file, output_pdf, output_text, options
         )
예제 #7
0
 def get_orientation(input_file, options):
     with patch('ocrmypdf._exec.tesseract.run', new=raise_size_exception):
         return TesseractOcrEngine.get_orientation(input_file, options)
예제 #8
0
 def generate_pdf(input_file, output_pdf, output_text, options):
     with patch('ocrmypdf._exec.tesseract.run', new=bad_utf8):
         TesseractOcrEngine.generate_pdf(input_file, output_pdf,
                                         output_text, options)
예제 #9
0
 def generate_hocr(input_file, output_hocr, output_text, options):
     with patch('ocrmypdf._exec.tesseract.run', new=raise_crash):
         TesseractOcrEngine.generate_hocr(input_file, output_hocr,
                                          output_text, options)