예제 #1
0
def get_orientation(input_file: Path, engine_mode: Optional[int], timeout: float):
    args_tesseract = tess_base_args(['osd'], engine_mode) + [
        '--psm',
        '0',
        fspath(input_file),
        'stdout',
    ]

    try:
        p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
        stdout = p.stdout
    except TimeoutExpired:
        return OrientationConfidence(angle=0, confidence=0.0)
    except CalledProcessError as e:
        tesseract_log_output(e.stdout)
        tesseract_log_output(e.stderr)
        if (
            b'Too few characters. Skipping this page' in e.output
            or b'Image too large' in e.output
        ):
            return OrientationConfidence(0, 0)
        raise SubprocessOutputError() from e
    else:
        osd = {}
        for line in stdout.decode().splitlines():
            line = line.strip()
            parts = line.split(':', maxsplit=2)
            if len(parts) == 2:
                osd[parts[0].strip()] = parts[1].strip()

        angle = int(osd.get('Orientation in degrees', 0))
        oc = OrientationConfidence(
            angle=angle, confidence=float(osd.get('Orientation confidence', 0))
        )
        return oc
예제 #2
0
def get_languages():
    def lang_error(output):
        msg = (
            "Tesseract failed to report available languages.\n"
            "Output from Tesseract:\n"
            "-----------\n"
        )
        msg += output
        return msg

    args_tess = ['tesseract', '--list-langs']
    try:
        proc = run(
            args_tess,
            text=True,
            stdout=PIPE,
            stderr=STDOUT,
            logs_errors_to_stdout=True,
            check=True,
        )
        output = proc.stdout
    except CalledProcessError as e:
        raise MissingDependencyError(lang_error(e.output)) from e

    for line in output.splitlines():
        if line.startswith('Error'):
            raise MissingDependencyError(lang_error(output))
    _header, *rest = output.splitlines()
    return set(lang.strip() for lang in rest)
예제 #3
0
def generate_pdf(
    *,
    input_file: Path,
    output_pdf: Path,
    output_text: Path,
    languages: List[str],
    engine_mode: int,
    tessconfig: List[str],
    timeout: float,
    pagesegmode: int,
    user_words,
    user_patterns,
):
    """Use Tesseract to render a PDF.

    input_file -- image to analyze
    output_pdf -- file to generate
    output_text -- OCR text file
    languages -- list of languages to consider
    engine_mode -- engine mode argument for tess v4
    tessconfig -- tesseract configuration
    timeout -- timeout (seconds)
    """

    args_tesseract = tess_base_args(languages, engine_mode)

    if pagesegmode is not None:
        args_tesseract.extend(['--psm', str(pagesegmode)])

    args_tesseract.extend(['-c', 'textonly_pdf=1'])

    if user_words:
        args_tesseract.extend(['--user-words', user_words])

    if user_patterns:
        args_tesseract.extend(['--user-patterns', user_patterns])

    prefix = os.path.splitext(output_pdf)[0]  # Tesseract appends suffixes

    # Reminder: test suite tesseract test plugins might break after any changes
    # to the number of order parameters here

    args_tesseract.extend([input_file, prefix, 'pdf', 'txt'] + tessconfig)
    try:
        p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
        stdout = p.stdout
        if os.path.exists(prefix + '.txt'):
            shutil.move(prefix + '.txt', output_text)
    except TimeoutExpired:
        page_timedout(timeout)
        use_skip_page(output_pdf, output_text)
    except CalledProcessError as e:
        tesseract_log_output(e.output)
        if b'Image too large' in e.output:
            use_skip_page(output_pdf, output_text)
            return
        raise SubprocessOutputError() from e
    else:
        tesseract_log_output(stdout)
예제 #4
0
def rasterize_pdf(
    input_file: os.PathLike,
    output_file: os.PathLike,
    *,
    raster_device: str,
    raster_dpi: Resolution,
    pageno: int = 1,
    page_dpi: Optional[Resolution] = None,
    rotation: Optional[int] = None,
    filter_vector: bool = False,
):
    """Rasterize one page of a PDF at resolution raster_dpi in canvas units."""
    raster_dpi = raster_dpi.round(6)
    if not page_dpi:
        page_dpi = raster_dpi

    args_gs = ([
        GS,
        '-dQUIET',
        '-dSAFER',
        '-dBATCH',
        '-dNOPAUSE',
        f'-sDEVICE={raster_device}',
        f'-dFirstPage={pageno}',
        f'-dLastPage={pageno}',
        f'-r{raster_dpi.x:f}x{raster_dpi.y:f}',
    ] + (['-dFILTERVECTOR'] if filter_vector else []) + [
        '-o',
        '-',
        '-sstdout=%stderr',
        '-dAutoRotatePages=/None',  # Probably has no effect on raster
        '-f',
        fspath(input_file),
    ])

    try:
        p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True)
    except CalledProcessError as e:
        log.error(e.stderr.decode(errors='replace'))
        raise SubprocessOutputError('Ghostscript rasterizing failed')
    else:
        stderr = p.stderr.decode(errors='replace')
        if _gs_error_reported(stderr):
            log.error(stderr)

    with Image.open(BytesIO(p.stdout)) as im:
        if rotation is not None:
            log.debug("Rotating output by %i", rotation)
            # rotation is a clockwise angle and Image.ROTATE_* is
            # counterclockwise so this cancels out the rotation
            if rotation == 90:
                im = im.transpose(Image.ROTATE_90)
            elif rotation == 180:
                im = im.transpose(Image.ROTATE_180)
            elif rotation == 270:
                im = im.transpose(Image.ROTATE_270)
            if rotation % 180 == 90:
                page_dpi = page_dpi.flip_axis()
        im.save(fspath(output_file), dpi=page_dpi)
예제 #5
0
def generate_hocr(
    *,
    input_file: Path,
    output_hocr: Path,
    output_text: Path,
    languages: List[str],
    engine_mode: int,
    tessconfig: List[str],
    timeout: float,
    pagesegmode: int,
    user_words,
    user_patterns,
):
    prefix = output_hocr.with_suffix('')

    args_tesseract = tess_base_args(languages, engine_mode)

    if pagesegmode is not None:
        args_tesseract.extend(['--psm', str(pagesegmode)])

    if user_words:
        args_tesseract.extend(['--user-words', user_words])

    if user_patterns:
        args_tesseract.extend(['--user-patterns', user_patterns])

    # Reminder: test suite tesseract test plugins will break after any changes
    # to the number of order parameters here
    args_tesseract.extend([input_file, prefix, 'hocr', 'txt'] + tessconfig)
    try:
        p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
        stdout = p.stdout
    except TimeoutExpired:
        # Generate a HOCR file with no recognized text if tesseract times out
        # Temporary workaround to hocrTransform not being able to function if
        # it does not have a valid hOCR file.
        page_timedout(timeout)
        _generate_null_hocr(output_hocr, output_text, input_file)
    except CalledProcessError as e:
        tesseract_log_output(e.output)
        if b'Image too large' in e.output:
            _generate_null_hocr(output_hocr, output_text, input_file)
            return

        raise SubprocessOutputError() from e
    else:
        tesseract_log_output(stdout)
        # The sidecar text file will get the suffix .txt; rename it to
        # whatever caller wants it named
        if prefix.with_suffix('.txt').exists():
            shutil.move(prefix.with_suffix('.txt'), output_text)
예제 #6
0
def convert_group(*, cwd, infiles, out_prefix):
    args = [
        'jbig2',
        '-b',
        out_prefix,
        '-s',  # symbol mode (lossy)
        # '-r', # refinement mode (lossless symbol mode, currently disabled in
        # jbig2)
        '-p',
    ]
    args.extend(infiles)
    proc = run(args, cwd=cwd, stdout=PIPE, stderr=PIPE)
    proc.check_returncode()
    return proc
예제 #7
0
def quantize(input_file: Path, output_file: Path, quality_min: int,
             quality_max: int):
    with input_as_png(input_file) as input_stream:
        args = [
            'pngquant',
            '--force',
            '--skip-if-larger',
            '--quality',
            f'{quality_min}-{quality_max}',
            '--',  # pngquant: stop processing arguments
            '-',  # pngquant: stream input and output
        ]
        result = run(args,
                     stdin=input_stream,
                     stdout=PIPE,
                     stderr=PIPE,
                     check=False)

    if result.returncode == 0:
        # input_file could be the same as output_file, so we defer the write
        output_file.write_bytes(result.stdout)
예제 #8
0
def convert_single(*, cwd, infile, outfile):
    args = ['jbig2', '-p', infile]
    with open(outfile, 'wb') as fstdout:
        proc = run(args, cwd=cwd, stdout=fstdout, stderr=PIPE)
    proc.check_returncode()
    return proc
예제 #9
0
def cached_run(options, run_args, **run_kwargs):
    run_args = [str(arg) for arg in run_args]  # flatten PosixPaths
    args = parser.parse_args(run_args[1:])

    if args.imagename in ('stdin', '-'):
        return run(run_args, **run_kwargs)

    source_file = options.input_file
    cache_folder = get_cache_folder(source_file, run_args, args)
    cache_folder.mkdir(parents=True, exist_ok=True)

    log.debug(f"Using Tesseract cache {cache_folder}")

    if (cache_folder / 'stderr.bin').exists():
        log.debug("Cache HIT")

        # Replicate stdout/err
        if args.outputbase != 'stdout':
            if not args.configfiles:
                args.configfiles.append('txt')
            for configfile in args.configfiles:
                # cp cache -> output
                tessfile = args.outputbase + '.' + configfile
                shutil.copy(str(cache_folder / configfile) + '.bin', tessfile)
        return CompletedProcess(
            args=run_args,
            returncode=0,
            stdout=(cache_folder / 'stdout.bin').read_bytes(),
            stderr=(cache_folder / 'stderr.bin').read_bytes(),
        )

    log.debug("Cache MISS")

    cache_kwargs = {
        k: v
        for k, v in run_kwargs.items() if k not in ('stdout', 'stderr')
    }
    assert cache_kwargs['check']
    try:
        p = run(run_args, stdout=PIPE, stderr=PIPE, **cache_kwargs)
    except CalledProcessError as e:
        log.exception(e)
        raise  # Pass exception onward

    # Update cache
    (cache_folder / 'stdout.bin').write_bytes(p.stdout)
    (cache_folder / 'stderr.bin').write_bytes(p.stderr)

    if args.outputbase != 'stdout':
        if not args.configfiles:
            args.configfiles.append('txt')

        for configfile in args.configfiles:
            if configfile not in ('hocr', 'pdf', 'txt'):
                continue
            # cp pwd/{outputbase}.{configfile} -> {cache}/{configfile}
            tessfile = args.outputbase + '.' + configfile
            shutil.copy(tessfile, str(cache_folder / configfile) + '.bin')

    manifest = {}
    manifest['tesseract_version'] = TesseractOcrEngine.version().replace(
        '\n', ' ')
    manifest['platform'] = platform.platform()
    manifest['python'] = platform.python_version()
    manifest['argv_slug'] = cache_folder.name
    manifest['sourcefile'] = str(Path(source_file).relative_to(TESTS_ROOT))

    def clean_sys_argv():
        for arg in run_args[1:]:
            yield re.sub(r'.*/ocrmypdf[.]io[.][^/]+[/](.*)', r'$TMPDIR/\1',
                         arg)

    manifest['args'] = list(clean_sys_argv())
    with (Path(CACHE_ROOT) / 'manifest.jsonl').open('a') as f:
        json.dump(manifest, f)
        f.write('\n')
        f.flush()
    return p