Exemplo n.º 1
0
def check_options(options):
    gs_version = ghostscript.version()
    check_external_program(
        program='gs',
        package='ghostscript',
        version_checker=gs_version,
        need_version='9.15',  # limited by Travis CI / Ubuntu 14.04 backports
    )
    if gs_version in ('9.24', '9.51'):
        raise MissingDependencyError(
            f"Ghostscript {gs_version} contains serious regressions and is not "
            "supported. Please upgrade to a newer version, or downgrade to the "
            "previous version.")

    # We have these constraints to check for.
    # 1. Ghostscript < 9.20 mangles multibyte Unicode
    # 2. hocr doesn't work on non-Latin languages (so don't select it)
    is_latin = options.languages.issubset(HOCR_OK_LANGS)
    if gs_version < '9.20' and options.output_type != 'pdf' and not is_latin:
        # https://bugs.ghostscript.com/show_bug.cgi?id=696874
        # Ghostscript < 9.20 fails to encode multibyte characters properly
        log.warning(
            f"The installed version of Ghostscript ({gs_version}) does not work "
            "correctly with the OCR languages you specified. Use --output-type pdf or "
            "upgrade to Ghostscript 9.20 or later to avoid this issue.")

    if options.output_type == 'pdfa':
        options.output_type = 'pdfa-2'

    if options.output_type == 'pdfa-3' and ghostscript.version() < '9.19':
        raise MissingDependencyError(
            "--output-type pdfa-3 requires Ghostscript 9.19 or later")
Exemplo n.º 2
0
def get_languages():
    def lang_error(output):
        msg = (
            "Tesseract failed to report available languages.\n"
            "Output from Tesseract:\n"
            "-----------\n"
        )
        msg += output
        return msg

    args_tess = ['tesseract', '--list-langs']
    try:
        proc = run(
            args_tess,
            text=True,
            stdout=PIPE,
            stderr=STDOUT,
            logs_errors_to_stdout=True,
            check=True,
        )
        output = proc.stdout
    except CalledProcessError as e:
        raise MissingDependencyError(lang_error(e.output)) from e

    for line in output.splitlines():
        if line.startswith('Error'):
            raise MissingDependencyError(lang_error(output))
    _header, *rest = output.splitlines()
    return set(lang.strip() for lang in rest)
Exemplo n.º 3
0
def _setup_unpaper_io(tmpdir: Path, input_file: Path) -> Tuple[Path, Path]:
    SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
    with Image.open(input_file) as im:
        im_modified = False
        if im.mode not in SUFFIXES:
            log.info("Converting image to other colorspace")
            try:
                if im.mode == 'P' and len(im.getcolors()) == 2:
                    im = im.convert(mode='1')
                else:
                    im = im.convert(mode='RGB')
            except IOError as e:
                raise MissingDependencyError(
                    "Could not convert image with type " + im.mode) from e
            else:
                im_modified = True
        try:
            suffix = SUFFIXES[im.mode]
        except KeyError:
            raise MissingDependencyError(
                "Failed to convert image to a supported format.") from e

        if im_modified or input_file.suffix != '.png':
            input_png = tmpdir / 'input.png'
            im.save(input_png, format='PNG', compress_level=1)
        else:
            # No changes, PNG input, just use the file we already have
            input_png = input_file
        output_pnm = tmpdir / f'output{suffix}'
    return input_png, output_pnm
Exemplo n.º 4
0
def run(input_file, output_file, dpi, mode_args):
    args_unpaper = ['unpaper', '-v', '--dpi', str(dpi)] + mode_args

    SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}

    with TemporaryDirectory() as tmpdir, Image.open(input_file) as im:
        if im.mode not in SUFFIXES.keys():
            log.info("Converting image to other colorspace")
            try:
                if im.mode == 'P' and len(im.getcolors()) == 2:
                    im = im.convert(mode='1')
                else:
                    im = im.convert(mode='RGB')
            except IOError as e:
                im.close()
                raise MissingDependencyError(
                    "Could not convert image with type " + im.mode) from e

        try:
            suffix = SUFFIXES[im.mode]
        except KeyError:
            raise MissingDependencyError(
                "Failed to convert image to a supported format.") from e

        input_pnm = Path(tmpdir) / f'input{suffix}'
        output_pnm = Path(tmpdir) / f'output{suffix}'
        im.save(input_pnm, format='PPM')

        # To prevent any shenanigans from accepting arbitrary parameters in
        # --unpaper-args, we:
        # 1) run with cwd set to a tmpdir with only unpaper's files
        # 2) forbid the use of '/' in arguments, to prevent changing paths
        # 3) append absolute paths for the input and output file
        # This should ensure that a user cannot clobber some other file with
        # their unpaper arguments (whether intentionally or otherwise)
        args_unpaper.extend([os.fspath(input_pnm), os.fspath(output_pnm)])
        try:
            proc = external_run(
                args_unpaper,
                check=True,
                close_fds=True,
                universal_newlines=True,
                stderr=STDOUT,
                cwd=tmpdir,
                stdout=PIPE,
            )
        except CalledProcessError as e:
            log.debug(e.output)
            raise e from e
        else:
            log.debug(proc.stdout)
            # unpaper sets dpi to 72; fix this
            try:
                with Image.open(output_pnm) as imout:
                    imout.save(output_file, dpi=(dpi, dpi))
            except (FileNotFoundError, OSError):
                raise SubprocessOutputError(
                    "unpaper: failed to produce the expected output file. " +
                    " Called with: " + str(args_unpaper)) from None
Exemplo n.º 5
0
def check_external_program(
    *,
    program: str,
    package: str,
    version_checker: Union[str, Callable],
    need_version: str,
    required_for: Optional[str] = None,
    recommended=False,
    version_parser: Type[Version] = LooseVersion,
):
    """Check for required version of external program and raise exception if not.

    Args:
        program: The name of the program to test.
        package: The name of a software package that typically supplies this program.
            Usually the same as program.
        version_check: A callable without arguments that retrieves the installed
            version of program.
        need_version: The minimum required version.
        required_for: The name of an argument of feature that requires this program.
        recommended: If this external program is recommended, instead of raising
            an exception, log a warning and allow execution to continue.
        version_parser: A class that should be used to parse and compare version
            numbers. Used when version numbers do not follow standard conventions.
    """

    try:
        if callable(version_checker):
            found_version = version_checker()
        else:
            found_version = version_checker
    except (CalledProcessError, FileNotFoundError, MissingDependencyError):
        _error_missing_program(program, package, required_for, recommended)
        if not recommended:
            raise MissingDependencyError(program)
        return

    def remove_leading_v(s):
        if s.startswith('v'):
            return s[1:]
        return s

    found_version = remove_leading_v(found_version)
    need_version = remove_leading_v(need_version)

    if found_version and version_parser(found_version) < version_parser(
            need_version):
        _error_old_version(program, package, need_version, found_version,
                           required_for)
        if not recommended:
            raise MissingDependencyError(program)

    log.debug('Found %s %s', program, found_version)
Exemplo n.º 6
0
def check_options(options):
    check_external_program(
        program='tesseract',
        package={'linux': 'tesseract-ocr'},
        version_checker=tesseract.version,
        need_version='4.0.0',  # using backport for Travis CI
    )

    # Decide on what renderer to use
    if options.pdf_renderer == 'auto':
        options.pdf_renderer = 'sandwich'

    if options.pdf_renderer == 'sandwich' and not tesseract.has_textonly_pdf(
            set(options.languages)):
        raise MissingDependencyError(
            "You are using an alpha version of Tesseract 4.0 that does not support "
            "the textonly_pdf parameter. We don't support versions this old.")
    if not tesseract.has_user_words() and (options.user_words
                                           or options.user_patterns):
        log.warning(
            "Tesseract 4.0 ignores --user-words and --user-patterns, so these "
            "arguments have no effect.")
    if options.tesseract_pagesegmode in (0, 2):
        log.warning(
            "The --tesseract-pagesegmode argument you select will disable OCR. "
            "This may cause processing to fail.")
Exemplo n.º 7
0
def get_version(program: str,
                *,
                version_arg: str = '--version',
                regex=r'(\d+(\.\d+)*)',
                env=None):
    """Get the version of the specified program

    Arguments:
        program: The program to version check.
        version_arg: The argument needed to ask for its version, e.g. ``--version``.
        regex: A regular expression to parse the program's output and obtain the
            version.
        env: Custom ``os.environ`` in which to run program.
    """
    args_prog = [program, version_arg]
    try:
        proc = run(
            args_prog,
            close_fds=True,
            text=True,
            stdout=PIPE,
            stderr=STDOUT,
            check=True,
            env=env,
        )
        output = proc.stdout
    except FileNotFoundError as e:
        raise MissingDependencyError(
            f"Could not find program '{program}' on the PATH") from e
    except CalledProcessError as e:
        if e.returncode != 0:
            raise MissingDependencyError(
                f"Ran program '{program}' but it exited with an error:\n{e.output}"
            ) from e
        raise MissingDependencyError(
            f"Could not find program '{program}' on the PATH") from e

    match = re.match(regex, output.strip())
    if not match:
        raise MissingDependencyError(
            f"The program '{program}' did not report its version. "
            f"Message was:\n{output}")
    version = match.group(1)

    return version
Exemplo n.º 8
0
def check_options_languages(options, ocr_engine_languages):
    if not options.languages:
        options.languages = {DEFAULT_LANGUAGE}
        system_lang = locale.getlocale()[0]
        if system_lang and not system_lang.startswith('en'):
            log.debug("No language specified; assuming --language %s",
                      DEFAULT_LANGUAGE)
    if not ocr_engine_languages:
        return
    if not options.languages.issubset(ocr_engine_languages):
        msg = (f"OCR engine does not have language data for the following "
               "requested languages: \n")
        for lang in options.languages - ocr_engine_languages:
            msg += lang + '\n'
        raise MissingDependencyError(msg)
Exemplo n.º 9
0
def check_external_program(
    *,
    program,
    package,
    version_checker,
    need_version,
    required_for=None,
    recommended=False,
):
    try:
        if callable(version_checker):
            found_version = version_checker()
        else:
            found_version = version_checker
    except (CalledProcessError, FileNotFoundError, MissingDependencyError):
        _error_missing_program(program, package, required_for, recommended)
        if not recommended:
            raise MissingDependencyError()
        return

    def remove_leading_v(s):
        if s.startswith('v'):
            return s[1:]
        return s

    found_version = remove_leading_v(found_version)
    need_version = remove_leading_v(need_version)

    if found_version and LooseVersion(found_version) < LooseVersion(
            need_version):
        _error_old_version(program, package, need_version, found_version,
                           required_for)
        if not recommended:
            raise MissingDependencyError()

    log.debug('Found %s %s', program, found_version)
Exemplo n.º 10
0
def get_version(program,
                *,
                version_arg='--version',
                regex=r'(\d+(\.\d+)*)',
                env=None):
    """Get the version of the specified program"""
    args_prog = [program, version_arg]
    try:
        proc = run(
            args_prog,
            close_fds=True,
            universal_newlines=True,
            stdout=PIPE,
            stderr=STDOUT,
            check=True,
            env=env,
        )
        output = proc.stdout
    except FileNotFoundError as e:
        raise MissingDependencyError(
            f"Could not find program '{program}' on the PATH") from e
    except CalledProcessError as e:
        if e.returncode != 0:
            raise MissingDependencyError(
                f"Ran program '{program}' but it exited with an error:\n{e.output}"
            ) from e
        raise MissingDependencyError(
            f"Could not find program '{program}' on the PATH") from e
    try:
        version = re.match(regex, output.strip()).group(1)
    except AttributeError as e:
        raise MissingDependencyError(
            f"The program '{program}' did not report its version. "
            f"Message was:\n{output}")

    return version
Exemplo n.º 11
0
def check_options_languages(options, ocr_engine_languages):
    if not options.languages:
        options.languages = {DEFAULT_LANGUAGE}
        system_lang = locale.getlocale()[0]
        if system_lang and not system_lang.startswith('en'):
            log.debug("No language specified; assuming --language %s",
                      DEFAULT_LANGUAGE)
    if not ocr_engine_languages:
        return
    missing_languages = options.languages - ocr_engine_languages
    if missing_languages:
        msg = ("OCR engine does not have language data for the following "
               "requested languages: \n")
        msg += '\n'.join(lang for lang in missing_languages)
        msg += '\nNote: most languages are identified by a 3-digit ISO 639-2 Code'
        raise MissingDependencyError(msg)
Exemplo n.º 12
0
def has_textonly_pdf(langs=None):
    """Does Tesseract have textonly_pdf capability?

    Available in v4.00.00alpha since January 2017. Best to
    parse the parameter list.
    """
    args_tess = tess_base_args(langs, engine_mode=None) + ['--print-parameters', 'pdf']
    params = ''
    try:
        proc = run(args_tess, check=True, stdout=PIPE, stderr=STDOUT)
        params = proc.stdout
    except CalledProcessError as e:
        raise MissingDependencyError(
            "Could not --print-parameters from tesseract. This can happen if the "
            "TESSDATA_PREFIX environment is not set to a valid tessdata folder. "
        ) from e
    if b'textonly_pdf' in params:
        return True
    return False
Exemplo n.º 13
0
    from ocrmypdf.subprocess._windows import shim_env_path

    libname = 'liblept-5'
    os.environ['PATH'] = shim_env_path()
else:
    libname = 'lept'
_libpath = find_library(libname)
if not _libpath:
    raise MissingDependencyError("""
        ---------------------------------------------------------------------
        This error normally occurs when ocrmypdf can't find the Leptonica
        library, which is usually installed with Tesseract OCR. It could be that
        Tesseract is not installed properly, we can't find the installation
        on your system PATH environment variable.

        The library we are looking for is usually called:
            liblept-5.dll   (Windows)
            liblept*.dylib  (macOS)
            liblept*.so     (Linux/BSD)

        Please review our installation procedures to find a solution:
            https://ocrmypdf.readthedocs.io/en/latest/installation.html
        ---------------------------------------------------------------------
        """)
if os.name == 'nt':
    # On Windows, recent versions of libpng require zlib. We have to make sure
    # the zlib version being loaded is the same one that libpng was built with.
    # This tries to import zlib from Tesseract's installation folder, falling back
    # to find_library() if liblept is being loaded from somewhere else.
    # Loading zlib from other places could cause a version mismatch
    _zlib_path = os.path.join(os.path.dirname(_libpath), 'zlib1.dll')
    if not os.path.exists(_zlib_path):
Exemplo n.º 14
0
from ocrmypdf.helpers import Resolution
from ocrmypdf.subprocess import get_version, run

log = logging.getLogger(__name__)

_gswin = None
if os.name == 'nt':
    _gswin = which('gswin64c')
    if not _gswin:
        _gswin = which('gswin32c')
        if not _gswin:
            raise MissingDependencyError("""
                ---------------------------------------------------------------------
                This error normally occurs when ocrmypdf can't Ghostscript.  Please
                ensure Ghostscript is installed and its location is added to the
                system PATH environment variable.

                For details see:
                    https://ocrmypdf.readthedocs.io/en/latest/installation.html
                ---------------------------------------------------------------------
                """)
    _gswin = Path(_gswin).stem

GS = _gswin if _gswin else 'gs'
del _gswin


def version():
    return get_version(GS)


def jpeg_passthrough_available() -> bool:
Exemplo n.º 15
0
This error normally occurs when ocrmypdf find can't Ghostscript.
Please ensure Ghostscript is installed and its location is added to
the system PATH environment variable.

For details see:
    https://ocrmypdf.readthedocs.io/en/latest/installation.html
---------------------------------------------------------------------
"""

_gswin = None
if os.name == 'nt':
    _gswin = which('gswin64c')
    if not _gswin:
        _gswin = which('gswin32c')
        if not _gswin:
            raise MissingDependencyError(missing_gs_error)
    _gswin = Path(_gswin).stem

GS = _gswin if _gswin else 'gs'
del _gswin


def version():
    return get_version(GS)


def jpeg_passthrough_available() -> bool:
    """Returns True if the installed version of Ghostscript supports JPEG passthru

    Prior to 9.23, Ghostscript decoded and re-encoded JPEGs internally. In 9.23
    it gained the ability to keep JPEGs unmodified. However, the 9.23