Пример #1
0
def run_ocrmypdf_api(input_file, output_file, *args):
    """Run ocrmypdf via API and let caller deal with results

    Does not currently have a way to manipulate the PATH except for Tesseract.
    """

    args = [str(input_file), str(output_file)
            ] + [str(arg) for arg in args if arg is not None]
    _parser, options, plugin_manager = get_parser_options_plugins(args=args)

    api.check_options(options, plugin_manager)
    return api.run_pipeline(options, plugin_manager=None, api=False)
Пример #2
0
def check_ocrmypdf(input_file, output_file, *args):
    """Run ocrmypdf and confirmed that a valid file was created"""
    args = [str(input_file), str(output_file)
            ] + [str(arg) for arg in args if arg is not None]

    _parser, options, plugin_manager = get_parser_options_plugins(args=args)
    api.check_options(options, plugin_manager)
    result = api.run_pipeline(options, plugin_manager=plugin_manager, api=True)

    assert result == 0
    assert output_file.exists(), "Output file not created"
    assert output_file.stat().st_size > 100, "PDF too small or empty"

    return output_file
Пример #3
0
def run_ocrmypdf_api(input_file, output_file, *args, env=None):
    "Run ocrmypdf and let caller deal with results"

    options = cli.parser.parse_args(
        [str(input_file), str(output_file)] +
        [str(arg) for arg in args if arg is not None])
    api.check_options(options)
    if env:
        options.tesseract_env = env.copy()
        options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
    if options.tesseract_env:
        assert all(
            isinstance(v, (str, bytes))
            for v in options.tesseract_env.values())

    return api.run_pipeline(options, api=False)
Пример #4
0
def check_ocrmypdf(input_file, output_file, *args, env=None):
    """Run ocrmypdf and confirmed that a valid file was created"""

    options = cli.parser.parse_args(
        [str(input_file), str(output_file)] +
        [str(arg) for arg in args if arg is not None])
    api.check_options(options)
    if env:
        options.tesseract_env = env
        options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = input_file
    result = api.run_pipeline(options, api=True)

    assert result == 0
    assert os.path.exists(str(output_file)), "Output file not created"
    assert os.stat(str(output_file)).st_size > 100, "PDF too small or empty"

    return output_file
Пример #5
0
def run_ocrmypdf_api(input_file, output_file, *args, env=None):
    """Run ocrmypdf via API and let caller deal with results

    Does not currently have a way to manipulate the PATH except for Tesseract.
    """

    options = cli.parser.parse_args(
        [str(input_file), str(output_file)]
        + [str(arg) for arg in args if arg is not None]
    )
    api.check_options(options)
    if env:
        options.tesseract_env = env.copy()
        options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
        first_path = env.get('_OCRMYPDF_TEST_PATH', '').split(os.pathsep)[0]
        if 'spoof' in first_path:
            assert 'gs' not in first_path, "use run_ocrmypdf() for gs"
            assert 'tesseract' in first_path
    if options.tesseract_env:
        assert all(isinstance(v, (str, bytes)) for v in options.tesseract_env.values())

    return api.run_pipeline(options, api=False)