예제 #1
0
def check_options(options):
    check_external_program(
        program='tesseract',
        package={'linux': 'tesseract-ocr'},
        version_checker=tesseract.version,
        need_version='4.0.0',  # using backport for Travis CI
    )

    # Decide on what renderer to use
    if options.pdf_renderer == 'auto':
        options.pdf_renderer = 'sandwich'

    if options.pdf_renderer == 'sandwich' and not tesseract.has_textonly_pdf(
            set(options.languages)):
        raise MissingDependencyError(
            "You are using an alpha version of Tesseract 4.0 that does not support "
            "the textonly_pdf parameter. We don't support versions this old.")
    if not tesseract.has_user_words() and (options.user_words
                                           or options.user_patterns):
        log.warning(
            "Tesseract 4.0 ignores --user-words and --user-patterns, so these "
            "arguments have no effect.")
    if options.tesseract_pagesegmode in (0, 2):
        log.warning(
            "The --tesseract-pagesegmode argument you select will disable OCR. "
            "This may cause processing to fail.")
예제 #2
0
def check_options(options):
    check_external_program(
        program='tesseract',
        package={'linux': 'tesseract-ocr'},
        version_checker=tesseract.version,
        need_version='4.0.0-beta.1',  # using backport for Travis CI
        version_parser=tesseract.TesseractVersion,
    )

    # Decide on what renderer to use
    if options.pdf_renderer == 'auto':
        options.pdf_renderer = 'sandwich'

    if not tesseract.has_user_words() and (options.user_words
                                           or options.user_patterns):
        log.warning(
            "Tesseract 4.0 ignores --user-words and --user-patterns, so these "
            "arguments have no effect.")
    if options.tesseract_pagesegmode in (0, 2):
        log.warning(
            "The --tesseract-pagesegmode argument you select will disable OCR. "
            "This may cause processing to fail.")
예제 #3
0
    p, _out, err = run_ocrmypdf(
        resources / 'ccitt.pdf',
        outdir / 'out.pdf',
        '--pdf-renderer',
        renderer,
        '--tesseract-config',
        cfg_file,
    )
    assert (
        "parameter not found" in err.lower()
        or "error occurred while parsing" in err.lower()
    ), "No error message"
    assert p.returncode == ExitCode.invalid_config


@pytest.mark.skipif(not tesseract.has_user_words(), reason='not functional until 4.1.0')
def test_user_words_ocr(resources, outdir):
    # Does not actually test if --user-words causes output to differ
    word_list = outdir / 'wordlist.txt'
    sidecar_after = outdir / 'sidecar.txt'

    with word_list.open('w') as f:
        f.write('cromulent\n')  # a perfectly cromulent word

    check_ocrmypdf(
        resources / 'crom.png',
        outdir / 'out.pdf',
        '--image-dpi',
        150,
        '--sidecar',
        sidecar_after,
예제 #4
0
''')

    p, _out, err = run_ocrmypdf(
        resources / 'ccitt.pdf',
        outdir / 'out.pdf',
        '--pdf-renderer',
        renderer,
        '--tesseract-config',
        cfg_file,
    )
    assert ("parameter not found" in err.lower() or
            "error occurred while parsing" in err.lower()), "No error message"
    assert p.returncode == ExitCode.invalid_config


@pytest.mark.skipif(not tesseract.has_user_words(),
                    reason='not functional until 4.1.0')
def test_user_words_ocr(resources, outdir):
    # Does not actually test if --user-words causes output to differ
    word_list = outdir / 'wordlist.txt'
    sidecar_after = outdir / 'sidecar.txt'

    with word_list.open('w') as f:
        f.write('cromulent\n')  # a perfectly cromulent word

    check_ocrmypdf(
        resources / 'crom.png',
        outdir / 'out.pdf',
        '--image-dpi',
        150,
        '--sidecar',