Exemplo n.º 1
0
def extract_image_jbig2(*, pike: pikepdf.Pdf, root: Path, image: Object,
                        xref: Xref, options) -> Optional[XrefExt]:
    result = extract_image_filter(pike, root, image, xref)
    if result is None:
        return None
    pim, filtdp = result

    if (pim.bits_per_component == 1 and filtdp[0] != Name.JBIG2Decode
            and jbig2enc.available()):
        # Save any colorspace associated with the image, so that we
        # will export a pure 1-bit PNG with no palette or ICC profile.
        # Showing the palette or ICC to jbig2enc will cause it to perform
        # colorspace transform to 1bpp, which will conflict the palette or
        # ICC if it exists.
        colorspace = pim.obj.get(pikepdf.Name.ColorSpace, None)
        if colorspace is not None or pim.image_mask:
            try:
                # Set to DeviceGray temporarily; we already in 1 bpc.
                pim.obj.ColorSpace = pikepdf.Name.DeviceGray
                imgname = root / f'{xref:08d}'
                with imgname.open('wb') as f:
                    ext = pim.extract_to(stream=f)
                imgname.rename(imgname.with_suffix(ext))
            except pikepdf.UnsupportedImageTypeError:
                return None
            finally:
                # Restore image colorspace after temporarily setting it to DeviceGray
                if colorspace is not None:
                    pim.obj.ColorSpace = colorspace
                else:
                    del pim.obj.ColorSpace
            return XrefExt(xref, ext)
    return None
Exemplo n.º 2
0
def report_output_file_size(options, input_file, output_file):
    try:
        output_size = Path(output_file).stat().st_size
        input_size = Path(input_file).stat().st_size
    except FileNotFoundError:
        return  # Outputting to stream or something
    with pikepdf.open(output_file) as p:
        # Overhead constants obtained by estimating amount of data added by OCR
        # PDF/A conversion, and possible XMP metadata addition, with compression
        FILE_OVERHEAD = 4000
        OCR_PER_PAGE_OVERHEAD = 3000
        reasonable_overhead = FILE_OVERHEAD + OCR_PER_PAGE_OVERHEAD * len(
            p.pages)
    ratio = output_size / input_size
    reasonable_ratio = output_size / (input_size + reasonable_overhead)
    if reasonable_ratio < 1.35 or input_size < 25000:
        return  # Seems fine

    reasons = []
    image_preproc = {
        'deskew',
        'clean_final',
        'remove_background',
        'oversample',
        'force_ocr',
    }
    for arg in image_preproc:
        if getattr(options, arg, False):
            reasons.append(
                f"The argument --{arg.replace('_', '-')} was issued, causing transcoding."
            )

    if options.optimize == 0:
        reasons.append("Optimization was disabled.")
    else:
        image_optimizers = {
            'jbig2': jbig2enc.available(),
            'pngquant': pngquant.available(),
        }
        for name, available in image_optimizers.items():
            if not available:
                reasons.append(
                    f"The optional dependency '{name}' was not found, so some image "
                    f"optimizations could not be attempted.")
    if options.output_type.startswith('pdfa'):
        reasons.append(
            "PDF/A conversion was enabled. (Try `--output-type pdf`.)")
    if options.plugins:
        reasons.append("Plugins were used.")

    if reasons:
        explanation = "Possible reasons for this include:\n" + '\n'.join(
            reasons) + "\n"
    else:
        explanation = "No reason for this increase is known.  Please report this issue."

    log.warning(
        f"The output file size is {ratio:.2f}× larger than the input file.\n"
        f"{explanation}")
Exemplo n.º 3
0
def extract_image_jbig2(*, pike: pikepdf.Pdf, root: Path, image: Object,
                        xref: Xref, options) -> Optional[XrefExt]:
    result = extract_image_filter(pike, root, image, xref)
    if result is None:
        return None
    pim, filtdp = result

    if (pim.bits_per_component == 1 and filtdp[0] != Name.JBIG2Decode
            and jbig2enc.available()):
        try:
            imgname = root / f'{xref:08d}'
            with imgname.open('wb') as f:
                ext = pim.extract_to(stream=f)
            imgname.rename(imgname.with_suffix(ext))
        except pikepdf.UnsupportedImageTypeError:
            return None
        return XrefExt(xref, ext)
    return None
Exemplo n.º 4
0
        resources / 'crom.png',
        outpdf,
        '--image-dpi',
        '200',
        '--optimize',
        '3',
        '--jpg-quality',
        '50',
        '--png-quality',
        '20',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )


@pytest.mark.skipif(not jbig2enc.available(), reason='need jbig2enc')
@pytest.mark.parametrize('lossy', [False, True])
def test_jbig2_lossy(lossy, resources, outpdf):
    args = [
        resources / 'ccitt.pdf',
        outpdf,
        '--image-dpi',
        '200',
        '--optimize',
        3,
        '--jpg-quality',
        '50',
        '--png-quality',
        '20',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
Exemplo n.º 5
0
import img2pdf
import pikepdf
import pytest
from PIL import Image, ImageDraw

from ocrmypdf import optimize as opt
from ocrmypdf._exec import jbig2enc, pngquant
from ocrmypdf._exec.ghostscript import rasterize_pdf
from ocrmypdf.helpers import Resolution

from .conftest import check_ocrmypdf

needs_pngquant = pytest.mark.skipif(not pngquant.available(),
                                    reason="pngquant not installed")
needs_jbig2enc = pytest.mark.skipif(not jbig2enc.available(),
                                    reason="jbig2enc not installed")


@needs_pngquant
@pytest.mark.parametrize('pdf', ['multipage.pdf', 'palette.pdf'])
def test_basic(resources, pdf, outpdf):
    infile = resources / pdf
    opt.main(infile, outpdf, level=3)

    assert 0.98 * Path(outpdf).stat().st_size <= Path(infile).stat().st_size


@needs_pngquant
def test_mono_not_inverted(resources, outdir):
    infile = resources / '2400dpi.pdf'