示例#1
0
def test_rasterize_rotates(resources, tmp_path):
    pm = get_plugin_manager([])

    img = tmp_path / 'img90.png'
    pm.hook.rasterize_pdf_page(
        input_file=resources / 'graph.pdf',
        output_file=img,
        raster_device='pngmono',
        raster_dpi=Resolution(20, 20),
        page_dpi=Resolution(20, 20),
        pageno=1,
        rotation=90,
        filter_vector=False,
    )
    assert Image.open(img).size == (123, 151), "Image not rotated"

    img = tmp_path / 'img180.png'
    pm.hook.rasterize_pdf_page(
        input_file=resources / 'graph.pdf',
        output_file=img,
        raster_device='pngmono',
        raster_dpi=Resolution(20, 20),
        page_dpi=Resolution(20, 20),
        pageno=1,
        rotation=180,
        filter_vector=False,
    )
    assert Image.open(img).size == (151, 123), "Image not rotated"
示例#2
0
def _get_dpi(ctm_shorthand, image_size) -> Resolution:
    """Given the transformation matrix and image size, find the image DPI.

    PDFs do not include image resolution information within image data.
    Instead, the PDF page content stream describes the location where the
    image will be rasterized, and the effective resolution is the ratio of the
    pixel size to raster target size.

    Normally a scanned PDF has the paper size set appropriately but this is
    not guaranteed. The most common case is a cropped image will change the
    page size (/CropBox) without altering the page content stream. That means
    it is not sufficient to assume that the image fills the page, even though
    that is the most common case.

    A PDF image may be scaled (always), cropped, translated, rotated in place
    to an arbitrary angle (rarely) and skewed. Only equal area mappings can
    be expressed, that is, it is not necessary to consider distortions where
    the effective DPI varies with position.

    To determine the image scale, transform an offset axis vector v0 (0, 0),
    width-axis vector v0 (1, 0), height-axis vector vh (0, 1) with the matrix,
    which gives the dimensions of the image in PDF units. From there we can
    compare to actual image dimensions. PDF uses
    row vector * matrix_transposed unlike the traditional
    matrix * column vector.

    The offset, width and height vectors can be combined in a matrix and
    multiplied by the transform matrix. Then we want to calculated
        magnitude(width_vector - offset_vector)
    and
        magnitude(height_vector - offset_vector)

    When the above is worked out algebraically, the effect of translation
    cancels out, and the vector magnitudes become functions of the nonzero
    transformation matrix indices. The results of the derivation are used
    in this code.

    pdfimages -list does calculate the DPI in some way that is not completely
    naive, but it does not get the DPI of rotated images right, so cannot be
    used anymore to validate this. Photoshop works, or using Acrobat to
    rotate the image back to normal.

    It does not matter if the image is partially cropped, or even out of the
    /MediaBox.

    """

    a, b, c, d, _, _ = ctm_shorthand

    # Calculate the width and height of the image in PDF units
    image_drawn = hypot(a, b), hypot(c, d)

    def calc(drawn, pixels, inches_per_pt=72.0):
        # The scale of the image is pixels per unit of default user space (1/72")
        scale = pixels / drawn if drawn != 0 else inf
        dpi = scale * inches_per_pt
        return dpi

    dpi_w, dpi_h = (calc(image_drawn[n], image_size[n]) for n in range(2))
    return Resolution(dpi_w, dpi_h)
示例#3
0
def test_remove_background(resources, outdir):
    # Ensure the input image does not contain pure white/black
    with Image.open(resources / 'congress.jpg') as im:
        assert im.getextrema() != ((0, 255), (0, 255), (0, 255))

    output_pdf = check_ocrmypdf(
        resources / 'congress.jpg',
        outdir / 'test_remove_bg.pdf',
        '--remove-background',
        '--image-dpi',
        '150',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    output_png = outdir / 'remove_bg.png'

    ghostscript.rasterize_pdf(
        output_pdf,
        output_png,
        raster_device='png16m',
        raster_dpi=Resolution(100, 100),
        pageno=1,
    )

    # The output image should contain pure white and black
    with Image.open(output_png) as im:
        assert im.getextrema() == ((0, 255), (0, 255), (0, 255))
示例#4
0
def test_deskew(resources, outdir):
    # Run with deskew
    deskewed_pdf = check_ocrmypdf(
        resources / 'skew.pdf',
        outdir / 'skew.pdf',
        '-d',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    # Now render as an image again and use Leptonica to find the skew angle
    # to confirm that it was deskewed
    deskewed_png = outdir / 'deskewed.png'

    ghostscript.rasterize_pdf(
        deskewed_pdf,
        deskewed_png,
        raster_device='pngmono',
        raster_dpi=Resolution(150, 150),
        pageno=1,
    )

    pix = Pix.open(deskewed_png)
    skew_angle, _skew_confidence = pix.find_skew()

    print(skew_angle)
    assert -0.5 < skew_angle < 0.5, "Deskewing failed"
示例#5
0
def rasterize_pdf(
    input_file: os.PathLike,
    output_file: os.PathLike,
    *,
    raster_device: str,
    raster_dpi: Resolution,
    pageno: int = 1,
    page_dpi: Optional[Resolution] = None,
    rotation: Optional[int] = None,
    filter_vector: bool = False,
):
    """Rasterize one page of a PDF at resolution raster_dpi in canvas units."""
    raster_dpi = raster_dpi.round(6)
    if not page_dpi:
        page_dpi = raster_dpi

    args_gs = ([
        GS,
        '-dQUIET',
        '-dSAFER',
        '-dBATCH',
        '-dNOPAUSE',
        f'-sDEVICE={raster_device}',
        f'-dFirstPage={pageno}',
        f'-dLastPage={pageno}',
        f'-r{raster_dpi.x:f}x{raster_dpi.y:f}',
    ] + (['-dFILTERVECTOR'] if filter_vector else []) + [
        '-o',
        '-',
        '-sstdout=%stderr',
        '-dAutoRotatePages=/None',  # Probably has no effect on raster
        '-f',
        fspath(input_file),
    ])

    try:
        p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True)
    except CalledProcessError as e:
        log.error(e.stderr.decode(errors='replace'))
        raise SubprocessOutputError('Ghostscript rasterizing failed')
    else:
        stderr = p.stderr.decode(errors='replace')
        if _gs_error_reported(stderr):
            log.error(stderr)

    with Image.open(BytesIO(p.stdout)) as im:
        if rotation is not None:
            log.debug("Rotating output by %i", rotation)
            # rotation is a clockwise angle and Image.ROTATE_* is
            # counterclockwise so this cancels out the rotation
            if rotation == 90:
                im = im.transpose(Image.ROTATE_90)
            elif rotation == 180:
                im = im.transpose(Image.ROTATE_180)
            elif rotation == 270:
                im = im.transpose(Image.ROTATE_270)
            if rotation % 180 == 90:
                page_dpi = page_dpi.flip_axis()
        im.save(fspath(output_file), dpi=page_dpi)
示例#6
0
def create_visible_page_jpg(image: Path, page_context: PageContext) -> Path:
    output_file = page_context.get_path('visible.jpg')
    with Image.open(image) as im:
        # At this point the image should be a .png, but deskew, unpaper
        # might have removed the DPI information. In this case, fall back to
        # square DPI used to rasterize. When the preview image was
        # rasterized, it was also converted to square resolution, which is
        # what we want to give to the OCR engine, so keep it square.
        if 'dpi' in im.info:
            dpi = Resolution(*im.info['dpi'])
        else:
            # Fallback to page-implied DPI
            dpi = get_page_square_dpi(page_context.pageinfo,
                                      page_context.options)

        # Pillow requires integer DPI
        im.save(output_file, format='JPEG', dpi=dpi.to_int())
    return output_file
示例#7
0
def triage_image_file(input_file, output_file, options):
    log.info("Input file is not a PDF, checking if it is an image...")
    try:
        im = Image.open(input_file)
    except EnvironmentError as e:
        # Recover the original filename
        log.error(str(e).replace(str(input_file), str(options.input_file)))
        raise UnsupportedImageFormatError() from e

    with im:
        log.info("Input file is an image")
        if 'dpi' in im.info:
            if im.info['dpi'] <= (96, 96) and not options.image_dpi:
                log.info("Image size: (%d, %d)", *im.size)
                log.info("Image resolution: (%d, %d)", *im.info['dpi'])
                log.error(
                    "Input file is an image, but the resolution (DPI) is "
                    "not credible.  Estimate the resolution at which the "
                    "image was scanned and specify it using --image-dpi.")
                raise DpiError()
        elif not options.image_dpi:
            log.info("Image size: (%d, %d)", *im.size)
            log.error("Input file is an image, but has no resolution (DPI) "
                      "in its metadata.  Estimate the resolution at which "
                      "image was scanned and specify it using --image-dpi.")
            raise DpiError()

        if im.mode in ('RGBA', 'LA'):
            log.error("The input image has an alpha channel. Remove the alpha "
                      "channel first.")
            raise UnsupportedImageFormatError()

        if 'iccprofile' not in im.info:
            if im.mode == 'RGB':
                log.info("Input image has no ICC profile, assuming sRGB")
            elif im.mode == 'CMYK':
                log.error("Input CMYK image has no ICC profile, not usable")
                raise UnsupportedImageFormatError()

    try:
        log.info("Image seems valid. Try converting to PDF...")
        layout_fun = img2pdf.default_layout_fun
        if options.image_dpi:
            layout_fun = img2pdf.get_fixed_dpi_layout_fun(
                Resolution(options.image_dpi, options.image_dpi))
        with open(output_file, 'wb') as outf:
            img2pdf.convert(
                os.fspath(input_file),
                layout_fun=layout_fun,
                with_pdfrw=False,
                outputstream=outf,
            )
        log.info("Successfully converted to PDF, processing...")
    except img2pdf.ImageOpenError as e:
        log.error(e)
        raise UnsupportedImageFormatError() from e
示例#8
0
def get_canvas_square_dpi(pageinfo, options) -> Resolution:
    """Get the DPI when we require xres == yres, in Postscript units"""
    units = float(
        max(
            (pageinfo.dpi.x) or VECTOR_PAGE_DPI,
            (pageinfo.dpi.y) or VECTOR_PAGE_DPI,
            VECTOR_PAGE_DPI if pageinfo.has_vector else 0.0,
            options.oversample or 0.0,
        ))
    return Resolution(units, units)
示例#9
0
 def rasterize(pdf, pageno, png):
     if png.exists():
         print(png)
         return
     ghostscript.rasterize_pdf(
         pdf,
         png,
         raster_device='pngmono',
         raster_dpi=Resolution(100, 100),
         pageno=pageno,
         rotation=0,
     )
示例#10
0
def test_rasterize_size(francais, outdir):
    path, pdf = francais
    page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
    assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
    page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
    target_size = Decimal('50.0'), Decimal('30.0')
    forced_dpi = Resolution(42.0, 4242.0)

    rasterize_pdf(
        path,
        outdir / 'out.png',
        raster_device='pngmono',
        raster_dpi=Resolution(
            target_size[0] / page_size[0], target_size[1] / page_size[1]
        ),
        page_dpi=forced_dpi,
    )

    with Image.open(outdir / 'out.png') as im:
        assert im.size == target_size
        assert im.info['dpi'] == forced_dpi
示例#11
0
def test_mono_not_inverted(resources, outdir):
    infile = resources / '2400dpi.pdf'
    opt.main(infile, outdir / 'out.pdf', level=3)

    rasterize_pdf(
        outdir / 'out.pdf',
        outdir / 'im.png',
        raster_device='pnggray',
        raster_dpi=Resolution(10, 10),
    )

    with Image.open(fspath(outdir / 'im.png')) as im:
        assert im.getpixel((0, 0)) == 255, "Expected white background"
示例#12
0
def get_page_square_dpi(pageinfo, options) -> Resolution:
    "Get the DPI when we require xres == yres, scaled to physical units"
    xres = pageinfo.dpi.x or 0.0
    yres = pageinfo.dpi.y or 0.0
    userunit = float(pageinfo.userunit) or 1.0
    units = float(
        max(
            (xres * userunit) or VECTOR_PAGE_DPI,
            (yres * userunit) or VECTOR_PAGE_DPI,
            VECTOR_PAGE_DPI if pageinfo.has_vector else 0.0,
            options.oversample or 0.0,
        ))
    return Resolution(units, units)
示例#13
0
def get_page_dpi(pageinfo, options):
    "Get the DPI when nonsquare DPI is tolerable"
    xres = max(
        pageinfo.dpi.x or VECTOR_PAGE_DPI,
        options.oversample or 0.0,
        VECTOR_PAGE_DPI if pageinfo.has_vector else 0.0,
    )
    yres = max(
        pageinfo.dpi.y or VECTOR_PAGE_DPI,
        options.oversample or 0,
        VECTOR_PAGE_DPI if pageinfo.has_vector else 0.0,
    )
    return Resolution(float(xres), float(yres))
示例#14
0
def test_rasterize_rotated(francais, outdir, caplog):
    path, pdf = francais
    page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
    assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
    page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
    target_size = Decimal('50.0'), Decimal('30.0')
    forced_dpi = Resolution(42.0, 4242.0)

    caplog.set_level(logging.DEBUG)
    rasterize_pdf(
        path,
        outdir / 'out.png',
        raster_device='pngmono',
        raster_dpi=Resolution(
            target_size[0] / page_size[0], target_size[1] / page_size[1]
        ),
        page_dpi=forced_dpi,
        rotation=90,
    )

    with Image.open(outdir / 'out.png') as im:
        assert im.size == (target_size[1], target_size[0])
        assert im.info['dpi'] == (forced_dpi[1], forced_dpi[0])
示例#15
0
def test_image_scale0(resources, outpdf):
    with pikepdf.open(resources / 'cmyk.pdf') as cmyk:
        xobj = pikepdf.Page(cmyk.pages[0]).as_form_xobject()

        p = pikepdf.Pdf.new()
        p.add_blank_page(page_size=(72, 72))
        objname = pikepdf.Page(p.pages[0]).add_resource(
            p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0)
        print(objname)
        p.pages[0].Contents = pikepdf.Stream(
            p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname))
        p.save(outpdf)

    pi = pdfinfo.PdfInfo(outpdf,
                         detailed_analysis=True,
                         progbar=False,
                         max_workers=1)
    assert not pi.pages[0]._images[0].dpi.is_finite
    assert pi.pages[0].dpi == Resolution(0, 0)
示例#16
0
文件: info.py 项目: masixian/OCRmyPDF
 def dpi(self) -> Resolution:
     return self._pageinfo.get('dpi', Resolution(0.0, 0.0))
示例#17
0
文件: info.py 项目: masixian/OCRmyPDF
def _pdf_get_pageinfo(
    pdf, pageno: int, infile: PathLike, check_pages, detailed_analysis: bool
):
    pageinfo: Dict[str, Any] = {}
    pageinfo['pageno'] = pageno
    pageinfo['images'] = []

    page = pdf.pages[pageno]
    mediabox = [Decimal(d) for d in page.MediaBox.as_list()]
    width_pt = mediabox[2] - mediabox[0]
    height_pt = mediabox[3] - mediabox[1]

    check_this_page = pageno in check_pages

    if check_this_page and detailed_analysis:
        pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
        miner = get_page_analysis(infile, pageno, pscript5_mode)
        pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes))
        bboxes = (box.bbox for box in pageinfo['textboxes'])

        pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt)
    else:
        pageinfo['textboxes'] = []
        pageinfo['has_text'] = None  # i.e. "no information"

    userunit = page.get('/UserUnit', Decimal(1.0))
    if not isinstance(userunit, Decimal):
        userunit = Decimal(userunit)
    pageinfo['userunit'] = userunit
    pageinfo['width_inches'] = width_pt * userunit / Decimal(72.0)
    pageinfo['height_inches'] = height_pt * userunit / Decimal(72.0)

    try:
        pageinfo['rotate'] = int(page['/Rotate'])
    except KeyError:
        pageinfo['rotate'] = 0

    userunit_shorthand = (userunit, 0, 0, userunit, 0, 0)

    if check_this_page:
        pageinfo['has_vector'] = False
        pageinfo['has_text'] = False
        pageinfo['images'] = []
        for ci in _process_content_streams(
            pdf=pdf, container=page, shorthand=userunit_shorthand
        ):
            if isinstance(ci, VectorMarker):
                pageinfo['has_vector'] = True
            elif isinstance(ci, TextMarker):
                pageinfo['has_text'] = True
            elif isinstance(ci, ImageInfo):
                pageinfo['images'].append(ci)
            else:
                raise NotImplementedError()
    else:
        pageinfo['has_vector'] = None  # i.e. "no information"
        pageinfo['has_text'] = None
        pageinfo['images'] = None

    if pageinfo['images']:
        dpi = Resolution(0.0, 0.0).take_max(image.dpi for image in pageinfo['images'])
        pageinfo['dpi'] = dpi
        pageinfo['width_pixels'] = int(round(dpi.x * float(pageinfo['width_inches'])))
        pageinfo['height_pixels'] = int(round(dpi.y * float(pageinfo['height_inches'])))

    return pageinfo
示例#18
0
from reportlab.pdfgen.canvas import Canvas

from ocrmypdf import _pipeline, pdfinfo
from ocrmypdf.helpers import Resolution


@pytest.fixture(scope='session')
def rgb_image():
    im = Image.new('RGB', (8, 8))
    im.putpixel((4, 4), (255, 0, 0))
    im.putpixel((5, 5), (0, 255, 0))
    im.putpixel((6, 6), (0, 0, 255))
    return ImageReader(im)


DUMMY_OVERSAMPLE_RESOLUTION = Resolution(42.0, 42.0)
VECTOR_RESOLUTION = Resolution(_pipeline.VECTOR_PAGE_DPI,
                               _pipeline.VECTOR_PAGE_DPI)


@pytest.mark.parametrize(
    'image, text, vector, result',
    [
        (False, False, False, VECTOR_RESOLUTION),
        (False, True, False, VECTOR_RESOLUTION),
        (True, False, False, DUMMY_OVERSAMPLE_RESOLUTION),
        (True, True, False, VECTOR_RESOLUTION),
        (False, False, True, VECTOR_RESOLUTION),
        (False, True, True, VECTOR_RESOLUTION),
        (True, False, True, VECTOR_RESOLUTION),
        (True, True, True, VECTOR_RESOLUTION),
示例#19
0
 def dpi(self) -> Resolution:
     if self._dpi is None:
         return Resolution(0.0, 0.0)
     return self._dpi
示例#20
0
    def _gather_pageinfo(
        self,
        pdf: Pdf,
        pageno: int,
        infile: PathLike,
        check_pages: Container[int],
        detailed_analysis: bool,
    ):
        page = pdf.pages[pageno]
        mediabox = [Decimal(d) for d in page.MediaBox.as_list()]
        width_pt = mediabox[2] - mediabox[0]
        height_pt = mediabox[3] - mediabox[1]

        check_this_page = pageno in check_pages

        if check_this_page and detailed_analysis:
            pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
            miner = get_page_analysis(infile, pageno, pscript5_mode)
            self._textboxes = list(simplify_textboxes(miner, get_text_boxes))
            bboxes = (box.bbox for box in self._textboxes)

            self._has_text = _page_has_text(bboxes, width_pt, height_pt)
        else:
            self._textboxes = []
            self._has_text = None  # i.e. "no information"

        userunit = page.get('/UserUnit', Decimal(1.0))
        if not isinstance(userunit, Decimal):
            userunit = Decimal(userunit)
        self._userunit = userunit
        self._width_inches = width_pt * userunit / Decimal(72.0)
        self._height_inches = height_pt * userunit / Decimal(72.0)

        try:
            self._rotate = int(page['/Rotate'])
        except KeyError:
            self._rotate = 0

        userunit_shorthand = (userunit, 0, 0, userunit, 0, 0)

        if check_this_page:
            self._has_vector = False
            self._has_text = False
            self._images = []
            for ci in _process_content_streams(
                pdf=pdf, container=page, shorthand=userunit_shorthand
            ):
                if isinstance(ci, VectorMarker):
                    self._has_vector = True
                elif isinstance(ci, TextMarker):
                    self._has_text = True
                elif isinstance(ci, ImageInfo):
                    self._images.append(ci)
                else:
                    raise NotImplementedError()
        else:
            self._has_vector = None  # i.e. "no information"
            self._has_text = None
            self._images = None

        self._dpi = None
        if self._images:
            dpi = Resolution(0.0, 0.0).take_max(image.dpi for image in self._images)
            self._dpi = dpi
            self._width_pixels = int(round(dpi.x * float(self._width_inches)))
            self._height_pixels = int(round(dpi.y * float(self._height_inches)))