Exemplo n.º 1
0
def test_extract_filepath(congress, outdir):
    xobj, _pdf = congress
    pim = PdfImage(xobj)

    result = pim.extract_to(fileprefix=(outdir / 'image'))
    assert Path(result).exists()
    assert (outdir / 'image.jpg').exists()
Exemplo n.º 2
0
def test_ccitt_photometry(sandwich):
    xobj, _pdf = sandwich

    pim = PdfImage(xobj)
    im = pim.as_pil_image()
    im = im.convert('L')
    assert im.getpixel((0, 0)) == 255, "Expected white background"
Exemplo n.º 3
0
def test_oddwidth_grayscale(bits, check_pixels):
    pdf = pikepdf.new()
    pdf.add_blank_page(page_size=(108, 72))

    imobj = Stream(
        pdf,
        bytes([0b00011011, 0b11011000, 0b00000001]),
        BitsPerComponent=bits,
        ColorSpace=Name.DeviceGray,
        Width=3,
        Height=2,
        Type=Name.XObject,
        Subtype=Name.Image,
    )

    pdf.pages[0].Contents = Stream(pdf, b'108 0 0 72 0 0 cm /Im0 Do')
    pdf.pages[0].Resources = Dictionary(XObject=Dictionary(Im0=imobj))

    pim = PdfImage(pdf.pages[0].Resources.XObject.Im0)
    assert pim.mode == 'L'
    assert pim.bits_per_component == bits
    bio = BytesIO()
    pim.extract_to(stream=bio)
    bio.seek(0)
    im = Image.open(bio)
    assert im.mode == 'L'
    assert im.size == (3, 2)

    # pdf.save(f'oddbit_{bits}.pdf')
    for check_x, check_y, val in check_pixels:
        assert im.getpixel((check_x, check_y)) == val
Exemplo n.º 4
0
def test_jbig2_not_available(jbig2, monkeypatch):
    xobj, _pdf = jbig2
    pim = PdfImage(xobj)

    monkeypatch.setattr(pikepdf.jbig2, 'jbig2dec_available', lambda: False)
    with pytest.raises(DependencyError):
        pim.as_pil_image()
Exemplo n.º 5
0
def test_jbig2_global_palette(resources):
    xobj, _pdf = first_image_in(resources / 'jbig2global.pdf')
    xobj.ColorSpace = pikepdf.Array(
        [Name.Indexed, Name.DeviceRGB, 1, b'\x00\x00\x00\xff\xff\xff'])
    pim = PdfImage(xobj)
    im = pim.as_pil_image()
    assert im.size == (4000, 2864)
Exemplo n.º 6
0
def test_image_ccitt(sandwich):
    pim = PdfImage(sandwich[0])

    assert pim.bits_per_component == 1
    assert pim.filters == ['/CCITTFaxDecode']

    outstream = BytesIO()
    assert pim.extract_to(stream=outstream) == '.tif'
Exemplo n.º 7
0
def test_image_roundtrip(outdir, w, h, pixeldata, cs, bpc):
    pdf = Pdf.new()

    image_data = pixeldata * (w * h)

    image = Stream(pdf, image_data)
    image.Type = Name('/XObject')
    image.Subtype = Name('/Image')
    image.ColorSpace = Name(cs)
    image.BitsPerComponent = bpc
    image.Width = w
    image.Height = h

    xobj = {'/Im1': image}
    resources = {'/XObject': xobj}
    mediabox = [0, 0, 100, 100]
    stream = b'q 100 0 0 100 0 0 cm /Im1 Do Q'
    contents = Stream(pdf, stream)

    page_dict = {
        '/Type': Name('/Page'),
        '/MediaBox': mediabox,
        '/Contents': contents,
        '/Resources': resources,
    }
    page = pdf.make_indirect(page_dict)

    pdf.pages.append(page)
    outfile = outdir / f'test{w}{h}{cs[1:]}{bpc}.pdf'
    pdf.save(
        outfile, compress_streams=False, stream_decode_level=StreamDecodeLevel.none
    )

    with Pdf.open(outfile) as p2:
        pim = PdfImage(p2.pages[0].Resources.XObject['/Im1'])

        assert pim.bits_per_component == bpc
        assert pim.colorspace == cs
        assert pim.width == w
        assert pim.height == h
        if cs == '/DeviceRGB':
            assert pim.mode == 'RGB'
        elif cs == '/DeviceGray' and bpc == 8:
            assert pim.mode == 'L'
        elif cs == '/DeviceCMYK':
            assert pim.mode == 'CMYK'
        elif bpc == 1:
            assert pim.mode == '1'
        assert not pim.palette

        assert pim.filters == []
        assert pim.read_bytes() == pixeldata

        outstream = BytesIO()
        pim.extract_to(stream=outstream)
        outstream.seek(0)
        im = Image.open(outstream)
        assert pim.mode == im.mode
Exemplo n.º 8
0
def test_ccitt_encodedbytealign(sandwich):
    xobj, _pdf = sandwich

    # Pretend this is image is "EncodedByteAlign". We don't have a FOSS
    # example of such an image.
    xobj.DecodeParms.EncodedByteAlign = True
    pim = PdfImage(xobj)
    with pytest.raises(UnsupportedImageTypeError):
        pim.as_pil_image()
Exemplo n.º 9
0
def test_jbig2_global_palette(first_image_in):
    xobj, _pdf = first_image_in('jbig2global.pdf')
    xobj.ColorSpace = pikepdf.Array(
        [Name.Indexed, Name.DeviceRGB, 1, b'\x00\x00\x00\xff\xff\xff']
    )
    pim = PdfImage(xobj)
    im = pim.as_pil_image()
    assert im.size == (4000, 2864)
    assert im.getpixel((0, 0)) == 255  # Ensure loaded
Exemplo n.º 10
0
def test_image_replace(congress, outdir):
    pdfimage = PdfImage(congress[0])
    pillowimage = pdfimage.as_pil_image()

    grayscale = pillowimage.convert('L')

    congress[0].write(zlib.compress(grayscale.tobytes()), Name("/FlateDecode"), Null())
    congress[0].ColorSpace = Name("/DeviceGray")
    pdf = congress[1]
    pdf.save(outdir / 'congress_gray.pdf')
Exemplo n.º 11
0
def test_image_replace(congress, outdir):
    pdfimage = PdfImage(congress[0])
    pillowimage = pdfimage.as_pil_image()

    grayscale = pillowimage.convert('L')
    grayscale = grayscale.resize((4, 4))  # So it is not obnoxious on error

    congress[0].write(zlib.compress(grayscale.tobytes()), filter=Name("/FlateDecode"))
    congress[0].ColorSpace = Name("/DeviceGray")
    pdf = congress[1]
    pdf.save(outdir / 'congress_gray.pdf')
Exemplo n.º 12
0
def test_image_palette(resources, filename, bpc):
    pdf = Pdf.open(resources / filename)
    pim = PdfImage(next(iter(pdf.pages[0].images.values())))

    assert pim.palette[0] == 'RGB'
    assert pim.colorspace == '/DeviceRGB'
    assert not pim.is_inline
    assert pim.mode == 'P'
    assert pim.bits_per_component == bpc

    outstream = BytesIO()
    pim.extract_to(stream=outstream)
Exemplo n.º 13
0
def test_jbig2_not_available(jbig2, monkeypatch):
    xobj, _pdf = jbig2
    pim = PdfImage(xobj)

    def raise_filenotfound(*args, **kwargs):
        raise FileNotFoundError('jbig2dec')

    monkeypatch.setattr(pikepdf.jbig2, 'run', raise_filenotfound)

    assert not pikepdf.jbig2.jbig2dec_available()

    with pytest.raises(DependencyError):
        pim.as_pil_image()
Exemplo n.º 14
0
def test_jp2(resources):
    pdf = Pdf.open(resources / 'pike-jp2.pdf')
    xobj = next(iter(pdf.pages[0].images.values()))
    pim = PdfImage(xobj)
    assert isinstance(pim, PdfJpxImage)

    assert '/JPXDecode' in pim.filters
    assert pim.colorspace == '/DeviceRGB'
    assert not pim.is_inline
    assert not pim.indexed
    assert pim.mode == 'RGB'
    assert pim.bits_per_component == 8
    assert pim.__eq__(42) is NotImplemented
    assert pim == PdfImage(xobj)

    outstream = BytesIO()
    pim.extract_to(stream=outstream)
    del pim
    del xobj.ColorSpace

    # If there is no explicit ColorSpace metadata we should get it from the
    # compressed data stream
    pim = PdfImage(xobj)
    assert pim.colorspace == '/DeviceRGB'
    assert pim.bits_per_component == 8
Exemplo n.º 15
0
def test_lowlevel_jpeg(congress, outdir):
    raw_bytes = congress[0].read_raw_bytes()
    with pytest.raises(PdfError):
        congress[0].read_bytes()

    assert imghdr.what('', h=raw_bytes) == 'jpeg'

    pim = PdfImage(congress[0])
    b = BytesIO()
    pim.extract_to(stream=b)
    b.seek(0)
    im = Image.open(b)
    assert im.size == (congress[0].Width, congress[0].Height)
    assert im.mode == 'RGB'
Exemplo n.º 16
0
def test_lowlevel_jpeg(congress):
    raw_bytes = congress[0].read_raw_bytes()
    with pytest.raises(PdfError):
        congress[0].read_bytes()

    im = Image.open(BytesIO(raw_bytes))
    assert im.format == 'JPEG'

    pim = PdfImage(congress[0])
    b = BytesIO()
    pim.extract_to(stream=b)
    b.seek(0)
    im = Image.open(b)
    assert im.size == (congress[0].Width, congress[0].Height)
    assert im.mode == 'RGB'
Exemplo n.º 17
0
def test_direct_extract(resources, filename, bpc, filters, ext, mode, format_):
    xobj, _pdf = first_image_in(resources / filename)
    pim = PdfImage(xobj)

    assert pim.bits_per_component == bpc
    assert pim.filters == filters

    outstream = BytesIO()
    outext = pim.extract_to(stream=outstream)
    assert outext == ext, 'unexpected output file'
    outstream.seek(0)

    im = Image.open(outstream)
    assert im.mode == mode
    assert im.format == format_
Exemplo n.º 18
0
def test_icc_palette(resources):
    xobj, _pdf = first_image_in(resources / 'pink-palette-icc.pdf')
    pim = PdfImage(xobj)
    assert pim.icc.profile.xcolor_space == 'RGB '  # with trailing space
    b = BytesIO()
    pim.extract_to(stream=b)
    b.seek(0)

    im = Image.open(b)
    assert im.size == (xobj.Width, xobj.Height)
    assert im.mode == 'P'
    pil_icc = im.info.get('icc_profile')
    pil_icc_stream = BytesIO(pil_icc)
    pil_prf = ImageCms.ImageCmsProfile(pil_icc_stream)

    assert pil_prf.tobytes() == pim.icc.tobytes()
Exemplo n.º 19
0
def extract_image_filter(
        pike: Pdf, root: Path, image: Object,
        xref: Xref) -> Optional[Tuple[PdfImage, Tuple[Name, Object]]]:
    if image.Subtype != Name.Image:
        return None
    if image.Length < 100:
        log.debug(f"Skipping small image, xref {xref}")
        return None
    if image.Width < 8 or image.Height < 8:  # Issue 732
        log.debug(f"Skipping oddly sized image, xref {xref}")
        return None

    pim = PdfImage(image)

    if len(pim.filter_decodeparms) > 1:
        log.debug(f"Skipping multiply filtered image, xref {xref}")
        return None
    filtdp = pim.filter_decodeparms[0]

    if pim.bits_per_component > 8:
        log.debug(f"Skipping wide gamut image, xref {xref}")
        return None  # Don't mess with wide gamut images

    if filtdp[0] == Name.JPXDecode:
        log.debug(f"Skipping JPEG2000 iamge, xref {xref}")
        return None  # Don't do JPEG2000

    if Name.Decode in image:
        log.debug(f"Skipping image with Decode table, xref {xref}")
        return None  # Don't mess with custom Decode tables

    return pim, filtdp
Exemplo n.º 20
0
def test_stacked_compression(resources):
    xobj, _pdf = first_image_in(resources / 'pike-flate-jp2.pdf')

    pim = PdfImage(xobj)
    assert pim.mode == 'RGB'
    assert pim.colorspace == '/DeviceRGB'
    assert pim.bits_per_component == 8
    assert pim.filters == ['/FlateDecode', '/JPXDecode']
Exemplo n.º 21
0
def test_invalid_icc(resources):
    xobj, _pdf = first_image_in(resources / 'pink-palette-icc.pdf')

    cs = xobj.ColorSpace[1][1]  # [/Indexed [/ICCBased <stream>]]
    cs.write(b'foobar')  # corrupt the ICC profile
    with pytest.raises(UnsupportedImageTypeError,
                       match="ICC profile corrupt or not readable"):
        pim = PdfImage(xobj)
        _icc = pim.icc
Exemplo n.º 22
0
def test_icc_use(first_image_in):
    xobj, _pdf = first_image_in('1biticc.pdf')

    pim = PdfImage(xobj)
    assert pim.mode == 'L'  # It may be 1 bit per pixel but it's more complex than that
    assert pim.colorspace == '/ICCBased'
    assert pim.bits_per_component == 1

    assert pim.icc.profile.xcolor_space == 'GRAY'
Exemplo n.º 23
0
def test_icc_use(resources):
    xobj, _pdf = first_image_in(resources / '1biticc.pdf')

    pim = PdfImage(xobj)
    assert pim.mode == '1'
    assert pim.colorspace == '/ICCBased'
    assert pim.bits_per_component == 1

    assert pim.icc.profile.xcolor_space == 'GRAY'
Exemplo n.º 24
0
def extract_image_pikepdf(page):
    """Extracts an image as a PIL Image from the designated page.

    This method uses PikePDF to extract the image. It works on the assumption that the scan is included as a single
    embedded image within the page. This means that the PDF should include a single embedded image which has the same
    aspect ratio of the complete page. If there is not a single image embedded on the page, or if this image does not
    share the same aspect ratio to the page, a ValueError is thrown.

    Parameters
    ----------
    page: pikepdf.Page
        Page from which to extract the image

    Returns
    -------
    img_array : PIL Image
        The extracted image data

    Raises
    ------
    ValueError
        if not exactly one image is found on the page or the image does not have the same aspect ratio as the page
    AttributeError
        if the MediaBox of a page is not defined
    """
    images = page.images

    # Check whether only one image is embedded within the page.
    if len(images) != 1:
        raise ValueError('Not exactly 1 image present on the page.')
    else:
        pdf_image = PdfImage(images[list(images.keys())[0]])
        pdf_width = float(page.MediaBox[2] - page.MediaBox[0])
        pdf_height = float(page.MediaBox[3] - page.MediaBox[1])

        pdf_ratio = pdf_width / pdf_height
        image_ratio = pdf_image.width / pdf_image.height

        # Check if the aspect ratio of the image is the same as the aspect ratio of the page up to a 3% relative error.
        if abs(pdf_ratio - image_ratio) > 0.03 * pdf_ratio:
            raise ValueError('Image has incorrect dimensions')
        return pdf_image.as_pil_image()
Exemplo n.º 25
0
    def read_document(self, file):
        pdf = Pdf.open(file)
        print(pdf)
        document = pdf.pages[0]

        (name, raw_image) = next(document.images.items())

        image = PdfImage(raw_image).as_pil_image()

        text = pytesseract.image_to_string(image)

        return self.process_text(text)
Exemplo n.º 26
0
def test_ccitt_icc(resources):
    xobj, pdf = first_image_in(resources / 'sandwich.pdf')

    pim = PdfImage(xobj)
    assert pim.icc is None
    bio = BytesIO()
    output_type = pim.extract_to(stream=bio)
    assert output_type == '.tif'
    bio.seek(0)
    assert b'GRAYXYZ' not in bio.read(1000)
    bio.seek(0)
    assert Image.open(bio)

    icc_data = (resources / 'Gray.icc').read_bytes()
    icc_stream = pdf.make_stream(icc_data)
    icc_stream.N = 1
    xobj.ColorSpace = pikepdf.Array([Name.ICCBased, icc_stream])

    pim = PdfImage(xobj)
    assert pim.icc.profile.xcolor_space == 'GRAY'
    bio = BytesIO()
    output_type = pim.extract_to(stream=bio)
    assert output_type == '.tif'
    bio.seek(0)
    assert b'GRAYXYZ' in bio.read(1000)
    bio.seek(0)
    assert Image.open(bio)
Exemplo n.º 27
0
def test_image_eq(trivial, congress, inline):
    # Note: JPX equality is tested in test_jp2 (if we have a jpeg2000 codec)
    assert PdfImage(trivial[0]) == PdfImage(trivial[0])
    assert PdfImage(trivial[0]).__eq__(42) is NotImplemented
    assert PdfImage(trivial[0]) != PdfImage(congress[0])

    assert inline != PdfImage(congress[0])
    assert inline.__eq__(42) is NotImplemented
Exemplo n.º 28
0
def test_extract_direct_fails_nondefault_colortransform(congress):
    xobj, _pdf = congress

    xobj.DecodeParms = Dictionary(
        ColorTransform=42  # Non standard (or allowed in the spec)
    )
    pim = PdfImage(xobj)

    bio = BytesIO()
    with pytest.raises(UnsupportedImageTypeError):
        pim._extract_direct(stream=bio)

    xobj.ColorSpace = Name.DeviceCMYK
    pim = PdfImage(xobj)
    with pytest.raises(UnsupportedImageTypeError):
        pim._extract_direct(stream=bio)
Exemplo n.º 29
0
def test_jbig2_error(resources, monkeypatch):
    xobj, _pdf = first_image_in(resources / 'jbig2global.pdf')
    pim = PdfImage(xobj)
    monkeypatch.setattr(pikepdf.jbig2, 'jbig2dec_available', lambda: True)

    def raise_calledprocesserror(*args, **kwargs):
        raise subprocess.CalledProcessError(1, 'jbig2dec')

    monkeypatch.setattr(pikepdf.jbig2, 'run', raise_calledprocesserror)

    pim = PdfImage(xobj)
    with pytest.raises(subprocess.CalledProcessError):
        pim.as_pil_image()
Exemplo n.º 30
0
def test_jbig2_too_old(resources, monkeypatch):
    xobj, _pdf = first_image_in(resources / 'jbig2global.pdf')
    pim = PdfImage(xobj)

    def run_version_override(subprocargs, *args, **kwargs):
        if '--version' in subprocargs:
            return subprocess.CompletedProcess(subprocargs, 0, 'jbig2dec 0.12\n')
        return subprocess.run(subprocargs, *args, **kwargs)

    monkeypatch.setattr(pikepdf.jbig2, 'run', run_version_override)

    pim = PdfImage(xobj)
    with pytest.raises(DependencyError, match='too old'):
        pim.as_pil_image()