def test_extract_filepath(congress, outdir): xobj, _pdf = congress pim = PdfImage(xobj) result = pim.extract_to(fileprefix=(outdir / 'image')) assert Path(result).exists() assert (outdir / 'image.jpg').exists()
def test_ccitt_photometry(sandwich): xobj, _pdf = sandwich pim = PdfImage(xobj) im = pim.as_pil_image() im = im.convert('L') assert im.getpixel((0, 0)) == 255, "Expected white background"
def test_oddwidth_grayscale(bits, check_pixels): pdf = pikepdf.new() pdf.add_blank_page(page_size=(108, 72)) imobj = Stream( pdf, bytes([0b00011011, 0b11011000, 0b00000001]), BitsPerComponent=bits, ColorSpace=Name.DeviceGray, Width=3, Height=2, Type=Name.XObject, Subtype=Name.Image, ) pdf.pages[0].Contents = Stream(pdf, b'108 0 0 72 0 0 cm /Im0 Do') pdf.pages[0].Resources = Dictionary(XObject=Dictionary(Im0=imobj)) pim = PdfImage(pdf.pages[0].Resources.XObject.Im0) assert pim.mode == 'L' assert pim.bits_per_component == bits bio = BytesIO() pim.extract_to(stream=bio) bio.seek(0) im = Image.open(bio) assert im.mode == 'L' assert im.size == (3, 2) # pdf.save(f'oddbit_{bits}.pdf') for check_x, check_y, val in check_pixels: assert im.getpixel((check_x, check_y)) == val
def test_jbig2_not_available(jbig2, monkeypatch): xobj, _pdf = jbig2 pim = PdfImage(xobj) monkeypatch.setattr(pikepdf.jbig2, 'jbig2dec_available', lambda: False) with pytest.raises(DependencyError): pim.as_pil_image()
def test_jbig2_global_palette(resources): xobj, _pdf = first_image_in(resources / 'jbig2global.pdf') xobj.ColorSpace = pikepdf.Array( [Name.Indexed, Name.DeviceRGB, 1, b'\x00\x00\x00\xff\xff\xff']) pim = PdfImage(xobj) im = pim.as_pil_image() assert im.size == (4000, 2864)
def test_image_ccitt(sandwich): pim = PdfImage(sandwich[0]) assert pim.bits_per_component == 1 assert pim.filters == ['/CCITTFaxDecode'] outstream = BytesIO() assert pim.extract_to(stream=outstream) == '.tif'
def test_image_roundtrip(outdir, w, h, pixeldata, cs, bpc): pdf = Pdf.new() image_data = pixeldata * (w * h) image = Stream(pdf, image_data) image.Type = Name('/XObject') image.Subtype = Name('/Image') image.ColorSpace = Name(cs) image.BitsPerComponent = bpc image.Width = w image.Height = h xobj = {'/Im1': image} resources = {'/XObject': xobj} mediabox = [0, 0, 100, 100] stream = b'q 100 0 0 100 0 0 cm /Im1 Do Q' contents = Stream(pdf, stream) page_dict = { '/Type': Name('/Page'), '/MediaBox': mediabox, '/Contents': contents, '/Resources': resources, } page = pdf.make_indirect(page_dict) pdf.pages.append(page) outfile = outdir / f'test{w}{h}{cs[1:]}{bpc}.pdf' pdf.save( outfile, compress_streams=False, stream_decode_level=StreamDecodeLevel.none ) with Pdf.open(outfile) as p2: pim = PdfImage(p2.pages[0].Resources.XObject['/Im1']) assert pim.bits_per_component == bpc assert pim.colorspace == cs assert pim.width == w assert pim.height == h if cs == '/DeviceRGB': assert pim.mode == 'RGB' elif cs == '/DeviceGray' and bpc == 8: assert pim.mode == 'L' elif cs == '/DeviceCMYK': assert pim.mode == 'CMYK' elif bpc == 1: assert pim.mode == '1' assert not pim.palette assert pim.filters == [] assert pim.read_bytes() == pixeldata outstream = BytesIO() pim.extract_to(stream=outstream) outstream.seek(0) im = Image.open(outstream) assert pim.mode == im.mode
def test_ccitt_encodedbytealign(sandwich): xobj, _pdf = sandwich # Pretend this is image is "EncodedByteAlign". We don't have a FOSS # example of such an image. xobj.DecodeParms.EncodedByteAlign = True pim = PdfImage(xobj) with pytest.raises(UnsupportedImageTypeError): pim.as_pil_image()
def test_jbig2_global_palette(first_image_in): xobj, _pdf = first_image_in('jbig2global.pdf') xobj.ColorSpace = pikepdf.Array( [Name.Indexed, Name.DeviceRGB, 1, b'\x00\x00\x00\xff\xff\xff'] ) pim = PdfImage(xobj) im = pim.as_pil_image() assert im.size == (4000, 2864) assert im.getpixel((0, 0)) == 255 # Ensure loaded
def test_image_replace(congress, outdir): pdfimage = PdfImage(congress[0]) pillowimage = pdfimage.as_pil_image() grayscale = pillowimage.convert('L') congress[0].write(zlib.compress(grayscale.tobytes()), Name("/FlateDecode"), Null()) congress[0].ColorSpace = Name("/DeviceGray") pdf = congress[1] pdf.save(outdir / 'congress_gray.pdf')
def test_image_replace(congress, outdir): pdfimage = PdfImage(congress[0]) pillowimage = pdfimage.as_pil_image() grayscale = pillowimage.convert('L') grayscale = grayscale.resize((4, 4)) # So it is not obnoxious on error congress[0].write(zlib.compress(grayscale.tobytes()), filter=Name("/FlateDecode")) congress[0].ColorSpace = Name("/DeviceGray") pdf = congress[1] pdf.save(outdir / 'congress_gray.pdf')
def test_image_palette(resources, filename, bpc): pdf = Pdf.open(resources / filename) pim = PdfImage(next(iter(pdf.pages[0].images.values()))) assert pim.palette[0] == 'RGB' assert pim.colorspace == '/DeviceRGB' assert not pim.is_inline assert pim.mode == 'P' assert pim.bits_per_component == bpc outstream = BytesIO() pim.extract_to(stream=outstream)
def test_jbig2_not_available(jbig2, monkeypatch): xobj, _pdf = jbig2 pim = PdfImage(xobj) def raise_filenotfound(*args, **kwargs): raise FileNotFoundError('jbig2dec') monkeypatch.setattr(pikepdf.jbig2, 'run', raise_filenotfound) assert not pikepdf.jbig2.jbig2dec_available() with pytest.raises(DependencyError): pim.as_pil_image()
def test_jp2(resources): pdf = Pdf.open(resources / 'pike-jp2.pdf') xobj = next(iter(pdf.pages[0].images.values())) pim = PdfImage(xobj) assert isinstance(pim, PdfJpxImage) assert '/JPXDecode' in pim.filters assert pim.colorspace == '/DeviceRGB' assert not pim.is_inline assert not pim.indexed assert pim.mode == 'RGB' assert pim.bits_per_component == 8 assert pim.__eq__(42) is NotImplemented assert pim == PdfImage(xobj) outstream = BytesIO() pim.extract_to(stream=outstream) del pim del xobj.ColorSpace # If there is no explicit ColorSpace metadata we should get it from the # compressed data stream pim = PdfImage(xobj) assert pim.colorspace == '/DeviceRGB' assert pim.bits_per_component == 8
def test_lowlevel_jpeg(congress, outdir): raw_bytes = congress[0].read_raw_bytes() with pytest.raises(PdfError): congress[0].read_bytes() assert imghdr.what('', h=raw_bytes) == 'jpeg' pim = PdfImage(congress[0]) b = BytesIO() pim.extract_to(stream=b) b.seek(0) im = Image.open(b) assert im.size == (congress[0].Width, congress[0].Height) assert im.mode == 'RGB'
def test_lowlevel_jpeg(congress): raw_bytes = congress[0].read_raw_bytes() with pytest.raises(PdfError): congress[0].read_bytes() im = Image.open(BytesIO(raw_bytes)) assert im.format == 'JPEG' pim = PdfImage(congress[0]) b = BytesIO() pim.extract_to(stream=b) b.seek(0) im = Image.open(b) assert im.size == (congress[0].Width, congress[0].Height) assert im.mode == 'RGB'
def test_direct_extract(resources, filename, bpc, filters, ext, mode, format_): xobj, _pdf = first_image_in(resources / filename) pim = PdfImage(xobj) assert pim.bits_per_component == bpc assert pim.filters == filters outstream = BytesIO() outext = pim.extract_to(stream=outstream) assert outext == ext, 'unexpected output file' outstream.seek(0) im = Image.open(outstream) assert im.mode == mode assert im.format == format_
def test_icc_palette(resources): xobj, _pdf = first_image_in(resources / 'pink-palette-icc.pdf') pim = PdfImage(xobj) assert pim.icc.profile.xcolor_space == 'RGB ' # with trailing space b = BytesIO() pim.extract_to(stream=b) b.seek(0) im = Image.open(b) assert im.size == (xobj.Width, xobj.Height) assert im.mode == 'P' pil_icc = im.info.get('icc_profile') pil_icc_stream = BytesIO(pil_icc) pil_prf = ImageCms.ImageCmsProfile(pil_icc_stream) assert pil_prf.tobytes() == pim.icc.tobytes()
def extract_image_filter( pike: Pdf, root: Path, image: Object, xref: Xref) -> Optional[Tuple[PdfImage, Tuple[Name, Object]]]: if image.Subtype != Name.Image: return None if image.Length < 100: log.debug(f"Skipping small image, xref {xref}") return None if image.Width < 8 or image.Height < 8: # Issue 732 log.debug(f"Skipping oddly sized image, xref {xref}") return None pim = PdfImage(image) if len(pim.filter_decodeparms) > 1: log.debug(f"Skipping multiply filtered image, xref {xref}") return None filtdp = pim.filter_decodeparms[0] if pim.bits_per_component > 8: log.debug(f"Skipping wide gamut image, xref {xref}") return None # Don't mess with wide gamut images if filtdp[0] == Name.JPXDecode: log.debug(f"Skipping JPEG2000 iamge, xref {xref}") return None # Don't do JPEG2000 if Name.Decode in image: log.debug(f"Skipping image with Decode table, xref {xref}") return None # Don't mess with custom Decode tables return pim, filtdp
def test_stacked_compression(resources): xobj, _pdf = first_image_in(resources / 'pike-flate-jp2.pdf') pim = PdfImage(xobj) assert pim.mode == 'RGB' assert pim.colorspace == '/DeviceRGB' assert pim.bits_per_component == 8 assert pim.filters == ['/FlateDecode', '/JPXDecode']
def test_invalid_icc(resources): xobj, _pdf = first_image_in(resources / 'pink-palette-icc.pdf') cs = xobj.ColorSpace[1][1] # [/Indexed [/ICCBased <stream>]] cs.write(b'foobar') # corrupt the ICC profile with pytest.raises(UnsupportedImageTypeError, match="ICC profile corrupt or not readable"): pim = PdfImage(xobj) _icc = pim.icc
def test_icc_use(first_image_in): xobj, _pdf = first_image_in('1biticc.pdf') pim = PdfImage(xobj) assert pim.mode == 'L' # It may be 1 bit per pixel but it's more complex than that assert pim.colorspace == '/ICCBased' assert pim.bits_per_component == 1 assert pim.icc.profile.xcolor_space == 'GRAY'
def test_icc_use(resources): xobj, _pdf = first_image_in(resources / '1biticc.pdf') pim = PdfImage(xobj) assert pim.mode == '1' assert pim.colorspace == '/ICCBased' assert pim.bits_per_component == 1 assert pim.icc.profile.xcolor_space == 'GRAY'
def extract_image_pikepdf(page): """Extracts an image as a PIL Image from the designated page. This method uses PikePDF to extract the image. It works on the assumption that the scan is included as a single embedded image within the page. This means that the PDF should include a single embedded image which has the same aspect ratio of the complete page. If there is not a single image embedded on the page, or if this image does not share the same aspect ratio to the page, a ValueError is thrown. Parameters ---------- page: pikepdf.Page Page from which to extract the image Returns ------- img_array : PIL Image The extracted image data Raises ------ ValueError if not exactly one image is found on the page or the image does not have the same aspect ratio as the page AttributeError if the MediaBox of a page is not defined """ images = page.images # Check whether only one image is embedded within the page. if len(images) != 1: raise ValueError('Not exactly 1 image present on the page.') else: pdf_image = PdfImage(images[list(images.keys())[0]]) pdf_width = float(page.MediaBox[2] - page.MediaBox[0]) pdf_height = float(page.MediaBox[3] - page.MediaBox[1]) pdf_ratio = pdf_width / pdf_height image_ratio = pdf_image.width / pdf_image.height # Check if the aspect ratio of the image is the same as the aspect ratio of the page up to a 3% relative error. if abs(pdf_ratio - image_ratio) > 0.03 * pdf_ratio: raise ValueError('Image has incorrect dimensions') return pdf_image.as_pil_image()
def read_document(self, file): pdf = Pdf.open(file) print(pdf) document = pdf.pages[0] (name, raw_image) = next(document.images.items()) image = PdfImage(raw_image).as_pil_image() text = pytesseract.image_to_string(image) return self.process_text(text)
def test_ccitt_icc(resources): xobj, pdf = first_image_in(resources / 'sandwich.pdf') pim = PdfImage(xobj) assert pim.icc is None bio = BytesIO() output_type = pim.extract_to(stream=bio) assert output_type == '.tif' bio.seek(0) assert b'GRAYXYZ' not in bio.read(1000) bio.seek(0) assert Image.open(bio) icc_data = (resources / 'Gray.icc').read_bytes() icc_stream = pdf.make_stream(icc_data) icc_stream.N = 1 xobj.ColorSpace = pikepdf.Array([Name.ICCBased, icc_stream]) pim = PdfImage(xobj) assert pim.icc.profile.xcolor_space == 'GRAY' bio = BytesIO() output_type = pim.extract_to(stream=bio) assert output_type == '.tif' bio.seek(0) assert b'GRAYXYZ' in bio.read(1000) bio.seek(0) assert Image.open(bio)
def test_image_eq(trivial, congress, inline): # Note: JPX equality is tested in test_jp2 (if we have a jpeg2000 codec) assert PdfImage(trivial[0]) == PdfImage(trivial[0]) assert PdfImage(trivial[0]).__eq__(42) is NotImplemented assert PdfImage(trivial[0]) != PdfImage(congress[0]) assert inline != PdfImage(congress[0]) assert inline.__eq__(42) is NotImplemented
def test_extract_direct_fails_nondefault_colortransform(congress): xobj, _pdf = congress xobj.DecodeParms = Dictionary( ColorTransform=42 # Non standard (or allowed in the spec) ) pim = PdfImage(xobj) bio = BytesIO() with pytest.raises(UnsupportedImageTypeError): pim._extract_direct(stream=bio) xobj.ColorSpace = Name.DeviceCMYK pim = PdfImage(xobj) with pytest.raises(UnsupportedImageTypeError): pim._extract_direct(stream=bio)
def test_jbig2_error(resources, monkeypatch): xobj, _pdf = first_image_in(resources / 'jbig2global.pdf') pim = PdfImage(xobj) monkeypatch.setattr(pikepdf.jbig2, 'jbig2dec_available', lambda: True) def raise_calledprocesserror(*args, **kwargs): raise subprocess.CalledProcessError(1, 'jbig2dec') monkeypatch.setattr(pikepdf.jbig2, 'run', raise_calledprocesserror) pim = PdfImage(xobj) with pytest.raises(subprocess.CalledProcessError): pim.as_pil_image()
def test_jbig2_too_old(resources, monkeypatch): xobj, _pdf = first_image_in(resources / 'jbig2global.pdf') pim = PdfImage(xobj) def run_version_override(subprocargs, *args, **kwargs): if '--version' in subprocargs: return subprocess.CompletedProcess(subprocargs, 0, 'jbig2dec 0.12\n') return subprocess.run(subprocargs, *args, **kwargs) monkeypatch.setattr(pikepdf.jbig2, 'run', run_version_override) pim = PdfImage(xobj) with pytest.raises(DependencyError, match='too old'): pim.as_pil_image()