def img_extract(): #The function to Extract the images from the Document print(Style.BRIGHT + Fore.CYAN + '\nEnter Your Option:\n') print(Style.BRIGHT + Fore.YELLOW + '1 - Extracting the Images in Specific Pages.') print(Style.BRIGHT + Fore.YELLOW + '2 - Extracting Every Images of a Document.') print(Style.BRIGHT + Fore.RED + '\nYour Option >>>', end=' ') n = int(input()) if (n != 1 and n != 2): raise Exception( 'Please enter only the above given numerical values...') input_path, filename = get_path() pdf = pikepdf.Pdf.open(os.path.join(input_path, filename)) if (n == 1): print( Style.BRIGHT + Fore.BLUE + '\nEnter the Page Numbers of the Images to be Extracted (Each number should be separated by Comma(,)): ', end='') page_nums = list(map(int, input().split(','))) for page in page_nums: l = list(pdf.pages[page - 1].images.keys()) if len(l) == 0: print(Style.DIM + Fore.LIGHTRED_EX + 'There is No Image present in the Given Page:{}'.format( page)) else: for image in l: raw_image = pdf.pages[page - 1].images[image] pdfimage = pikepdf.PdfImage(raw_image) pdfimage.extract_to(fileprefix=os.path.join( os.getcwd(), str(page) + '_' + image[1:] + '_' + filename)) print(Style.BRIGHT + Fore.RED + '\n----------FINISHED------------') pdf.close() else: for page in range(len(pdf.pages)): l = list(pdf.pages[page].images.keys()) if len(l) == 0: print(Style.DIM + Fore.LIGHTRED_EX + 'There is No Image present in the Page:{}'.format(page)) else: for image in l: raw_image = pdf.pages[page].images[image] pdfimage = pikepdf.PdfImage(raw_image) pdfimage.extract_to(fileprefix=os.path.join( os.getcwd(), str(page) + '_' + image[1:] + '_' + filename)) print(Style.BRIGHT + Fore.RED + '\n----------FINISHED------------') pdf.close()
def test_jbig2_lossy(lossy, resources, outpdf): args = [ resources / 'ccitt.pdf', outpdf, '--image-dpi', '200', '--optimize', 3, '--jpg-quality', '50', '--png-quality', '20', '--plugin', 'tests/plugins/tesseract_noop.py', ] if lossy: args.append('--jbig2-lossy') check_ocrmypdf(*args) pdf = pikepdf.open(outpdf) pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values()))) assert pim.filters[0] == '/JBIG2Decode' if lossy: assert '/JBIG2Globals' in pim.decode_parms[0] else: assert len(pim.decode_parms) == 0
def test_flate_to_jbig2(resources, outdir): # This test requires an image that pngquant is capable of converting to # to 1bpp - so use an existing 1bpp image, convert up, confirm it can # convert down with Image.open(fspath(resources / 'typewriter.png')) as im: assert im.mode in ('1', 'P') im = im.convert('L') im.save(fspath(outdir / 'type8.png')) check_ocrmypdf( outdir / 'type8.png', outdir / 'out.pdf', '--image-dpi', '100', '--png-quality', '50', '--optimize', '3', '--plugin', 'tests/plugins/tesseract_noop.py', ) pdf = pikepdf.open(outdir / 'out.pdf') pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values()))) assert pim.filters[0] == '/JBIG2Decode'
def __init__( self, *, name='', pdfimage: Optional[Object] = None, inline: Optional[Object] = None, shorthand=None, ): self._name = str(name) self._shorthand = shorthand if inline is not None: self._origin = 'inline' pim = inline.iimage elif pdfimage is not None: self._origin = 'xobject' pim = pikepdf.PdfImage(pdfimage) else: raise ValueError("Either pdfimage or inline must be set") self._width = pim.width self._height = pim.height # If /ImageMask is true, then this image is a stencil mask # (Images that draw with this stencil mask will have a reference to # it in their /Mask, but we don't actually need that information) if pim.image_mask: self._type = 'stencil' else: self._type = 'image' self._bpc = int(pim.bits_per_component) try: self._enc = FRIENDLY_ENCODING.get(pim.filters[0], 'image') except IndexError: self._enc = '?' try: self._color = FRIENDLY_COLORSPACE.get(pim.colorspace, '?') except NotImplementedError: self._color = '?' if self._enc == Encoding.jpeg2000: self._color = Colorspace.jpeg2000 if self._color == Colorspace.icc: # Check the ICC profile to determine actual colorspace pim_icc = pim.icc if pim_icc.profile.xcolor_space == 'GRAY': self._comp = 1 elif pim_icc.profile.xcolor_space == 'CMYK': self._comp = 4 else: self._comp = 3 else: self._comp = FRIENDLY_COMP.get(self._color, '?') # Bit of a hack... infer grayscale if component count is uncertain # but encoding only supports monochrome. if self._comp == '?' and self._enc in (Encoding.ccitt, Encoding.jbig2): self._comp = FRIENDLY_COMP[Colorspace.gray]
def check_pim(imobj, idx): pim = pikepdf.PdfImage(imobj) assert pim.mode == 'DeviceN' assert pim.is_device_n assert not pim.is_separation assert pim.indexed == idx assert repr(pim) with pytest.raises(pikepdf.models.image.HifiPrintImageNotTranscodableError): pim.extract_to(stream=BytesIO())
def test_palette_nonrgb(base, hival, palette, expect_type): pdf = pikepdf.new() imobj = Stream( pdf, b'\x00\x01\x02\x03' * 4, BitsPerComponent=8, ColorSpace=Array([Name.Indexed, base, hival, palette]), Width=16, Height=1, Type=Name.XObject, Subtype=Name.Image, ) pim = pikepdf.PdfImage(imobj) assert pim.palette == (expect_type, palette)
def test_palette_nonrgb(base, hival, bits, palette, expect_type, expect_mode): pdf = pikepdf.new() imobj = Stream( pdf, b'\x00\x01\x02\x03' * 16, BitsPerComponent=bits, ColorSpace=Array([Name.Indexed, base, hival, palette]), Width=16, Height=4, Type=Name.XObject, Subtype=Name.Image, ) pim = pikepdf.PdfImage(imobj) assert pim.palette == (expect_type, palette) pim.extract_to(stream=BytesIO()) # To view images: # pim.extract_to(fileprefix=f'palette_nonrgb_{expect_type}_{bits}') assert pim.mode == expect_mode
def __init__(self, *, name='', pdfimage=None, inline=None, shorthand=None): self._name = str(name) self._shorthand = shorthand if inline is not None: self._origin = 'inline' pim = inline.iimage elif pdfimage is not None: self._origin = 'xobject' pim = pikepdf.PdfImage(pdfimage) self._width = pim.width self._height = pim.height # If /ImageMask is true, then this image is a stencil mask # (Images that draw with this stencil mask will have a reference to # it in their /Mask, but we don't actually need that information) if pim.image_mask: self._type = 'stencil' else: self._type = 'image' self._bpc = int(pim.bits_per_component) try: self._enc = FRIENDLY_ENCODING.get(pim.filters[0], 'image') except IndexError: self._enc = '?' try: self._color = FRIENDLY_COLORSPACE.get(pim.colorspace, '?') except NotImplementedError: self._color = '?' if self._enc == Encoding.jpeg2000: self._color = Colorspace.jpeg2000 self._comp = FRIENDLY_COMP.get(self._color, '?') # Bit of a hack... infer grayscale if component count is uncertain # but encoding must be monochrome. This happens if a monochrome image # has an ICC profile attached. Better solution would be to examine # the ICC profile. if self._comp == '?' and self._enc in (Encoding.ccitt, 'jbig2'): self._comp = FRIENDLY_COMP[Colorspace.gray]
def extract_image_filter(pike, root, log, image, xref): if image.Subtype != Name.Image: return None if image.Length < 100: log.debug("Skipping small image, xref %s", xref) return None pim = pikepdf.PdfImage(image) if len(pim.filter_decodeparms) > 1: log.debug("Skipping multiply filtered, xref %s", xref) return None filtdp = pim.filter_decodeparms[0] if pim.bits_per_component > 8: return None # Don't mess with wide gamut images if filtdp[0] == Name.JPXDecode: return None # Don't do JPEG2000 return pim, filtdp
def test_dict_or_array_dict(): pdf = pikepdf.new() imobj = Stream( pdf, b'dummy', BitsPerComponent=1, ColorSpace=Name.DeviceGray, DecodeParms=Array([Dictionary( BlackIs1=False, Columns=16, K=-1, )]), Filter=Array([Name.CCITTFaxDecode]), Height=16, Width=16, Type=Name.XObject, Subtype=Name.Image, ) pim = pikepdf.PdfImage(imobj) assert pim.decode_parms[ 0].K == -1 # Check that array of dict is unpacked properly
def decodeImage(page, prefix): xObject = page.images count = 0 for name, img in xObject.items(): count += 1 if img['/Filter'] in ['/DCTDecode', '/JPXDecode', '/FlatDecode']: try: pdfimage = pikepdf.PdfImage(img) input = imdecode( np.asarray(bytearray(img.get_raw_stream_buffer())), IMREAD_COLOR) suffixes = { '/JPXDecode': 'jp2', '/DCTDecode': 'jpg', '/FlatDecode': 'png' } suffixes[img['/Filter']] imwrite( prefix + '_%03d.%s' % (count, suffixes[img['/Filter']]), input) except Exception as ex: log(ERROR, 'Cannot extract %s:%s due to %s' % (prefix, name, str(ex)))
def extract_image(*, pike, root, log, image, xref, jbig2s, pngs, jpegs, options): if image.Subtype != '/Image': return False if image.Length < 100: log.debug("Skipping small image, xref {}".format(xref)) return False pim = pikepdf.PdfImage(image) if len(pim.filter_decodeparms) > 1: log.debug("Skipping multiply filtered, xref {}".format(xref)) return False filtdp = pim.filter_decodeparms[0] if pim.bits_per_component > 8: return False # Don't mess with wide gamut images if filtdp[0] == '/JPXDecode': return False # Don't do JPEG2000 if pim.bits_per_component == 1 \ and filtdp != '/JBIG2Decode' \ and jbig2enc.available(): try: imgname = Path(root / '{:08d}'.format(xref)) with imgname.open('wb') as f: ext = pim.extract_to(stream=f) imgname.rename(imgname.with_suffix(ext)) except pikepdf.UnsupportedImageTypeError: return False jbig2s.append((xref, ext)) elif filtdp[0] == '/DCTDecode' \ and options.optimize >= 2: # This is a simple heuristic derived from some training data, that has # about a 70% chance of guessing whether the JPEG is high quality, # and possibly recompressible, or not. The number itself doesn't mean # anything. # bytes_per_pixel = int(raw_jpeg.Length) / (w * h) # jpeg_quality_estimate = 117.0 * (bytes_per_pixel ** 0.213) # if jpeg_quality_estimate < 65: # return False # We could get the ICC profile here, but there's no need to look at it # for quality transcoding # if icc: # stream = BytesIO(raw_jpeg.read_raw_bytes()) # iccbytes = icc.read_bytes() # with Image.open(stream) as im: # im.save(jpg_name(root, xref), icc_profile=iccbytes) try: imgname = Path(root / '{:08d}'.format(xref)) with imgname.open('wb') as f: ext = pim.extract_to(stream=f) imgname.rename(imgname.with_suffix(ext)) except pikepdf.UnsupportedImageTypeError: return False jpegs.append(xref) elif pim.indexed \ and pim.colorspace in pim.SIMPLE_COLORSPACES \ and options.optimize >= 3: # Try to improve on indexed images - these are far from low hanging # fruit in most cases pim.as_pil_image().save(png_name(root, xref)) pngs.append(xref) elif not pim.indexed and pim.colorspace in pim.SIMPLE_COLORSPACES: # An optimization opportunity here, not currently taken, is directly # generating a PNG from compressed data pim.as_pil_image().save(png_name(root, xref)) pngs.append(xref) else: return False return True
def test_display_image(pal): im0 = pal.pages[0].Resources.XObject['/Im0'] pim = pikepdf.PdfImage(im0) result = pim._repr_png_() assert result[1:4] == b'PNG'