示例#1
0
def img_extract():  #The function to Extract the images from the Document
    print(Style.BRIGHT + Fore.CYAN + '\nEnter Your Option:\n')
    print(Style.BRIGHT + Fore.YELLOW +
          '1 - Extracting the Images in Specific Pages.')
    print(Style.BRIGHT + Fore.YELLOW +
          '2 - Extracting Every Images of a Document.')
    print(Style.BRIGHT + Fore.RED + '\nYour Option >>>', end=' ')
    n = int(input())

    if (n != 1 and n != 2):
        raise Exception(
            'Please enter only the above given numerical values...')

    input_path, filename = get_path()
    pdf = pikepdf.Pdf.open(os.path.join(input_path, filename))

    if (n == 1):
        print(
            Style.BRIGHT + Fore.BLUE +
            '\nEnter the Page Numbers of the Images to be Extracted (Each number should be separated by Comma(,)): ',
            end='')
        page_nums = list(map(int, input().split(',')))

        for page in page_nums:
            l = list(pdf.pages[page - 1].images.keys())

            if len(l) == 0:
                print(Style.DIM + Fore.LIGHTRED_EX +
                      'There is No Image present in the Given Page:{}'.format(
                          page))

            else:
                for image in l:
                    raw_image = pdf.pages[page - 1].images[image]
                    pdfimage = pikepdf.PdfImage(raw_image)
                    pdfimage.extract_to(fileprefix=os.path.join(
                        os.getcwd(),
                        str(page) + '_' + image[1:] + '_' + filename))
        print(Style.BRIGHT + Fore.RED + '\n----------FINISHED------------')
        pdf.close()

    else:
        for page in range(len(pdf.pages)):
            l = list(pdf.pages[page].images.keys())

            if len(l) == 0:
                print(Style.DIM + Fore.LIGHTRED_EX +
                      'There is No Image present in the Page:{}'.format(page))

            else:
                for image in l:
                    raw_image = pdf.pages[page].images[image]
                    pdfimage = pikepdf.PdfImage(raw_image)
                    pdfimage.extract_to(fileprefix=os.path.join(
                        os.getcwd(),
                        str(page) + '_' + image[1:] + '_' + filename))
                print(Style.BRIGHT + Fore.RED +
                      '\n----------FINISHED------------')
        pdf.close()
示例#2
0
def test_jbig2_lossy(lossy, resources, outpdf):
    args = [
        resources / 'ccitt.pdf',
        outpdf,
        '--image-dpi',
        '200',
        '--optimize',
        3,
        '--jpg-quality',
        '50',
        '--png-quality',
        '20',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    ]
    if lossy:
        args.append('--jbig2-lossy')

    check_ocrmypdf(*args)

    pdf = pikepdf.open(outpdf)
    pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
    assert pim.filters[0] == '/JBIG2Decode'

    if lossy:
        assert '/JBIG2Globals' in pim.decode_parms[0]
    else:
        assert len(pim.decode_parms) == 0
示例#3
0
def test_flate_to_jbig2(resources, outdir):
    # This test requires an image that pngquant is capable of converting to
    # to 1bpp - so use an existing 1bpp image, convert up, confirm it can
    # convert down
    with Image.open(fspath(resources / 'typewriter.png')) as im:
        assert im.mode in ('1', 'P')
        im = im.convert('L')
        im.save(fspath(outdir / 'type8.png'))

    check_ocrmypdf(
        outdir / 'type8.png',
        outdir / 'out.pdf',
        '--image-dpi',
        '100',
        '--png-quality',
        '50',
        '--optimize',
        '3',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
    )

    pdf = pikepdf.open(outdir / 'out.pdf')
    pim = pikepdf.PdfImage(next(iter(pdf.pages[0].images.values())))
    assert pim.filters[0] == '/JBIG2Decode'
示例#4
0
    def __init__(
        self,
        *,
        name='',
        pdfimage: Optional[Object] = None,
        inline: Optional[Object] = None,
        shorthand=None,
    ):
        self._name = str(name)
        self._shorthand = shorthand

        if inline is not None:
            self._origin = 'inline'
            pim = inline.iimage
        elif pdfimage is not None:
            self._origin = 'xobject'
            pim = pikepdf.PdfImage(pdfimage)
        else:
            raise ValueError("Either pdfimage or inline must be set")
        self._width = pim.width
        self._height = pim.height

        # If /ImageMask is true, then this image is a stencil mask
        # (Images that draw with this stencil mask will have a reference to
        # it in their /Mask, but we don't actually need that information)
        if pim.image_mask:
            self._type = 'stencil'
        else:
            self._type = 'image'

        self._bpc = int(pim.bits_per_component)
        try:
            self._enc = FRIENDLY_ENCODING.get(pim.filters[0], 'image')
        except IndexError:
            self._enc = '?'

        try:
            self._color = FRIENDLY_COLORSPACE.get(pim.colorspace, '?')
        except NotImplementedError:
            self._color = '?'
        if self._enc == Encoding.jpeg2000:
            self._color = Colorspace.jpeg2000

        if self._color == Colorspace.icc:
            # Check the ICC profile to determine actual colorspace
            pim_icc = pim.icc
            if pim_icc.profile.xcolor_space == 'GRAY':
                self._comp = 1
            elif pim_icc.profile.xcolor_space == 'CMYK':
                self._comp = 4
            else:
                self._comp = 3
        else:
            self._comp = FRIENDLY_COMP.get(self._color, '?')

            # Bit of a hack... infer grayscale if component count is uncertain
            # but encoding only supports monochrome.
            if self._comp == '?' and self._enc in (Encoding.ccitt,
                                                   Encoding.jbig2):
                self._comp = FRIENDLY_COMP[Colorspace.gray]
示例#5
0
 def check_pim(imobj, idx):
     pim = pikepdf.PdfImage(imobj)
     assert pim.mode == 'DeviceN'
     assert pim.is_device_n
     assert not pim.is_separation
     assert pim.indexed == idx
     assert repr(pim)
     with pytest.raises(pikepdf.models.image.HifiPrintImageNotTranscodableError):
         pim.extract_to(stream=BytesIO())
示例#6
0
def test_palette_nonrgb(base, hival, palette, expect_type):
    pdf = pikepdf.new()
    imobj = Stream(
        pdf,
        b'\x00\x01\x02\x03' * 4,
        BitsPerComponent=8,
        ColorSpace=Array([Name.Indexed, base, hival, palette]),
        Width=16,
        Height=1,
        Type=Name.XObject,
        Subtype=Name.Image,
    )
    pim = pikepdf.PdfImage(imobj)
    assert pim.palette == (expect_type, palette)
示例#7
0
def test_palette_nonrgb(base, hival, bits, palette, expect_type, expect_mode):
    pdf = pikepdf.new()
    imobj = Stream(
        pdf,
        b'\x00\x01\x02\x03' * 16,
        BitsPerComponent=bits,
        ColorSpace=Array([Name.Indexed, base, hival, palette]),
        Width=16,
        Height=4,
        Type=Name.XObject,
        Subtype=Name.Image,
    )
    pim = pikepdf.PdfImage(imobj)
    assert pim.palette == (expect_type, palette)
    pim.extract_to(stream=BytesIO())
    # To view images:
    # pim.extract_to(fileprefix=f'palette_nonrgb_{expect_type}_{bits}')
    assert pim.mode == expect_mode
示例#8
0
    def __init__(self, *, name='', pdfimage=None, inline=None,
                 shorthand=None):

        self._name = str(name)
        self._shorthand = shorthand

        if inline is not None:
            self._origin = 'inline'
            pim = inline.iimage
        elif pdfimage is not None:
            self._origin = 'xobject'
            pim = pikepdf.PdfImage(pdfimage)
        self._width = pim.width
        self._height = pim.height

        # If /ImageMask is true, then this image is a stencil mask
        # (Images that draw with this stencil mask will have a reference to
        # it in their /Mask, but we don't actually need that information)
        if pim.image_mask:
            self._type = 'stencil'
        else:
            self._type = 'image'

        self._bpc = int(pim.bits_per_component)
        try:
            self._enc = FRIENDLY_ENCODING.get(pim.filters[0], 'image')
        except IndexError:
            self._enc = '?'

        try:
            self._color = FRIENDLY_COLORSPACE.get(pim.colorspace, '?')
        except NotImplementedError:
            self._color = '?'
        if self._enc == Encoding.jpeg2000:
            self._color = Colorspace.jpeg2000

        self._comp = FRIENDLY_COMP.get(self._color, '?')

        # Bit of a hack... infer grayscale if component count is uncertain
        # but encoding must be monochrome. This happens if a monochrome image
        # has an ICC profile attached. Better solution would be to examine
        # the ICC profile.
        if self._comp == '?' and self._enc in (Encoding.ccitt, 'jbig2'):
            self._comp = FRIENDLY_COMP[Colorspace.gray]
示例#9
0
def extract_image_filter(pike, root, log, image, xref):
    if image.Subtype != Name.Image:
        return None
    if image.Length < 100:
        log.debug("Skipping small image, xref %s", xref)
        return None

    pim = pikepdf.PdfImage(image)

    if len(pim.filter_decodeparms) > 1:
        log.debug("Skipping multiply filtered, xref %s", xref)
        return None
    filtdp = pim.filter_decodeparms[0]

    if pim.bits_per_component > 8:
        return None  # Don't mess with wide gamut images

    if filtdp[0] == Name.JPXDecode:
        return None  # Don't do JPEG2000

    return pim, filtdp
示例#10
0
def test_dict_or_array_dict():
    pdf = pikepdf.new()
    imobj = Stream(
        pdf,
        b'dummy',
        BitsPerComponent=1,
        ColorSpace=Name.DeviceGray,
        DecodeParms=Array([Dictionary(
            BlackIs1=False,
            Columns=16,
            K=-1,
        )]),
        Filter=Array([Name.CCITTFaxDecode]),
        Height=16,
        Width=16,
        Type=Name.XObject,
        Subtype=Name.Image,
    )
    pim = pikepdf.PdfImage(imobj)
    assert pim.decode_parms[
        0].K == -1  # Check that array of dict is unpacked properly
示例#11
0
def decodeImage(page, prefix):
    xObject = page.images
    count = 0
    for name, img in xObject.items():
        count += 1
        if img['/Filter'] in ['/DCTDecode', '/JPXDecode', '/FlatDecode']:
            try:
                pdfimage = pikepdf.PdfImage(img)
                input = imdecode(
                    np.asarray(bytearray(img.get_raw_stream_buffer())),
                    IMREAD_COLOR)
                suffixes = {
                    '/JPXDecode': 'jp2',
                    '/DCTDecode': 'jpg',
                    '/FlatDecode': 'png'
                }
                suffixes[img['/Filter']]
                imwrite(
                    prefix + '_%03d.%s' % (count, suffixes[img['/Filter']]),
                    input)
            except Exception as ex:
                log(ERROR,
                    'Cannot extract %s:%s due to %s' % (prefix, name, str(ex)))
示例#12
0
def extract_image(*, pike, root, log, image, xref, jbig2s,
                  pngs, jpegs, options):
    if image.Subtype != '/Image':
        return False
    if image.Length < 100:
        log.debug("Skipping small image, xref {}".format(xref))
        return False

    pim = pikepdf.PdfImage(image)

    if len(pim.filter_decodeparms) > 1:
        log.debug("Skipping multiply filtered, xref {}".format(xref))
        return False
    filtdp = pim.filter_decodeparms[0]

    if pim.bits_per_component > 8:
        return False  # Don't mess with wide gamut images

    if filtdp[0] == '/JPXDecode':
        return False  # Don't do JPEG2000

    if pim.bits_per_component == 1 \
            and filtdp != '/JBIG2Decode' \
            and jbig2enc.available():
        try:
            imgname = Path(root / '{:08d}'.format(xref))
            with imgname.open('wb') as f:
                ext = pim.extract_to(stream=f)
            imgname.rename(imgname.with_suffix(ext))
        except pikepdf.UnsupportedImageTypeError:
            return False
        jbig2s.append((xref, ext))
    elif filtdp[0] == '/DCTDecode' \
            and options.optimize >= 2:
        # This is a simple heuristic derived from some training data, that has
        # about a 70% chance of guessing whether the JPEG is high quality,
        # and possibly recompressible, or not. The number itself doesn't mean
        # anything.
        # bytes_per_pixel = int(raw_jpeg.Length) / (w * h)
        # jpeg_quality_estimate = 117.0 * (bytes_per_pixel ** 0.213)
        # if jpeg_quality_estimate < 65:
        #     return False

        # We could get the ICC profile here, but there's no need to look at it
        # for quality transcoding
        # if icc:
        #     stream = BytesIO(raw_jpeg.read_raw_bytes())
        #     iccbytes = icc.read_bytes()
        #     with Image.open(stream) as im:
        #         im.save(jpg_name(root, xref), icc_profile=iccbytes)
        try:
            imgname = Path(root / '{:08d}'.format(xref))
            with imgname.open('wb') as f:
                ext = pim.extract_to(stream=f)
            imgname.rename(imgname.with_suffix(ext))
        except pikepdf.UnsupportedImageTypeError:
            return False
        jpegs.append(xref)
    elif pim.indexed \
            and pim.colorspace in pim.SIMPLE_COLORSPACES \
            and options.optimize >= 3:
        # Try to improve on indexed images - these are far from low hanging
        # fruit in most cases
        pim.as_pil_image().save(png_name(root, xref))
        pngs.append(xref)
    elif not pim.indexed and pim.colorspace in pim.SIMPLE_COLORSPACES:
        # An optimization opportunity here, not currently taken, is directly
        # generating a PNG from compressed data
        pim.as_pil_image().save(png_name(root, xref))
        pngs.append(xref)
    else:
        return False

    return True
示例#13
0
def test_display_image(pal):
    im0 = pal.pages[0].Resources.XObject['/Im0']
    pim = pikepdf.PdfImage(im0)
    result = pim._repr_png_()
    assert result[1:4] == b'PNG'