Пример #1
2
def select_image_layer(infiles, output_file, log, context):
    """Selects the image layer for the output page. If possible this is the
    orientation-corrected input page, or an image of the whole page converted
    to PDF."""

    options = context.get_options()
    page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf'))
    image = next(ii for ii in infiles if ii.endswith('.image'))

    if options.lossless_reconstruction:
        log.debug(
            f"{page_number(page_pdf):4d}: page eligible for lossless reconstruction"
        )
        re_symlink(page_pdf, output_file, log)  # Still points to multipage
        return

    pageinfo = get_pageinfo(image, context)

    # We rasterize a square DPI version of each page because most image
    # processing tools don't support rectangular DPI. Use the square DPI as it
    # accurately describes the image. It would be possible to resample the image
    # at this stage back to non-square DPI to more closely resemble the input,
    # except that the hocr renderer does not understand non-square DPI. The
    # sandwich renderer would be fine.
    dpi = get_page_square_dpi(pageinfo, options)
    layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi))

    # This create a single page PDF
    with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf:
        log.debug(f'{page_number(page_pdf):4d}: convert')
        img2pdf.convert(
            imfile, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf
        )
        log.debug(f'{page_number(page_pdf):4d}: convert done')
Пример #2
1
def triage_image_file(input_file, output_file, log, options):
    try:
        log.info("Input file is not a PDF, checking if it is an image...")
        im = Image.open(input_file)
    except EnvironmentError as e:
        msg = str(e)

        # Recover the original filename
        realpath = ''
        if os.path.islink(input_file):
            realpath = os.path.realpath(input_file)
        elif os.path.isfile(input_file):
            realpath = '<stdin>'
        msg = msg.replace(input_file, realpath)
        log.error(msg)
        raise UnsupportedImageFormatError() from e
    else:
        log.info("Input file is an image")

        if 'dpi' in im.info:
            if im.info['dpi'] <= (96, 96) and not options.image_dpi:
                log.info("Image size: (%d, %d)" % im.size)
                log.info("Image resolution: (%d, %d)" % im.info['dpi'])
                log.error(
                    "Input file is an image, but the resolution (DPI) is "
                    "not credible.  Estimate the resolution at which the "
                    "image was scanned and specify it using --image-dpi.")
                raise DpiError()
        elif not options.image_dpi:
            log.info("Image size: (%d, %d)" % im.size)
            log.error(
                "Input file is an image, but has no resolution (DPI) "
                "in its metadata.  Estimate the resolution at which "
                "image was scanned and specify it using --image-dpi.")
            raise DpiError()

        if 'iccprofile' not in im.info:
            if im.mode == 'RGB':
                log.info('Input image has no ICC profile, assuming sRGB')
            elif im.mode == 'CMYK':
                log.info('Input CMYK image has no ICC profile, not usable')
                raise UnsupportedImageFormatError()
        im.close()

    try:
        log.info("Image seems valid. Try converting to PDF...")
        layout_fun = img2pdf.default_layout_fun
        if options.image_dpi:
            layout_fun = img2pdf.get_fixed_dpi_layout_fun(
                (options.image_dpi, options.image_dpi))
        with open(output_file, 'wb') as outf:
            img2pdf.convert(
                input_file,
                layout_fun=layout_fun,
                with_pdfrw=False,
                outputstream=outf)
        log.info("Successfully converted to PDF, processing...")
    except img2pdf.ImageOpenError as e:
        log.error(e)
        raise UnsupportedImageFormatError() from e
Пример #3
0
def select_image_layer(infiles, output_file, log, context):
    """Selects the image layer for the output page. If possible this is the
    orientation-corrected input page, or an image of the whole page converted
    to PDF."""

    options = context.get_options()
    page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf'))
    image = next(ii for ii in infiles if ii.endswith('.image'))

    if options.lossless_reconstruction:
        log.debug("{:4d}: page eligible for lossless reconstruction".format(
            page_number(page_pdf)))
        re_symlink(page_pdf, output_file, log)
    else:
        pageinfo = get_pageinfo(image, context)

        # We rasterize a square DPI version of each page because most image
        # processing tools don't support rectangular DPI. Use the square DPI
        # as it accurately describes the image. It would be possible to
        # resample the image at this stage back to non-square DPI to more
        # closely resemble the input, except that the hocr renderer does not
        # understand non-square DPI. The tess4 renderer would be fine.
        dpi = get_page_square_dpi(pageinfo, options)
        layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi))

        with open(image, 'rb') as imfile, \
                open(output_file, 'wb') as pdf:
            log.debug('{:4d}: convert'.format(page_number(page_pdf)))
            img2pdf.convert(imfile,
                            with_pdfrw=False,
                            layout_fun=layout_fun,
                            outputstream=pdf)
            log.debug('{:4d}: convert done'.format(page_number(page_pdf)))
Пример #4
0
def select_image_layer(
        infiles,
        output_file,
        log,
        pdfinfo,
        pdfinfo_lock):

    page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf'))
    image = next(ii for ii in infiles if ii.endswith('.image'))

    if lossless_reconstruction:
        log.debug("{:4d}: page eligible for lossless reconstruction".format(
            page_number(page_pdf)))
        re_symlink(page_pdf, output_file)
    else:
        pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
        dpi = get_page_dpi(pageinfo)
        dpi = float(dpi[0]), float(dpi[1])
        layout_fun = img2pdf.get_fixed_dpi_layout_fun(dpi)

        with open(image, 'rb') as imfile, \
                open(output_file, 'wb') as pdf:
            rawdata = imfile.read()
            img2pdf.convert(
                rawdata, with_pdfrw=False,
                layout_fun=layout_fun, outputstream=pdf)
Пример #5
0
def triage_image_file(input_file, output_file, log):
    try:
        log.info("Input file is not a PDF, checking if it is an image...")
        im = Image.open(input_file)
    except EnvironmentError as e:
        msg = str(e)

        # Recover the original filename
        realpath = ''
        if os.path.islink(input_file):
            realpath = os.path.realpath(input_file)
        elif os.path.isfile(input_file):
            realpath = '<stdin>'
        msg = msg.replace(input_file, realpath)
        log.error(msg)
        sys.exit(ExitCode.input_file)
        return
    else:
        log.info("Input file is an image")

        if 'dpi' in im.info:
            if im.info['dpi'] <= (96, 96) and not options.image_dpi:
                log.info("Image size: (%d, %d)" % im.size)
                log.info("Image resolution: (%d, %d)" % im.info['dpi'])
                log.error(
                    "Input file is an image, but the resolution (DPI) is "
                    "not credible.  Estimate the resolution at which the "
                    "image was scanned and specify it using --image-dpi.")
                sys.exit(ExitCode.input_file)
        elif not options.image_dpi:
            log.info("Image size: (%d, %d)" % im.size)
            log.error("Input file is an image, but has no resolution (DPI) "
                      "in its metadata.  Estimate the resolution at which "
                      "image was scanned and specify it using --image-dpi.")
            sys.exit(ExitCode.input_file)

        if 'iccprofile' not in im.info:
            if im.mode == 'RGB':
                log.info('Input image has no ICC profile, assuming sRGB')
            elif im.mode == 'CMYK':
                log.info('Input CMYK image has no ICC profile, not usable')
                sys.exit(ExitCode.input_file)
        im.close()

    try:
        log.info("Image seems valid. Try converting to PDF...")
        layout_fun = img2pdf.default_layout_fun
        if options.image_dpi:
            layout_fun = img2pdf.get_fixed_dpi_layout_fun(
                (options.image_dpi, options.image_dpi))
        with open(output_file, 'wb') as outf:
            img2pdf.convert(input_file,
                            layout_fun=layout_fun,
                            with_pdfrw=False,
                            outputstream=outf)
        log.info("Successfully converted to PDF, processing...")
    except img2pdf.ImageOpenError as e:
        log.error(e)
        sys.exit(ExitCode.input_file)
Пример #6
0
def triage_image_file(input_file, output_file, options, log):
    log.info("Input file is not a PDF, checking if it is an image...")
    try:
        im = Image.open(input_file)
    except EnvironmentError as e:
        # Recover the original filename
        log.error(str(e).replace(input_file, options.input_file))
        raise UnsupportedImageFormatError() from e

    with im:
        log.info("Input file is an image")
        if 'dpi' in im.info:
            if im.info['dpi'] <= (96, 96) and not options.image_dpi:
                log.info("Image size: (%d, %d)" % im.size)
                log.info("Image resolution: (%d, %d)" % im.info['dpi'])
                log.error(
                    "Input file is an image, but the resolution (DPI) is "
                    "not credible.  Estimate the resolution at which the "
                    "image was scanned and specify it using --image-dpi."
                )
                raise DpiError()
        elif not options.image_dpi:
            log.info("Image size: (%d, %d)" % im.size)
            log.error(
                "Input file is an image, but has no resolution (DPI) "
                "in its metadata.  Estimate the resolution at which "
                "image was scanned and specify it using --image-dpi."
            )
            raise DpiError()

        if im.mode in ('RGBA', 'LA'):
            log.error(
                "The input image has an alpha channel. Remove the alpha "
                "channel first."
            )
            raise UnsupportedImageFormatError()

        if 'iccprofile' not in im.info:
            if im.mode == 'RGB':
                log.info("Input image has no ICC profile, assuming sRGB")
            elif im.mode == 'CMYK':
                log.error("Input CMYK image has no ICC profile, not usable")
                raise UnsupportedImageFormatError()

    try:
        log.info("Image seems valid. Try converting to PDF...")
        layout_fun = img2pdf.default_layout_fun
        if options.image_dpi:
            layout_fun = img2pdf.get_fixed_dpi_layout_fun(
                (options.image_dpi, options.image_dpi)
            )
        with open(output_file, 'wb') as outf:
            img2pdf.convert(
                input_file, layout_fun=layout_fun, with_pdfrw=False, outputstream=outf
            )
        log.info("Successfully converted to PDF, processing...")
    except img2pdf.ImageOpenError as e:
        log.error(e)
        raise UnsupportedImageFormatError() from e
Пример #7
0
def triage_image_file(input_file, output_file, log):
    try:
        log.info("Input file is not a PDF, checking if it is an image...")
        im = Image.open(input_file)
    except EnvironmentError as e:
        log.error(e)
        sys.exit(ExitCode.input_file)
        return
    else:
        log.info("Input file is an image")

        if 'dpi' in im.info:
            if im.info['dpi'] <= (96, 96) and not options.image_dpi:
                log.info("Image size: (%d, %d)" % im.size)
                log.info("Image resolution: (%d, %d)" % im.info['dpi'])
                log.error(
                    "Input file is an image, but the resolution (DPI) is "
                    "not credible.  Estimate the resolution at which the "
                    "image was scanned and specify it using --image-dpi.")
                sys.exit(ExitCode.input_file)
        elif not options.image_dpi:
            log.info("Image size: (%d, %d)" % im.size)
            log.error(
                "Input file is an image, but has no resolution (DPI) "
                "in its metadata.  Estimate the resolution at which "
                "image was scanned and specify it using --image-dpi.")
            sys.exit(ExitCode.input_file)

        if 'iccprofile' not in im.info:
            if im.mode == 'RGB':
                log.info('Input image has no ICC profile, assuming sRGB')
            elif im.mode == 'CMYK':
                log.info('Input CMYK image has no ICC profile, not usable')
                sys.exit(ExitCode.input_file)
        im.close()

    try:
        log.info("Image seems valid. Try converting to PDF...")
        layout_fun = img2pdf.default_layout_fun
        if options.image_dpi:
            layout_fun = img2pdf.get_fixed_dpi_layout_fun(
                (options.image_dpi, options.image_dpi))
        with open(output_file, 'wb') as outf:
            img2pdf.convert(
                input_file,
                layout_fun=layout_fun,
                with_pdfrw=False,
                outputstream=outf)
        log.info("Successfully converted to PDF, processing...")
    except img2pdf.ImageOpenError as e:
        log.error(e)
        sys.exit(ExitCode.input_file)
Пример #8
0
def create_pdf_page_from_image(image, page_context):
    # We rasterize a square DPI version of each page because most image
    # processing tools don't support rectangular DPI. Use the square DPI as it
    # accurately describes the image. It would be possible to resample the image
    # at this stage back to non-square DPI to more closely resemble the input,
    # except that the hocr renderer does not understand non-square DPI. The
    # sandwich renderer would be fine.
    output_file = page_context.get_path('visible.pdf')
    dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
    layout_fun = img2pdf.get_fixed_dpi_layout_fun((dpi, dpi))

    # This create a single page PDF
    with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf:
        page_context.log.debug('convert')
        img2pdf.convert(
            imfile, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf
        )
        page_context.log.debug('convert done')
    return output_file
Пример #9
0
 def make_rotate_test(prefix, image_angle, page_angle):
     im = Image.open(fspath(resources / 'typewriter.png'))
     if image_angle != 0:
         ccw_angle = -image_angle % 360
         im = im.transpose(getattr(Image, 'ROTATE_{}'.format(ccw_angle)))
     memimg = BytesIO()
     im.save(memimg, format='PNG')
     memimg.seek(0)
     mempdf = BytesIO()
     img2pdf.convert(memimg.read(),
                     layout_fun=img2pdf.get_fixed_dpi_layout_fun(
                         (200, 200)),
                     outputstream=mempdf)
     mempdf.seek(0)
     pike = pikepdf.open(mempdf)
     pike.pages[0].Rotate = page_angle
     target = outdir / '{}_{}_{}.pdf'.format(prefix, image_angle,
                                             page_angle)
     pike.save(target)
     return target
Пример #10
0
 def make_rotate_test(prefix, image_angle, page_angle):
     im = Image.open(fspath(resources / 'typewriter.png'))
     if image_angle != 0:
         ccw_angle = -image_angle % 360
         im = im.transpose(getattr(Image, f'ROTATE_{ccw_angle}'))
     memimg = BytesIO()
     im.save(memimg, format='PNG')
     memimg.seek(0)
     mempdf = BytesIO()
     img2pdf.convert(
         memimg.read(),
         layout_fun=img2pdf.get_fixed_dpi_layout_fun((200, 200)),
         outputstream=mempdf,
     )
     mempdf.seek(0)
     pike = pikepdf.open(mempdf)
     pike.pages[0].Rotate = page_angle
     target = outdir / f'{prefix}_{image_angle}_{page_angle}.pdf'
     pike.save(target)
     return target
Пример #11
0
def select_image_layer(infiles, output_file, log, pdfinfo, pdfinfo_lock):

    page_pdf = next(ii for ii in infiles if ii.endswith('.ocr.oriented.pdf'))
    image = next(ii for ii in infiles if ii.endswith('.image'))

    if lossless_reconstruction:
        log.debug("{:4d}: page eligible for lossless reconstruction".format(
            page_number(page_pdf)))
        re_symlink(page_pdf, output_file)
    else:
        pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock)
        dpi = get_page_dpi(pageinfo)
        dpi = float(dpi[0]), float(dpi[1])
        layout_fun = img2pdf.get_fixed_dpi_layout_fun(dpi)

        with open(image, 'rb') as imfile, \
                open(output_file, 'wb') as pdf:
            rawdata = imfile.read()
            img2pdf.convert(rawdata,
                            with_pdfrw=False,
                            layout_fun=layout_fun,
                            outputstream=pdf)
Пример #12
0
            elif len(
                    response.content
            ) < 1024:  # 图书馆如果页码不存在,也会响应 HTTP 200,并返回一个 142 Bytes 的 content
                print(pic_name + ' 超出页码范围')
                pic_path_list.remove(pic_full_path)
                PAGE_TYPE[type_i][2] = i - 1  # 设置页码数,合并 pdf 时用
                break
            else:
                with open(pic_full_path, 'wb') as f:
                    f.write(response.content)  # 写入图片
                print(pic_name + ' 下载成功')

# 将图片合成为 pdf
print('开始生成 PDF……')
with open(PDF_PATH, "wb+") as f:  # w+ 直接覆盖
    layout_fun = img2pdf.get_fixed_dpi_layout_fun(
        (300, 300))  # 固定 DPI,避免封面、正文 DPI 不同导致尺寸不同
    f.write(img2pdf.convert(pic_path_list, layout_fun=layout_fun))

# 为 PDF 添加简单的书签
# from PyPDF2 import PdfFileReader, PdfFileWriter
# def _get_parent_bookmark(current_indent, history_indent, bookmarks):
#     '''The parent of A is the nearest bookmark whose indent is smaller than A's
#     '''
#     assert len(history_indent) == len(bookmarks)
#     if current_indent == 0:
#         return None
#     for i in range(len(history_indent) - 1, -1, -1):
#         # len(history_indent) - 1   ===>   0
#         if history_indent[i] < current_indent:
#             return bookmarks[i]
#     return None