Пример #1
0
def ocr_engine_hocr(input_file: Path, page_context: PageContext):
    hocr_out = page_context.get_path('ocr_hocr.hocr')
    hocr_text_out = page_context.get_path('ocr_hocr.txt')
    options = page_context.options

    ocr_engine = page_context.plugin_manager.hook.get_ocr_engine()
    ocr_engine.generate_hocr(
        input_file=input_file,
        output_hocr=hocr_out,
        output_text=hocr_text_out,
        options=options,
    )
    return (hocr_out, hocr_text_out)
Пример #2
0
def ocr_engine_textonly_pdf(input_image: Path, page_context: PageContext):
    output_pdf = page_context.get_path('ocr_tess.pdf')
    output_text = page_context.get_path('ocr_tess.txt')
    options = page_context.options

    ocr_engine = page_context.plugin_manager.hook.get_ocr_engine()
    ocr_engine.generate_pdf(
        input_file=input_image,
        output_pdf=output_pdf,
        output_text=output_text,
        options=options,
    )
    return (output_pdf, output_text)
Пример #3
0
def create_pdf_page_from_image(image: Path, page_context: PageContext,
                               orientation_correction):
    # We rasterize a square DPI version of each page because most image
    # processing tools don't support rectangular DPI. Use the square DPI as it
    # accurately describes the image. It would be possible to resample the image
    # at this stage back to non-square DPI to more closely resemble the input,
    # except that the hocr renderer does not understand non-square DPI. The
    # sandwich renderer would be fine.
    output_file = page_context.get_path('visible.pdf')

    pageinfo = page_context.pageinfo
    pagesize = 72.0 * float(pageinfo.width_inches), 72.0 * float(
        pageinfo.height_inches)
    effective_rotation = (pageinfo.rotation - orientation_correction) % 360
    if effective_rotation % 180 == 90:
        pagesize = pagesize[1], pagesize[0]

    # This create a single page PDF
    with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf:
        log.debug('convert')

        layout_fun = img2pdf.get_layout_fun(pagesize)
        img2pdf.convert(imfile,
                        with_pdfrw=False,
                        layout_fun=layout_fun,
                        outputstream=pdf)
        log.debug('convert done')

    return output_file
Пример #4
0
def preprocess_remove_background(input_file: Path, page_context: PageContext):
    if any(image.bpc > 1 for image in page_context.pageinfo.images):
        output_file = page_context.get_path('pp_rm_bg.png')
        leptonica.remove_background(input_file, output_file)
        return output_file
    else:
        log.info("background removal skipped on mono page")
        return input_file
Пример #5
0
def create_ocr_image(image: Path, page_context: PageContext):
    """Create the image we send for OCR. May not be the same as the display
    image depending on preprocessing. This image will never be shown to the
    user."""

    output_file = page_context.get_path('ocr.png')
    options = page_context.options
    with Image.open(image) as im:
        white = ImageColor.getcolor('#ffffff', im.mode)
        # pink = ImageColor.getcolor('#ff0080', im.mode)
        draw = ImageDraw.ImageDraw(im)

        log.debug('resolution %r', im.info['dpi'])

        if not options.force_ocr:
            # Do not mask text areas when forcing OCR, because we need to OCR
            # all text areas
            mask = None  # Exclude both visible and invisible text from OCR
            if options.redo_ocr:
                mask = True  # Mask visible text, but not invisible text

            for textarea in page_context.pageinfo.get_textareas(visible=mask,
                                                                corrupt=None):
                # Calculate resolution based on the image size and page dimensions
                # without regard whatever resolution is in pageinfo (may differ or
                # be None)
                bbox = [float(v) for v in textarea]
                xyscale = tuple(
                    float(coord) / 72.0 for coord in im.info['dpi'])
                pixcoords = [
                    bbox[0] * xyscale[0],
                    im.height - bbox[3] * xyscale[1],
                    bbox[2] * xyscale[0],
                    im.height - bbox[1] * xyscale[1],
                ]
                pixcoords = [int(round(c)) for c in pixcoords]
                log.debug('blanking %r', pixcoords)
                draw.rectangle(pixcoords, fill=white)
                # draw.rectangle(pixcoords, outline=pink)

        if options.threshold:
            pix = leptonica.Pix.frompil(im)
            pix = pix.masked_threshold_on_background_norm()
            im_pix = pix.topil()
            im_pix.info['dpi'] = im.info['dpi']
            im = im_pix

        del draw

        filter_im = page_context.plugin_manager.hook.filter_ocr_image(
            page=page_context, image=im)
        if filter_im is not None:
            im = filter_im

        # Pillow requires integer DPI
        dpi = tuple(round(coord) for coord in im.info['dpi'])
        im.save(output_file, dpi=dpi)
    return output_file
Пример #6
0
def preprocess_clean(input_file: Path, page_context: PageContext):
    output_file = page_context.get_path('pp_clean.png')
    dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
    unpaper.clean(
        input_file,
        output_file,
        dpi=dpi.x,
        unpaper_args=page_context.options.unpaper_args,
    )
    return output_file
Пример #7
0
def rasterize(
    input_file: Path,
    page_context: PageContext,
    correction: int = 0,
    output_tag: str = '',
    remove_vectors=None,
):
    colorspaces = ['pngmono', 'pnggray', 'png256', 'png16m']
    device_idx = 0

    if remove_vectors is None:
        remove_vectors = page_context.options.remove_vectors

    output_file = page_context.get_path(f'rasterize{output_tag}.png')
    pageinfo = page_context.pageinfo

    def at_least(cs):
        return max(device_idx, colorspaces.index(cs))

    for image in pageinfo.images:
        if image.type_ != 'image':
            continue  # ignore masks
        if image.bpc > 1:
            if image.color == Colorspace.index:
                device_idx = at_least('png256')
            elif image.color == Colorspace.gray:
                device_idx = at_least('pnggray')
            else:
                device_idx = at_least('png16m')

    if pageinfo.has_vector:
        device_idx = at_least('png16m')

    device = colorspaces[device_idx]

    log.debug(f"Rasterize with {device}, rotation {correction}")

    # Produce the page image with square resolution or else deskew and OCR
    # will not work properly.
    canvas_dpi = get_canvas_square_dpi(pageinfo, page_context.options)
    page_dpi = get_page_square_dpi(pageinfo, page_context.options)

    page_context.plugin_manager.hook.rasterize_pdf_page(
        input_file=input_file,
        output_file=output_file,
        raster_device=device,
        raster_dpi=canvas_dpi,
        page_dpi=page_dpi,
        pageno=pageinfo.pageno + 1,
        rotation=correction,
        filter_vector=remove_vectors,
    )
    return output_file
Пример #8
0
def render_hocr_page(hocr: Path, page_context: PageContext):
    output_file = page_context.get_path('ocr_hocr.pdf')
    dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
    hocrtransform = HocrTransform(hocr, dpi.x)  # square
    hocrtransform.to_pdf(
        output_file,
        image_filename=None,
        show_bounding_boxes=False,
        invisible_text=True,
        interword_spaces=True,
    )
    return output_file
Пример #9
0
def rasterize_preview(input_file: Path, page_context: PageContext):
    output_file = page_context.get_path('rasterize_preview.jpg')
    canvas_dpi = get_canvas_square_dpi(page_context.pageinfo,
                                       page_context.options)
    page_dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
    page_context.plugin_manager.hook.rasterize_pdf_page(
        input_file=input_file,
        output_file=output_file,
        raster_device='jpeggray',
        raster_dpi=canvas_dpi,
        page_dpi=page_dpi,
        pageno=page_context.pageinfo.pageno + 1,
    )
    return output_file
Пример #10
0
def render_hocr_page(hocr: Path, page_context: PageContext):
    options = page_context.options
    output_file = page_context.get_path('ocr_hocr.pdf')
    dpi = get_page_square_dpi(page_context.pageinfo, options)
    debug_mode = options.pdf_renderer == 'hocrdebug'

    hocrtransform = HocrTransform(hocr_filename=hocr, dpi=dpi.x)  # square
    hocrtransform.to_pdf(
        out_filename=output_file,
        image_filename=None,
        show_bounding_boxes=False if not debug_mode else True,
        invisible_text=True if not debug_mode else False,
        interword_spaces=True,
    )
    return output_file
Пример #11
0
def create_visible_page_jpg(image: Path, page_context: PageContext) -> Path:
    output_file = page_context.get_path('visible.jpg')
    with Image.open(image) as im:
        # At this point the image should be a .png, but deskew, unpaper
        # might have removed the DPI information. In this case, fall back to
        # square DPI used to rasterize. When the preview image was
        # rasterized, it was also converted to square resolution, which is
        # what we want to give to the OCR engine, so keep it square.
        if 'dpi' in im.info:
            dpi = Resolution(*im.info['dpi'])
        else:
            # Fallback to page-implied DPI
            dpi = get_page_square_dpi(page_context.pageinfo,
                                      page_context.options)

        # Pillow requires integer DPI
        im.save(output_file, format='JPEG', dpi=dpi.to_int())
    return output_file
Пример #12
0
def create_pdf_page_from_image(image: Path, page_context: PageContext):
    # We rasterize a square DPI version of each page because most image
    # processing tools don't support rectangular DPI. Use the square DPI as it
    # accurately describes the image. It would be possible to resample the image
    # at this stage back to non-square DPI to more closely resemble the input,
    # except that the hocr renderer does not understand non-square DPI. The
    # sandwich renderer would be fine.
    output_file = page_context.get_path('visible.pdf')
    dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
    layout_fun = img2pdf.get_fixed_dpi_layout_fun(dpi)

    # This create a single page PDF
    with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf:
        log.debug('convert')
        img2pdf.convert(imfile,
                        with_pdfrw=False,
                        layout_fun=layout_fun,
                        outputstream=pdf)
        log.debug('convert done')
    return output_file
Пример #13
0
def preprocess_deskew(input_file: Path, page_context: PageContext):
    output_file = page_context.get_path('pp_deskew.png')
    dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
    leptonica.deskew(input_file, output_file, dpi.x)
    return output_file