def ocr_engine_hocr(input_file: Path, page_context: PageContext): hocr_out = page_context.get_path('ocr_hocr.hocr') hocr_text_out = page_context.get_path('ocr_hocr.txt') options = page_context.options ocr_engine = page_context.plugin_manager.hook.get_ocr_engine() ocr_engine.generate_hocr( input_file=input_file, output_hocr=hocr_out, output_text=hocr_text_out, options=options, ) return (hocr_out, hocr_text_out)
def ocr_engine_textonly_pdf(input_image: Path, page_context: PageContext): output_pdf = page_context.get_path('ocr_tess.pdf') output_text = page_context.get_path('ocr_tess.txt') options = page_context.options ocr_engine = page_context.plugin_manager.hook.get_ocr_engine() ocr_engine.generate_pdf( input_file=input_image, output_pdf=output_pdf, output_text=output_text, options=options, ) return (output_pdf, output_text)
def create_pdf_page_from_image(image: Path, page_context: PageContext, orientation_correction): # We rasterize a square DPI version of each page because most image # processing tools don't support rectangular DPI. Use the square DPI as it # accurately describes the image. It would be possible to resample the image # at this stage back to non-square DPI to more closely resemble the input, # except that the hocr renderer does not understand non-square DPI. The # sandwich renderer would be fine. output_file = page_context.get_path('visible.pdf') pageinfo = page_context.pageinfo pagesize = 72.0 * float(pageinfo.width_inches), 72.0 * float( pageinfo.height_inches) effective_rotation = (pageinfo.rotation - orientation_correction) % 360 if effective_rotation % 180 == 90: pagesize = pagesize[1], pagesize[0] # This create a single page PDF with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf: log.debug('convert') layout_fun = img2pdf.get_layout_fun(pagesize) img2pdf.convert(imfile, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf) log.debug('convert done') return output_file
def preprocess_remove_background(input_file: Path, page_context: PageContext): if any(image.bpc > 1 for image in page_context.pageinfo.images): output_file = page_context.get_path('pp_rm_bg.png') leptonica.remove_background(input_file, output_file) return output_file else: log.info("background removal skipped on mono page") return input_file
def create_ocr_image(image: Path, page_context: PageContext): """Create the image we send for OCR. May not be the same as the display image depending on preprocessing. This image will never be shown to the user.""" output_file = page_context.get_path('ocr.png') options = page_context.options with Image.open(image) as im: white = ImageColor.getcolor('#ffffff', im.mode) # pink = ImageColor.getcolor('#ff0080', im.mode) draw = ImageDraw.ImageDraw(im) log.debug('resolution %r', im.info['dpi']) if not options.force_ocr: # Do not mask text areas when forcing OCR, because we need to OCR # all text areas mask = None # Exclude both visible and invisible text from OCR if options.redo_ocr: mask = True # Mask visible text, but not invisible text for textarea in page_context.pageinfo.get_textareas(visible=mask, corrupt=None): # Calculate resolution based on the image size and page dimensions # without regard whatever resolution is in pageinfo (may differ or # be None) bbox = [float(v) for v in textarea] xyscale = tuple( float(coord) / 72.0 for coord in im.info['dpi']) pixcoords = [ bbox[0] * xyscale[0], im.height - bbox[3] * xyscale[1], bbox[2] * xyscale[0], im.height - bbox[1] * xyscale[1], ] pixcoords = [int(round(c)) for c in pixcoords] log.debug('blanking %r', pixcoords) draw.rectangle(pixcoords, fill=white) # draw.rectangle(pixcoords, outline=pink) if options.threshold: pix = leptonica.Pix.frompil(im) pix = pix.masked_threshold_on_background_norm() im_pix = pix.topil() im_pix.info['dpi'] = im.info['dpi'] im = im_pix del draw filter_im = page_context.plugin_manager.hook.filter_ocr_image( page=page_context, image=im) if filter_im is not None: im = filter_im # Pillow requires integer DPI dpi = tuple(round(coord) for coord in im.info['dpi']) im.save(output_file, dpi=dpi) return output_file
def preprocess_clean(input_file: Path, page_context: PageContext): output_file = page_context.get_path('pp_clean.png') dpi = get_page_square_dpi(page_context.pageinfo, page_context.options) unpaper.clean( input_file, output_file, dpi=dpi.x, unpaper_args=page_context.options.unpaper_args, ) return output_file
def rasterize( input_file: Path, page_context: PageContext, correction: int = 0, output_tag: str = '', remove_vectors=None, ): colorspaces = ['pngmono', 'pnggray', 'png256', 'png16m'] device_idx = 0 if remove_vectors is None: remove_vectors = page_context.options.remove_vectors output_file = page_context.get_path(f'rasterize{output_tag}.png') pageinfo = page_context.pageinfo def at_least(cs): return max(device_idx, colorspaces.index(cs)) for image in pageinfo.images: if image.type_ != 'image': continue # ignore masks if image.bpc > 1: if image.color == Colorspace.index: device_idx = at_least('png256') elif image.color == Colorspace.gray: device_idx = at_least('pnggray') else: device_idx = at_least('png16m') if pageinfo.has_vector: device_idx = at_least('png16m') device = colorspaces[device_idx] log.debug(f"Rasterize with {device}, rotation {correction}") # Produce the page image with square resolution or else deskew and OCR # will not work properly. canvas_dpi = get_canvas_square_dpi(pageinfo, page_context.options) page_dpi = get_page_square_dpi(pageinfo, page_context.options) page_context.plugin_manager.hook.rasterize_pdf_page( input_file=input_file, output_file=output_file, raster_device=device, raster_dpi=canvas_dpi, page_dpi=page_dpi, pageno=pageinfo.pageno + 1, rotation=correction, filter_vector=remove_vectors, ) return output_file
def render_hocr_page(hocr: Path, page_context: PageContext): output_file = page_context.get_path('ocr_hocr.pdf') dpi = get_page_square_dpi(page_context.pageinfo, page_context.options) hocrtransform = HocrTransform(hocr, dpi.x) # square hocrtransform.to_pdf( output_file, image_filename=None, show_bounding_boxes=False, invisible_text=True, interword_spaces=True, ) return output_file
def rasterize_preview(input_file: Path, page_context: PageContext): output_file = page_context.get_path('rasterize_preview.jpg') canvas_dpi = get_canvas_square_dpi(page_context.pageinfo, page_context.options) page_dpi = get_page_square_dpi(page_context.pageinfo, page_context.options) page_context.plugin_manager.hook.rasterize_pdf_page( input_file=input_file, output_file=output_file, raster_device='jpeggray', raster_dpi=canvas_dpi, page_dpi=page_dpi, pageno=page_context.pageinfo.pageno + 1, ) return output_file
def render_hocr_page(hocr: Path, page_context: PageContext): options = page_context.options output_file = page_context.get_path('ocr_hocr.pdf') dpi = get_page_square_dpi(page_context.pageinfo, options) debug_mode = options.pdf_renderer == 'hocrdebug' hocrtransform = HocrTransform(hocr_filename=hocr, dpi=dpi.x) # square hocrtransform.to_pdf( out_filename=output_file, image_filename=None, show_bounding_boxes=False if not debug_mode else True, invisible_text=True if not debug_mode else False, interword_spaces=True, ) return output_file
def create_visible_page_jpg(image: Path, page_context: PageContext) -> Path: output_file = page_context.get_path('visible.jpg') with Image.open(image) as im: # At this point the image should be a .png, but deskew, unpaper # might have removed the DPI information. In this case, fall back to # square DPI used to rasterize. When the preview image was # rasterized, it was also converted to square resolution, which is # what we want to give to the OCR engine, so keep it square. if 'dpi' in im.info: dpi = Resolution(*im.info['dpi']) else: # Fallback to page-implied DPI dpi = get_page_square_dpi(page_context.pageinfo, page_context.options) # Pillow requires integer DPI im.save(output_file, format='JPEG', dpi=dpi.to_int()) return output_file
def create_pdf_page_from_image(image: Path, page_context: PageContext): # We rasterize a square DPI version of each page because most image # processing tools don't support rectangular DPI. Use the square DPI as it # accurately describes the image. It would be possible to resample the image # at this stage back to non-square DPI to more closely resemble the input, # except that the hocr renderer does not understand non-square DPI. The # sandwich renderer would be fine. output_file = page_context.get_path('visible.pdf') dpi = get_page_square_dpi(page_context.pageinfo, page_context.options) layout_fun = img2pdf.get_fixed_dpi_layout_fun(dpi) # This create a single page PDF with open(image, 'rb') as imfile, open(output_file, 'wb') as pdf: log.debug('convert') img2pdf.convert(imfile, with_pdfrw=False, layout_fun=layout_fun, outputstream=pdf) log.debug('convert done') return output_file
def preprocess_deskew(input_file: Path, page_context: PageContext): output_file = page_context.get_path('pp_deskew.png') dpi = get_page_square_dpi(page_context.pageinfo, page_context.options) leptonica.deskew(input_file, output_file, dpi.x) return output_file