def estimate_bounds(self, page, page_image, tessapi, zoom=1.0): """Get outer bounds of all (existing or detected) regions.""" LOG = getLogger('processor.TesserocrCrop') all_left = page_image.width all_top = page_image.height all_right = 0 all_bottom = 0 LOG.info("Cropping with Tesseract") tessapi.SetImage(page_image) # PSM.SPARSE_TEXT: get as much text as possible in no particular order # PSM.AUTO (default): includes tables (dangerous) # PSM.SPARSE_TEXT_OSD: sparse but all orientations tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT) # # iterate over all text blocks and compare their # bbox extent to the running min and max values for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): image, xywh, index, _ = component # # the region reference in the reading order element # ID = "region%04d" % index left, top, right, bottom = bbox_from_xywh(xywh) LOG.debug("Detected text region '%s': %i:%i,%i:%i", ID, left, right, top, bottom) # filter region results: bin_bbox = image.getbbox() if not bin_bbox: # this does happen! LOG.info("Ignoring region '%s' because its binarization is empty", ID) continue width = bin_bbox[2]-bin_bbox[0] if width < 25 / zoom: # we must be conservative here: page numbers are tiny regions, too! LOG.info("Ignoring region '%s' because its width is too small (%d)", ID, width) continue height = bin_bbox[3]-bin_bbox[1] if height < 25 / zoom: # we must be conservative here: page numbers are tiny regions, too! LOG.debug("Ignoring region '%s' because its height is too small (%d)", ID, height) continue all_left = min(all_left, left) all_top = min(all_top, top) all_right = max(all_right, right) all_bottom = max(all_bottom, bottom) # use existing segmentation as "upper bound" regions = page.get_AllRegions(classes=['Text']) for region in regions: left, top, right, bottom = bbox_from_points(region.get_Coords().points) LOG.debug("Found existing text region '%s': %i:%i,%i:%i", region.id, left, right, top, bottom) all_left = min(all_left, left) all_top = min(all_top, top) all_right = max(all_right, right) all_bottom = max(all_bottom, bottom) LOG.info("Combined page bounds from text regions: %i:%i,%i:%i", all_left, all_right, all_top, all_bottom) return all_left, all_top, all_right, all_bottom
def convert_text(self): for reg_page in self.page_page.get_AllRegions(depth=0, order=self.region_order): reg_page_type = reg_page.__class__.__name__[ 0:-10] # len('RegionType') == 10 reg_alto_type = REGION_PAGE_TO_ALTO[reg_page_type] if not reg_alto_type: raise ValueError("Cannot handle PAGE-XML %sRegion" % reg_page_type) # determine if the region belongs to PrintSpace or to any of the Margins reg_bbox = bbox_from_points(reg_page.get_Coords().points) if contains(self.alto_printspace, reg_bbox): parent = self.alto_printspace else: parent = None for margin in [ 'LeftMargin', 'RightMargin', 'TopMargin', 'BottomMargin' ]: if not hasattr(self.alto_page, margin): continue margin = getattr(self.alto_page, margin) if contains(margin, reg_bbox): parent = margin break # pick first match only if not parent: parent = self.alto_printspace self.logger.warning( "region '%s' not properly contained in PrintSpace or Margins", reg_page.id) reg_alto = ET.SubElement(parent, reg_alto_type) set_alto_id_from_page_id(reg_alto, reg_page) set_alto_xywh_from_coords(reg_alto, reg_page) if version.parse(self.alto_version) >= version.parse('3.1'): set_alto_shape_from_coords(reg_alto, reg_page) if version.parse(self.alto_version) >= version.parse('2.1'): set_alto_lang_from_page_lang(reg_alto, reg_page) self.textstyle_mgr.set_alto_styleref_from_textstyle( reg_alto, reg_page) self.parastyle_mgr.set_alto_styleref_from_textstyle( reg_alto, reg_page) if version.parse(self.alto_version) >= version.parse('2.1'): self.layouttag_mgr.set_alto_tag_from_type(reg_alto, reg_page) if reg_page_type == 'Text': self._convert_textlines(reg_alto, reg_page) elif reg_page_type == 'Table': self._convert_table(reg_alto, reg_page)
def process(self): """Performs border detection on the workspace. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) LOG = getLogger('OcrdAnybaseocrCropper') oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # Check for existing Border --> already cropped border = page.get_Border() if border: left, top, right, bottom = bbox_from_points( border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='cropped', feature_selector='binarized') # should also be deskewed if oplevel == "page": self._process_segment( page_image, page, page_coords, page_id, input_file, n) else: raise Exception( 'Operation level %s, but should be "page".', oplevel) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8') )
def test_bbox_from_points(self): self.assertEqual(bbox_from_points('100,100 200,100 200,200 100,200'), (100, 100, 200, 200))
def convert_border(self): page_width = self.page_page.imageWidth page_height = self.page_page.imageHeight setxml(self.alto_page, 'WIDTH', page_width) setxml(self.alto_page, 'HEIGHT', page_height) page_border = self.page_page.get_Border() page_pspace = self.page_page.get_PrintSpace() if page_pspace is None and page_border is not None: self.logger.warning( "PAGE-XML has Border but no PrintSpace - Margins will be empty" ) page_pspace = page_border elif page_border is None and page_pspace is not None: self.logger.warning( "PAGE-XML has PrintSpace but no Border - Margins will be empty" ) page_border = page_pspace elif page_border is None and page_pspace is None: self.logger.warning( "PAGE-XML has neither Border nor PrintSpace - PrintSpace will fill the image" ) alto_pspace = ET.SubElement(self.alto_page, 'PrintSpace') setxml(alto_pspace, 'VPOS', 0) setxml(alto_pspace, 'HPOS', 0) setxml(alto_pspace, 'HEIGHT', page_height) setxml(alto_pspace, 'WIDTH', page_width) return alto_pspace alto_pspace = ET.SubElement(self.alto_page, 'PrintSpace') set_alto_xywh_from_coords(alto_pspace, page_pspace) set_alto_shape_from_coords(alto_pspace, page_pspace) if page_border is not page_pspace: bmin_x, bmin_y, bmax_x, bmax_y = bbox_from_points( page_border.get_Coords().points) pmin_x, pmin_y, pmax_x, pmax_y = bbox_from_points( page_pspace.get_Coords().points) # # ╔═══════╗ ╔═══════╗ ╔╗ ╔══╗ # ║┌───┐ ║ ╚═══════╝ ║║ ║ ║ ┌───┐ # ║│ │ ║ → + ║║ ║ ║ (margins) + │ │ (pspace) # ║└───┘ ║ ╔═══════╗ ║║ ║ ║ └───┘ # ║ ║ ║ ║ ║║ ║ ║ # ╚═══════╝ ╚═══════╝ ╚╝ ╚══╝ # alto_topmargin = ET.SubElement(self.alto_page, 'TopMargin') setxml(alto_topmargin, 'VPOS', bmin_y) setxml(alto_topmargin, 'HPOS', bmin_x) setxml(alto_topmargin, 'HEIGHT', pmin_y - bmin_y) setxml(alto_topmargin, 'WIDTH', bmax_x - bmin_x) alto_leftmargin = ET.SubElement(self.alto_page, 'LeftMargin') setxml(alto_leftmargin, 'VPOS', bmin_y) setxml(alto_leftmargin, 'HPOS', bmin_x) setxml(alto_leftmargin, 'HEIGHT', bmax_y - bmin_y) setxml(alto_leftmargin, 'WIDTH', pmin_x - bmin_x) alto_rightmargin = ET.SubElement(self.alto_page, 'RightMargin') setxml(alto_rightmargin, 'VPOS', bmin_y) setxml(alto_rightmargin, 'HPOS', pmax_x) setxml(alto_rightmargin, 'HEIGHT', bmax_y - bmin_y) setxml(alto_rightmargin, 'WIDTH', bmax_x - pmax_x) alto_bottommargin = ET.SubElement(self.alto_page, 'BottomMargin') setxml(alto_bottommargin, 'VPOS', pmax_y) setxml(alto_bottommargin, 'HPOS', bmin_x) setxml(alto_bottommargin, 'HEIGHT', bmax_y - pmax_y) setxml(alto_bottommargin, 'WIDTH', bmax_x - bmin_x) return alto_pspace
def process(self): """Performs heuristic page frame detection (cropping) on the workspace. Open and deserialize PAGE input files and their respective images. (Input should be deskewed already.) Retrieve the raw (non-binarized, uncropped) page image. Detect line segments via edge gradients, and cluster them into contiguous horizontal and vertical lines if possible. If candidates which are located at the margin and long enough (covering a large fraction of the page) exist on all four sides, then pick the best (i.e. thickest, longest and inner-most) one on each side and use their intersections as border points. Otherwise, first try to detect a ruler (i.e. image segment depicting a rule placed on the scan/photo for scale references) via thresholding and contour detection, identifying a single large rectangular region with a certain aspect ratio. Suppress (mask) any such segment during further calculations. Next in that line, try to detect text segments on the page. For that purpose, get the gradient of grayscale image, threshold and morphologically close it, then determine contours to define approximate text boxes. Merge these into columns, filtering candidates too small or entirely in the margin areas. Finally, merge the remaining columns across short gaps. If only one column remains, and it covers a significant fraction of the page, pick that segment as solution. Otherwise, keep the border points derived from line segments (intersecting with the full image on each side without line candidates). Lastly, map coordinates to the original (undeskewed) image and intersect the border polygon with the full image frame. Use that to define the page's Border. Moreover, crop (and mask) the image accordingly, and reference the resulting image file as AlternativeImage in the Page element. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-CROP`` along with further identification of the input element. Produce new output files by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) self.logger = getLogger('processor.AnybaseocrCropper') for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID self.logger.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # Check for existing Border --> already cropped border = page.get_Border() if border: left, top, right, bottom = bbox_from_points( border.get_Coords().points) self.logger.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, # should be deskewed already feature_filter='cropped,binarized,grayscale_normalized') if self.parameter['dpi'] > 0: zoom = 300.0 / self.parameter['dpi'] elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 self.logger.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0 / dpi else: zoom = 1 self._process_page(page, page_image, page_coords, input_file, zoom) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): """Performs page cropping with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images. Set up Tesseract to detect text blocks on each page, and find the largest coordinate extent spanning all of them. Use this extent in defining a Border, and add that to the page. Moreover, crop the original image accordingly, and reference the resulting image file as AlternativeImage in the Page element. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-CROP`` along with further identification of the input element. Produce new output files by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrCrop') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) padding = self.parameter['padding'] with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # disable table detection here (tables count as text blocks), # because we do not want to risk confusing the spine with # a column separator and thus creeping into a neighbouring # page: tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # warn of existing Border: border = page.get_Border() if border: left, top, right, bottom = bbox_from_points( border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, # image must not have been cropped already, # abort if no such image can be produced: feature_filter='cropped') if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) zoom = 300 / dpi else: zoom = 1 # warn of existing segmentation: regions = page.get_TextRegion() if regions: min_x = page_image.width min_y = page_image.height max_x = 0 max_y = 0 for region in regions: left, top, right, bottom = bbox_from_points( region.get_Coords().points) min_x = min(min_x, left) min_y = min(min_y, top) max_x = max(max_x, right) max_y = max(max_y, bottom) LOG.warning( 'Ignoring extent from existing TextRegions: %i:%i,%i:%i', min_x, max_x, min_y, max_y) LOG.debug("Cropping with Tesseract") tessapi.SetImage(page_image) # PSM.SPARSE_TEXT: get as much text as possible in no particular order # PSM.AUTO (default): includes tables (dangerous) tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT) # # helper variables for saving the box coordinates # min_x = page_image.width min_y = page_image.height max_x = 0 max_y = 0 # iterate over all text blocks and compare their # bbox extent to the running min and max values for component in tessapi.GetComponentImages( tesserocr.RIL.BLOCK, True): image, xywh, index, _ = component # # the region reference in the reading order element # ID = "region%04d" % index left, top, right, bottom = bbox_from_xywh(xywh) LOG.debug("Detected text region '%s': %i:%i,%i:%i", ID, left, right, top, bottom) # filter region results: bin_bbox = image.getbbox() if not bin_bbox: # this does happen! LOG.info( "Ignoring region '%s' because its binarization is empty", ID) continue width = bin_bbox[2] - bin_bbox[0] if width < 25 / zoom: # we must be conservative here: page numbers are tiny regions, too! LOG.info( "Ignoring region '%s' because its width is too small (%d)", ID, width) continue height = bin_bbox[3] - bin_bbox[1] if height < 25 / zoom: # we must be conservative here: page numbers are tiny regions, too! LOG.debug( "Ignoring region '%s' because its height is too small (%d)", ID, height) continue min_x = min(min_x, left) min_y = min(min_y, top) max_x = max(max_x, right) max_y = max(max_y, bottom) LOG.info("Updated page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) # # set the identified page border # if min_x < max_x and min_y < max_y: # add padding: min_x = max(min_x - padding, 0) max_x = min(max_x + padding, page_image.width) min_y = max(min_y - padding, 0) max_y = min(max_y + padding, page_image.height) LOG.info("Padded page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) polygon = polygon_from_bbox(min_x, min_y, max_x, max_y) polygon = coordinates_for_segment(polygon, page_image, page_xywh) polygon = polygon_for_parent(polygon, page) border = BorderType( Coords=CoordsType(points_from_polygon(polygon))) # intersection with parent could have changed bbox, # so recalculate: bbox = bbox_from_polygon( coordinates_of_segment(border, page_image, page_xywh)) # update PAGE (annotate border): page.set_Border(border) # update METS (add the image file): page_image = crop_image(page_image, box=bbox) page_xywh['features'] += ',cropped' file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( page_image, file_id + '.IMG-CROP', page_id=input_file.pageId, file_grp=self.output_file_grp) # update PAGE (reference the image file): page.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=page_xywh['features'])) else: LOG.error("Cannot find valid extent for page '%s'", page_id) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Performs border detection on the workspace. """ try: print("OUTPUT FILE ", self.output_file_grp) self.page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: self.page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() # Check for existing Border --> already cropped border = page.get_Border() if border: left, top, right, bottom = bbox_from_points( border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( # externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='cropped') if oplevel == "page": self._process_segment(page_image, page, page_xywh, page_id, input_file, n) else: LOG.warning('Operation level %s, but should be "page".', oplevel) break file_id = input_file.ID.replace(self.input_file_grp, self.page_grp) # Use input_file's basename for the new file - # this way the files retain the same basenames: if file_id == input_file.ID: file_id = concat_padded(self.page_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.page_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.page_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint, noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority): if purpose == "NERD": out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf'] elif purpose == "OCR": out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id'] if min_confidence is not None and max_confidence is not None: out_columns += ['ocrconf'] else: raise RuntimeError("Unknown purpose.") if noproxy: os.environ['no_proxy'] = '*' urls = [] if os.path.exists(tsv_out_file): parts = extract_doc_links(tsv_out_file) urls = [part['url'] for part in parts] else: pd.DataFrame([], columns=out_columns).to_csv(tsv_out_file, sep="\t", quoting=3, index=False) pcgts = parse(page_xml_file) tsv = [] line_info = [] for region_idx, region in enumerate(pcgts.get_Page().get_AllRegions(classes=['Text'], order='reading-order')): for text_line in region.get_TextLine(): left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)] if min_confidence is not None and max_confidence is not None: conf = np.max([textequiv.conf for textequiv in text_line.get_TextEquiv()]) else: conf = np.nan line_info.append((len(urls), left, right, top, bottom, conf, text_line.id)) words = [word for word in text_line.get_Word()] if len(words) <= 0: for text_equiv in text_line.get_TextEquiv(): # transform OCR coordinates using `scale_factor` to derive # correct coordinates for the web presentation image left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)] tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0, text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id)) else: for word in words: for text_equiv in word.get_TextEquiv(): # transform OCR coordinates using `scale_factor` to derive # correct coordinates for the web presentation image left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)] tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0, text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id)) line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id']) if min_confidence is not None and max_confidence is not None: line_info['ocrconf'] = line_info.conf.map(lambda x: get_conf_color(x, min_confidence, max_confidence)) tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] + ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'line_id']) if len(tsv) == 0: return with open(tsv_out_file, 'a') as f: f.write('# ' + image_url + '\n') vlinecenter = pd.DataFrame(tsv[['line', 'top']].groupby('line', sort=False).mean().top + (tsv[['line', 'bottom']].groupby('line', sort=False).mean().bottom - tsv[['line', 'top']].groupby('line', sort=False).mean().top) / 2, columns=['vlinecenter']) tsv = tsv.merge(vlinecenter, left_on='line', right_index=True) regions = [region.sort_values(['vlinecenter', 'hcenter']) for rid, region in tsv.groupby('rid', sort=False)] tsv = pd.concat(regions) if purpose == 'NERD': tsv['No.'] = 0 tsv['NE-TAG'] = 'O' tsv['NE-EMB'] = 'O' tsv['ID'] = '-' tsv['conf'] = '-' tsv = tsv.rename(columns={'TEXT': 'TOKEN'}) elif purpose == 'OCR': tsv = pd.DataFrame([(line, " ".join(part.TEXT.to_list())) for line, part in tsv.groupby('line')], columns=['line', 'TEXT']) tsv = tsv.merge(line_info, left_on='line', right_index=True) tsv = tsv[out_columns].reset_index(drop=True) try: if purpose == 'NERD' and ner_rest_endpoint is not None: tsv, ner_result = ner(tsv, ner_rest_endpoint) if ned_rest_endpoint is not None: tsv, _ = ned(tsv, ner_result, ned_rest_endpoint, threshold=ned_threshold, priority=ned_priority) tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False) except requests.HTTPError as e: print(e)
def process(self): """Performs page cropping with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images. Set up Tesseract to detect text blocks on each page, and find the largest coordinate extent spanning all of them. Use this extent in defining a Border, and add that to the page. Moreover, crop the original image accordingly, and reference the resulting image file as AlternativeImage in the Page element. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-CROP`` along with further identification of the input element. Produce new output files by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrCrop') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # disable table detection here (tables count as text blocks), # because we do not want to risk confusing the spine with # a column separator and thus creeping into a neighbouring # page: tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): file_id = make_file_id(input_file, self.output_file_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # warn of existing Border: border = page.get_Border() if border: left, top, right, bottom = bbox_from_points(border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, # image must not have been cropped already, # abort if no such image can be produced: feature_filter='cropped') if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) zoom = 300 / dpi else: zoom = 1 bounds = self.estimate_bounds(page, page_image, tessapi, zoom) self.process_page(page, page_image, page_xywh, bounds, file_id, input_file.pageId) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))