def process(self): assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR") assert_file_grp_cardinality(self.output_file_grp, 1) log = getLogger("processor.OcrdDinglehopperEvaluate") metrics = self.parameter["metrics"] textequiv_level = self.parameter["textequiv_level"] gt_grp, ocr_grp = self.input_file_grp.split(",") input_file_tuples = self.zip_input_files(on_error='abort') for n, (gt_file, ocr_file) in enumerate(input_file_tuples): if not gt_file or not ocr_file: # file/page was not found in this group continue gt_file = self.workspace.download_file(gt_file) ocr_file = self.workspace.download_file(ocr_file) page_id = gt_file.pageId log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) file_id = make_file_id(ocr_file, self.output_file_grp) report_prefix = os.path.join(self.output_file_grp, file_id) # Process the files try: os.mkdir(self.output_file_grp) except FileExistsError: pass cli_process( gt_file.local_filename, ocr_file.local_filename, report_prefix, metrics=metrics, textequiv_level=textequiv_level, ) # Add reports to the workspace for report_suffix, mimetype in [ [".html", "text/html"], [".json", "application/json"], ]: self.workspace.add_file( ID=file_id + report_suffix, file_grp=self.output_file_grp, pageId=page_id, mimetype=mimetype, local_filename=report_prefix + report_suffix, ) # Clear cache between files levenshtein_matrix_cache_clear()
def process(self): """Segment pages into regions using a Mask R-CNN model.""" assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) LOG = getLogger('processor.AnybaseocrBlockSegmenter') if not tf.test.is_gpu_available(): LOG.warning( "Tensorflow cannot detect CUDA installation. Running without GPU will be slow." ) for input_file in self.input_files: pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_id = input_file.pageId or input_file.ID # todo rs: why not cropped? page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='binarized,deskewed,cropped,clipped,non_text') # try to load pixel masks try: # todo rs: this combination only works for tiseg with use_deeplr=true mask_image, _, _ = self.workspace.image_from_page( page, page_id, feature_selector='clipped', feature_filter='binarized,deskewed,cropped,non_text') except: mask_image = None if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None self._process_segment(page_image, page, page_xywh, page_id, input_file, mask_image, dpi) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): LOG = getLogger('OcrdAnybaseocrLayoutAnalyser') if not tf.test.is_gpu_available(): LOG.error("Your system has no CUDA installed. No GPU detected.") # sys.exit(1) assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) model_path = Path(self.parameter['model_path']) class_mapper_path = Path(self.parameter['class_mapping_path']) if not Path(model_path).is_file(): LOG.error("""\ Layout Classfication model was not found at '%s'. Make sure the `model_path` parameter points to the local model path. model can be downloaded from http://url """ % model_path) sys.exit(1) else: LOG.info('Loading model from file %s', model_path) model = self.create_model(str(model_path)) # load the mapping pickle_in = open(str(class_mapper_path), "rb") class_indices = pickle.load(pickle_in) label_mapping = dict((v, k) for k, v in class_indices.items()) # print("INPUT FILE HERE",self.input_files) for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) fname = pcgts.get_Page().imageFilename page_id = input_file.pageId or input_file.ID size = 600, 500 self.add_metadata(pcgts) page = pcgts.get_Page() LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') img_array = ocrolib.pil2array( page_image.resize((500, 600), Image.ANTIALIAS)) img_array = img_array * 1. / 255. img_array = img_array[np.newaxis, :, :, np.newaxis] results = self.start_test(model, img_array, fname, label_mapping) LOG.info(results) self.workspace.mets.set_physical_page_for_file( "PHYS_000" + str(n), input_file) self.create_logmap_smlink(pcgts) self.write_to_mets(results, "PHYS_000" + str(n))
def test_assert_file_grp_cardinality(self): with self.assertRaisesRegex(AssertionError, "Expected exactly 5 output file groups, but '.'FOO', 'BAR'.' has 2"): assert_file_grp_cardinality('FOO,BAR', 5) with self.assertRaisesRegex(AssertionError, "Expected exactly 1 output file group, but '.'FOO', 'BAR'.' has 2"): assert_file_grp_cardinality('FOO,BAR', 1) assert_file_grp_cardinality('FOO,BAR', 2) with self.assertRaisesRegex(AssertionError, r"Expected exactly 1 output file group .foo bar., but '.'FOO', 'BAR'.' has 2"): assert_file_grp_cardinality('FOO,BAR', 1, 'foo bar')
def process(self): LOG = getLogger('ocrd.dummy') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for input_file in self.input_files: input_file = self.workspace.download_file(input_file) file_id = make_file_id(input_file, self.output_file_grp) ext = MIME_TO_EXT.get(input_file.mimetype, '') local_filename = join(self.output_file_grp, file_id + ext) pcgts = page_from_file(self.workspace.download_file(input_file)) pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) if input_file.mimetype == MIMETYPE_PAGE: # Source file is PAGE-XML: Write out in-memory PcGtsType self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, content=to_xml(pcgts).encode('utf-8')) else: # Source file is not PAGE-XML: Copy byte-by-byte with open(input_file.local_filename, 'rb') as f: content = f.read() self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, content=content) if input_file.mimetype.startswith('image/'): # write out the PAGE-XML representation for this image page_file_id = file_id + '_PAGE' pcgts.set_pcGtsId(page_file_id) pcgts.get_Page().set_imageFilename(local_filename) page_filename = join(self.output_file_grp, file_id + '.xml') LOG.info("Add PAGE-XML %s generated for %s at %s", page_file_id, file_id, page_filename) self.workspace.add_file( ID=page_file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=page_filename, content=to_xml(pcgts).encode('utf-8'))
def process(self): LOG = getLogger('OcrdAnybaseocrTextline') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized,deskewed') if oplevel == 'page': LOG.warning("Operation level should be region.") self._process_segment(page_image, page, None, page_xywh, page_id, input_file, n) else: regions = page.get_TextRegion() if not regions: LOG.warning("Page '%s' contains no text regions", page_id) continue for (k, region) in enumerate(regions): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) self._process_segment(region_image, page, region, region_xywh, region.id, input_file, k) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): """Performs border detection on the workspace. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) LOG = getLogger('OcrdAnybaseocrCropper') oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # Check for existing Border --> already cropped border = page.get_Border() if border: left, top, right, bottom = bbox_from_points( border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='cropped', feature_selector='binarized') # should also be deskewed if oplevel == "page": self._process_segment( page_image, page, page_coords, page_id, input_file, n) else: raise Exception( 'Operation level %s, but should be "page".', oplevel) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8') )
def process(self): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] LOG = getLogger('OcrdAnybaseocrBinarizer') for (n, input_file) in enumerate(self.input_files): file_id = make_file_id(input_file, self.output_file_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter="binarized") LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id) if oplevel == "page": self._process_segment(page_image, page, page_xywh, page_id, input_file, n) else: regions = page.get_TextRegion() + page.get_TableRegion() if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for (k, region) in enumerate(regions): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) # TODO: not tested on regions self._process_segment(region_image, page, region_xywh, region.id, input_file, str(n) + "_" + str(k)) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): assert_file_grp_cardinality(self.input_file_grp, 2, 'GT and OCR') assert_file_grp_cardinality(self.output_file_grp, 1) metrics = self.parameter['metrics'] gt_grp, ocr_grp = self.input_file_grp.split(',') for n, page_id in enumerate(self.workspace.mets.physical_pages): gt_file = self.workspace.mets.find_files(fileGrp=gt_grp, pageId=page_id)[0] ocr_file = self.workspace.mets.find_files(fileGrp=ocr_grp, pageId=page_id)[0] gt_file = self.workspace.download_file(gt_file) ocr_file = self.workspace.download_file(ocr_file) log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) file_id = make_file_id(ocr_file, self.output_file_grp) report_prefix = os.path.join(self.output_file_grp, file_id) # Process the files try: os.mkdir(self.output_file_grp) except FileExistsError: pass cli_process(gt_file.local_filename, ocr_file.local_filename, report_prefix, metrics=metrics) # Add reports to the workspace for report_suffix, mimetype in \ [ ['.html', 'text/html'], ['.json', 'application/json'] ]: self.workspace.add_file(ID=file_id + report_suffix, file_grp=self.output_file_grp, pageId=page_id, mimetype=mimetype, local_filename=report_prefix + report_suffix) # Clear cache between files levenshtein_matrix_cache_clear()
def process(self): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] LOG = getLogger('OcrdAnybaseocrDeskewer') for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() angle = page.get_orientation() if angle: LOG.warning('Overwriting existing deskewing angle: %i', angle) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='deskewed', feature_selector='binarized') if oplevel == "page": self._process_segment(page_image, page, page_xywh, page_id, input_file, n) else: LOG.warning('Operation level %s, but should be "page".', oplevel) break file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): LOG.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) LOG.info("Scoring text in page '%s' at the %s level", pcgts.get_pcGtsId(), self.parameter['textequiv_level']) self._process_page(pcgts) # write back result file_id = make_file_id(input_file, self.output_file_grp) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), )
def process(self): LOG = getLogger('OcrdAnybaseocrTiseg') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for input_file in self.input_files: page_id = input_file.pageId or input_file.ID pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) if self.parameter['use_deeplr']: kwargs = {'feature_filter': 'binarized,deskewed,cropped'} else: # _should_ also be deskewed and cropped, but no need to enforce that here kwargs = {'feature_selector': 'binarized'} page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, **kwargs) self._process_segment(page, page_image, page_coords, page_id, input_file) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'), )
def process(self): LOG = getLogger('eynollah') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for n, input_file in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files)) pcgts = page_from_file(self.workspace.download_file(input_file)) LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight) self.add_metadata(pcgts) page = pcgts.get_Page() # XXX loses DPI information # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized') image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename eynollah_kwargs = { 'dir_models': self.resolve_resource(self.parameter['models']), 'allow_enhancement': False, 'curved_line': self.parameter['curved_line'], 'full_layout': self.parameter['full_layout'], 'allow_scaling': self.parameter['allow_scaling'], 'headers_off': self.parameter['headers_off'], 'override_dpi': self.parameter['dpi'], 'logger': LOG, 'pcgts': pcgts, 'image_filename': image_filename } Eynollah(**eynollah_kwargs).run() file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=page_id, mimetype=MIMETYPE_PAGE, local_filename=join(self.output_file_grp, file_id) + '.xml', content=to_xml(pcgts))
def process(self): """Rates textual annotation of PAGE input files, producing output files with LM scores (and choices). ... explain incremental page-wise processing here ... """ LOG = getLogger('processor.KerasRate') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) level = self.parameter['textequiv_level'] beam_width = self.parameter['beam_width'] lm_weight = self.parameter['lm_weight'] prev_traceback = None prev_pcgts = None prev_file = None for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) LOG.info("Scoring text in page '%s' at the %s level", pcgts.get_pcGtsId(), level) # annotate processing metadata: metadata = pcgts.get_Metadata() # ensured by page_from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=OCRD_TOOL['tools']['ocrd-keraslm-rate']['steps'][0], value='ocrd-keraslm-rate', Labels=[ LabelsType(externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) # context preprocessing: # todo: as soon as we have true MODS meta-data in METS (dmdSec/mdWrap/xmlData/mods), # get global context variables from there (e.g. originInfo/dateIssued/@text for year) ident = self.workspace.mets.unique_identifier # at least try to get purl context = [0] if ident: name = ident.split('/')[-1] year = name.split('_')[-1] if year.isnumeric(): year = ceil(int(year) / 10) context = [year] # todo: author etc # create a graph for the linear sequence of elements at the given level: graph, start_node, end_node = page_get_linear_graph_at( level, pcgts) # apply language model to (TextEquiv path in) graph, # remove non-path TextEquivs, modify confidences: if not self.parameter['alternative_decoding']: text = [(edge['element'], edge['alternatives']) for edge in _get_edges(graph, 0)] # graph's path textstring = u''.join( textequivs[0].Unicode for element, textequivs in text) # same length as text LOG.info("Rating %d elements with a total of %d characters", len(text), len(textstring)) confidences = self.rater.rate(textstring, context) # much faster i = 0 for element, textequivs in text: textequiv = textequivs[0] # 1st choice only if element: element.set_TextEquiv([textequiv]) # delete others textequiv_len = len(textequiv.Unicode) conf = sum(confidences[i:i + textequiv_len] ) / textequiv_len # mean probability conf2 = textequiv.conf textequiv.set_conf(conf * lm_weight + conf2 * (1. - lm_weight)) i += textequiv_len if i != len(confidences): LOG.critical( "Input text length and output scores length are off by %d characters", i - len(confidences)) avg = sum(confidences) / len(confidences) ent = sum([-log(max(p, 1e-99), 2) for p in confidences]) / len(confidences) ppl = pow(2.0, ent) # character level ppll = pow( 2.0, ent * len(confidences) / len(text)) # textequiv level (including spaces/newlines) LOG.info("avg: %.3f, char ppl: %.3f, %s ppl: %.3f", avg, ppl, level, ppll) # character need not always equal glyph! # ensure parent textequivs are up to date: page_update_higher_textequiv_levels(level, pcgts) # write back result file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, pageId=input_file.pageId, file_grp=self.output_file_grp, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), ) else: LOG.info("Rating %d elements including its alternatives", end_node - start_node) path, entropy, traceback = self.rater.rate_best( graph, start_node, end_node, start_traceback=prev_traceback, context=context, lm_weight=lm_weight, beam_width=beam_width, beam_clustering_dist=BEAM_CLUSTERING_DIST if BEAM_CLUSTERING_ENABLE else 0) if prev_pcgts: _page_update_from_path(level, path, entropy) # ensure parent textequivs are up to date: page_update_higher_textequiv_levels(level, prev_pcgts) # write back result file_id = make_file_id(prev_file, self.output_file_grp) prev_pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, pageId=prev_file.pageId, file_grp=self.output_file_grp, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(prev_pcgts), ) prev_file = input_file prev_pcgts = pcgts prev_traceback = traceback if prev_pcgts: path, entropy, _ = self.rater.next_path(prev_traceback[0], ([], prev_traceback[1])) _page_update_from_path(level, path, entropy) # ensure parent textequivs are up to date: page_update_higher_textequiv_levels(level, prev_pcgts) # write back result file_id = make_file_id(input_file, self.output_file_grp) prev_pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, pageId=input_file.pageId, file_grp=self.output_file_grp, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(prev_pcgts), )
def process(self): """Despeckle the pages / regions / lines of the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the requested ``level-of-operation``. Next, for each file, crop each segment image according to the layout annotation (via coordinates into the higher-level image, or from the alternative image). Then despeckle by removing connected components smaller than ``noise_maxsize``. Apply results to the image and export it as an image file. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-DESPECK`` along with further identification of the input element. Reference each new image in the AlternativeImage of the element. Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.OcropyDenoise') level = self.parameter['level-of-operation'] assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized' if level == 'page' else '') if self.parameter['dpi'] > 0: zoom = 300.0 / self.parameter['dpi'] elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 LOG.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0 / dpi else: zoom = 1 if level == 'page': self.process_segment(page, page_image, page_xywh, zoom, input_file.pageId, file_id) else: regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: LOG.warning('Page "%s" contains no text regions', page_id) for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, feature_selector='binarized' if level == 'region' else '') if level == 'region': self.process_segment(region, region_image, region_xywh, zoom, input_file.pageId, file_id + '_' + region.id) continue lines = region.get_TextLine() if not lines: LOG.warning( 'Page "%s" region "%s" contains no text lines', page_id, region.id) for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_selector='binarized') self.process_segment( line, line_image, line_xywh, zoom, input_file.pageId, file_id + '_' + region.id + '_' + line.id) # update METS (add the PAGE file): file_path = os.path.join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename)
def process(self): """Perform OCR recognition with Tesseract on the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the requested ``textequiv_level`` if it exists and ``overwrite_words`` is disabled, or to the line level otherwise. In the latter case, (remove any existing segmentation below the line level, and) create new segmentation below the line level if necessary. Set up Tesseract to recognise each segment's image (either from AlternativeImage or cropping the bounding box rectangle and masking it from the polygon outline) with the appropriate mode and ``model``. Put text and confidence results into the TextEquiv at ``textequiv_level``, removing any existing TextEquiv. Finally, make the higher levels consistent with these results by concatenation, ordered as appropriate for its readingDirection, textLineOrder, and ReadingOrder, and joined by whitespace, as appropriate for the respective level and Relation/join status. Produce new output files by serialising the resulting hierarchy. """ LOG.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages()) assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) maxlevel = self.parameter['textequiv_level'] model = get_languages()[1][-1] # last installed model if 'model' in self.parameter: model = self.parameter['model'] for sub_model in model.split('+'): if sub_model not in get_languages()[1]: raise Exception("configured model " + sub_model + " is not installed") with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: LOG.info("Using model '%s' in %s for recognition at the %s level", model, get_languages()[0], maxlevel) if maxlevel == 'glyph': # populate GetChoiceIterator() with LSTM models, too: tessapi.SetVariable("lstm_choice_mode", "2") # aggregate symbols tessapi.SetVariable("lstm_choice_iterations", "15") # squeeze out more best paths # TODO: maybe warn/raise when illegal combinations or characters not in the model unicharset? if self.parameter['char_whitelist']: tessapi.SetVariable("tessedit_char_whitelist", self.parameter['char_whitelist']) if self.parameter['char_blacklist']: tessapi.SetVariable("tessedit_char_blacklist", self.parameter['char_blacklist']) if self.parameter['char_unblacklist']: tessapi.SetVariable("tessedit_char_unblacklist", self.parameter['char_unblacklist']) # todo: determine relevancy of these variables: # tessapi.SetVariable("tessedit_single_match", "0") # # tessedit_load_sublangs # tessedit_preserve_min_wd_len 2 # tessedit_prefer_joined_punct 0 # tessedit_write_rep_codes 0 # tessedit_parallelize 0 # tessedit_zero_rejection 0 # tessedit_zero_kelvin_rejection 0 # tessedit_reject_mode 0 # tessedit_use_reject_spaces 1 # tessedit_fix_fuzzy_spaces 1 # tessedit_char_blacklist # tessedit_char_whitelist # chs_leading_punct ('`" # chs_trailing_punct1 ).,;:?! # chs_trailing_punct2 )'`" # numeric_punctuation ., # unrecognised_char | # ok_repeated_ch_non_alphanum_wds -?*= # conflict_set_I_l_1 Il1[] # preserve_interword_spaces 0 # tessedit_enable_dict_correction 0 # tessedit_enable_bigram_correction 1 # stopper_smallword_size 2 # wordrec_max_join_chunks 4 # suspect_space_level 100 # suspect_short_words 2 # language_model_ngram_on 0 # language_model_ngram_order 8 # language_model_min_compound_length 3 # language_model_penalty_non_freq_dict_word 0.1 # language_model_penalty_non_dict_word 0.15 # language_model_penalty_punc 0.2 # language_model_penalty_case 0.1 # language_model_penalty_script 0.5 # language_model_penalty_chartype 0.3 # language_model_penalty_spacing 0.05 # textord_max_noise_size 7 # enable_noise_removal 1 # classify_bln_numeric_mode 0 # lstm_use_matrix 1 # user_words_file # user_patterns_file for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType( type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from paramter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) LOG.info("Processing page '%s'", page_id) regions = itertools.chain.from_iterable( [page.get_TextRegion()] + [ subregion.get_TextRegion() for subregion in page.get_TableRegion() ]) if not regions: LOG.warning("Page '%s' contains no text regions", page_id) else: self._process_regions(tessapi, regions, page_image, page_xywh) page_update_higher_textequiv_levels(maxlevel, pcgts) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Performs heuristic page frame detection (cropping) on the workspace. Open and deserialize PAGE input files and their respective images. (Input should be deskewed already.) Retrieve the raw (non-binarized, uncropped) page image. Detect line segments via edge gradients, and cluster them into contiguous horizontal and vertical lines if possible. If candidates which are located at the margin and long enough (covering a large fraction of the page) exist on all four sides, then pick the best (i.e. thickest, longest and inner-most) one on each side and use their intersections as border points. Otherwise, first try to detect a ruler (i.e. image segment depicting a rule placed on the scan/photo for scale references) via thresholding and contour detection, identifying a single large rectangular region with a certain aspect ratio. Suppress (mask) any such segment during further calculations. Next in that line, try to detect text segments on the page. For that purpose, get the gradient of grayscale image, threshold and morphologically close it, then determine contours to define approximate text boxes. Merge these into columns, filtering candidates too small or entirely in the margin areas. Finally, merge the remaining columns across short gaps. If only one column remains, and it covers a significant fraction of the page, pick that segment as solution. Otherwise, keep the border points derived from line segments (intersecting with the full image on each side without line candidates). Lastly, map coordinates to the original (undeskewed) image and intersect the border polygon with the full image frame. Use that to define the page's Border. Moreover, crop (and mask) the image accordingly, and reference the resulting image file as AlternativeImage in the Page element. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-CROP`` along with further identification of the input element. Produce new output files by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) self.logger = getLogger('processor.AnybaseocrCropper') for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID self.logger.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # Check for existing Border --> already cropped border = page.get_Border() if border: left, top, right, bottom = bbox_from_points( border.get_Coords().points) self.logger.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, # should be deskewed already feature_filter='cropped,binarized,grayscale_normalized') if self.parameter['dpi'] > 0: zoom = 300.0 / self.parameter['dpi'] elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 self.logger.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0 / dpi else: zoom = 1 self._process_page(page, page_image, page_coords, input_file, zoom) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): """Detect font shapes via rule-based OCR with Tesseract on the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the line level. Set up Tesseract to recognise each word's image (either from AlternativeImage or cropping the bounding box rectangle and masking it from the polygon outline) in word mode and with the ``osd`` model. Query the result's font attributes and write them into the word element's ``TextStyle``. Produce new output files by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrFontShape') LOG.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages()) assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) model = self.parameter['model'] if model not in get_languages()[1]: raise Exception( "model " + model + " (needed for font style detection) is not installed") with PyTessBaseAPI( path=get_tessdata_path(), #oem=OEM.TESSERACT_LSTM_COMBINED, # legacy required for OSD or WordFontAttributes! oem=OEM. TESSERACT_ONLY, # legacy required for OSD or WordFontAttributes! lang=model) as tessapi: LOG.info( "Using model '%s' in %s for recognition at the word level", model, get_languages()[0]) for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) LOG.info("Processing page '%s'", page_id) regions = page.get_AllRegions(classes=['Text']) if not regions: LOG.warning("Page '%s' contains no text regions", page_id) else: self._process_regions(tessapi, regions, page_image, page_coords) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Performs deskewing of the page / region with Tesseract on the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the region level for all text and table regions. Set up Tesseract to recognise the region image's orientation, skew and script (with both OSD and AnalyseLayout). Rotate the image accordingly, and annotate the angle, readingDirection and textlineOrder. Create a corresponding image file, and reference it as AlternativeImage in the element. Add the new image file to the workspace with the fileGrp USE given in the second position of the output fileGrp, or ``OCR-D-IMG-DESKEW``, and an ID based on input file and input element. Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrDeskew') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] with PyTessBaseAPI( path=TESSDATA_PREFIX, lang="osd", # osd required for legacy init! oem=OEM.TESSERACT_LSTM_COMBINED, # legacy required for OSD! psm=PSM.AUTO_OSD) as tessapi: for n, input_file in enumerate(self.input_files): file_id = make_file_id(input_file, self.output_file_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, # image must not have been rotated already, # (we will overwrite @orientation anyway,) # abort if no such image can be produced: feature_filter='deskewed' if oplevel == 'page' else '') if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) LOG.info("Deskewing on '%s' level in page '%s'", oplevel, page_id) if oplevel == 'page': self._process_segment(tessapi, page, page_image, page_xywh, "page '%s'" % page_id, input_file.pageId, file_id) else: regions = page.get_TextRegion() + page.get_TableRegion() if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, # image must not have been rotated already, # (we will overwrite @orientation anyway,) # abort if no such image can be produced: feature_filter='deskewed') self._process_segment(tessapi, region, region_image, region_xywh, "region '%s'" % region.id, input_file.pageId, file_id + '_' + region.id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): LOG = getLogger('processor.RepairInconsistencies') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) regions = [] regions.extend(page.get_TextRegion()) for special_region in page.get_TableRegion( ) + page.get_GraphicRegion(): regions.extend(special_region.get_TextRegion()) for region in regions: textLineOrder = 'top-to-bottom' for segment in [region, page]: if segment.textLineOrder is None: continue else: textLineOrder = segment.textLineOrder break if textLineOrder not in ['top-to-bottom', 'bottom-to-top']: LOG.info( 'Not processing page "%s" region "%s" (textLineOrder=%s)', page_id, region.id, textLineOrder) continue _fix_segment(region, page_id, reverse=(textLineOrder == 'bottom-to-top')) lines = region.get_TextLine() for line in lines: readingDirection = 'left-to-right' for segment in [line, region, page]: if segment.readingDirection is None: continue else: readingDirection = segment.readingDirection break if readingDirection not in [ 'left-to-right', 'right-to-left' ]: LOG.info( 'Not processing page "%s" line "%s" (readingDirection=%s)', page_id, line.id, readingDirection) continue _fix_segment(line, page_id, reverse=(readingDirection == 'right-to-left')) words = line.get_Word() for word in words: readingDirection = 'left-to-right' for segment in [word, line, region, page]: if segment.readingDirection is None: continue else: readingDirection = segment.readingDirection break if readingDirection not in [ 'left-to-right', 'right-to-left' ]: LOG.info( 'Not processing page "%s" word "%s" (readingDirection=%s)', page_id, word.id, readingDirection) continue _fix_segment( word, page_id, reverse=(readingDirection == 'right-to-left')) file_id = make_file_id(input_file, self.output_file_grp) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): log = getLogger('processor.OcrdSbbTextlineDetectorRecognize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, input_file) file_id = make_file_id(input_file, self.output_file_grp) # Process the files try: os.mkdir(self.output_file_grp) except FileExistsError: pass pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_coords, page_image_info = \ self.workspace.image_from_page( page, page_id, feature_filter='cropped,binarized,grayscale_normalized' ) with tempfile.TemporaryDirectory() as tmp_dirname: # Save the image image_file = tempfile.mkstemp(dir=tmp_dirname, suffix='.png')[1] page_image.save(image_file) # Segment the image model = self.parameter['model'] x = textline_detector(image_file, tmp_dirname, file_id, model) x.run() # Read segmentation results tmp_filename = os.path.join(tmp_dirname, file_id) + '.xml' tmp_pcgts = ocrd_models.ocrd_page.parse(tmp_filename, silence=True) tmp_page = tmp_pcgts.get_Page() # Create a new PAGE file from the input file pcgts.set_pcGtsId(file_id) page = pcgts.get_Page() # Merge results → PAGE file # 1. Border if page.get_Border(): log.warning("Page already contained a border") # We need to translate the coordinates: text_border = tmp_page.get_Border() coords = text_border.get_Coords().get_points() polygon = polygon_from_points(coords) polygon_new = coordinates_for_segment(polygon, page_image, page_coords) points_new = points_from_polygon(polygon_new) coords_new = CoordsType(points=points_new) text_border.set_Coords(coords_new) page.set_Border(text_border) # 2. ReadingOrder if page.get_ReadingOrder(): log.warning("Page already contained a reading order") page.set_ReadingOrder(tmp_page.get_ReadingOrder()) # 3. TextRegion if page.get_TextRegion(): log.warning("Page already contained text regions") # We need to translate the coordinates: text_regions_new = [] for text_region in tmp_page.get_TextRegion(): coords = text_region.get_Coords().get_points() polygon = polygon_from_points(coords) polygon_new = coordinates_for_segment(polygon, page_image, page_coords) points_new = points_from_polygon(polygon_new) coords_new = CoordsType(points=points_new) text_region.set_Coords(coords_new) text_regions_new.append(text_region) page.set_TextRegion(text_regions_new) # Save metadata about this operation metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=page_id, mimetype='application/vnd.prima.page+xml', local_filename=os.path.join(self.output_file_grp, file_id) + '.xml', content=ocrd_models.ocrd_page.to_xml(pcgts))
def process(self): """Performs word segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the textline level, and remove any existing Word elements (unless ``overwrite_words`` is False). Set up Tesseract to detect words, and add each one to the line at the detected coordinates. Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrSegmentWord') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) overwrite_words = self.parameter['overwrite_words'] with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX ) as tessapi: for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) for region in page.get_TextRegion(): region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) for line in region.get_TextLine(): if line.get_Word(): if overwrite_words: LOG.info('removing existing Words in line "%s"', line.id) line.set_Word([]) else: LOG.warning('keeping existing Words in line "%s"', line.id) LOG.debug("Detecting words in line '%s'", line.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords) tessapi.SetImage(line_image) for word_no, component in enumerate(tessapi.GetComponentImages(RIL.WORD, True, raw_image=True)): word_id = '%s_word%04d' % (line.id, word_no) word_polygon = polygon_from_xywh(component[1]) word_polygon = coordinates_for_segment(word_polygon, line_image, line_coords) word_polygon2 = polygon_for_parent(word_polygon, line) if word_polygon2 is not None: word_polygon = word_polygon2 word_points = points_from_polygon(word_polygon) if word_polygon2 is None: # could happen due to rotation LOG.info('Ignoring extant word: %s', word_points) continue line.add_Word(WordType( id=word_id, Coords=CoordsType(word_points))) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """ Perform text recognition with Calamari on the workspace. If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word / glyph level segments by splitting at white space characters / glyph boundaries. In the case of ``glyph``, add all alternative character hypotheses down to ``glyph_conf_cutoff`` confidence threshold. """ log = getLogger('processor.CalamariRecognize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector=self.features) for region in page.get_AllRegions(classes=['Text']): region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector=self.features) textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) line_images_np = [] line_coordss = [] for line in textlines: log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords, feature_selector=self.features) if ('binarized' not in line_coords['features'] and 'grayscale_normalized' not in line_coords['features'] and self.network_input_channels == 1): # We cannot use a feature selector for this since we don't # know whether the model expects (has been trained on) # binarized or grayscale images; but raw images are likely # always inadequate: log.warning( "Using raw image for line '%s' in region '%s'", line.id, region.id) line_image = line_image if all(line_image.size) else [[0]] line_image_np = np.array(line_image, dtype=np.uint8) line_images_np.append(line_image_np) line_coordss.append(line_coords) raw_results_all = self.predictor.predict_raw( line_images_np, progress_bar=False) for line, line_coords, raw_results in zip( textlines, line_coordss, raw_results_all): for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" # Build line text on our own # # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same # on prediction.positions. Do it on our own to have consistency. # # XXX Check Calamari's built-in post-processing on prediction.sentence def _sort_chars(p): """Filter and sort chars of prediction p""" chars = p.chars chars = [ c for c in chars if c.char ] # XXX Note that omission probabilities are not normalized?! chars = [ c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff'] ] chars = sorted(chars, key=lambda k: k.probability, reverse=True) return chars def _drop_leading_spaces(positions): return list( itertools.dropwhile( lambda p: _sort_chars(p)[0].char == " ", positions)) def _drop_trailing_spaces(positions): return list( reversed(_drop_leading_spaces( reversed(positions)))) def _drop_double_spaces(positions): def _drop_double_spaces_generator(positions): last_was_space = False for p in positions: if p.chars[0].char == " ": if not last_was_space: yield p last_was_space = True else: yield p last_was_space = False return list(_drop_double_spaces_generator(positions)) positions = prediction.positions positions = _drop_leading_spaces(positions) positions = _drop_trailing_spaces(positions) positions = _drop_double_spaces(positions) positions = list(positions) line_text = ''.join( _sort_chars(p)[0].char for p in positions) if line_text != prediction.sentence: log.warning( "Our own line text is not the same as Calamari's: '%s' != '%s'", line_text, prediction.sentence) # Delete existing results if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) if line.get_Word(): log.warning( "Line '%s' already contained word segmentation", line.id) line.set_Word([]) # Save line results line_conf = prediction.avg_char_probability line.set_TextEquiv( [TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results # # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict # hierarchy of lines > words > glyphs. def _words(s): """Split words based on spaces and include spaces as 'words'""" spaces = None word = '' for c in s: if c == ' ' and spaces is True: word += c elif c != ' ' and spaces is False: word += c else: if word: yield word word = c spaces = (c == ' ') yield word if self.parameter['textequiv_level'] in ['word', 'glyph']: word_no = 0 i = 0 for word_text in _words(line_text): word_length = len(word_text) if not all(c == ' ' for c in word_text): word_positions = positions[i:i + word_length] word_start = word_positions[0].global_start word_end = word_positions[-1].global_end polygon = polygon_from_x0y0x1y1([ word_start, 0, word_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) # XXX Crop to line polygon? word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) word.add_TextEquiv( TextEquivType(Unicode=word_text)) if self.parameter[ 'textequiv_level'] == 'glyph': for glyph_no, p in enumerate( word_positions): glyph_start = p.global_start glyph_end = p.global_end polygon = polygon_from_x0y0x1y1([ glyph_start, 0, glyph_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) glyph = GlyphType( id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) # Add predictions (= TextEquivs) char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs for char_index, char in enumerate( _sort_chars(p), start=char_index_start): glyph.add_TextEquiv( TextEquivType( Unicode=char.char, index=char_index, conf=char.probability)) word.add_Glyph(glyph) line.add_Word(word) word_no += 1 i += word_length _page_update_higher_textequiv_levels('line', pcgts) # Add metadata about this operation and its runtime parameters: self.add_metadata(pcgts) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Performs table cell segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the block level for table regions. If ``overwrite_regions`` is enabled and any layout annotation already exists inside, then remove it. Set up Tesseract to detect text blocks (as table cells). (This is not Tesseract's internal table structure recognition, but the general page segmentation.) Add each to the block at the detected coordinates. Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrSegmentTable') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) overwrite_regions = self.parameter['overwrite_regions'] with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # disable table detection here, so we won't get # tables inside tables, but try to analyse them as # independent text/line blocks: tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) # # prepare dict of reading order reading_order = dict() ro = page.get_ReadingOrder() if not ro: LOG.warning("Page '%s' contains no ReadingOrder", page_id) rogroup = None else: rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() page_get_reading_order(reading_order, rogroup) # # dive into regions regions = page.get_TableRegion() for region in regions: # delete or warn of existing regions: if region.get_TextRegion(): if overwrite_regions: LOG.info( 'removing existing TextRegions in block "%s" of page "%s"', region.id, page_id) for subregion in region.get_TextRegion(): if subregion.id in reading_order: regionref = reading_order[subregion.id] # could be any of the 6 types above: regionrefs = rogroup.__getattribute__( regionref.__class__.__name__.replace( 'Type', '')) # remove in-place regionrefs.remove(regionref) # TODO: adjust index to make contiguous again? region.set_TextRegion([]) else: LOG.warning( 'keeping existing TextRegions in block "%s" of page "%s"', region.id, page_id) # get region image region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) tessapi.SetImage(region_image) LOG.info("Detecting table cells in region '%s'", region.id) # # detect the region segments: tessapi.SetPageSegMode(PSM.SPARSE_TEXT) # retrieve "cells" # TODO: we should XY-cut the sparse cells in regroup them into consistent cells layout = tessapi.AnalyseLayout() roelem = reading_order.get(region.id) if not roelem: LOG.warning( "Page '%s' table region '%s' is not referenced in reading order (%s)", page_id, region.id, "no target to add cells into") elif isinstance( roelem, (OrderedGroupType, OrderedGroupIndexedType)): LOG.warning( "Page '%s' table region '%s' already has an ordered group (%s)", page_id, region.id, "cells will be appended") elif isinstance( roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): LOG.warning( "Page '%s' table region '%s' already has an unordered group (%s)", page_id, region.id, "cells will not be appended") roelem = None elif isinstance(roelem, RegionRefIndexedType): # replace regionref by group with same index and ref # (which can then take the cells as subregions) roelem2 = OrderedGroupIndexedType( id=region.id + '_order', index=roelem.index, regionRef=roelem.regionRef) roelem.parent_object_.add_OrderedGroupIndexed(roelem2) roelem.parent_object_.get_RegionRefIndexed().remove( roelem) roelem = roelem2 elif isinstance(roelem, RegionRefType): # replace regionref by group with same ref # (which can then take the cells as subregions) roelem2 = OrderedGroupType(id=region.id + '_order', regionRef=roelem.regionRef) roelem.parent_object_.add_OrderedGroup(roelem2) roelem.parent_object_.get_RegionRef().remove(roelem) roelem = roelem2 self._process_region(layout, region, roelem, region_image, region_coords) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(force=True, ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): LOG = getLogger('OcrdAnybaseocrDewarper') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) if self.parameter['gpu_id'] > -1 and not torch.cuda.is_available(): LOG.warning("torch cannot detect CUDA installation.") self.parameter['gpu_id'] = -1 model_path = Path(self.resolve_resource(self.parameter['model_path'])) if not model_path.is_file(): LOG.error("""\ pix2pixHD model file was not found at '%s'. Make sure this file exists. """ % model_path) sys.exit(1) opt, model = prepare_options( gpu_id=self.parameter['gpu_id'], dataroot=str(Path(self.workspace.directory, self.input_file_grp)), model_path=model_path, resize_or_crop=self.parameter['imgresize'], loadSize=self.parameter['resizeHeight'], fineSize=self.parameter['resizeWidth'], ) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %s", page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() try: page_image, page_xywh, _ = self.workspace.image_from_page( page, page_id, feature_filter='dewarped', feature_selector='binarized' ) # images should be deskewed and cropped except Exception: page_image, page_xywh, _ = self.workspace.image_from_page( page, page_id, feature_filter='dewarped' ) # images should be deskewed and cropped if oplevel == 'page': dataset = prepare_data(opt, page_image) orig_img_size = page_image.size self._process_segment(model, dataset, page, page_xywh, page_id, input_file, orig_img_size, n) else: regions = page.get_TextRegion() + page.get_TableRegion( ) # get all regions? if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for _, region in enumerate(regions): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) # TODO: not tested on regions # TODO: region has to exist as a physical file to be processed by pix2pixHD dataset = prepare_data(opt, region_image) orig_img_size = region_image.size self._process_segment(model, dataset, page, region_xywh, region.id, input_file, orig_img_size, n) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): """Performs page cropping with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images. Set up Tesseract to detect text blocks on each page, and find the largest coordinate extent spanning all of them. Use this extent in defining a Border, and add that to the page. Moreover, crop the original image accordingly, and reference the resulting image file as AlternativeImage in the Page element. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-CROP`` along with further identification of the input element. Produce new output files by serialising the resulting hierarchy. """ LOG = getLogger('processor.TesserocrCrop') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) padding = self.parameter['padding'] with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # disable table detection here (tables count as text blocks), # because we do not want to risk confusing the spine with # a column separator and thus creeping into a neighbouring # page: tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # warn of existing Border: border = page.get_Border() if border: left, top, right, bottom = bbox_from_points( border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, # image must not have been cropped already, # abort if no such image can be produced: feature_filter='cropped') if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) zoom = 300 / dpi else: zoom = 1 # warn of existing segmentation: regions = page.get_TextRegion() if regions: min_x = page_image.width min_y = page_image.height max_x = 0 max_y = 0 for region in regions: left, top, right, bottom = bbox_from_points( region.get_Coords().points) min_x = min(min_x, left) min_y = min(min_y, top) max_x = max(max_x, right) max_y = max(max_y, bottom) LOG.warning( 'Ignoring extent from existing TextRegions: %i:%i,%i:%i', min_x, max_x, min_y, max_y) LOG.debug("Cropping with Tesseract") tessapi.SetImage(page_image) # PSM.SPARSE_TEXT: get as much text as possible in no particular order # PSM.AUTO (default): includes tables (dangerous) tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT) # # helper variables for saving the box coordinates # min_x = page_image.width min_y = page_image.height max_x = 0 max_y = 0 # iterate over all text blocks and compare their # bbox extent to the running min and max values for component in tessapi.GetComponentImages( tesserocr.RIL.BLOCK, True): image, xywh, index, _ = component # # the region reference in the reading order element # ID = "region%04d" % index left, top, right, bottom = bbox_from_xywh(xywh) LOG.debug("Detected text region '%s': %i:%i,%i:%i", ID, left, right, top, bottom) # filter region results: bin_bbox = image.getbbox() if not bin_bbox: # this does happen! LOG.info( "Ignoring region '%s' because its binarization is empty", ID) continue width = bin_bbox[2] - bin_bbox[0] if width < 25 / zoom: # we must be conservative here: page numbers are tiny regions, too! LOG.info( "Ignoring region '%s' because its width is too small (%d)", ID, width) continue height = bin_bbox[3] - bin_bbox[1] if height < 25 / zoom: # we must be conservative here: page numbers are tiny regions, too! LOG.debug( "Ignoring region '%s' because its height is too small (%d)", ID, height) continue min_x = min(min_x, left) min_y = min(min_y, top) max_x = max(max_x, right) max_y = max(max_y, bottom) LOG.info("Updated page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) # # set the identified page border # if min_x < max_x and min_y < max_y: # add padding: min_x = max(min_x - padding, 0) max_x = min(max_x + padding, page_image.width) min_y = max(min_y - padding, 0) max_y = min(max_y + padding, page_image.height) LOG.info("Padded page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) polygon = polygon_from_bbox(min_x, min_y, max_x, max_y) polygon = coordinates_for_segment(polygon, page_image, page_xywh) polygon = polygon_for_parent(polygon, page) border = BorderType( Coords=CoordsType(points_from_polygon(polygon))) # intersection with parent could have changed bbox, # so recalculate: bbox = bbox_from_polygon( coordinates_of_segment(border, page_image, page_xywh)) # update PAGE (annotate border): page.set_Border(border) # update METS (add the image file): page_image = crop_image(page_image, box=bbox) page_xywh['features'] += ',cropped' file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( page_image, file_id + '.IMG-CROP', page_id=input_file.pageId, file_grp=self.output_file_grp) # update PAGE (reference the image file): page.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=page_xywh['features'])) else: LOG.error("Cannot find valid extent for page '%s'", page_id) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """ Performs the recognition. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) self._init_calamari() for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) for region in pcgts.get_Page().get_TextRegion(): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_xywh) line_image_np = np.array(line_image, dtype=np.uint8) raw_results = list( self.predictor.predict_raw([line_image_np], progress_bar=False))[0] for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" # Build line text on our own # # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same # on prediction.positions. Do it on our own to have consistency. # # XXX Check Calamari's built-in post-processing on prediction.sentence def _sort_chars(p): """Filter and sort chars of prediction p""" chars = p.chars chars = [ c for c in chars if c.char ] # XXX Note that omission probabilities are not normalized?! chars = [ c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff'] ] chars = sorted(chars, key=lambda k: k.probability, reverse=True) return chars def _drop_leading_spaces(positions): return list( itertools.dropwhile( lambda p: _sort_chars(p)[0].char == " ", positions)) def _drop_trailing_spaces(positions): return list( reversed(_drop_leading_spaces( reversed(positions)))) def _drop_double_spaces(positions): def _drop_double_spaces_generator(positions): last_was_space = False for p in positions: if p.chars[0].char == " ": if not last_was_space: yield p last_was_space = True else: yield p last_was_space = False return list(_drop_double_spaces_generator(positions)) positions = prediction.positions positions = _drop_leading_spaces(positions) positions = _drop_trailing_spaces(positions) positions = _drop_double_spaces(positions) positions = list(positions) line_text = ''.join( _sort_chars(p)[0].char for p in positions) if line_text != prediction.sentence: log.warning( "Our own line text is not the same as Calamari's: '%s' != '%s'", line_text, prediction.sentence) # Delete existing results if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) if line.get_Word(): log.warning( "Line '%s' already contained word segmentation", line.id) line.set_Word([]) # Save line results line_conf = prediction.avg_char_probability line.set_TextEquiv( [TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results # # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict # hierarchy of lines > words > glyphs. def _words(s): """Split words based on spaces and include spaces as 'words'""" spaces = None word = '' for c in s: if c == ' ' and spaces is True: word += c elif c != ' ' and spaces is False: word += c else: if word: yield word word = c spaces = (c == ' ') yield word if self.parameter['textequiv_level'] in ['word', 'glyph']: word_no = 0 i = 0 for word_text in _words(line_text): word_length = len(word_text) if not all(c == ' ' for c in word_text): word_positions = positions[i:i + word_length] word_start = word_positions[0].global_start word_end = word_positions[-1].global_end polygon = polygon_from_x0y0x1y1([ word_start, 0, word_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) # XXX Crop to line polygon? word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) word.add_TextEquiv( TextEquivType(Unicode=word_text)) if self.parameter[ 'textequiv_level'] == 'glyph': for glyph_no, p in enumerate( word_positions): glyph_start = p.global_start glyph_end = p.global_end polygon = polygon_from_x0y0x1y1([ glyph_start, 0, glyph_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) glyph = GlyphType( id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) # Add predictions (= TextEquivs) char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs for char_index, char in enumerate( _sort_chars(p), start=char_index_start): glyph.add_TextEquiv( TextEquivType( Unicode=char.char, index=char_index, conf=char.probability)) word.add_Glyph(glyph) line.add_Word(word) word_no += 1 i += word_length _page_update_higher_textequiv_levels('line', pcgts) # Add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Recognize lines / words / glyphs of the workspace. Open and deserialise each PAGE input file and its respective image, then iterate over the element hierarchy down to the requested ``textequiv_level``. If any layout annotation below the line level already exists, then remove it (regardless of ``textequiv_level``). Set up Ocropy to recognise each text line (via coordinates into the higher-level image, or from the alternative image; the image must have been binarised/grayscale-normalised, deskewed and dewarped already). Rescale and pad the image, then recognize. Create new elements below the line level, if necessary. Put text results and confidence values into new TextEquiv at ``textequiv_level``, and make the higher levels consistent with that up to the line level (by concatenation joined by whitespace). If a TextLine contained any previous text annotation, then compare that with the new result by aligning characters and computing the Levenshtein distance. Aggregate these scores for each file and print the line-wise and the total character error rates (CER). Produce a new output file by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) maxlevel = self.parameter['textequiv_level'] # self.logger.info("Using model %s in %s for recognition", model) for (n, input_file) in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() page_image, page_coords, _ = self.workspace.image_from_page( page, page_id) self.logger.info("Recognizing text in page '%s'", page_id) # region, line, word, or glyph level: regions = page.get_AllRegions(classes=['Text']) if not regions: self.logger.warning("Page '%s' contains no text regions", page_id) self.process_regions(regions, maxlevel, page_image, page_coords) # update METS (add the PAGE file): file_id = make_file_id(input_file.ID, self.output_file_grp) file_path = os.path.join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) self.logger.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename)
def process(self): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) LOG = getLogger('OcrdAnybaseocrBlockSegmenter') if not tf.test.is_gpu_available(): LOG.warning( "Tensorflow cannot detect CUDA installation. Running without GPU will be slow." ) model_path = Path(self.parameter['block_segmentation_model']) model_weights = Path(self.parameter['block_segmentation_weights']) confidence = self.parameter['DETECTION_MIN_CONFIDENCE'] # DETECTION_MIN_CONFIDENCE = Path(self.parameter['DETECTION_MIN_CONFIDENCE']) class_names = [ 'BG', 'page-number', 'paragraph', 'catch-word', 'heading', 'drop-capital', 'signature-mark', 'header', 'marginalia', 'footnote', 'footnote-continued', 'caption', 'endnote', 'footer', 'keynote', 'image', 'table', 'graphics' ] if not Path(model_weights).is_file(): LOG.error( """\ Block Segmentation model weights file was not found at '%s'. Make sure the `model_weights` parameter points to the local model weights path. """, model_weights) sys.exit(1) # config = InferenceConfig(Config,DETECTION_MIN_CONFIDENCE) config = InferenceConfig(confidence) # config = InferenceConfig() mrcnn_model = model.MaskRCNN(mode="inference", model_dir=str(model_path), config=config) mrcnn_model.load_weights(str(model_weights), by_name=True) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_id = input_file.pageId or input_file.ID page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='binarized,deskewed,cropped,clipped,non_text') # try to load pixel masks try: mask_image, mask_xywh, mask_image_info = self.workspace.image_from_page( page, page_id, feature_selector='clipped', feature_filter='binarized,deskewed,cropped,non_text') except: mask_image = None # Display Warning If image segment results already exist or not in StructMap? regions = page.get_TextRegion() + page.get_TableRegion() if regions: LOG.warning("Image already has text segments!") if oplevel == "page": self._process_segment(page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, class_names, mask_image) else: LOG.warning('Operation level %s, but should be "page".', oplevel) break file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): """Perform OCR post-correction with encoder-attention-decoder ANN on the workspace. Open and deserialise PAGE input files, then iterate over the element hierarchy down to the requested `textequiv_level`, making sequences of TextEquiv objects as lists of lines. Concatenate their string values, obeying rules of implicit whitespace, and map the string positions where the objects start. Next, transcode the input lines into output lines in parallel, and use the retrieved soft alignment scores to calculate hard alignment paths between input and output string via Viterbi decoding. Then use those to map back the start positions and overwrite each TextEquiv with its new content, paying special attention to whitespace: Distribute edits such that whitespace objects cannot become more than whitespace (or be deleted) and that non-whitespace objects must not start or end with whitespace (but may contain new whitespace in the middle). Subsequently, unless processing on the `line` level, make the Word segmentation consistent with that result again: merge around deleted whitespace tokens and split at whitespace inside non-whitespace tokens. Finally, make the levels above `textequiv_level` consistent with that textual result (via concatenation joined by whitespace). Produce new output files by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) # Dragging Word/TextLine references along in all lists besides TextEquiv # is necessary because the generateDS version of the PAGE-XML model # has no references upwards in the hierarchy (from TextEquiv to containing # elements, from Glyph/Word/TextLine to Word/TextLine/TextRegion), and # its classes are not hashable. level = self.parameter['textequiv_level'] for n, input_file in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) pcgts = page_from_file(self.workspace.download_file(input_file)) page_id = input_file.pageId or input_file.ID # (PageType has no id) self.logger.info("Correcting text in page '%s' at the %s level", page_id, level) # annotate processing metadata: self.add_metadata(pcgts) # get textequiv references for all lines: # FIXME: conf with TextEquiv alternatives line_sequences = _page_get_line_sequences_at(level, pcgts) # concatenate to strings and get dict of start positions to refs: input_lines, conf, textequiv_starts, word_starts, textline_starts = ( _line_sequences2string_sequences( self.s2s.mapping[0], line_sequences, charmap=self.parameter['charmap'])) # correct string and get input-output alignment: # FIXME: split into self.batch_size chunks output_lines, output_probs, output_scores, alignments = ( self.s2s.correct_lines(input_lines, conf, fast=self.parameter['fast_mode'], greedy=self.parameter['fast_mode'])) # re-align (from alignment scores) and overwrite the textequiv references: for (input_line, output_line, output_prob, output_score, alignment, textequivs, words, textlines) in zip(input_lines, output_lines, output_probs, output_scores, alignments, textequiv_starts, word_starts, textline_starts): self.logger.debug('"%s" -> "%s"', input_line.rstrip('\n'), output_line.rstrip('\n')) # convert soft scores (seen from output) to hard path (seen from input): realignment = _alignment2path(alignment, len(input_line), len(output_line), 1. / self.s2s.voc_size) # overwrite TextEquiv references: new_sequence = _update_sequence(input_line, output_line, output_prob, output_score, realignment, textequivs, words, textlines) # update Word segmentation: if level != 'line': _resegment_sequence(new_sequence, level) self.logger.info('corrected line with %d elements, ppl: %.3f', len(new_sequence), np.exp(output_score)) # make higher levels consistent again: page_update_higher_textequiv_levels(level, pcgts) # write back result to new annotation: file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) file_path = os.path.join(self.output_file_grp, file_id + '.xml') self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts))