def add_metadata(self, pcgts): """ Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``. """ pcgts.get_Metadata().add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=self.ocrd_tool['executable'], Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]), LabelsType(externalModel="ocrd-tool", externalId="version", Label=[ LabelType( type_=self.ocrd_tool['executable'], value=self.version), LabelType(type_='ocrd/core', value=OCRD_VERSION) ]) ]))
def process(self): try: self.page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: self.page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() angle = page.get_orientation() if angle: LOG.warning('Overwriting existing deskewing angle: %i', angle) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='deskewed') if oplevel == "page": self._process_segment(page_image, page, page_xywh, page_id, input_file, n) else: LOG.warning('Operation level %s, but should be "page".', oplevel) break # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): if not tf.test.is_gpu_available(): LOG.error("Your system has no CUDA installed. No GPU detected.") sys.exit(1) model_path = Path(self.parameter['model_path']) class_mapper_path = Path(self.parameter['class_mapping_path']) if not Path(model_path).is_file(): LOG.error("""\ Layout Classfication model was not found at '%s'. Make sure the `model_path` parameter points to the local model path. model can be downloaded from http://url """ % model_path) sys.exit(1) else: LOG.info('Loading model from file %s', model_path) model = self.create_model(str(model_path)) # load the mapping pickle_in = open(str(class_mapper_path), "rb") class_indices = pickle.load(pickle_in) label_mapping = dict((v, k) for k, v in class_indices.items()) # print("INPUT FILE HERE",self.input_files) for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) fname = pcgts.get_Page().imageFilename page_id = input_file.pageId or input_file.ID size = 600, 500 metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameter", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') img_array = ocrolib.pil2array( page_image.resize((500, 600), Image.ANTIALIAS)) img_array = img_array * 1. / 255. img_array = img_array[np.newaxis, :, :, np.newaxis] results = self.start_test(model, img_array, fname, label_mapping) LOG.info(results) self.workspace.mets.set_physical_page_for_file( "PHYS_000" + str(n), input_file) self.create_logmap_smlink(pcgts) self.write_to_mets(results, "PHYS_000" + str(n))
def process(self): try: self.page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: self.page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) if oplevel == "page": self._process_segment(page, page_image.filename, page_id, file_id + ".ds") file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def add_metadata(self, pcgts): """ Adds PAGE-XML MetadataItem describing the processing step """ pcgts.get_Metadata().add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=self.ocrd_tool['executable'], Labels=[LabelsType( externalModel="ocrd-tool", externalId="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])]))
def _add_my_metadata_to_page(self, pcgts): metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType(type_='processingStep', name=OCRD_TOOL['tools'] ['ocrd-cor-asv-fst-process']['steps'][0], value='ocrd-cor-asv-fst-process', Labels=[ LabelsType( externalRef='parameters', Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ]))
def process(self): LOG = getLogger('processor.RepairInconsistencies') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) regions = [] regions.extend(page.get_TextRegion()) for special_region in page.get_TableRegion( ) + page.get_GraphicRegion(): regions.extend(special_region.get_TextRegion()) for region in regions: textLineOrder = 'top-to-bottom' for segment in [region, page]: if segment.textLineOrder is None: continue else: textLineOrder = segment.textLineOrder break if textLineOrder not in ['top-to-bottom', 'bottom-to-top']: LOG.info( 'Not processing page "%s" region "%s" (textLineOrder=%s)', page_id, region.id, textLineOrder) continue _fix_segment(region, page_id, reverse=(textLineOrder == 'bottom-to-top')) lines = region.get_TextLine() for line in lines: readingDirection = 'left-to-right' for segment in [line, region, page]: if segment.readingDirection is None: continue else: readingDirection = segment.readingDirection break if readingDirection not in [ 'left-to-right', 'right-to-left' ]: LOG.info( 'Not processing page "%s" line "%s" (readingDirection=%s)', page_id, line.id, readingDirection) continue _fix_segment(line, page_id, reverse=(readingDirection == 'right-to-left')) words = line.get_Word() for word in words: readingDirection = 'left-to-right' for segment in [word, line, region, page]: if segment.readingDirection is None: continue else: readingDirection = segment.readingDirection break if readingDirection not in [ 'left-to-right', 'right-to-left' ]: LOG.info( 'Not processing page "%s" word "%s" (readingDirection=%s)', page_id, word.id, readingDirection) continue _fix_segment( word, page_id, reverse=(readingDirection == 'right-to-left')) file_id = make_file_id(input_file, self.output_file_grp) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): try: page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] model = None if self.parameter['use_deeplr']: model_weights = self.parameter['seg_weights'] if not Path(model_weights).is_file(): LOG.error("""\ Segementation model weights file was not found at '%s'. Make sure the `seg_weights` parameter points to the local model weights path. """ % model_weights) sys.exit(1) model = resnet50_unet(n_classes=self.parameter['classes'], input_height=self.parameter['height'], input_width=self.parameter['width']) model.load_weights(model_weights) LOG.info('Segmentation Model loaded') for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) if self.parameter['use_deeplr']: page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='binarized,deskewed,cropped') else: page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized,deskewed,cropped') if oplevel == 'page': self._process_segment(page_image, page, page_xywh, page_id, input_file, n, model) else: LOG.warning('Operation level %s, but should be "page".', oplevel) break # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=file_id, file_grp=page_grp, #self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'), force=self.parameter['force'])
def process(self): """Performs border detection on the workspace. """ try: LOG.info("OUTPUT FILE %s", self.output_file_grp) page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() # Check for existing Border --> already cropped border = page.get_Border() if border: left, top, right, bottom = bbox_from_points( border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( # externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='cropped', feature_selector='binarized') # should also be deskewed #page_image, page_xywh, page_image_info = self.workspace.image_from_page( # page, page_id, feature_filter='cropped') if oplevel == "page": self._process_segment(page_image, page, page_xywh, page_id, input_file, n) else: LOG.warning('Operation level %s, but should be "page".', oplevel) break file_id = input_file.ID.replace(self.input_file_grp, page_grp) # Use input_file's basename for the new file - # this way the files retain the same basenames: if file_id == input_file.ID: file_id = concat_padded(page_grp, n) self.workspace.add_file(ID=file_id, file_grp=page_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( page_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'), force=self.parameter['force'])
def process(self): try: self.page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: self.page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id) if oplevel == "page": self._process_segment(page, page_image.filename, page_id, file_id + ".bin") else: regions = page.get_TextRegion() + page.get_TableRegion() if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) # strange TODO at the moment #self._process_segment(region.filename, region.id) # To retain the basenames of files and their respective dir: file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): """Performs page cropping with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images. Set up Tesseract to detect text blocks on each page, and find the largest coordinate extent spanning all of them. Use this extent in defining a Border, and add that to the page. Moreover, crop the original image accordingly, and reference the resulting image file as AlternativeImage in the Page element. Add the new image file to the workspace along with the output fileGrp, and using a file ID with suffix ``.IMG-CROP`` along with further identification of the input element. Produce new output files by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) padding = self.parameter['padding'] with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # disable table detection here (tables count as text blocks), # because we do not want to risk confusing the spine with # a column separator and thus creeping into a neighbouring # page: tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType( type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) # warn of existing Border: border = page.get_Border() if border: left, top, right, bottom = bbox_from_points( border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, # image must not have been cropped already, # abort if no such image can be produced: feature_filter='cropped') if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) zoom = 300 / dpi else: zoom = 1 # warn of existing segmentation: regions = page.get_TextRegion() if regions: min_x = page_image.width min_y = page_image.height max_x = 0 max_y = 0 for region in regions: left, top, right, bottom = bbox_from_points( region.get_Coords().points) min_x = min(min_x, left) min_y = min(min_y, top) max_x = max(max_x, right) max_y = max(max_y, bottom) LOG.warning( 'Ignoring extent from existing TextRegions: %i:%i,%i:%i', min_x, max_x, min_y, max_y) LOG.debug("Cropping with Tesseract") tessapi.SetImage(page_image) # PSM.SPARSE_TEXT: get as much text as possible in no particular order # PSM.AUTO (default): includes tables (dangerous) tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT) # # helper variables for saving the box coordinates # min_x = page_image.width min_y = page_image.height max_x = 0 max_y = 0 # iterate over all text blocks and compare their # bbox extent to the running min and max values for component in tessapi.GetComponentImages( tesserocr.RIL.BLOCK, True): image, xywh, index, _ = component # # the region reference in the reading order element # ID = "region%04d" % index left, top, right, bottom = bbox_from_xywh(xywh) LOG.debug("Detected text region '%s': %i:%i,%i:%i", ID, left, right, top, bottom) # filter region results: bin_bbox = image.getbbox() if not bin_bbox: # this does happen! LOG.info( "Ignoring region '%s' because its binarization is empty", ID) continue width = bin_bbox[2] - bin_bbox[0] if width < 25 / zoom: # we must be conservative here: page numbers are tiny regions, too! LOG.info( "Ignoring region '%s' because its width is too small (%d)", ID, width) continue height = bin_bbox[3] - bin_bbox[1] if height < 25 / zoom: # we must be conservative here: page numbers are tiny regions, too! LOG.debug( "Ignoring region '%s' because its height is too small (%d)", ID, height) continue min_x = min(min_x, left) min_y = min(min_y, top) max_x = max(max_x, right) max_y = max(max_y, bottom) LOG.info("Updated page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) # # set the identified page border # if min_x < max_x and min_y < max_y: # add padding: min_x = max(min_x - padding, 0) max_x = min(max_x + padding, page_image.width) min_y = max(min_y - padding, 0) max_y = min(max_y + padding, page_image.height) LOG.info("Padded page border: %i:%i,%i:%i", min_x, max_x, min_y, max_y) polygon = polygon_from_bbox(min_x, min_y, max_x, max_y) polygon = coordinates_for_segment(polygon, page_image, page_xywh) polygon = polygon_for_parent(polygon, page) border = BorderType( Coords=CoordsType(points_from_polygon(polygon))) # intersection with parent could have changed bbox, # so recalculate: bbox = bbox_from_polygon( coordinates_of_segment(border, page_image, page_xywh)) # update PAGE (annotate border): page.set_Border(border) # update METS (add the image file): page_image = crop_image(page_image, box=bbox) page_xywh['features'] += ',cropped' file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( page_image, file_id + '.IMG-CROP', page_id=input_file.pageId, file_grp=self.output_file_grp) # update PAGE (reference the image file): page.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=page_xywh['features'])) else: LOG.error("Cannot find valid extent for page '%s'", page_id) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Performs table cell segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the block level for table regions. If ``overwrite_regions`` is enabled and any layout annotation already exists inside, then remove it. Set up Tesseract to detect text blocks (as table cells). (This is not Tesseract's internal table structure recognition, but the general page segmentation.) Add each to the block at the detected coordinates. Produce a new output file by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) overwrite_regions = self.parameter['overwrite_regions'] with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # disable table detection here, so we won't get # tables inside tables, but try to analyse them as # independent text/line blocks: tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType( type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) # # prepare dict of reading order reading_order = dict() ro = page.get_ReadingOrder() if not ro: LOG.warning("Page '%s' contains no ReadingOrder", page_id) rogroup = None else: rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() page_get_reading_order(reading_order, rogroup) # # dive into regions regions = page.get_TableRegion() for region in regions: # delete or warn of existing regions: if region.get_TextRegion(): if overwrite_regions: LOG.info( 'removing existing TextRegions in block "%s" of page "%s"', region.id, page_id) for subregion in region.get_TextRegion(): if subregion.id in reading_order: regionref = reading_order[subregion.id] # could be any of the 6 types above: regionrefs = rogroup.__getattribute__( regionref.__class__.__name__.replace( 'Type', '')) # remove in-place regionrefs.remove(regionref) # TODO: adjust index to make contiguous again? region.set_TextRegion([]) else: LOG.warning( 'keeping existing TextRegions in block "%s" of page "%s"', region.id, page_id) # get region image region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) tessapi.SetImage(region_image) LOG.info("Detecting table cells in region '%s'", region.id) # # detect the region segments: tessapi.SetPageSegMode(PSM.SPARSE_TEXT) # retrieve "cells" # TODO: we should XY-cut the sparse cells in regroup them into consistent cells layout = tessapi.AnalyseLayout() roelem = reading_order.get(region.id) if not roelem: LOG.warning( "Page '%s' table region '%s' is not referenced in reading order (%s)", page_id, region.id, "no target to add cells into") elif isinstance( roelem, (OrderedGroupType, OrderedGroupIndexedType)): LOG.warning( "Page '%s' table region '%s' already has an ordered group (%s)", page_id, region.id, "cells will be appended") elif isinstance( roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): LOG.warning( "Page '%s' table region '%s' already has an unordered group (%s)", page_id, region.id, "cells will not be appended") roelem = None elif isinstance(roelem, RegionRefIndexedType): # replace regionref by group with same index and ref # (which can then take the cells as subregions) roelem2 = OrderedGroupIndexedType( id=region.id + '_order', index=roelem.index, regionRef=roelem.regionRef) roelem.parent_object_.add_OrderedGroupIndexed(roelem2) roelem.parent_object_.get_RegionRefIndexed().remove( roelem) roelem = roelem2 elif isinstance(roelem, RegionRefType): # replace regionref by group with same ref # (which can then take the cells as subregions) roelem2 = OrderedGroupType(id=region.id + '_order', regionRef=roelem.regionRef) roelem.parent_object_.add_OrderedGroup(roelem2) roelem.parent_object_.get_RegionRef().remove(roelem) roelem = roelem2 self._process_region(layout, region, roelem, region_image, region_coords) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(force=True, ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Rates textual annotation of PAGE input files, producing output files with LM scores (and choices). ... explain incremental page-wise processing here ... """ level = self.parameter['textequiv_level'] beam_width = self.parameter['beam_width'] lm_weight = self.parameter['lm_weight'] prev_traceback = None prev_pcgts = None prev_file_id = None prev_page_id = None for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) LOG.info("Scoring text in page '%s' at the %s level", pcgts.get_pcGtsId(), level) # annotate processing metadata: metadata = pcgts.get_Metadata() # ensured by page_from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=OCRD_TOOL['tools']['ocrd-keraslm-rate']['steps'][0], value='ocrd-keraslm-rate', Labels=[ LabelsType(externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) # context preprocessing: # todo: as soon as we have true MODS meta-data in METS (dmdSec/mdWrap/xmlData/mods), # get global context variables from there (e.g. originInfo/dateIssued/@text for year) ident = self.workspace.mets.unique_identifier # at least try to get purl context = [0] if ident: name = ident.split('/')[-1] year = name.split('_')[-1] if year.isnumeric(): year = ceil(int(year) / 10) context = [year] # todo: author etc # create a graph for the linear sequence of elements at the given level: graph, start_node, end_node = page_get_linear_graph_at( level, pcgts) # apply language model to (TextEquiv path in) graph, # remove non-path TextEquivs, modify confidences: if not self.parameter['alternative_decoding']: text = [(edge['element'], edge['alternatives']) for edge in _get_edges(graph, 0)] # graph's path textstring = u''.join( textequivs[0].Unicode for element, textequivs in text) # same length as text LOG.info("Rating %d elements with a total of %d characters", len(text), len(textstring)) confidences = self.rater.rate(textstring, context) # much faster i = 0 for element, textequivs in text: textequiv = textequivs[0] # 1st choice only if element: element.set_TextEquiv([textequiv]) # delete others textequiv_len = len(textequiv.Unicode) conf = sum(confidences[i:i + textequiv_len] ) / textequiv_len # mean probability conf2 = textequiv.conf textequiv.set_conf(conf * lm_weight + conf2 * (1. - lm_weight)) i += textequiv_len if i != len(confidences): LOG.critical( "Input text length and output scores length are off by %d characters", i - len(confidences)) avg = sum(confidences) / len(confidences) ent = sum([-log(max(p, 1e-99), 2) for p in confidences]) / len(confidences) ppl = pow(2.0, ent) # character level ppll = pow( 2.0, ent * len(confidences) / len(text)) # textequiv level (including spaces/newlines) LOG.info("avg: %.3f, char ppl: %.3f, %s ppl: %.3f", avg, ppl, level, ppll) # character need not always equal glyph! # ensure parent textequivs are up to date: page_update_higher_textequiv_levels(level, pcgts) # write back result file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=file_id, pageId=input_file.pageId, file_grp=self.output_file_grp, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), ) else: LOG.info("Rating %d elements including its alternatives", end_node - start_node) path, entropy, traceback = self.rater.rate_best( graph, start_node, end_node, start_traceback=prev_traceback, context=context, lm_weight=lm_weight, beam_width=beam_width, beam_clustering_dist=BEAM_CLUSTERING_DIST if BEAM_CLUSTERING_ENABLE else 0) if prev_pcgts: _page_update_from_path(level, path, entropy) # ensure parent textequivs are up to date: page_update_higher_textequiv_levels(level, prev_pcgts) # write back result file_id = prev_file_id.replace(self.input_file_grp, self.output_file_grp) if file_id == prev_file_id: file_id = concat_padded(self.output_file_grp, n - 1) self.workspace.add_file( ID=file_id, pageId=prev_page_id, file_grp=self.output_file_grp, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(prev_pcgts), ) prev_page_id = input_file.pageId prev_file_id = input_file.ID prev_pcgts = pcgts prev_traceback = traceback if prev_pcgts: path, entropy, _ = self.rater.next_path(prev_traceback[0], ([], prev_traceback[1])) _page_update_from_path(level, path, entropy) # ensure parent textequivs are up to date: page_update_higher_textequiv_levels(level, prev_pcgts) # write back result file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=file_id, pageId=input_file.pageId, file_grp=self.output_file_grp, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(prev_pcgts), )
def process(self): """Perform OCR recognition with Tesseract on the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the requested ``textequiv_level`` if it exists and ``overwrite_words`` is disabled, or to the line level otherwise. In the latter case, (remove any existing segmentation below the line level, and) create new segmentation below the line level if necessary. Set up Tesseract to recognise each segment's image (either from AlternativeImage or cropping the bounding box rectangle and masking it from the polygon outline) with the appropriate mode and ``model``. Put text and confidence results into the TextEquiv at ``textequiv_level``, removing any existing TextEquiv. Finally, make the higher levels consistent with these results by concatenation, ordered as appropriate for its readingDirection, textLineOrder, and ReadingOrder, and joined by whitespace, as appropriate for the respective level and Relation/join status. Produce new output files by serialising the resulting hierarchy. """ LOG.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages()) assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) maxlevel = self.parameter['textequiv_level'] model = get_languages()[1][-1] # last installed model if 'model' in self.parameter: model = self.parameter['model'] for sub_model in model.split('+'): if sub_model not in get_languages()[1]: raise Exception("configured model " + sub_model + " is not installed") with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: LOG.info("Using model '%s' in %s for recognition at the %s level", model, get_languages()[0], maxlevel) if maxlevel == 'glyph': # populate GetChoiceIterator() with LSTM models, too: tessapi.SetVariable("lstm_choice_mode", "2") # aggregate symbols tessapi.SetVariable("lstm_choice_iterations", "15") # squeeze out more best paths # TODO: maybe warn/raise when illegal combinations or characters not in the model unicharset? if self.parameter['char_whitelist']: tessapi.SetVariable("tessedit_char_whitelist", self.parameter['char_whitelist']) if self.parameter['char_blacklist']: tessapi.SetVariable("tessedit_char_blacklist", self.parameter['char_blacklist']) if self.parameter['char_unblacklist']: tessapi.SetVariable("tessedit_char_unblacklist", self.parameter['char_unblacklist']) # todo: determine relevancy of these variables: # tessapi.SetVariable("tessedit_single_match", "0") # # tessedit_load_sublangs # tessedit_preserve_min_wd_len 2 # tessedit_prefer_joined_punct 0 # tessedit_write_rep_codes 0 # tessedit_parallelize 0 # tessedit_zero_rejection 0 # tessedit_zero_kelvin_rejection 0 # tessedit_reject_mode 0 # tessedit_use_reject_spaces 1 # tessedit_fix_fuzzy_spaces 1 # tessedit_char_blacklist # tessedit_char_whitelist # chs_leading_punct ('`" # chs_trailing_punct1 ).,;:?! # chs_trailing_punct2 )'`" # numeric_punctuation ., # unrecognised_char | # ok_repeated_ch_non_alphanum_wds -?*= # conflict_set_I_l_1 Il1[] # preserve_interword_spaces 0 # tessedit_enable_dict_correction 0 # tessedit_enable_bigram_correction 1 # stopper_smallword_size 2 # wordrec_max_join_chunks 4 # suspect_space_level 100 # suspect_short_words 2 # language_model_ngram_on 0 # language_model_ngram_order 8 # language_model_min_compound_length 3 # language_model_penalty_non_freq_dict_word 0.1 # language_model_penalty_non_dict_word 0.15 # language_model_penalty_punc 0.2 # language_model_penalty_case 0.1 # language_model_penalty_script 0.5 # language_model_penalty_chartype 0.3 # language_model_penalty_spacing 0.05 # textord_max_noise_size 7 # enable_noise_removal 1 # classify_bln_numeric_mode 0 # lstm_use_matrix 1 # user_words_file # user_patterns_file for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file( self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType( type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info( "Page '%s' images will use %d DPI from paramter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info( "Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info( "Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) LOG.info("Processing page '%s'", page_id) regions = itertools.chain.from_iterable( [page.get_TextRegion()] + [ subregion.get_TextRegion() for subregion in page.get_TableRegion() ]) if not regions: LOG.warning("Page '%s' contains no text regions", page_id) else: self._process_regions(tessapi, regions, page_image, page_xywh) page_update_higher_textequiv_levels(maxlevel, pcgts) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Performs word segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the textline level, and remove any existing Word elements (unless ``overwrite_words`` is False). Set up Tesseract to detect words, and add each one to the line at the detected coordinates. Produce a new output file by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) overwrite_words = self.parameter['overwrite_words'] with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX ) as tessapi: for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[LabelsType( externalModel="ocrd-tool", externalId="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) for region in page.get_TextRegion(): region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) for line in region.get_TextLine(): if line.get_Word(): if overwrite_words: LOG.info('removing existing Words in line "%s"', line.id) line.set_Word([]) else: LOG.warning('keeping existing Words in line "%s"', line.id) LOG.debug("Detecting words in line '%s'", line.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords) tessapi.SetImage(line_image) for word_no, component in enumerate(tessapi.GetComponentImages(RIL.WORD, True, raw_image=True)): word_id = '%s_word%04d' % (line.id, word_no) word_polygon = polygon_from_xywh(component[1]) word_polygon = coordinates_for_segment(word_polygon, line_image, line_coords) word_points = points_from_polygon(word_polygon) line.add_Word(WordType( id=word_id, Coords=CoordsType(word_points))) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Performs (text) line segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the (text) region level, and remove any existing TextLine elements (unless ``overwrite_lines`` is False). Set up Tesseract to detect lines, and add each one to the region at the detected coordinates. Produce a new output file by serialising the resulting hierarchy. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) overwrite_lines = self.parameter['overwrite_lines'] with PyTessBaseAPI( psm=PSM.SINGLE_BLOCK, path=TESSDATA_PREFIX ) as tessapi: for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[LabelsType( externalModel="ocrd-tool", externalId="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id) if self.parameter['dpi'] > 0: dpi = self.parameter['dpi'] LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi) elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi) else: dpi = 0 LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id) if dpi: tessapi.SetVariable('user_defined_dpi', str(dpi)) for region in itertools.chain.from_iterable( [page.get_TextRegion()] + [subregion.get_TextRegion() for subregion in page.get_TableRegion()]): if region.get_TextLine(): if overwrite_lines: LOG.info('removing existing TextLines in region "%s"', region.id) region.set_TextLine([]) else: LOG.warning('keeping existing TextLines in region "%s"', region.id) LOG.debug("Detecting lines in region '%s'", region.id) region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords) region_polygon = coordinates_of_segment(region, region_image, region_coords) region_poly = Polygon(region_polygon) tessapi.SetImage(region_image) for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)): line_id = '%s_line%04d' % (region.id, line_no) line_polygon = polygon_from_xywh(component[1]) line_poly = Polygon(line_polygon) if not line_poly.within(region_poly): # this could happen due to rotation interline = line_poly.intersection(region_poly) if interline.is_empty: continue # ignore this line if hasattr(interline, 'geoms'): # is (heterogeneous) GeometryCollection area = 0 for geom in interline.geoms: if geom.area > area: area = geom.area interline = geom if not area: continue line_poly = interline.convex_hull line_polygon = line_poly.exterior.coords line_polygon = coordinates_for_segment(line_polygon, region_image, region_coords) line_points = points_from_polygon(line_polygon) region.add_TextLine(TextLineType( id=line_id, Coords=CoordsType(line_points))) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): try: page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter="binarized") LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id) if oplevel == "page": self._process_segment(page_image, page, page_xywh, page_id, input_file, n) else: regions = page.get_TextRegion() + page.get_TableRegion() if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for (k, region) in enumerate(regions): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) # TODO: not tested on regions self._process_segment(region_image, page, region_xywh, region.id, input_file, str(n) + "_" + str(k)) # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, page_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) #LOG.info('Adding force option to False') self.workspace.add_file(ID=file_id, file_grp=page_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'), force=self.parameter['force'])
def process(self): """Performs segmentation on the input binary image Produces a PageXML file as output. """ LOG = getLogger('processor.PixelClassifierSegmentation') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) overwrite_regions = self.parameter['overwrite_regions'] xheight = self.parameter['xheight'] gpu_allow_growth = self.parameter['gpu_allow_growth'] resize_height = self.parameter['resize_height'] model = self.parameter['model'] if model == '__DEFAULT__': from ocrd_pc_segmentation import DEFAULT_SEGMENTATION_MODEL_PATH model = DEFAULT_SEGMENTATION_MODEL_PATH elif model == '__LEGACY__': from ocrd_pc_segmentation import LEGACY_SEGMENTATION_MODEL_PATH model = LEGACY_SEGMENTATION_MODEL_PATH page_grp = self.output_file_grp for n, input_file in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() if page.get_TextRegion(): if overwrite_regions: LOG.info('removing existing TextRegions') page.set_TextRegion([]) else: LOG.warning('keeping existing TextRegions') page.set_AdvertRegion([]) page.set_ChartRegion([]) page.set_ChemRegion([]) page.set_GraphicRegion([]) page.set_ImageRegion([]) page.set_LineDrawingRegion([]) page.set_MathsRegion([]) page.set_MusicRegion([]) page.set_NoiseRegion([]) page.set_SeparatorRegion([]) page.set_TableRegion([]) page.set_UnknownRegion([]) page_image, page_coords, _ = self.workspace.image_from_page( page, page_id) # ensure the image doesn't have an alpha channel if page_image.mode[-1] == "A": page_image = page_image.convert(mode=page_image.mode[0:-1]) page_binary = page_image.convert(mode='1') self._process_page(page, np.asarray(page_image), np.asarray(page_binary), page_coords, xheight, model, gpu_allow_growth, resize_height) file_id = make_file_id(input_file, self.output_file_grp) self.workspace.add_file(ID=file_id, file_grp=page_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( page_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Performs region segmentation by reading mask images in pseudo-colour. Open and deserialize each PAGE input file (or generate from image input file) from the first input file group, as well as mask image file from the second. Then iterate over all connected (equally colored) mask segments and compute convex hull contours for them. Convert them to polygons, and look up their color value in ``colordict`` to instantiate the appropriate region types (optionally with subtype). Instantiate and annotate regions accordingly. Produce a new output file by serialising the resulting hierarchy. """ colordict = self.parameter['colordict'] if not colordict: LOG.info('Using default PAGE colordict') colordict = dict(('#' + col, name) for name, col in CLASSES.items() if name) typedict = {"TextRegion": TextTypeSimpleType, "GraphicRegion": GraphicsTypeSimpleType, "ChartType": ChartTypeSimpleType} ifgs = self.input_file_grp.split(",") # input file groups if len(ifgs) != 2: raise Exception("need 2 input file groups (base and mask)") # collect input file tuples ifts = self.zip_input_files(ifgs) # input file tuples # process input file tuples for n, ift in enumerate(ifts): input_file, segmentation_file = ift LOG.info("processing page %s", input_file.pageId) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[LabelsType( externalModel="ocrd-tool", externalId="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) # import mask image segmentation_filename = self.workspace.download_file(segmentation_file).local_filename with pushd_popd(self.workspace.directory): segmentation_pil = Image.open(segmentation_filename) has_alpha = segmentation_pil.mode == 'RGBA' if has_alpha: colorformat = "#%08X" else: colorformat = "#%06X" if segmentation_pil.mode != 'RGB': segmentation_pil = segmentation_pil.convert('RGB') # convert to array segmentation_array = np.array(segmentation_pil) # collapse 3 color channels segmentation_array = segmentation_array.dot( np.array([2**24, 2**16, 2**8, 1], np.uint32)[0 if has_alpha else 1:]) # partition mapped colors vs background colors = np.unique(segmentation_array) bgcolors = [] for i, color in enumerate(colors): colorname = colorformat % color if (colorname not in colordict or not colordict[colorname]): #raise Exception("Unknown color %s (not in colordict)" % colorname) LOG.info("Ignoring background color %s", colorname) bgcolors.append(i) background = np.zeros_like(segmentation_array, np.uint8) if bgcolors: for i in bgcolors: background += np.array(segmentation_array == colors[i], np.uint8) colors = np.delete(colors, bgcolors, 0) # iterate over mask for each mapped color/class regionno = 0 for color in colors: # get region (sub)type colorname = colorformat % color classname = colordict[colorname] regiontype = None custom = None if ":" in classname: classname, regiontype = classname.split(":") if classname in typedict: typename = membername(typedict[classname], regiontype) if typename == regiontype: # not predefined in PAGE: use other + custom custom = "subtype:%s" % regiontype regiontype = "other" else: custom = "subtype:%s" % regiontype if classname + "Type" not in globals(): raise Exception("Unknown class '%s' for color %s in colordict" % (classname, colorname)) classtype = globals()[classname + "Type"] if classtype is BorderType: # mask from all non-background regions classmask = 1 - background else: # mask from current color/class classmask = np.array(segmentation_array == color, np.uint8) if not np.count_nonzero(classmask): continue # now get the contours and make polygons for them contours, _ = cv2.findContours(classmask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) for contour in contours: # (could also just take bounding boxes to avoid islands/inclusions...) area = cv2.contourArea(contour) # filter too small regions area_pct = area / np.prod(segmentation_array.shape) * 100 if area < 100 and area_pct < 0.1: LOG.warning('ignoring contour of only %.1f%% area for %s', area_pct, classname) continue LOG.info('found region %s:%s:%s with area %.1f%%', classname, regiontype or '', custom or '', area_pct) # simplify shape poly = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y if len(poly) < 4: LOG.warning('ignoring contour of only %d points (area %.1f%%) for %s', len(poly), area_pct, classname) continue if classtype is BorderType: # add Border page.set_Border(BorderType(Coords=CoordsType(points=points_from_polygon(poly)))) break else: # instantiate region regionno += 1 region = classtype(id="region_%d" % regionno, type_=regiontype, custom=custom, Coords=CoordsType(points=points_from_polygon(poly))) # add region getattr(page, 'add_%s' % classname)(region) # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(ifgs[0], self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): try: page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) if not torch.cuda.is_available(): LOG.error("Your system has no CUDA installed. No GPU detected.") sys.exit(1) path = self.parameter['pix2pixHD'] if not Path(path).is_dir(): LOG.error("""\ NVIDIA's pix2pixHD was not found at '%s'. Make sure the `pix2pixHD` parameter in ocrd-tools.json points to the local path to the cloned pix2pixHD repository. pix2pixHD can be downloaded from https://github.com/NVIDIA/pix2pixHD """ % path) sys.exit(1) model_file_path = os.path.join(path, 'checkpoints/latest_net_G.pth') if not Path(model_file_path).is_file(): LOG.error("""\ pix2pixHD model file was not found at '%s'. Make sure the this file exists. """ % model_file_path) sys.exit(1) opt, model = self.prepare_options(path) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %s", page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='dewarped', feature_selector='binarized' ) # images should be deskewed and cropped if oplevel == 'page': dataset = self.prepare_data(opt, page_image, path) orig_img_size = page_image.size self._process_segment(model, dataset, page, page_xywh, page_id, input_file, orig_img_size, n) else: regions = page.get_TextRegion() + page.get_TableRegion( ) #get all regions? if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for (k, region) in enumerate(regions): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) # TODO: not tested on regions # TODO: region has to exist as a physical file to be processed by pix2pixHD dataset = self.prepare_data(opt, region_image, path) orig_img_size = region_image.size self._process_segment(model, dataset, page, region_xywh, region.id, input_file, orig_img_size, n) # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, page_grp) if file_id == input_file.ID: file_id = concat_padded(page_grp, n) self.workspace.add_file(ID=file_id, file_grp=page_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( page_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'), force=self.parameter['force']) os.rmdir(self.input_file_grp + "/test_A/") #FIXME: better way of deleting a temp_dir?
def process(self): """Extract page images and region descriptions (type and coordinates) from the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. Get all regions with their types (region element class), sub-types (@type) and coordinates relative to the page (which depending on the workflow could already be cropped, deskewed, dewarped, binarized etc). Extract the image of the (cropped, deskewed, dewarped) page, both in binarized form (if available) and non-binarized form. In addition, create a new image with masks for all regions, color-coded by type. Create two JSON files with region types and coordinates: one (page-wise) in our custom format and one (global) in MS-COCO. The output file group may be given as a comma-separated list to separate these 3 page-level images. Write files as follows: * in the first (or only) output file group (directory): - ID + '.png': raw image of the (preprocessed) page - ID + '.json': region coordinates/classes (custom format) * in the second (or first) output file group (directory): - ID + '.bin.png': binarized image of the (preprocessed) page, if available * in the third (or first) output file group (directory): - ID + '.dbg.png': debug image In addition, write a file for all pages at once: * in the third (or first) output file group (directory): - output_file_grp + '.coco.json': region coordinates/classes (MS-COCO format) - output_file_grp + '.colordict.json': color definitions (as in PAGE viewer) (This is intended for training and evaluation of region segmentation models.) """ file_groups = self.output_file_grp.split(',') if len(file_groups) > 3: raise Exception( "at most 3 output file grps allowed (raw, [binarized, [mask]] image)" ) if len(file_groups) > 2: dbg_image_grp = file_groups[2] else: dbg_image_grp = file_groups[0] LOG.info( "No output file group for debug images specified, falling back to output filegrp '%s'", dbg_image_grp) if len(file_groups) > 1: bin_image_grp = file_groups[1] else: bin_image_grp = file_groups[0] LOG.info( "No output file group for binarized images specified, falling back to output filegrp '%s'", bin_image_grp) self.output_file_grp = file_groups[0] # COCO: init data structures images = list() annotations = list() categories = list() i = 0 for cat, color in CLASSES.items(): # COCO format does not allow alpha channel color = (int(color[0:2], 16), int(color[2:4], 16), int(color[4:6], 16)) try: supercat, name = cat.split(':') except ValueError: name = cat supercat = '' categories.append({ 'id': i, 'name': name, 'supercategory': supercat, 'source': 'PAGE', 'color': color }) i += 1 i = 0 # pylint: disable=attribute-defined-outside-init for n, input_file in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) page_id = input_file.pageId or input_file.ID num_page_id = int(page_id.strip(page_id.strip("0123456789"))) LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() ptype = page.get_type() metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter ]) ])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='binarized', transparency=self.parameter['transparency']) if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None file_path = self.workspace.save_image_file( page_image, file_id, self.output_file_grp, page_id=page_id, mimetype=self.parameter['mimetype']) try: page_image_bin, _, _ = self.workspace.image_from_page( page, page_id, feature_selector='binarized', transparency=self.parameter['transparency']) self.workspace.save_image_file(page_image_bin, file_id + '.bin', bin_image_grp, page_id=page_id) except Exception as err: if err.args[0].startswith('Found no AlternativeImage'): LOG.warning( 'Page "%s" has no binarized images, skipping .bin', page_id) else: raise page_image_dbg = Image.new(mode='RGBA', size=page_image.size, color='#' + CLASSES['']) if page.get_Border(): polygon = coordinates_of_segment(page.get_Border(), page_image, page_coords).tolist() ImageDraw.Draw(page_image_dbg).polygon( list(map(tuple, polygon)), fill='#' + CLASSES['Border']) else: page_image_dbg.paste( '#' + CLASSES['Border'], (0, 0, page_image.width, page_image.height)) regions = dict() for name in CLASSES.keys(): if not name or name == 'Border' or ':' in name: # no subtypes here continue regions[name] = getattr(page, 'get_' + name)() description = {'angle': page.get_orientation()} Neighbor = namedtuple('Neighbor', ['id', 'poly', 'type']) neighbors = [] for rtype, rlist in regions.items(): for region in rlist: if rtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']: subrtype = region.get_type() else: subrtype = None polygon = coordinates_of_segment(region, page_image, page_coords) polygon2 = polygon.reshape(1, -1).tolist() polygon = polygon.tolist() xywh = xywh_from_polygon(polygon) # validate coordinates and check intersection with neighbours # (which would melt into another in the mask image): try: poly = Polygon(polygon) reason = '' except ValueError as err: reason = err if not poly.is_valid: reason = explain_validity(poly) elif poly.is_empty: reason = 'is empty' elif poly.bounds[0] < 0 or poly.bounds[1] < 0: reason = 'is negative' elif poly.length < 4: reason = 'has too few points' if reason: LOG.error('Page "%s" region "%s" %s', page_id, region.id, reason) continue poly_prep = prep(poly) for neighbor in neighbors: if (rtype == neighbor.type and poly_prep.intersects(neighbor.poly) and poly.intersection(neighbor.poly).area > 0): LOG.warning('Page "%s" region "%s" intersects neighbour "%s" (IoU: %.3f)', page_id, region.id, neighbor.id, poly.intersection(neighbor.poly).area / \ poly.union(neighbor.poly).area) elif (rtype != neighbor.type and poly_prep.within(neighbor.poly)): LOG.warning( 'Page "%s" region "%s" within neighbour "%s" (IoU: %.3f)', page_id, region.id, neighbor.id, poly.area / neighbor.poly.area) neighbors.append(Neighbor(region.id, poly, rtype)) area = poly.area description.setdefault('regions', []).append({ 'type': rtype, 'subtype': subrtype, 'coords': polygon, 'area': area, 'features': page_coords['features'], 'DPI': dpi, 'region.ID': region.id, 'page.ID': page_id, 'page.type': ptype, 'file_grp': self.input_file_grp, 'METS.UID': self.workspace.mets.unique_identifier }) # draw region: ImageDraw.Draw(page_image_dbg).polygon( list(map(tuple, polygon)), fill='#' + CLASSES[(rtype + ':' + subrtype) if subrtype else rtype]) # COCO: add annotations i += 1 annotations.append({ 'id': i, 'image_id': num_page_id, 'category_id': next( (cat['id'] for cat in categories if cat['name'] == subrtype), next((cat['id'] for cat in categories if cat['name'] == rtype))), 'segmentation': polygon2, 'area': area, 'bbox': [xywh['x'], xywh['y'], xywh['w'], xywh['h']], 'iscrowd': 0 }) self.workspace.save_image_file(page_image_dbg, file_id + '.dbg', dbg_image_grp, page_id=page_id, mimetype=self.parameter['mimetype']) self.workspace.add_file( ID=file_id + '.json', file_grp=dbg_image_grp, pageId=page_id, local_filename=file_path.replace( MIME_TO_EXT[self.parameter['mimetype']], '.json'), mimetype='application/json', content=json.dumps(description)) # COCO: add image images.append({ # COCO does not allow string identifiers: # -> use numerical part of page_id 'id': num_page_id, # all exported coordinates are relative to the cropped page: # -> use that for reference (instead of original page.imageFilename) 'file_name': file_path, # -> use its size (instead of original page.imageWidth/page.imageHeight) 'width': page_image.width, 'height': page_image.height }) # COCO: write result file_id = dbg_image_grp + '.coco.json' LOG.info('Writing COCO result file "%s" in "%s"', file_id, dbg_image_grp) self.workspace.add_file(ID=file_id, file_grp=dbg_image_grp, local_filename=os.path.join( dbg_image_grp, file_id), mimetype='application/json', content=json.dumps({ 'categories': categories, 'images': images, 'annotations': annotations })) # write inverse colordict (for ocrd-segment-from-masks) file_id = dbg_image_grp + '.colordict.json' LOG.info('Writing colordict file "%s" in .', file_id) with open(file_id, 'w') as out: json.dump( dict(('#' + col, name) for name, col in CLASSES.items() if name), out)
def process(self): """Extract page image and replace original with it. Open and deserialize PAGE input files and their respective images, then go to the page hierarchy level. Retrieve the image of the (cropped, deskewed, dewarped) page, preferring the last annotated form (which, depending on the workflow, could be binarized or raw). Add that image file to the workspace with the fileGrp USE given in the second position of the output fileGrp, or ``OCR-D-IMG-SUBST``. Reference that file in the page (not as AlternativeImage but) as original image. Adjust all segment coordinates accordingly. Produce a new output file by serialising the resulting hierarchy. """ try: page_grp, image_grp = self.output_file_grp.split(',') except ValueError: page_grp = self.output_file_grp image_grp = FALLBACK_FILEGRP_IMG LOG.info( "No output file group for images specified, falling back to '%s'", image_grp) feature_selector = self.parameter['feature_selector'] feature_filter = self.parameter['feature_filter'] adapt_coords = self.parameter['transform_coordinates'] # pylint: disable=attribute-defined-outside-init for n, input_file in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, page_grp) if file_id == input_file.ID: file_id = concat_padded(page_grp, n) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter ]) ])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter=feature_filter, feature_selector=feature_selector) if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None # annotate extracted image file_path = self.workspace.save_image_file( page_image, file_id.replace(page_grp, image_grp), image_grp, page_id=input_file.pageId, mimetype='image/png') # replace original image page.set_imageFilename(file_path) # adjust all coordinates if adapt_coords: for region in page.get_AllRegions(): region_polygon = coordinates_of_segment( region, page_image, page_coords) region.get_Coords().points = points_from_polygon( region_polygon) if isinstance(region, TextRegionType): for line in region.get_TextLine(): line_polygon = coordinates_of_segment( line, page_image, page_coords) line.get_Coords().points = points_from_polygon( line_polygon) for word in line.get_Word(): word_polygon = coordinates_of_segment( word, page_image, page_coords) word.get_Coords().points = points_from_polygon( word_polygon) for glyph in word.get_Glyph(): glyph_polygon = coordinates_of_segment( glyph, page_image, page_coords) glyph.get_Coords( ).points = points_from_polygon( glyph_polygon) # update METS (add the PAGE file): file_path = os.path.join(page_grp, file_id + '.xml') out = self.workspace.add_file(ID=file_id, file_grp=page_grp, pageId=input_file.pageId, local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id, page_grp, out.local_filename)
def process(self): """Performs segmentation evaluation with Shapely on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. Return information on the plausibility of the segmentation into regions on the logging level. """ plausibilize = self.parameter['plausibilize'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this # what we want here is `externalModel="ocrd-tool" externalId="parameters"` Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() regions = page.get_TextRegion() mark_for_deletion = set() mark_for_merging = set() for i in range(0, len(regions)): for j in range(i + 1, len(regions)): LOG.info('Comparing regions "%s" and "%s"', regions[i].id, regions[j].id) region_poly1 = Polygon( polygon_from_points(regions[i].get_Coords().points)) region_poly2 = Polygon( polygon_from_points(regions[j].get_Coords().points)) LOG.debug('Checking for equality ...') equality = region_poly1.almost_equals(region_poly2) if equality: LOG.warn( 'Warning: regions %s and %s cover the same area.' % (regions[i].id, regions[j].id)) mark_for_deletion.add(j) LOG.debug('Checking for containment ...') containment_r = region_poly1.contains(region_poly2) containment_l = region_poly2.contains(region_poly1) if containment_r: LOG.warn('Warning: %s contains %s' % (regions[i].id, regions[j].id)) mark_for_deletion.add(j) if containment_l: LOG.warn('Warning: %s contains %s' % (regions[j].id, regions[i].id)) mark_for_deletion.add(i) if plausibilize: new_regions = [] for i in range(0, len(regions)): if not i in mark_for_deletion: new_regions.append(regions[i]) page.set_TextRegion(new_regions) #LOG.info('Intersection %i', region_poly1.intersects(region_poly2)) #LOG.info('Containment %i', region_poly1.contains(region_poly2)) #if region_poly1.intersects(region_poly2): # LOG.info('Area 1 %d', region_poly1.area) # LOG.info('Area 2 %d', region_poly2.area) # LOG.info('Area intersect %d', region_poly1.intersection(region_poly2).area) # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Extract page images and region descriptions (type and coordinates) from the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. Get all regions with their types (region element class), sub-types (@type) and coordinates relative to the page (which depending on the workflow could already be cropped, deskewed, dewarped, binarized etc). Extract the image of the page, both in binarized and non-binarized form. In addition, create a new image which color-codes all regions. Create a JSON file with region types and coordinates. Write all files in the directory of the output file group, named like so: * ID + '.png': raw image * ID + '.bin.png': binarized image * ID + '.dbg.png': debug image * ID + '.json': region coordinates (This is intended for training and evaluation of region segmentation models.) """ # pylint: disable=attribute-defined-outside-init for n, input_file in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() ptype = page.get_type() metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='binarized', transparency=self.parameter['transparency']) if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None file_path = self.workspace.save_image_file(page_image, file_id, self.output_file_grp, page_id=page_id) page_image_bin, _, _ = self.workspace.image_from_page( page, page_id, feature_selector='binarized', transparency=self.parameter['transparency']) self.workspace.save_image_file(page_image_bin, file_id + '.bin', self.output_file_grp, page_id=page_id) page_image_dbg = Image.new(mode='RGB', size=page_image.size, color=0) regions = { 'text': page.get_TextRegion(), 'table': page.get_TableRegion(), 'chart': page.get_ChartRegion(), 'chem': page.get_ChemRegion(), 'graphic': page.get_GraphicRegion(), 'image': page.get_ImageRegion(), 'linedrawing': page.get_LineDrawingRegion(), 'maths': page.get_MathsRegion(), 'music': page.get_MusicRegion(), 'noise': page.get_NoiseRegion(), 'separator': page.get_SeparatorRegion(), 'unknown': page.get_UnknownRegion() } description = {'angle': page.get_orientation()} for rtype, rlist in regions.items(): for region in rlist: polygon = coordinates_of_segment(region, page_image, page_coords).tolist() description.setdefault('regions', []).append({ 'type': rtype, 'subtype': region.get_type() if rtype in ['text', 'chart', 'graphic'] else None, 'coords': polygon, 'features': page_coords['features'], 'DPI': dpi, 'region.ID': region.id, 'page.ID': page_id, 'page.type': ptype, 'file_grp': self.input_file_grp, 'METS.UID': self.workspace.mets.unique_identifier }) ImageDraw.Draw(page_image_dbg).polygon(list( map(tuple, polygon)), fill=CLASSES[rtype]) ImageDraw.Draw(page_image_dbg).line(list( map(tuple, polygon + [polygon[0]])), fill=CLASSES['border'], width=3) self.workspace.save_image_file(page_image_dbg, file_id + '.dbg', self.output_file_grp, page_id=page_id) file_path = file_path.replace('.png', '.json') json.dump(description, open(file_path, 'w'))
def process(self): """Extract region images from the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. Extract an image for each region (which depending on the workflow can already be deskewed, dewarped, binarized etc.), cropped to its minimal bounding box, and masked by the coordinate polygon outline. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. Create a JSON file with: * the IDs of the region and its parents, * the region's coordinates relative to the region image, * the region's absolute coordinates, * the (text) region's text content (if any), * the (text) region's TextStyle (if any), * the (text) region's @production (if any), * the (text) region's @readingDirection (if any), * the (text) region's @textLineOrder (if any), * the (text) region's @primaryScript (if any), * the (text) region's @primaryLanguage (if any), * the region's AlternativeImage/@comments (features), * the region's element class, * the region's @type, * the page's @type, * the page's DPI value. Write all files in the directory of the output file group, named like so: * ID + '.raw.png': region image (if the workflow provides raw images) * ID + '.bin.png': region image (if the workflow provides binarized images) * ID + '.nrm.png': region image (if the workflow provides grayscale-normalized images) * ID + '.json': region metadata. """ # pylint: disable=attribute-defined-outside-init for n, input_file in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[LabelsType( externalModel="ocrd-tool", externalId="parameters", Label=[LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys()])])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, transparency=self.parameter['transparency']) if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None ptype = page.get_type() regions = { 'advert': page.get_AdvertRegion(), 'text': page.get_TextRegion(), 'table': page.get_TableRegion(), 'chart': page.get_ChartRegion(), 'chem': page.get_ChemRegion(), 'graphic': page.get_GraphicRegion(), 'image': page.get_ImageRegion(), 'linedrawing': page.get_LineDrawingRegion(), 'maths': page.get_MathsRegion(), 'music': page.get_MusicRegion(), 'noise': page.get_NoiseRegion(), 'separator': page.get_SeparatorRegion(), 'unknown': page.get_UnknownRegion() } for rtype, rlist in regions.items(): for region in rlist: description = { 'region.ID': region.id, 'region.type': rtype } region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, transparency=self.parameter['transparency']) description['subtype'] = region.get_type() if rtype in ['text', 'chart', 'graphic'] else None description['coords_rel'] = coordinates_of_segment( region, region_image, region_coords).tolist() description['coords_abs'] = polygon_from_points(region.get_Coords().points) if rtype == 'text': rtext = region.get_TextEquiv() if rtext: description['region.text'] = rtext[0].Unicode else: description['region.text'] = '' rstyle = region.get_TextStyle() or page.get_TextStyle() if rstyle: description['region.style'] = { 'fontFamily': rstyle.fontFamily, 'fontSize': rstyle.fontSize, 'xHeight': rstyle.xHeight, 'kerning': rstyle.kerning, 'serif': rstyle.serif, 'monospace': rstyle.monospace, 'bold': rstyle.bold, 'italic': rstyle.italic, 'smallCaps': rstyle.smallCaps, 'letterSpaced': rstyle.letterSpaced, 'strikethrough': rstyle.strikethrough, 'underlined': rstyle.underlined, 'underlineStyle': rstyle.underlineStyle, 'subscript': rstyle.subscript, 'superscript': rstyle.superscript } description['production'] = region.get_production() description['readingDirection'] = ( region.get_readingDirection() or page.get_readingDirection()) description['textLineOrder'] = ( region.get_textLineOrder() or page.get_textLineOrder()) description['primaryScript'] = ( region.get_primaryScript() or page.get_primaryScript()) description['primaryLanguage'] = ( region.get_primaryLanguage() or page.get_primaryLanguage()) description['features'] = region_coords['features'] description['DPI']= dpi description['page.ID'] = page_id description['page.type'] = ptype description['file_grp'] = self.input_file_grp description['METS.UID'] = self.workspace.mets.unique_identifier if 'binarized' in region_coords['features']: extension = '.bin' elif 'grayscale_normalized' in region_coords['features']: extension = '.nrm' else: extension = '.raw' file_path = self.workspace.save_image_file( region_image, file_id + '_' + region.id + extension, self.output_file_grp, page_id=page_id, format='PNG') file_path = file_path.replace(extension + '.png', '.json') json.dump(description, open(file_path, 'w'))
def process(self): """Performs segmentation on the input binary image Produces a PageXML file as output. """ overwrite_regions = self.parameter['overwrite_regions'] try: self.page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: self.page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) for n, input_file in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this # what we want here is `externalModel="ocrd-tool" externalId="parameters"` Labels=[ LabelsType( # externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() if page.get_TextRegion(): if overwrite_regions: LOG.info('removing existing TextRegions') page.set_TextRegion([]) else: LOG.warning('keeping existing TextRegions') page.set_AdvertRegion([]) page.set_ChartRegion([]) page.set_ChemRegion([]) page.set_GraphicRegion([]) page.set_ImageRegion([]) page.set_LineDrawingRegion([]) page.set_MathsRegion([]) page.set_MusicRegion([]) page.set_NoiseRegion([]) page.set_SeparatorRegion([]) page.set_TableRegion([]) page.set_UnknownRegion([]) page_image, page_xywh, _ = image_from_page(self.workspace, page, page_id) self._process_page(page, page_image, page_xywh, input_file.pageId, file_id) # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, self.page_grp) if file_id == input_file.ID: file_id = concat_padded(self.page_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.page_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.page_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """ Performs the (text) recognition. """ # print(self.parameter) log.debug("TESSDATA: %s, installed tesseract models: %s", *get_languages()) maxlevel = self.parameter['textequiv_level'] model = get_languages()[1][-1] # last installed model if 'model' in self.parameter: model = self.parameter['model'] if model not in get_languages()[1]: raise Exception("configured model " + model + " is not installed") with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: log.info("Using model '%s' in %s for recognition at the %s level", model, get_languages()[0], maxlevel) # todo: populate GetChoiceIterator() with LSTM models, too: #tessapi.SetVariable("lstm_choice_mode", "2") # todo: determine relevancy of these variables: # tessapi.SetVariable("tessedit_single_match", "0") # # tessedit_load_sublangs # tessedit_preserve_min_wd_len 2 # tessedit_prefer_joined_punct 0 # tessedit_write_rep_codes 0 # tessedit_parallelize 0 # tessedit_zero_rejection 0 # tessedit_zero_kelvin_rejection 0 # tessedit_reject_mode 0 # tessedit_use_reject_spaces 1 # tessedit_fix_fuzzy_spaces 1 # tessedit_char_blacklist # tessedit_char_whitelist # chs_leading_punct ('`" # chs_trailing_punct1 ).,;:?! # chs_trailing_punct2 )'`" # numeric_punctuation ., # unrecognised_char | # ok_repeated_ch_non_alphanum_wds -?*= # conflict_set_I_l_1 Il1[] # preserve_interword_spaces 0 # tessedit_enable_dict_correction 0 # tessedit_enable_bigram_correction 1 # stopper_smallword_size 2 # wordrec_max_join_chunks 4 # suspect_space_level 100 # suspect_short_words 2 # language_model_ngram_on 0 # language_model_ngram_order 8 # language_model_min_compound_length 3 # language_model_penalty_non_freq_dict_word 0.1 # language_model_penalty_non_dict_word 0.15 # language_model_penalty_punc 0.2 # language_model_penalty_case 0.1 # language_model_penalty_script 0.5 # language_model_penalty_chartype 0.3 # language_model_penalty_spacing 0.05 # textord_max_noise_size 7 # enable_noise_removal 1 # classify_bln_numeric_mode 0 # lstm_use_matrix 1 # user_words_file # user_patterns_file for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file( self.workspace.download_file(input_file)) # TODO use binarized / gray pil_image = self.workspace.resolve_image_as_pil( pcgts.get_Page().imageFilename) tessapi.SetImage(pil_image) metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=OCRD_TOOL['tools']['ocrd-tesserocr-recognize'] ['steps'][0], value='ocrd-tesserocr-recognize', Labels=[ LabelsType(externalRef="parameters", Label=[ LabelType( type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) log.info("Recognizing text in page '%s'", pcgts.get_pcGtsId()) regions = pcgts.get_Page().get_TextRegion() if not regions: log.warning("Page contains no text regions") self._process_regions(regions, maxlevel, tessapi) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts), )
def process(self): """Extract textline images and texts from the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the line level. Extract an image for each textline (which depending on the workflow can already be deskewed, dewarped, binarized etc.), cropped to its minimal bounding box, and masked by the coordinate polygon outline. If ``transparency`` is true, then also add an alpha channel which is fully transparent outside of the mask. Create a JSON file with: * the IDs of the textline and its parents, * the textline's text content, * the textline's coordinates relative to the line image, * the textline's absolute coordinates, * the textline's TextStyle (if any), * the textline's @production (if any), * the textline's @readingDirection (if any), * the textline's @primaryScript (if any), * the textline's @primaryLanguage (if any), * the textline's AlternativeImage/@comments (features), * the parent textregion's @type, * the page's @type, * the page's DPI value. Create a plain text file for the text content, too. Write all files in the directory of the output file group, named like so: * ID + '.raw.png': line image (if the workflow provides raw images) * ID + '.bin.png': line image (if the workflow provides binarized images) * ID + '.nrm.png': line image (if the workflow provides grayscale-normalized images) * ID + '.json': line metadata. * ID + '.gt.txt': line text. (This is intended for training and evaluation of OCR models.) """ # pylint: disable=attribute-defined-outside-init for n, input_file in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, transparency=self.parameter['transparency']) if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None ptype = page.get_type() regions = itertools.chain.from_iterable([page.get_TextRegion()] + [ subregion.get_TextRegion() for subregion in page.get_TableRegion() ]) if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for region in regions: region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, transparency=self.parameter['transparency']) rtype = region.get_type() lines = region.get_TextLine() if not lines: LOG.warning("Region '%s' contains no text lines", region.id) for line in lines: line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_coords, transparency=self.parameter['transparency']) lpolygon_rel = coordinates_of_segment( line, line_image, line_coords).tolist() lpolygon_abs = polygon_from_points( line.get_Coords().points) ltext = line.get_TextEquiv() if not ltext: LOG.warning("Line '%s' contains no text conent", line.id) ltext = '' else: ltext = ltext[0].Unicode lstyle = line.get_TextStyle() or region.get_TextStyle() if lstyle: lstyle = { 'fontFamily': lstyle.fontFamily, 'fontSize': lstyle.fontSize, 'xHeight': lstyle.xHeight, 'kerning': lstyle.kerning, 'serif': lstyle.serif, 'monospace': lstyle.monospace, 'bold': lstyle.bold, 'italic': lstyle.italic, 'smallCaps': lstyle.smallCaps, 'letterSpaced': lstyle.letterSpaced, 'strikethrough': lstyle.strikethrough, 'underlined': lstyle.underlined, 'underlineStyle': lstyle.underlineStyle, 'subscript': lstyle.subscript, 'superscript': lstyle.superscript } lfeatures = line_coords['features'] description = { 'line.ID': line.id, 'text': ltext, 'style': lstyle, 'production': (line.get_production() or region.get_production()), 'readingDirection': (line.get_readingDirection() or region.get_readingDirection() or page.get_readingDirection()), 'primaryScript': (line.get_primaryScript() or region.get_primaryScript() or page.get_primaryScript()), 'primaryLanguage': (line.get_primaryLanguage() or region.get_primaryLanguage() or page.get_primaryLanguage()), 'features': lfeatures, 'DPI': dpi, 'coords_rel': lpolygon_rel, 'coords_abs': lpolygon_abs, 'region.ID': region.id, 'region.type': rtype, 'page.ID': page_id, 'page.type': ptype, 'file_grp': self.input_file_grp, 'METS.UID': self.workspace.mets.unique_identifier } if 'binarized' in lfeatures: extension = '.bin' elif 'grayscale_normalized' in lfeatures: extension = '.nrm' else: extension = '.raw' file_path = self.workspace.save_image_file( line_image, file_id + '_' + region.id + '_' + line.id + extension, self.output_file_grp, page_id=page_id, mimetype=self.parameter['mimetype']) file_path = file_path.replace( extension + MIME_TO_EXT[self.parameter['mimetype']], '.json') json.dump(description, open(file_path, 'w')) file_path = file_path.replace('.json', '.gt.txt') with open(file_path, 'wb') as f: f.write((ltext + '\n').encode('utf-8'))
def process(self): """ Performs the recognition. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) self._init_calamari() for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) for region in pcgts.get_Page().get_TextRegion(): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): log.debug("Recognizing line '%s' in region '%s'", line.id, region.id) line_image, line_coords = self.workspace.image_from_segment( line, region_image, region_xywh) line_image_np = np.array(line_image, dtype=np.uint8) raw_results = list( self.predictor.predict_raw([line_image_np], progress_bar=False))[0] for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" # Build line text on our own # # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same # on prediction.positions. Do it on our own to have consistency. # # XXX Check Calamari's built-in post-processing on prediction.sentence def _sort_chars(p): """Filter and sort chars of prediction p""" chars = p.chars chars = [ c for c in chars if c.char ] # XXX Note that omission probabilities are not normalized?! chars = [ c for c in chars if c.probability >= self.parameter['glyph_conf_cutoff'] ] chars = sorted(chars, key=lambda k: k.probability, reverse=True) return chars def _drop_leading_spaces(positions): return list( itertools.dropwhile( lambda p: _sort_chars(p)[0].char == " ", positions)) def _drop_trailing_spaces(positions): return list( reversed(_drop_leading_spaces( reversed(positions)))) def _drop_double_spaces(positions): def _drop_double_spaces_generator(positions): last_was_space = False for p in positions: if p.chars[0].char == " ": if not last_was_space: yield p last_was_space = True else: yield p last_was_space = False return list(_drop_double_spaces_generator(positions)) positions = prediction.positions positions = _drop_leading_spaces(positions) positions = _drop_trailing_spaces(positions) positions = _drop_double_spaces(positions) positions = list(positions) line_text = ''.join( _sort_chars(p)[0].char for p in positions) if line_text != prediction.sentence: log.warning( "Our own line text is not the same as Calamari's: '%s' != '%s'", line_text, prediction.sentence) # Delete existing results if line.get_TextEquiv(): log.warning("Line '%s' already contained text results", line.id) line.set_TextEquiv([]) if line.get_Word(): log.warning( "Line '%s' already contained word segmentation", line.id) line.set_Word([]) # Save line results line_conf = prediction.avg_char_probability line.set_TextEquiv( [TextEquivType(Unicode=line_text, conf=line_conf)]) # Save word results # # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict # hierarchy of lines > words > glyphs. def _words(s): """Split words based on spaces and include spaces as 'words'""" spaces = None word = '' for c in s: if c == ' ' and spaces is True: word += c elif c != ' ' and spaces is False: word += c else: if word: yield word word = c spaces = (c == ' ') yield word if self.parameter['textequiv_level'] in ['word', 'glyph']: word_no = 0 i = 0 for word_text in _words(line_text): word_length = len(word_text) if not all(c == ' ' for c in word_text): word_positions = positions[i:i + word_length] word_start = word_positions[0].global_start word_end = word_positions[-1].global_end polygon = polygon_from_x0y0x1y1([ word_start, 0, word_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) # XXX Crop to line polygon? word = WordType(id='%s_word%04d' % (line.id, word_no), Coords=CoordsType(points)) word.add_TextEquiv( TextEquivType(Unicode=word_text)) if self.parameter[ 'textequiv_level'] == 'glyph': for glyph_no, p in enumerate( word_positions): glyph_start = p.global_start glyph_end = p.global_end polygon = polygon_from_x0y0x1y1([ glyph_start, 0, glyph_end, line_image.height ]) points = points_from_polygon( coordinates_for_segment( polygon, None, line_coords)) glyph = GlyphType( id='%s_glyph%04d' % (word.id, glyph_no), Coords=CoordsType(points)) # Add predictions (= TextEquivs) char_index_start = 1 # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs for char_index, char in enumerate( _sort_chars(p), start=char_index_start): glyph.add_TextEquiv( TextEquivType( Unicode=char.char, index=char_index, conf=char.probability)) word.add_Glyph(glyph) line.add_Word(word) word_no += 1 i += word_length _page_update_higher_textequiv_levels('line', pcgts) # Add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): """Performs segmentation evaluation with Shapely on the workspace. Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level. Return information on the plausibility of the segmentation into regions on the logging level. """ sanitize = self.parameter['sanitize'] plausibilize = self.parameter['plausibilize'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType(externalModel="ocrd-tool", externalId="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) # # validate segmentation (warn of children extending beyond their parents) # self.validate_coords(page, page_id) # # sanitize region segmentation (shrink to hull of lines) # if sanitize: self.sanitize_page(page, page_id) # # plausibilize region segmentation (remove redundant text regions) # mark_for_deletion = list() # what regions get removed? mark_for_merging = dict( ) # what regions get merged into which regions? # TODO: cover recursive region structure (but compare only at the same level) regions = page.get_TextRegion() # sort by area to ensure to arrive at a total ordering compatible # with the topological sort along containment/equivalence arcs # (so we can avoid substituting regions with superregions that have # themselves been substituted/deleted): RegionPolygon = namedtuple('RegionPolygon', ['region', 'polygon']) regionspolys = sorted([ RegionPolygon( region, Polygon(polygon_from_points(region.get_Coords().points))) for region in regions ], key=lambda x: x.polygon.area) for i in range(0, len(regionspolys)): for j in range(i + 1, len(regionspolys)): region1 = regionspolys[i].region region2 = regionspolys[j].region poly1 = regionspolys[i].polygon poly2 = regionspolys[j].polygon LOG.debug('Comparing regions "%s" and "%s"', region1.id, region2.id) if poly1.almost_equals(poly2): LOG.warning( 'Page "%s" region "%s" is almost equal to "%s" %s', page_id, region2.id, region1.id, '(removing)' if plausibilize else '') mark_for_deletion.append(region2.id) elif poly1.contains(poly2): LOG.warning('Page "%s" region "%s" is within "%s" %s', page_id, region2.id, region1.id, '(removing)' if plausibilize else '') mark_for_deletion.append(region2.id) elif poly2.contains(poly1): LOG.warning('Page "%s" region "%s" is within "%s" %s', page_id, region1.id, region2.id, '(removing)' if plausibilize else '') mark_for_deletion.append(region1.id) elif poly1.overlaps(poly2): inter_poly = poly1.intersection(poly2) union_poly = poly1.union(poly2) LOG.debug( 'Page "%s" region "%s" overlaps "%s" by %f/%f', page_id, region1.id, region2.id, inter_poly.area / poly1.area, inter_poly.area / poly2.area) if union_poly.convex_hull.area >= poly1.area + poly2.area: # skip this pair -- combined polygon encloses previously free segments pass elif inter_poly.area / poly2.area > self.parameter[ 'plausibilize_merge_min_overlap']: LOG.warning( 'Page "%s" region "%s" is almost within "%s" %s', page_id, region2.id, region1.id, '(merging)' if plausibilize else '') mark_for_merging[region2.id] = region1 elif inter_poly.area / poly1.area > self.parameter[ 'plausibilize_merge_min_overlap']: LOG.warning( 'Page "%s" region "%s" is almost within "%s" %s', page_id, region1.id, region2.id, '(merging)' if plausibilize else '') mark_for_merging[region1.id] = region2 # TODO: more merging cases... #LOG.info('Intersection %i', poly1.intersects(poly2)) #LOG.info('Containment %i', poly1.contains(poly2)) #if poly1.intersects(poly2): # LOG.info('Area 1 %d', poly1.area) # LOG.info('Area 2 %d', poly2.area) # LOG.info('Area intersect %d', poly1.intersection(poly2).area) if plausibilize: # the reading order does not have to include all regions # but it may include all types of regions! ro = page.get_ReadingOrder() if ro: rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() else: rogroup = None # pass the regions sorted (see above) _plausibilize_group(regionspolys, rogroup, mark_for_deletion, mark_for_merging) # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))