def _process_region(self, it, region, rogroup, region_image, region_coords): LOG = getLogger('processor.TesserocrSegmentTable') # equivalent to GetComponentImages with raw_image=True, # (which would also give raw coordinates), # except we are also interested in the iterator's BlockType() here, index = 0 if rogroup: for elem in (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + rogroup.get_UnorderedGroupIndexed()): if elem.index >= index: index = elem.index + 1 while it and not it.Empty(RIL.BLOCK): bbox = it.BoundingBox(RIL.BLOCK) polygon = polygon_from_x0y0x1y1(bbox) polygon = coordinates_for_segment(polygon, region_image, region_coords) points = points_from_polygon(polygon) coords = CoordsType(points=points) # if xywh['w'] < 30 or xywh['h'] < 30: # LOG.info('Ignoring too small region: %s', points) # it.Next(RIL.BLOCK) # continue # # add the region reference in the reading order element # (but ignore non-text regions entirely) ID = region.id + "_%04d" % index subregion = TextRegionType(id=ID, Coords=coords, type=TextTypeSimpleType.PARAGRAPH) block_type = it.BlockType() if block_type == PT.FLOWING_TEXT: pass elif block_type == PT.HEADING_TEXT: subregion.set_type(TextTypeSimpleType.HEADING) elif block_type == PT.PULLOUT_TEXT: subregion.set_type(TextTypeSimpleType.FLOATING) elif block_type == PT.CAPTION_TEXT: subregion.set_type(TextTypeSimpleType.CAPTION) elif block_type == PT.VERTICAL_TEXT: subregion.set_orientation(90.0) else: it.Next(RIL.BLOCK) continue LOG.info("Detected cell '%s': %s (%s)", ID, points, membername(PT, block_type)) region.add_TextRegion(subregion) if rogroup: rogroup.add_RegionRefIndexed( RegionRefIndexedType(regionRef=ID, index=index)) # # iterator increment # index += 1 it.Next(RIL.BLOCK)
def _process_page(self, it, page, page_image, page_coords, page_id): LOG = getLogger('processor.TesserocrSegmentRegion') # equivalent to GetComponentImages with raw_image=True, # (which would also give raw coordinates), # except we are also interested in the iterator's BlockType() here, # and its BlockPolygon() index = 0 ro = page.get_ReadingOrder() if not ro: ro = ReadingOrderType() page.set_ReadingOrder(ro) og = ro.get_OrderedGroup() if og: # start counting from largest existing index for elem in (og.get_RegionRefIndexed() + og.get_OrderedGroupIndexed() + og.get_UnorderedGroupIndexed()): if elem.index >= index: index = elem.index + 1 else: # new top-level group og = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(og) while it and not it.Empty(RIL.BLOCK): # (padding will be passed to both BoundingBox and GetImage) # (actually, Tesseract honours padding only on the left and bottom, # whereas right and top are increased less!) bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding']) # sometimes these polygons are not planar, which causes # PIL.ImageDraw.Draw.polygon (and likely others as well) # to misbehave; however, PAGE coordinate semantics prohibit # multi-path polygons! # (probably a bug in Tesseract itself, cf. tesseract#2826): if self.parameter['crop_polygons']: polygon = it.BlockPolygon() else: polygon = polygon_from_x0y0x1y1(bbox) polygon = coordinates_for_segment(polygon, page_image, page_coords) polygon2 = polygon_for_parent(polygon, page) if polygon2 is not None: polygon = polygon2 points = points_from_polygon(polygon) coords = CoordsType(points=points) if polygon2 is None: LOG.info('Ignoring extant region: %s', points) it.Next(RIL.BLOCK) continue # if xywh['w'] < 30 or xywh['h'] < 30: # LOG.info('Ignoring too small region: %s', points) # it.Next(RIL.BLOCK) # continue # region_image_bin = it.GetBinaryImage(RIL.BLOCK) # if not region_image_bin.getbbox(): # LOG.info('Ignoring binary-empty region: %s', points) # it.Next(RIL.BLOCK) # continue # # add the region reference in the reading order element # (will be removed again if Separator/Noise region below) ID = "region%04d" % index og.add_RegionRefIndexed( RegionRefIndexedType(regionRef=ID, index=index)) # # region type switch # block_type = it.BlockType() if block_type in [ PT.FLOWING_TEXT, PT.HEADING_TEXT, PT.PULLOUT_TEXT, PT.CAPTION_TEXT, # TABLE is contained in PTIsTextType, but # it is a bad idea to create a TextRegion # for it (better set `find_tables` False): # PT.TABLE, # will also get a 90° @orientation # (but that can be overridden by deskew/OSD): PT.VERTICAL_TEXT ]: region = TextRegionType(id=ID, Coords=coords, type=TextTypeSimpleType.PARAGRAPH) if block_type == PT.VERTICAL_TEXT: region.set_orientation(90.0) elif block_type == PT.HEADING_TEXT: region.set_type(TextTypeSimpleType.HEADING) elif block_type == PT.PULLOUT_TEXT: region.set_type(TextTypeSimpleType.FLOATING) elif block_type == PT.CAPTION_TEXT: region.set_type(TextTypeSimpleType.CAPTION) page.add_TextRegion(region) if self.parameter['sparse_text']: region.set_type(TextTypeSimpleType.OTHER) region.add_TextLine( TextLineType(id=region.id + '_line', Coords=coords)) elif block_type in [ PT.FLOWING_IMAGE, PT.HEADING_IMAGE, PT.PULLOUT_IMAGE ]: region = ImageRegionType(id=ID, Coords=coords) page.add_ImageRegion(region) elif block_type in [PT.HORZ_LINE, PT.VERT_LINE]: region = SeparatorRegionType(id=ID, Coords=coords) page.add_SeparatorRegion(region) # undo appending in ReadingOrder og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1]) elif block_type in [PT.INLINE_EQUATION, PT.EQUATION]: region = MathsRegionType(id=ID, Coords=coords) page.add_MathsRegion(region) elif block_type == PT.TABLE: # without API access to StructuredTable we cannot # do much for a TableRegionType (i.e. nrows, ncols, # coordinates of cells for recursive regions etc), # but this can be achieved afterwards by segment-table region = TableRegionType(id=ID, Coords=coords) page.add_TableRegion(region) else: region = NoiseRegionType(id=ID, Coords=coords) page.add_NoiseRegion() # undo appending in ReadingOrder og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1]) LOG.info("Detected region '%s': %s (%s)", ID, points, membername(PT, block_type)) # # iterator increment # index += 1 it.Next(RIL.BLOCK) if (not og.get_RegionRefIndexed() and not og.get_OrderedGroupIndexed() and not og.get_UnorderedGroupIndexed()): # schema forbids empty OrderedGroup ro.set_OrderedGroup(None)