Exemplo n.º 1
0
def _add_annotation(annotations,
                    segment,
                    imgid,
                    catid,
                    coords=None,
                    mask=None):
    LOG = getLogger('processor.EvaluateSegmentation')
    score = segment.get_Coords().get_conf() or 1.0
    polygon = polygon_from_points(segment.get_Coords().points)
    if len(polygon) < 3:
        LOG.warning('ignoring segment "%s" with only %d points', segment.id,
                    len(polygon))
        return
    xywh = xywh_from_polygon(polygon)
    if mask is None:
        segmentation = np.array(polygon).reshape(1, -1).tolist()
    else:
        polygon = coordinates_of_segment(segment, None, coords)
        py, px = draw.polygon(polygon[:, 1], polygon[:, 0], mask.shape)
        masked = np.zeros(
            mask.shape, dtype=np.uint8,
            order='F')  # pycocotools.mask wants Fortran-contiguous arrays
        masked[py, px] = 1 * mask[py, px]
        segmentation = encodeMask(masked)
    annotations.append({
        'segment_id':
        segment.id,  # non-standard string-valued in addition to 'id'
        'image_id': imgid,
        'category_id': catid,
        'segmentation': segmentation,
        'area': Polygon(polygon).area,
        'bbox': [xywh['x'], xywh['y'], xywh['w'], xywh['h']],
        'score': score,
        'iscrowd': 0
    })
Exemplo n.º 2
0
 def sanitize_page(self, page, page_id):
     regions = page.get_TextRegion()
     page_image, page_coords, _ = self.workspace.image_from_page(
         page, page_id)
     for region in regions:
         LOG.info('Sanitizing region "%s"', region.id)
         lines = region.get_TextLine()
         heights = []
         # get labels:
         region_mask = np.zeros((page_image.height, page_image.width), dtype=np.uint8)
         for line in lines:
             line_polygon = coordinates_of_segment(line, page_image, page_coords)
             heights.append(xywh_from_polygon(line_polygon)['h'])
             region_mask[draw.polygon(line_polygon[:, 1],
                                      line_polygon[:, 0],
                                      region_mask.shape)] = 1
             region_mask[draw.polygon_perimeter(line_polygon[:, 1],
                                                line_polygon[:, 0],
                                                region_mask.shape)] = 1
         # estimate scale:
         scale = int(np.median(np.array(heights)))
         # close labels:
         region_mask = np.pad(region_mask, scale) # protect edges
         region_mask = filters.maximum_filter(region_mask, (scale, 1), origin=0)
         region_mask = filters.minimum_filter(region_mask, (scale, 1), origin=0)
         region_mask = region_mask[scale:-scale, scale:-scale] # unprotect
         # find outer contour (parts):
         contours, _ = cv2.findContours(region_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
         # determine areas of parts:
         areas = [cv2.contourArea(contour) for contour in contours]
         total_area = sum(areas)
         if not total_area:
             # ignore if too small
             LOG.warning('Zero contour area in region "%s"', region.id)
             continue
         # pick contour and convert to absolute:
         region_polygon = None
         for i, contour in enumerate(contours):
             area = areas[i]
             if area / total_area < 0.1:
                 LOG.warning('Ignoring contour %d too small (%d/%d) in region "%s"',
                             i, area, total_area, region.id)
                 continue
             # simplify shape:
             polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y
             if len(polygon) < 4:
                 LOG.warning('Ignoring contour %d less than 4 points in region "%s"',
                             i, region.id)
                 continue
             if region_polygon is not None:
                 LOG.error('Skipping region "%s" due to non-contiguous contours',
                           region.id)
                 region_polygon = None
                 break
             region_polygon = coordinates_for_segment(polygon, page_image, page_coords)
         if region_polygon is not None:
             LOG.info('Using new coordinates for region "%s"', region.id)
             region.get_Coords().points = points_from_polygon(region_polygon)
Exemplo n.º 3
0
    def create(self, region_ds: RegionWithCoords) -> Optional[Region]:
        if not region_ds:
            return None

        region = Region(region_ds)
        coords = coordinates_of_segment(region_ds, None, self.coords)

        warnings = []

        try:
            poly = Polygon(coords)
        except ValueError as err:
            self.logger.error('Page "%s" @ %s %s', self.page_id, str(region),
                              str(err))
            return None

        if not poly.is_valid:
            warning = explain_validity(poly)
            poly, error = self.make_valid(poly)
            if not poly.is_valid:
                self.logger.error('Page "%s" @ %s %s', self.page_id,
                                  str(region), str(warning))
                return None
            else:
                warnings.append('{} fixed with an error of {:.3%}'.format(
                    warning, error))

        if poly.length < 4:
            warnings.append(str('has too few points'))

        if poly.is_empty or not poly.area > 0:
            self.logger.error('Page "%s" @ %s %s', self.page_id, str(region),
                              'is empty')
            return None

        if poly.bounds[0] < 0 or poly.bounds[1] < 0:
            warnings.append('is negative')

        if warnings:
            self.logger.warning('Page "%s" @ %s %s', self.page_id, str(region),
                                ' | '.join(warnings))

        region.poly = poly
        region.warnings = warnings

        return region
Exemplo n.º 4
0
 def process_page(self, page, page_image, page_xywh, bounds, file_id, page_id):
     """Set the identified page border, if valid."""
     LOG = getLogger('processor.TesserocrCrop')
     left, top, right, bottom = bounds
     if left >= right or top >= bottom:
         LOG.error("Cannot find valid extent for page '%s'", page_id)
         return
     padding = self.parameter['padding']
     # add padding:
     left = max(left - padding, 0)
     right = min(right + padding, page_image.width)
     top = max(top - padding, 0)
     bottom = min(bottom + padding, page_image.height)
     LOG.info("Padded page border: %i:%i,%i:%i", left, right, top, bottom)
     polygon = polygon_from_bbox(left, top, right, bottom)
     polygon = coordinates_for_segment(polygon, page_image, page_xywh)
     polygon = polygon_for_parent(polygon, page)
     if polygon is None:
         LOG.error("Ignoring extant border")
         return
     border = BorderType(Coords=CoordsType(
         points_from_polygon(polygon)))
     # intersection with parent could have changed bbox,
     # so recalculate:
     bbox = bbox_from_polygon(coordinates_of_segment(border, page_image, page_xywh))
     # update PAGE (annotate border):
     page.set_Border(border)
     # update METS (add the image file):
     page_image = crop_image(page_image, box=bbox)
     page_xywh['features'] += ',cropped'
     file_path = self.workspace.save_image_file(
         page_image, file_id + '.IMG-CROP',
         page_id=page_id, file_grp=self.output_file_grp)
     # update PAGE (reference the image file):
     page.add_AlternativeImage(AlternativeImageType(
         filename=file_path, comments=page_xywh['features']))
Exemplo n.º 5
0
def segment_poly(page_id, segment, coords):
    LOG = getLogger('processor.ExtractPages')
    polygon = coordinates_of_segment(segment, None, coords)
    # validate coordinates
    try:
        poly = Polygon(polygon)
        reason = ''
        if not poly.is_valid:
            reason = explain_validity(poly)
        elif poly.is_empty:
            reason = 'is empty'
        elif poly.bounds[0] < 0 or poly.bounds[1] < 0:
            reason = 'is negative'
        elif poly.length < 4:
            reason = 'has too few points'
    except ValueError as err:
        reason = err
    if reason:
        tag = segment.__class__.__name__.replace('Type', '')
        if tag != 'Border':
            tag += ' "%s"' % segment.id
        LOG.error('Page "%s" %s %s', page_id, tag, reason)
        return None
    return poly
Exemplo n.º 6
0
 def process(self):
     """Extract page images and region descriptions (type and coordinates) from the workspace.
     
     Open and deserialize PAGE input files and their respective images,
     then iterate over the element hierarchy down to the region level.
     
     Get all regions with their types (region element class), sub-types (@type)
     and coordinates relative to the page (which depending on the workflow could
     already be cropped, deskewed, dewarped, binarized etc). Extract the image of
     the page, both in binarized and non-binarized form. In addition, create a new
     image which color-codes all regions. Create a JSON file with region types and
     coordinates.
     
     Write all files in the directory of the output file group, named like so:
     * ID + '.png': raw image
     * ID + '.bin.png': binarized image
     * ID + '.dbg.png': debug image
     * ID + '.json': region coordinates
     
     (This is intended for training and evaluation of region segmentation models.)
     """
     # pylint: disable=attribute-defined-outside-init
     for n, input_file in enumerate(self.input_files):
         file_id = input_file.ID.replace(self.input_file_grp,
                                         self.output_file_grp)
         page_id = input_file.pageId or input_file.ID
         LOG.info("INPUT FILE %i / %s", n, page_id)
         pcgts = page_from_file(self.workspace.download_file(input_file))
         page = pcgts.get_Page()
         ptype = page.get_type()
         metadata = pcgts.get_Metadata()  # ensured by from_file()
         metadata.add_MetadataItem(
             MetadataItemType(
                 type_="processingStep",
                 name=self.ocrd_tool['steps'][0],
                 value=TOOL,
                 Labels=[
                     LabelsType(externalModel="ocrd-tool",
                                externalId="parameters",
                                Label=[
                                    LabelType(type_=name,
                                              value=self.parameter[name])
                                    for name in self.parameter.keys()
                                ])
                 ]))
         page_image, page_coords, page_image_info = self.workspace.image_from_page(
             page,
             page_id,
             feature_filter='binarized',
             transparency=self.parameter['transparency'])
         if page_image_info.resolution != 1:
             dpi = page_image_info.resolution
             if page_image_info.resolutionUnit == 'cm':
                 dpi = round(dpi * 2.54)
         else:
             dpi = None
         file_path = self.workspace.save_image_file(page_image,
                                                    file_id,
                                                    self.output_file_grp,
                                                    page_id=page_id)
         page_image_bin, _, _ = self.workspace.image_from_page(
             page,
             page_id,
             feature_selector='binarized',
             transparency=self.parameter['transparency'])
         self.workspace.save_image_file(page_image_bin,
                                        file_id + '.bin',
                                        self.output_file_grp,
                                        page_id=page_id)
         page_image_dbg = Image.new(mode='RGB',
                                    size=page_image.size,
                                    color=0)
         regions = {
             'text': page.get_TextRegion(),
             'table': page.get_TableRegion(),
             'chart': page.get_ChartRegion(),
             'chem': page.get_ChemRegion(),
             'graphic': page.get_GraphicRegion(),
             'image': page.get_ImageRegion(),
             'linedrawing': page.get_LineDrawingRegion(),
             'maths': page.get_MathsRegion(),
             'music': page.get_MusicRegion(),
             'noise': page.get_NoiseRegion(),
             'separator': page.get_SeparatorRegion(),
             'unknown': page.get_UnknownRegion()
         }
         description = {'angle': page.get_orientation()}
         for rtype, rlist in regions.items():
             for region in rlist:
                 polygon = coordinates_of_segment(region, page_image,
                                                  page_coords).tolist()
                 description.setdefault('regions', []).append({
                     'type':
                     rtype,
                     'subtype':
                     region.get_type()
                     if rtype in ['text', 'chart', 'graphic'] else None,
                     'coords':
                     polygon,
                     'features':
                     page_coords['features'],
                     'DPI':
                     dpi,
                     'region.ID':
                     region.id,
                     'page.ID':
                     page_id,
                     'page.type':
                     ptype,
                     'file_grp':
                     self.input_file_grp,
                     'METS.UID':
                     self.workspace.mets.unique_identifier
                 })
                 ImageDraw.Draw(page_image_dbg).polygon(list(
                     map(tuple, polygon)),
                                                        fill=CLASSES[rtype])
                 ImageDraw.Draw(page_image_dbg).line(list(
                     map(tuple, polygon + [polygon[0]])),
                                                     fill=CLASSES['border'],
                                                     width=3)
         self.workspace.save_image_file(page_image_dbg,
                                        file_id + '.dbg',
                                        self.output_file_grp,
                                        page_id=page_id)
         file_path = file_path.replace('.png', '.json')
         json.dump(description, open(file_path, 'w'))
Exemplo n.º 7
0
    def process(self):
        """Performs (text) line segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the (text) region level,
        and remove any existing TextLine elements (unless ``overwrite_lines``
        is False).
        
        Set up Tesseract to detect lines, and add each one to the region
        at the detected coordinates.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        overwrite_lines = self.parameter['overwrite_lines']
        
        with PyTessBaseAPI(
                psm=PSM.SINGLE_BLOCK,
                path=TESSDATA_PREFIX
        ) as tessapi:
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                page = pcgts.get_Page()
                
                # add metadata about this operation and its runtime parameters:
                metadata = pcgts.get_Metadata() # ensured by from_file()
                metadata.add_MetadataItem(
                    MetadataItemType(type_="processingStep",
                                     name=self.ocrd_tool['steps'][0],
                                     value=TOOL,
                                     Labels=[LabelsType(
                                         externalModel="ocrd-tool",
                                         externalId="parameters",
                                         Label=[LabelType(type_=name,
                                                          value=self.parameter[name])
                                                for name in self.parameter.keys()])]))
                
                page_image, page_coords, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi)
                else:
                    dpi = 0
                    LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                
                for region in itertools.chain.from_iterable(
                        [page.get_TextRegion()] +
                        [subregion.get_TextRegion() for subregion in page.get_TableRegion()]):
                    if region.get_TextLine():
                        if overwrite_lines:
                            LOG.info('removing existing TextLines in region "%s"', region.id)
                            region.set_TextLine([])
                        else:
                            LOG.warning('keeping existing TextLines in region "%s"', region.id)
                    LOG.debug("Detecting lines in region '%s'", region.id)
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords)
                    region_polygon = coordinates_of_segment(region, region_image, region_coords)
                    region_poly = Polygon(region_polygon)
                    tessapi.SetImage(region_image)
                    for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)):
                        line_id = '%s_line%04d' % (region.id, line_no)
                        line_polygon = polygon_from_xywh(component[1])
                        line_poly = Polygon(line_polygon)
                        if not line_poly.within(region_poly):
                            # this could happen due to rotation
                            interline = line_poly.intersection(region_poly)
                            if interline.is_empty:
                                continue # ignore this line
                            if hasattr(interline, 'geoms'):
                                # is (heterogeneous) GeometryCollection
                                area = 0
                                for geom in interline.geoms:
                                    if geom.area > area:
                                        area = geom.area
                                        interline = geom
                                if not area:
                                    continue
                            line_poly = interline.convex_hull
                            line_polygon = line_poly.exterior.coords
                        line_polygon = coordinates_for_segment(line_polygon, region_image, region_coords)
                        line_points = points_from_polygon(line_polygon)
                        region.add_TextLine(TextLineType(
                            id=line_id, Coords=CoordsType(line_points)))
                
                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(
                    ID=file_id,
                    file_grp=self.output_file_grp,
                    pageId=input_file.pageId,
                    mimetype=MIMETYPE_PAGE,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    content=to_xml(pcgts))
Exemplo n.º 8
0
    def process(self):
        """Clip text regions / lines of the workspace at intersections with neighbours.

        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the requested
        ``level-of-operation``.

        Next, get each segment image according to the layout annotation (by cropping
        via coordinates into the higher-level image), as well as all its neighbours',
        binarize them (without deskewing), and make a connected component analysis.
        (Segments must not already have AlternativeImage annotated, otherwise they
        will be skipped.)

        Then, for each section of overlap with a neighbour, re-assign components
        which are only contained in the neighbour by clipping them to white (background),
        and export the (final) result as image file.

        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CLIP`` along with further
        identification of the input element.

        Reference each new image in the AlternativeImage of the element.

        Produce a new output file by serialising the resulting hierarchy.
        """
        # This makes best sense for overlapping segmentation, like current GT
        # or Tesseract layout analysis. Most notably, it can suppress graphics
        # and separators within or across a region or line. It _should_ ideally
        # be run after binarization (on page level for region-level clipping,
        # and on the region level for line-level clipping), because the
        # connected component analysis after implicit binarization could be
        # suboptimal, and the explicit binarization after clipping could be,
        # too. However, region-level clipping _must_ be run before region-level
        # deskewing, because that would make segments incomensurable with their
        # neighbours.
        LOG = getLogger('processor.OcropyClip')
        level = self.parameter['level-of-operation']
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file.pageId
                     or input_file.ID)
            file_id = make_file_id(input_file, self.output_file_grp)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID  # (PageType has no id)
            page = pcgts.get_Page()

            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized')
            if self.parameter['dpi'] > 0:
                zoom = 300.0 / self.parameter['dpi']
            elif page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi *= 2.54
                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
                zoom = 300.0 / dpi
            else:
                zoom = 1

            # FIXME: what about text regions inside table regions?
            regions = list(page.get_TextRegion())
            num_texts = len(regions)
            regions += (page.get_AdvertRegion() + page.get_ChartRegion() +
                        page.get_ChemRegion() + page.get_GraphicRegion() +
                        page.get_ImageRegion() + page.get_LineDrawingRegion() +
                        page.get_MathsRegion() + page.get_MusicRegion() +
                        page.get_NoiseRegion() + page.get_SeparatorRegion() +
                        page.get_TableRegion() + page.get_UnknownRegion())
            if not num_texts:
                LOG.warning('Page "%s" contains no text regions', page_id)
            background = ImageStat.Stat(page_image)
            # workaround for Pillow#4925
            if len(background.bands) > 1:
                background = tuple(background.median)
            else:
                background = background.median[0]
            if level == 'region':
                background_image = Image.new(page_image.mode, page_image.size,
                                             background)
                page_array = pil2array(page_image)
                page_bin = np.array(page_array <= midrange(page_array),
                                    np.uint8)
                # in absolute coordinates merely for comparison/intersection
                shapes = [
                    Polygon(polygon_from_points(region.get_Coords().points))
                    for region in regions
                ]
                # in relative coordinates for mask/cropping
                polygons = [
                    coordinates_of_segment(region, page_image, page_coords)
                    for region in regions
                ]
                for i, polygon in enumerate(polygons[num_texts:], num_texts):
                    # for non-text regions, extend mask by 3 pixels in each direction
                    # to ensure they do not leak components accidentally
                    # (accounts for bad cropping of such regions in GT):
                    polygon = Polygon(polygon).buffer(
                        3).exterior.coords[:-1]  # keep open
                    polygons[i] = polygon
                masks = [
                    pil2array(polygon_mask(page_image,
                                           polygon)).astype(np.uint8)
                    for polygon in polygons
                ]
            for i, region in enumerate(regions):
                if i >= num_texts:
                    break  # keep non-text regions unchanged
                if level == 'region':
                    if region.get_AlternativeImage():
                        # FIXME: This should probably be an exception (bad workflow configuration).
                        LOG.warning(
                            'Page "%s" region "%s" already contains image data: skipping',
                            page_id, region.id)
                        continue
                    shape = prep(shapes[i])
                    neighbours = [
                        (regionj, maskj) for shapej, regionj, maskj in zip(
                            shapes[:i] + shapes[i + 1:], regions[:i] +
                            regions[i + 1:], masks[:i] + masks[i + 1:])
                        if shape.intersects(shapej)
                    ]
                    if neighbours:
                        self.process_segment(region, masks[i], polygons[i],
                                             neighbours, background_image,
                                             page_image, page_coords, page_bin,
                                             input_file.pageId,
                                             file_id + '_' + region.id)
                    continue
                # level == 'line':
                lines = region.get_TextLine()
                if not lines:
                    LOG.warning('Page "%s" region "%s" contains no text lines',
                                page_id, region.id)
                    continue
                region_image, region_coords = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_coords,
                    feature_selector='binarized')
                background_image = Image.new(region_image.mode,
                                             region_image.size, background)
                region_array = pil2array(region_image)
                region_bin = np.array(region_array <= midrange(region_array),
                                      np.uint8)
                # in absolute coordinates merely for comparison/intersection
                shapes = [
                    Polygon(polygon_from_points(line.get_Coords().points))
                    for line in lines
                ]
                # in relative coordinates for mask/cropping
                polygons = [
                    coordinates_of_segment(line, region_image, region_coords)
                    for line in lines
                ]
                masks = [
                    pil2array(polygon_mask(region_image,
                                           polygon)).astype(np.uint8)
                    for polygon in polygons
                ]
                for j, line in enumerate(lines):
                    if line.get_AlternativeImage():
                        # FIXME: This should probably be an exception (bad workflow configuration).
                        LOG.warning(
                            'Page "%s" region "%s" line "%s" already contains image data: skipping',
                            page_id, region.id, line.id)
                        continue
                    shape = prep(shapes[j])
                    neighbours = [(linej, maskj)
                                  for shapej, linej, maskj in zip(
                                      shapes[:j] + shapes[j + 1:], lines[:j] +
                                      lines[j + 1:], masks[:j] + masks[j + 1:])
                                  if shape.intersects(shapej)]
                    if neighbours:
                        self.process_segment(
                            line, masks[j], polygons[j], neighbours,
                            background_image, region_image, region_coords,
                            region_bin, input_file.pageId,
                            file_id + '_' + region.id + '_' + line.id)

            # update METS (add the PAGE file):
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            pcgts.set_pcGtsId(file_id)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     self.output_file_grp, out.local_filename)
Exemplo n.º 9
0
    def process(self):
        """Extract page images and region descriptions (type and coordinates) from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Get all regions with their types (region element class), sub-types (@type)
        and coordinates relative to the page (which depending on the workflow could
        already be cropped, deskewed, dewarped, binarized etc). Extract the image of
        the (cropped, deskewed, dewarped) page, both in binarized form (if available)
        and non-binarized form. In addition, create a new image with masks for all
        regions, color-coded by type. Create two JSON files with region types and
        coordinates: one (page-wise) in our custom format and one (global) in MS-COCO.
        
        The output file group may be given as a comma-separated list to separate
        these 3 page-level images. Write files as follows:
        * in the first (or only) output file group (directory):
          - ID + '.png': raw image of the (preprocessed) page
          - ID + '.json': region coordinates/classes (custom format)
        * in the second (or first) output file group (directory):
          - ID + '.bin.png': binarized image of the (preprocessed) page, if available
        * in the third (or first) output file group (directory):
          - ID + '.dbg.png': debug image
        
        In addition, write a file for all pages at once:
        * in the third (or first) output file group (directory):
          - output_file_grp + '.coco.json': region coordinates/classes (MS-COCO format)
          - output_file_grp + '.colordict.json': color definitions (as in PAGE viewer)
        
        (This is intended for training and evaluation of region segmentation models.)
        """
        file_groups = self.output_file_grp.split(',')
        if len(file_groups) > 3:
            raise Exception(
                "at most 3 output file grps allowed (raw, [binarized, [mask]] image)"
            )
        if len(file_groups) > 2:
            dbg_image_grp = file_groups[2]
        else:
            dbg_image_grp = file_groups[0]
            LOG.info(
                "No output file group for debug images specified, falling back to output filegrp '%s'",
                dbg_image_grp)
        if len(file_groups) > 1:
            bin_image_grp = file_groups[1]
        else:
            bin_image_grp = file_groups[0]
            LOG.info(
                "No output file group for binarized images specified, falling back to output filegrp '%s'",
                bin_image_grp)
        self.output_file_grp = file_groups[0]

        # COCO: init data structures
        images = list()
        annotations = list()
        categories = list()
        i = 0
        for cat, color in CLASSES.items():
            # COCO format does not allow alpha channel
            color = (int(color[0:2], 16), int(color[2:4],
                                              16), int(color[4:6], 16))
            try:
                supercat, name = cat.split(':')
            except ValueError:
                name = cat
                supercat = ''
            categories.append({
                'id': i,
                'name': name,
                'supercategory': supercat,
                'source': 'PAGE',
                'color': color
            })
            i += 1

        i = 0
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            page_id = input_file.pageId or input_file.ID
            num_page_id = int(page_id.strip(page_id.strip("0123456789")))
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            ptype = page.get_type()
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter
                                   ])
                    ]))
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter='binarized',
                transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            file_path = self.workspace.save_image_file(
                page_image,
                file_id,
                self.output_file_grp,
                page_id=page_id,
                mimetype=self.parameter['mimetype'])
            try:
                page_image_bin, _, _ = self.workspace.image_from_page(
                    page,
                    page_id,
                    feature_selector='binarized',
                    transparency=self.parameter['transparency'])
                self.workspace.save_image_file(page_image_bin,
                                               file_id + '.bin',
                                               bin_image_grp,
                                               page_id=page_id)
            except Exception as err:
                if err.args[0].startswith('Found no AlternativeImage'):
                    LOG.warning(
                        'Page "%s" has no binarized images, skipping .bin',
                        page_id)
                else:
                    raise
            page_image_dbg = Image.new(mode='RGBA',
                                       size=page_image.size,
                                       color='#' + CLASSES[''])
            if page.get_Border():
                polygon = coordinates_of_segment(page.get_Border(), page_image,
                                                 page_coords).tolist()
                ImageDraw.Draw(page_image_dbg).polygon(
                    list(map(tuple, polygon)), fill='#' + CLASSES['Border'])
            else:
                page_image_dbg.paste(
                    '#' + CLASSES['Border'],
                    (0, 0, page_image.width, page_image.height))
            regions = dict()
            for name in CLASSES.keys():
                if not name or name == 'Border' or ':' in name:
                    # no subtypes here
                    continue
                regions[name] = getattr(page, 'get_' + name)()
            description = {'angle': page.get_orientation()}
            Neighbor = namedtuple('Neighbor', ['id', 'poly', 'type'])
            neighbors = []
            for rtype, rlist in regions.items():
                for region in rlist:
                    if rtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']:
                        subrtype = region.get_type()
                    else:
                        subrtype = None
                    polygon = coordinates_of_segment(region, page_image,
                                                     page_coords)
                    polygon2 = polygon.reshape(1, -1).tolist()
                    polygon = polygon.tolist()
                    xywh = xywh_from_polygon(polygon)
                    # validate coordinates and check intersection with neighbours
                    # (which would melt into another in the mask image):
                    try:
                        poly = Polygon(polygon)
                        reason = ''
                    except ValueError as err:
                        reason = err
                    if not poly.is_valid:
                        reason = explain_validity(poly)
                    elif poly.is_empty:
                        reason = 'is empty'
                    elif poly.bounds[0] < 0 or poly.bounds[1] < 0:
                        reason = 'is negative'
                    elif poly.length < 4:
                        reason = 'has too few points'
                    if reason:
                        LOG.error('Page "%s" region "%s" %s', page_id,
                                  region.id, reason)
                        continue
                    poly_prep = prep(poly)
                    for neighbor in neighbors:
                        if (rtype == neighbor.type
                                and poly_prep.intersects(neighbor.poly)
                                and poly.intersection(neighbor.poly).area > 0):
                            LOG.warning('Page "%s" region "%s" intersects neighbour "%s" (IoU: %.3f)',
                                        page_id, region.id, neighbor.id,
                                        poly.intersection(neighbor.poly).area / \
                                        poly.union(neighbor.poly).area)
                        elif (rtype != neighbor.type
                              and poly_prep.within(neighbor.poly)):
                            LOG.warning(
                                'Page "%s" region "%s" within neighbour "%s" (IoU: %.3f)',
                                page_id, region.id, neighbor.id,
                                poly.area / neighbor.poly.area)
                    neighbors.append(Neighbor(region.id, poly, rtype))
                    area = poly.area
                    description.setdefault('regions', []).append({
                        'type':
                        rtype,
                        'subtype':
                        subrtype,
                        'coords':
                        polygon,
                        'area':
                        area,
                        'features':
                        page_coords['features'],
                        'DPI':
                        dpi,
                        'region.ID':
                        region.id,
                        'page.ID':
                        page_id,
                        'page.type':
                        ptype,
                        'file_grp':
                        self.input_file_grp,
                        'METS.UID':
                        self.workspace.mets.unique_identifier
                    })
                    # draw region:
                    ImageDraw.Draw(page_image_dbg).polygon(
                        list(map(tuple, polygon)),
                        fill='#' + CLASSES[(rtype + ':' +
                                            subrtype) if subrtype else rtype])
                    # COCO: add annotations
                    i += 1
                    annotations.append({
                        'id':
                        i,
                        'image_id':
                        num_page_id,
                        'category_id':
                        next(
                            (cat['id']
                             for cat in categories if cat['name'] == subrtype),
                            next((cat['id'] for cat in categories
                                  if cat['name'] == rtype))),
                        'segmentation':
                        polygon2,
                        'area':
                        area,
                        'bbox': [xywh['x'], xywh['y'], xywh['w'], xywh['h']],
                        'iscrowd':
                        0
                    })

            self.workspace.save_image_file(page_image_dbg,
                                           file_id + '.dbg',
                                           dbg_image_grp,
                                           page_id=page_id,
                                           mimetype=self.parameter['mimetype'])
            self.workspace.add_file(
                ID=file_id + '.json',
                file_grp=dbg_image_grp,
                pageId=page_id,
                local_filename=file_path.replace(
                    MIME_TO_EXT[self.parameter['mimetype']], '.json'),
                mimetype='application/json',
                content=json.dumps(description))

            # COCO: add image
            images.append({
                # COCO does not allow string identifiers:
                # -> use numerical part of page_id
                'id': num_page_id,
                # all exported coordinates are relative to the cropped page:
                # -> use that for reference (instead of original page.imageFilename)
                'file_name': file_path,
                # -> use its size (instead of original page.imageWidth/page.imageHeight)
                'width': page_image.width,
                'height': page_image.height
            })

        # COCO: write result
        file_id = dbg_image_grp + '.coco.json'
        LOG.info('Writing COCO result file "%s" in "%s"', file_id,
                 dbg_image_grp)
        self.workspace.add_file(ID=file_id,
                                file_grp=dbg_image_grp,
                                local_filename=os.path.join(
                                    dbg_image_grp, file_id),
                                mimetype='application/json',
                                content=json.dumps({
                                    'categories': categories,
                                    'images': images,
                                    'annotations': annotations
                                }))

        # write inverse colordict (for ocrd-segment-from-masks)
        file_id = dbg_image_grp + '.colordict.json'
        LOG.info('Writing colordict file "%s" in .', file_id)
        with open(file_id, 'w') as out:
            json.dump(
                dict(('#' + col, name) for name, col in CLASSES.items()
                     if name), out)
Exemplo n.º 10
0
    def process(self):
        """Performs page cropping with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images.
        Set up Tesseract to detect text blocks on each page, and find
        the largest coordinate extent spanning all of them. Use this
        extent in defining a Border, and add that to the page.
        
        Moreover, crop the original image accordingly, and reference the
        resulting image file as AlternativeImage in the Page element.
        
        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CROP`` along with further
        identification of the input element.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrCrop')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        padding = self.parameter['padding']
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            # disable table detection here (tables count as text blocks),
            # because we do not want to risk confusing the spine with
            # a column separator and thus creeping into a neighbouring
            # page:
            tessapi.SetVariable("textord_tabfind_find_tables", "0")
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()

                # warn of existing Border:
                border = page.get_Border()
                if border:
                    left, top, right, bottom = bbox_from_points(
                        border.get_Coords().points)
                    LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
                                left, top, right, bottom)

                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page,
                    page_id,
                    # image must not have been cropped already,
                    # abort if no such image can be produced:
                    feature_filter='cropped')
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from parameter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                    zoom = 300 / dpi
                else:
                    zoom = 1

                # warn of existing segmentation:
                regions = page.get_TextRegion()
                if regions:
                    min_x = page_image.width
                    min_y = page_image.height
                    max_x = 0
                    max_y = 0
                    for region in regions:
                        left, top, right, bottom = bbox_from_points(
                            region.get_Coords().points)
                        min_x = min(min_x, left)
                        min_y = min(min_y, top)
                        max_x = max(max_x, right)
                        max_y = max(max_y, bottom)
                    LOG.warning(
                        'Ignoring extent from existing TextRegions: %i:%i,%i:%i',
                        min_x, max_x, min_y, max_y)

                LOG.debug("Cropping with Tesseract")
                tessapi.SetImage(page_image)
                # PSM.SPARSE_TEXT: get as much text as possible in no particular order
                # PSM.AUTO (default): includes tables (dangerous)
                tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT)
                #
                # helper variables for saving the box coordinates
                #
                min_x = page_image.width
                min_y = page_image.height
                max_x = 0
                max_y = 0
                # iterate over all text blocks and compare their
                # bbox extent to the running min and max values
                for component in tessapi.GetComponentImages(
                        tesserocr.RIL.BLOCK, True):
                    image, xywh, index, _ = component
                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    left, top, right, bottom = bbox_from_xywh(xywh)
                    LOG.debug("Detected text region '%s': %i:%i,%i:%i", ID,
                              left, right, top, bottom)
                    # filter region results:
                    bin_bbox = image.getbbox()
                    if not bin_bbox:
                        # this does happen!
                        LOG.info(
                            "Ignoring region '%s' because its binarization is empty",
                            ID)
                        continue
                    width = bin_bbox[2] - bin_bbox[0]
                    if width < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.info(
                            "Ignoring region '%s' because its width is too small (%d)",
                            ID, width)
                        continue
                    height = bin_bbox[3] - bin_bbox[1]
                    if height < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.debug(
                            "Ignoring region '%s' because its height is too small (%d)",
                            ID, height)
                        continue
                    min_x = min(min_x, left)
                    min_y = min(min_y, top)
                    max_x = max(max_x, right)
                    max_y = max(max_y, bottom)
                    LOG.info("Updated page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)

                #
                # set the identified page border
                #
                if min_x < max_x and min_y < max_y:
                    # add padding:
                    min_x = max(min_x - padding, 0)
                    max_x = min(max_x + padding, page_image.width)
                    min_y = max(min_y - padding, 0)
                    max_y = min(max_y + padding, page_image.height)
                    LOG.info("Padded page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)
                    polygon = polygon_from_bbox(min_x, min_y, max_x, max_y)
                    polygon = coordinates_for_segment(polygon, page_image,
                                                      page_xywh)
                    polygon = polygon_for_parent(polygon, page)
                    border = BorderType(
                        Coords=CoordsType(points_from_polygon(polygon)))
                    # intersection with parent could have changed bbox,
                    # so recalculate:
                    bbox = bbox_from_polygon(
                        coordinates_of_segment(border, page_image, page_xywh))
                    # update PAGE (annotate border):
                    page.set_Border(border)
                    # update METS (add the image file):
                    page_image = crop_image(page_image, box=bbox)
                    page_xywh['features'] += ',cropped'
                    file_id = make_file_id(input_file, self.output_file_grp)
                    file_path = self.workspace.save_image_file(
                        page_image,
                        file_id + '.IMG-CROP',
                        page_id=input_file.pageId,
                        file_grp=self.output_file_grp)
                    # update PAGE (reference the image file):
                    page.add_AlternativeImage(
                        AlternativeImageType(filename=file_path,
                                             comments=page_xywh['features']))
                else:
                    LOG.error("Cannot find valid extent for page '%s'",
                              page_id)

                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
Exemplo n.º 11
0
 def _process_segment(self, parent, parent_image, parent_coords, page_id,
                      zoom, lines, ignore):
     LOG = getLogger('processor.OcropyResegment')
     threshold = self.parameter['min_fraction']
     margin = self.parameter['extend_margins']
     method = self.parameter['method']
     # prepare line segmentation
     parent_array = pil2array(parent_image)
     #parent_array, _ = common.binarize(parent_array, maxskew=0) # just in case still raw
     parent_bin = np.array(parent_array <= midrange(parent_array), np.bool)
     ignore_bin = np.ones_like(parent_bin, np.bool)
     if isinstance(parent, PageType):
         tag = 'page'
         fullpage = True
         report = check_page(parent_bin, zoom)
     else:
         tag = 'region'
         fullpage = False
         report = check_region(parent_bin, zoom)
     if report:
         LOG.warning('Invalid %s "%s": %s', tag,
                     page_id if fullpage else parent.id, report)
         return
     # get existing line labels:
     line_labels = np.zeros_like(parent_bin, np.bool)
     line_labels = np.tile(line_labels[np.newaxis], (len(lines), 1, 1))
     line_polygons = []
     for i, segment in enumerate(lines):
         segment_polygon = coordinates_of_segment(segment, parent_image,
                                                  parent_coords)
         segment_polygon = make_valid(
             Polygon(segment_polygon)).buffer(margin)
         line_polygons.append(prep(segment_polygon))
         segment_polygon = np.array(segment_polygon.exterior, np.int)[:-1]
         # draw.polygon: If any segment_polygon lies outside of parent
         # (causing negative/above-max indices), either fully or partially,
         # then this will silently ignore them. The caller does not need
         # to concern herself with this.
         segment_y, segment_x = draw.polygon(segment_polygon[:, 1],
                                             segment_polygon[:, 0],
                                             parent_bin.shape)
         line_labels[i, segment_y, segment_x] = True
     # only text region(s) may contain new text lines
     for i, segment in enumerate(set(line.parent_object_
                                     for line in lines)):
         LOG.debug('unmasking area of text region "%s" for "%s"',
                   segment.id, page_id if fullpage else parent.id)
         segment_polygon = coordinates_of_segment(segment, parent_image,
                                                  parent_coords)
         segment_polygon = make_valid(
             Polygon(segment_polygon)).buffer(margin)
         segment_polygon = np.array(segment_polygon.exterior, np.int)[:-1]
         ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:,
                                                                        0],
                                 parent_bin.shape)] = False
     # mask/ignore overlapping neighbours
     for i, segment in enumerate(ignore):
         LOG.debug('masking area of %s "%s" for "%s"',
                   type(segment).__name__[:-4], segment.id,
                   page_id if fullpage else parent.id)
         segment_polygon = coordinates_of_segment(segment, parent_image,
                                                  parent_coords)
         ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:,
                                                                        0],
                                 parent_bin.shape)] = True
     if method != 'lineest':
         LOG.debug(
             'calculating connected component and distance transforms for "%s"',
             parent.id)
         bin = parent_bin & ~ignore_bin
         components, _ = morph.label(bin)
         # estimate glyph scale (roughly)
         _, counts = np.unique(components, return_counts=True)
         if counts.shape[0] > 1:
             counts = np.sqrt(3 * counts)
             scale = int(
                 np.median(counts[(5 / zoom < counts)
                                  & (counts < 100 / zoom)]))
             components *= (counts > 15 / zoom)[components]
             LOG.debug("estimated scale: %d", scale)
         else:
             scale = 43
         if method == 'ccomps':
             labels = np.insert(line_labels, 0, ignore_bin, axis=0)
             distances = np.zeros_like(labels, np.uint8)
             for i, label in enumerate(labels):
                 distances[i] = morph.dist_labels(label.astype(np.uint8))
                 # normalize the distances of all lines so larger ones do not displace smaller ones
                 distances[i] = distances[i] / distances[i].max() * 255
             # use depth to flatten overlapping lines as seed labels
             new_labels = np.argmax(distances, axis=0)
         else:
             new_labels = np.zeros_like(parent_bin, np.uint8)
             for i, line in enumerate(lines):
                 if line.Baseline is None:
                     LOG.warning("Skipping '%s' without baseline", line.id)
                     new_labels[line_labels[i]] = i + 1
                     continue
                 line_polygon = baseline_of_segment(line, parent_coords)
                 line_ltr = line_polygon[0, 0] < line_polygon[-1, 0]
                 line_polygon = make_valid(
                     join_polygons(
                         LineString(line_polygon).buffer(
                             # left-hand side if left-to-right, and vice versa
                             scale * (-1)**line_ltr,
                             single_sided=True),
                         loc=line.id))
                 line_polygon = np.array(line_polygon.exterior, np.int)[:-1]
                 line_y, line_x = draw.polygon(line_polygon[:, 1],
                                               line_polygon[:, 0],
                                               parent_bin.shape)
                 new_labels[line_y, line_x] = i + 1
         spread_dist(lines,
                     line_labels,
                     new_labels,
                     parent_bin,
                     components,
                     parent_coords,
                     scale=scale,
                     loc=parent.id,
                     threshold=threshold)
         return
     try:
         new_line_labels, _, _, _, _, scale = compute_segmentation(
             parent_bin,
             seps=ignore_bin,
             zoom=zoom,
             fullpage=fullpage,
             maxseps=0,
             maxcolseps=len(ignore),
             maximages=0)
     except Exception as err:
         LOG.warning('Cannot line-segment %s "%s": %s', tag,
                     page_id if fullpage else parent.id, err)
         return
     LOG.info("Found %d new line labels for %d existing lines on %s '%s'",
              new_line_labels.max(), len(lines), tag, parent.id)
     # polygonalize and prepare comparison
     new_line_polygons, new_line_labels = masks2polygons(
         new_line_labels,
         parent_bin,
         '%s "%s"' % (tag, parent.id),
         min_area=640 / zoom / zoom)
     # DSAVE('line_labels', [np.mean(line_labels, axis=0), parent_bin])
     # DSAVE('new_line_labels', [new_line_labels, parent_bin], disabled=False)
     new_line_polygons = [
         make_valid(Polygon(line_poly))
         for line_label, line_poly in new_line_polygons
     ]
     # polygons for intersecting pairs
     intersections = dict()
     # ratio of overlap between intersection and new line
     fits_bg = np.zeros((len(new_line_polygons), len(line_polygons)),
                        np.float)
     fits_fg = np.zeros((len(new_line_polygons), len(line_polygons)),
                        np.float)
     # ratio of overlap between intersection and existing line
     covers_bg = np.zeros((len(new_line_polygons), len(line_polygons)),
                          np.float)
     covers_fg = np.zeros((len(new_line_polygons), len(line_polygons)),
                          np.float)
     # compare segmentations, calculating ratios of overlapping fore/background area
     for i, new_line_poly in enumerate(new_line_polygons):
         for j, line_poly in enumerate(line_polygons):
             # too strict: .contains
             if line_poly.intersects(new_line_poly):
                 inter = make_intersection(line_poly.context, new_line_poly)
                 if not inter:
                     continue
                 new_line_mask = (new_line_labels == i + 1) & parent_bin
                 line_mask = line_labels[j] & parent_bin
                 inter_mask = new_line_mask & line_mask
                 if (not np.count_nonzero(inter_mask)
                         or not np.count_nonzero(new_line_mask)
                         or not np.count_nonzero(line_mask)):
                     continue
                 intersections[(i, j)] = inter
                 fits_bg[i, j] = inter.area / new_line_poly.area
                 covers_bg[i, j] = inter.area / line_poly.context.area
                 fits_fg[i, j] = np.count_nonzero(
                     inter_mask) / np.count_nonzero(new_line_mask)
                 covers_fg[i, j] = np.count_nonzero(
                     inter_mask) / np.count_nonzero(line_mask)
                 # LOG.debug("new %d old %d (%s): %.1f%% / %.1f%% bg, %.1f%% / %.1f%% fg",
                 #           i, j, lines[j].id,
                 #           fits_bg[i,j]*100, covers_bg[i,j]*100,
                 #           fits_fg[i,j]*100, covers_fg[i,j]*100)
     # assign new lines to existing lines, if possible
     assignments = np.ones(len(new_line_polygons), np.int) * -1
     for i, new_line_poly in enumerate(new_line_polygons):
         if not fits_bg[i].any():
             LOG.debug("new line %d fits no existing line's background", i)
             continue
         if not fits_fg[i].any():
             LOG.debug("new line %d fits no existing line's foreground", i)
             continue
         fits = (fits_bg[i] > 0.6) & (fits_fg[i] > 0.9)
         if not fits.any():
             j = np.argmax(fits_bg[i] * fits_fg[i])
             LOG.debug(
                 "best fit '%s' for new line %d fits only %.1f%% bg / %.1f%% fg",
                 lines[j].id, i, fits_bg[i, j] * 100, fits_fg[i, j] * 100)
             continue
         covers = covers_bg[i] * covers_fg[i] * fits
         j = np.argmax(covers)
         line = lines[j]
         inter_polygon = intersections[(i, j)]
         new_line_polygon = new_line_polygons[i]
         new_center = inter_polygon.centroid
         center = new_line_polygon.centroid
         # FIXME: apply reasonable threshold for centroid distance
         LOG.debug("new line for '%s' has centroid distance %.2f", line.id,
                   center.distance(new_center))
         assignments[i] = j
     # validate assignments retain enough area and do not loose unassigned matches
     line_polygons = [
         poly.context.buffer(-margin) for poly in line_polygons
     ]
     for j, line in enumerate(lines):
         new_lines = np.nonzero(assignments == j)[0]
         if not np.prod(new_lines.shape):
             LOG.debug("no lines for '%s' match or fit", line.id)
             continue
         covers = np.sum(covers_bg[new_lines, j])
         if covers < threshold / 3:
             LOG.debug("new lines for '%s' only cover %.1f%% bg", line.id,
                       covers * 100)
             continue
         covers = np.sum(covers_fg[new_lines, j])
         if covers < threshold:
             LOG.debug("new lines for '%s' only cover %.1f%% fg", line.id,
                       covers * 100)
             continue
         looses = (assignments < 0) & (covers_bg[:, j] > 0.1)
         if looses.any():
             covers = np.sum(covers_bg[np.nonzero(looses)[0], j])
             LOG.debug(
                 "new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg",
                 line.id, np.count_nonzero(looses), covers * 100)
             continue
         line_count = np.count_nonzero(line_labels[j] & parent_bin)
         new_count = covers * line_count
         LOG.debug(
             'Black pixels before/after resegment of line "%s": %d/%d',
             line.id, line_count, new_count)
         # combine all assigned new lines to single outline polygon
         if len(new_lines) > 1:
             LOG.debug("joining %d new line polygons for '%s'",
                       len(new_lines), line.id)
         new_polygon = join_polygons(
             [intersections[(i, j)] for i in new_lines], loc=line.id)
         line_polygons[j] = new_polygon
         # convert back to absolute (page) coordinates:
         line_polygon = coordinates_for_segment(
             new_polygon.exterior.coords[:-1], parent_image, parent_coords)
         line_polygon = polygon_for_parent(line_polygon,
                                           line.parent_object_)
         if line_polygon is None:
             LOG.warning("Ignoring extant new polygon for line '%s'",
                         line.id)
             return
         # annotate result:
         line.get_Coords().set_points(points_from_polygon(line_polygon))
         # now also ensure the assigned lines do not overlap other existing lines
         for i in new_lines:
             for otherj in np.nonzero(fits_fg[i] > 0.1)[0]:
                 if j == otherj:
                     continue
                 otherline = lines[otherj]
                 LOG.debug("subtracting new '%s' from overlapping '%s'",
                           line.id, otherline.id)
                 other_polygon = diff_polygons(line_polygons[otherj],
                                               new_polygon)
                 if other_polygon.is_empty:
                     continue
                 # convert back to absolute (page) coordinates:
                 other_polygon = coordinates_for_segment(
                     other_polygon.exterior.coords[:-1], parent_image,
                     parent_coords)
                 other_polygon = polygon_for_parent(
                     other_polygon, otherline.parent_object_)
                 if other_polygon is None:
                     LOG.warning(
                         "Ignoring extant new polygon for line '%s'",
                         otherline.id)
                     continue
                 otherline.get_Coords().set_points(
                     points_from_polygon(other_polygon))
Exemplo n.º 12
0
    def process(self):
        """Extract page image and replace original with it.
        
        Open and deserialize PAGE input files and their respective images,
        then go to the page hierarchy level.
        
        Retrieve the image of the (cropped, deskewed, dewarped) page, preferring
        the last annotated form (which, depending on the workflow, could be
        binarized or raw). Add that image file to the workspace with the fileGrp
        USE given in the output fileGrp.
        Reference that file in the page (not as AlternativeImage but) as original
        image. Adjust all segment coordinates accordingly.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.ReplaceOriginal')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        feature_selector = self.parameter['feature_selector']
        feature_filter = self.parameter['feature_filter']
        adapt_coords = self.parameter['transform_coordinates']

        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter=feature_filter,
                feature_selector=feature_selector)
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            # annotate extracted image
            file_id = make_file_id(input_file, self.output_file_grp)
            file_path = self.workspace.save_image_file(
                page_image,
                file_id + '-IMG',
                self.output_file_grp,
                page_id=input_file.pageId,
                mimetype='image/png')
            # replace original image
            page.set_imageFilename(file_path)
            # remove all coordinate-sensitive page-level annotations
            page.set_imageWidth(page_image.width)
            page.set_imageHeight(page_image.height)
            page.set_Border(None)  # also removes all derived images
            page.set_orientation(None)
            # also add image as derived image (in order to preserve image features)
            # (but exclude coordinate-sensitive features that have already been applied over the "original")
            features = ','.join(
                filter(
                    lambda f: f not in [
                        "cropped", "deskewed", "rotated-90", "rotated-180",
                        "rotated-270"
                    ], page_coords['features'].split(",")))
            page.add_AlternativeImage(
                AlternativeImageType(filename=file_path, comments=features))
            # adjust all coordinates
            if adapt_coords:
                for region in page.get_AllRegions():
                    region_polygon = coordinates_of_segment(
                        region, page_image, page_coords)
                    region.get_Coords().set_points(
                        points_from_polygon(region_polygon))
                    ensure_valid(region)
                    if isinstance(region, TextRegionType):
                        for line in region.get_TextLine():
                            line_polygon = coordinates_of_segment(
                                line, page_image, page_coords)
                            line.get_Coords().set_points(
                                points_from_polygon(line_polygon))
                            ensure_valid(line)
                            for word in line.get_Word():
                                word_polygon = coordinates_of_segment(
                                    word, page_image, page_coords)
                                word.get_Coords().set_points(
                                    points_from_polygon(word_polygon))
                                ensure_valid(word)
                                for glyph in word.get_Glyph():
                                    glyph_polygon = coordinates_of_segment(
                                        glyph, page_image, page_coords)
                                    glyph.get_Coords().set_points(
                                        points_from_polygon(glyph_polygon))
                                    ensure_valid(glyph)

            # update METS (add the PAGE file):
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=os.path.join(
                                              self.output_file_grp,
                                              file_id + '.xml'),
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     self.output_file_grp, out.local_filename)
Exemplo n.º 13
0
 def sanitize_page(self, page, page_id):
     LOG = getLogger('processor.RepairSegmentation')
     regions = page.get_AllRegions(classes=['Text'])
     page_image, page_coords, _ = self.workspace.image_from_page(
         page, page_id)
     for region in regions:
         LOG.info('Sanitizing region "%s"', region.id)
         lines = region.get_TextLine()
         if not lines:
             LOG.warning('Page "%s" region "%s" contains no textlines',
                         page_id, region.id)
             continue
         heights = []
         tops = []
         # get labels:
         region_mask = np.zeros((page_image.height, page_image.width),
                                dtype=np.uint8)
         for line in lines:
             line_polygon = coordinates_of_segment(line, page_image,
                                                   page_coords)
             line_xywh = xywh_from_polygon(line_polygon)
             heights.append(line_xywh['h'])
             tops.append(line_xywh['y'])
             region_mask[draw.polygon(line_polygon[:, 1], line_polygon[:,
                                                                       0],
                                      region_mask.shape)] = 1
             region_mask[draw.polygon_perimeter(line_polygon[:, 1],
                                                line_polygon[:, 0],
                                                region_mask.shape)] = 1
         # estimate scale:
         heights = np.array(heights)
         scale = int(np.max(heights))
         tops = np.array(tops)
         order = np.argsort(tops)
         heights = heights[order]
         tops = tops[order]
         if len(lines) > 1:
             # if interline spacing is larger than line height, use this
             bottoms = tops + heights
             deltas = tops[1:] - bottoms[:-1]
             scale = max(scale, int(np.max(deltas)))
         # close labels:
         region_mask = np.pad(region_mask, scale)  # protect edges
         region_mask = np.array(morphology.binary_closing(
             region_mask, np.ones((scale, 1))),
                                dtype=np.uint8)
         region_mask = region_mask[scale:-scale, scale:-scale]  # unprotect
         # extend margins (to ensure simplified hull polygon is outside children):
         region_mask = filters.maximum_filter(region_mask,
                                              3)  # 1px in each direction
         # find outer contour (parts):
         contours, _ = cv2.findContours(region_mask, cv2.RETR_EXTERNAL,
                                        cv2.CHAIN_APPROX_SIMPLE)
         # determine areas of parts:
         areas = [cv2.contourArea(contour) for contour in contours]
         total_area = sum(areas)
         if not total_area:
             # ignore if too small
             LOG.warning('Zero contour area in region "%s"', region.id)
             continue
         # pick contour and convert to absolute:
         region_polygon = None
         for i, contour in enumerate(contours):
             area = areas[i]
             if area / total_area < 0.1:
                 LOG.warning(
                     'Ignoring contour %d too small (%d/%d) in region "%s"',
                     i, area, total_area, region.id)
                 continue
             # simplify shape (until valid):
             # can produce invalid (self-intersecting) polygons:
             #polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y
             polygon = contour[:, 0, ::]  # already ordered x,y
             polygon = Polygon(polygon).simplify(1)
             polygon = make_valid(polygon)
             polygon = polygon.exterior.coords[:-1]  # keep open
             if len(polygon) < 4:
                 LOG.warning(
                     'Ignoring contour %d less than 4 points in region "%s"',
                     i, region.id)
                 continue
             if region_polygon is not None:
                 LOG.error(
                     'Skipping region "%s" due to non-contiguous contours',
                     region.id)
                 region_polygon = None
                 break
             region_polygon = coordinates_for_segment(
                 polygon, page_image, page_coords)
         if region_polygon is not None:
             LOG.info('Using new coordinates for region "%s"', region.id)
             region.get_Coords().set_points(
                 points_from_polygon(region_polygon))
Exemplo n.º 14
0
    def process(self):
        """Extract region images from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Extract an image for each region (which depending on the workflow
        can already be deskewed, dewarped, binarized etc.), cropped to its
        minimal bounding box, and masked by the coordinate polygon outline.
        If ``transparency`` is true, then also add an alpha channel which is
        fully transparent outside of the mask.
        
        Create a JSON file with:
        * the IDs of the region and its parents,
        * the region's coordinates relative to the region image,
        * the region's absolute coordinates,
        * the (text) region's text content (if any),
        * the (text) region's TextStyle (if any),
        * the (text) region's @production (if any),
        * the (text) region's @readingDirection (if any),
        * the (text) region's @textLineOrder (if any),
        * the (text) region's @primaryScript (if any),
        * the (text) region's @primaryLanguage (if any),
        * the region's AlternativeImage/@comments (features),
        * the region's element class,
        * the region's @type,
        * the page's @type,
        * the page's DPI value.
        
        Write all files in the directory of the output file group, named like so:
        * ID + '.raw.png': region image (if the workflow provides raw images)
        * ID + '.bin.png': region image (if the workflow provides binarized images)
        * ID + '.nrm.png': region image (if the workflow provides grayscale-normalized images)
        * ID + '.json': region metadata.
        """
        LOG = getLogger('processor.ExtractRegions')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            ptype = page.get_type()

            regions = dict()
            for name in CLASSES.keys():
                if not name or name == 'Border' or ':' in name:
                    # no subtypes here
                    continue
                regions[name] = getattr(page, 'get_' + name)()
            for rtype, rlist in regions.items():
                for region in rlist:
                    description = {
                        'region.ID': region.id,
                        'region.type': rtype
                    }
                    region_image, region_coords = self.workspace.image_from_segment(
                        region,
                        page_image,
                        page_coords,
                        transparency=self.parameter['transparency'])
                    if rtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']:
                        subrtype = region.get_type()
                    else:
                        subrtype = None
                    description['subtype'] = subrtype
                    description['coords_rel'] = coordinates_of_segment(
                        region, region_image, region_coords).tolist()
                    description['coords_abs'] = polygon_from_points(
                        region.get_Coords().points)
                    if rtype == 'text':
                        rtext = region.get_TextEquiv()
                        if rtext:
                            description['region.text'] = rtext[0].Unicode
                        else:
                            description['region.text'] = ''
                        rstyle = region.get_TextStyle() or page.get_TextStyle()
                        if rstyle:
                            description['region.style'] = {
                                'fontFamily': rstyle.fontFamily,
                                'fontSize': rstyle.fontSize,
                                'xHeight': rstyle.xHeight,
                                'kerning': rstyle.kerning,
                                'serif': rstyle.serif,
                                'monospace': rstyle.monospace,
                                'bold': rstyle.bold,
                                'italic': rstyle.italic,
                                'smallCaps': rstyle.smallCaps,
                                'letterSpaced': rstyle.letterSpaced,
                                'strikethrough': rstyle.strikethrough,
                                'underlined': rstyle.underlined,
                                'underlineStyle': rstyle.underlineStyle,
                                'subscript': rstyle.subscript,
                                'superscript': rstyle.superscript
                            }
                        description['production'] = region.get_production()
                        description['readingDirection'] = (
                            region.get_readingDirection()
                            or page.get_readingDirection())
                        description['textLineOrder'] = (
                            region.get_textLineOrder()
                            or page.get_textLineOrder())
                        description['primaryScript'] = (
                            region.get_primaryScript()
                            or page.get_primaryScript())
                        description['primaryLanguage'] = (
                            region.get_primaryLanguage()
                            or page.get_primaryLanguage())
                    description['features'] = region_coords['features']
                    description['DPI'] = dpi
                    description['page.ID'] = page_id
                    description['page.type'] = ptype
                    description['file_grp'] = self.input_file_grp
                    description[
                        'METS.UID'] = self.workspace.mets.unique_identifier
                    if 'binarized' in region_coords['features']:
                        extension = '.bin'
                    elif 'grayscale_normalized' in region_coords['features']:
                        extension = '.nrm'
                    else:
                        extension = '.raw'

                    file_id = make_file_id(input_file, self.output_file_grp)
                    file_path = self.workspace.save_image_file(
                        region_image,
                        file_id + '_' + region.id + extension,
                        self.output_file_grp,
                        pageId=input_file.pageId,
                        mimetype=self.parameter['mimetype'])
                    self.workspace.add_file(
                        ID=file_id + '.json',
                        file_grp=self.output_file_grp,
                        local_filename=file_path.replace(
                            extension +
                            MIME_TO_EXT[self.parameter['mimetype']], '.json'),
                        pageId=input_file.pageId,
                        mimetype='application/json',
                        content=json.dumps(description))
Exemplo n.º 15
0
    def process(self):
        """Extract textline images and texts from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the line level.
        
        Extract an image for each textline (which depending on the workflow
        can already be deskewed, dewarped, binarized etc.), cropped to its
        minimal bounding box, and masked by the coordinate polygon outline.
        If ``transparency`` is true, then also add an alpha channel which is
        fully transparent outside of the mask.
        
        Create a JSON file with:
        * the IDs of the textline and its parents,
        * the textline's text content,
        * the textline's coordinates relative to the line image,
        * the textline's absolute coordinates,
        * the textline's TextStyle (if any),
        * the textline's @production (if any),
        * the textline's @readingDirection (if any),
        * the textline's @primaryScript (if any),
        * the textline's @primaryLanguage (if any),
        * the textline's AlternativeImage/@comments (features),
        * the parent textregion's @type,
        * the page's @type,
        * the page's DPI value.
        
        Create a plain text file for the text content, too.
        
        Write all files in the directory of the output file group, named like so:
        * ID + '.raw.png': line image (if the workflow provides raw images)
        * ID + '.bin.png': line image (if the workflow provides binarized images)
        * ID + '.nrm.png': line image (if the workflow provides grayscale-normalized images)
        * ID + '.json': line metadata.
        * ID + '.gt.txt': line text.
        
        (This is intended for training and evaluation of OCR models.)
        """
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            ptype = page.get_type()

            regions = itertools.chain.from_iterable([page.get_TextRegion()] + [
                subregion.get_TextRegion()
                for subregion in page.get_TableRegion()
            ])
            if not regions:
                LOG.warning("Page '%s' contains no text regions", page_id)
            for region in regions:
                region_image, region_coords = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_coords,
                    transparency=self.parameter['transparency'])
                rtype = region.get_type()

                lines = region.get_TextLine()
                if not lines:
                    LOG.warning("Region '%s' contains no text lines",
                                region.id)
                for line in lines:
                    line_image, line_coords = self.workspace.image_from_segment(
                        line,
                        region_image,
                        region_coords,
                        transparency=self.parameter['transparency'])
                    lpolygon_rel = coordinates_of_segment(
                        line, line_image, line_coords).tolist()
                    lpolygon_abs = polygon_from_points(
                        line.get_Coords().points)
                    ltext = line.get_TextEquiv()
                    if not ltext:
                        LOG.warning("Line '%s' contains no text conent",
                                    line.id)
                        ltext = ''
                    else:
                        ltext = ltext[0].Unicode
                    lstyle = line.get_TextStyle() or region.get_TextStyle()
                    if lstyle:
                        lstyle = {
                            'fontFamily': lstyle.fontFamily,
                            'fontSize': lstyle.fontSize,
                            'xHeight': lstyle.xHeight,
                            'kerning': lstyle.kerning,
                            'serif': lstyle.serif,
                            'monospace': lstyle.monospace,
                            'bold': lstyle.bold,
                            'italic': lstyle.italic,
                            'smallCaps': lstyle.smallCaps,
                            'letterSpaced': lstyle.letterSpaced,
                            'strikethrough': lstyle.strikethrough,
                            'underlined': lstyle.underlined,
                            'underlineStyle': lstyle.underlineStyle,
                            'subscript': lstyle.subscript,
                            'superscript': lstyle.superscript
                        }
                    lfeatures = line_coords['features']
                    description = {
                        'line.ID':
                        line.id,
                        'text':
                        ltext,
                        'style':
                        lstyle,
                        'production': (line.get_production()
                                       or region.get_production()),
                        'readingDirection': (line.get_readingDirection()
                                             or region.get_readingDirection()
                                             or page.get_readingDirection()),
                        'primaryScript': (line.get_primaryScript()
                                          or region.get_primaryScript()
                                          or page.get_primaryScript()),
                        'primaryLanguage': (line.get_primaryLanguage()
                                            or region.get_primaryLanguage()
                                            or page.get_primaryLanguage()),
                        'features':
                        lfeatures,
                        'DPI':
                        dpi,
                        'coords_rel':
                        lpolygon_rel,
                        'coords_abs':
                        lpolygon_abs,
                        'region.ID':
                        region.id,
                        'region.type':
                        rtype,
                        'page.ID':
                        page_id,
                        'page.type':
                        ptype,
                        'file_grp':
                        self.input_file_grp,
                        'METS.UID':
                        self.workspace.mets.unique_identifier
                    }
                    if 'binarized' in lfeatures:
                        extension = '.bin'
                    elif 'grayscale_normalized' in lfeatures:
                        extension = '.nrm'
                    else:
                        extension = '.raw'

                    file_path = self.workspace.save_image_file(
                        line_image,
                        file_id + '_' + region.id + '_' + line.id + extension,
                        self.output_file_grp,
                        page_id=page_id,
                        mimetype=self.parameter['mimetype'])
                    file_path = file_path.replace(
                        extension + MIME_TO_EXT[self.parameter['mimetype']],
                        '.json')
                    json.dump(description, open(file_path, 'w'))
                    file_path = file_path.replace('.json', '.gt.txt')
                    with open(file_path, 'wb') as f:
                        f.write((ltext + '\n').encode('utf-8'))
Exemplo n.º 16
0
    def process(self):
        """Extract glyph images and texts from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the glyph level.
        
        Extract an image for each glyph (which depending on the workflow
        can already be deskewed, dewarped, binarized etc.), cropped to its
        minimal bounding box, and masked by the coordinate polygon outline.
        Apply ``feature_filter`` (a comma-separated list of image features,
        cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip
        specific features when retrieving derived images.
        If ``transparency`` is true, then also add an alpha channel which is
        fully transparent outside of the mask.
        
        Create a JSON file with:
        * the IDs of the glyph and its parents,
        * the glyph's text content,
        * the glyph's coordinates relative to the line image,
        * the glyph's absolute coordinates,
        * the glyph's TextStyle (if any),
        * the glyph's @production (if any),
        * the glyph's @ligature (if any),
        * the glyph's @symbol (if any),
        * the glyph's @script (if any),
        * the glyph's AlternativeImage/@comments (features),
        * the parent textregion's @type,
        * the page's @type,
        * the page's DPI value.
        
        Create a plain text file for the text content, too.
        
        Write all files in the directory of the output file group, named like so:
        * ID + '.raw.png': glyph image (if the workflow provides raw images)
        * ID + '.bin.png': glyph image (if the workflow provides binarized images)
        * ID + '.nrm.png': glyph image (if the workflow provides grayscale-normalized images)
        * ID + '.json': glyph metadata.
        * ID + '.gt.txt': glyph text.
        
        (This is intended for training and evaluation of script detection models.)
        """
        LOG = getLogger('processor.ExtractGlyph')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter=self.parameter['feature_filter'],
                transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            ptype = page.get_type()

            regions = itertools.chain.from_iterable([page.get_TextRegion()] + [
                subregion.get_TextRegion()
                for subregion in page.get_TableRegion()
            ])
            if not regions:
                LOG.warning("Page '%s' contains no text regions", page_id)
            for region in regions:
                region_image, region_coords = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_coords,
                    feature_filter=self.parameter['feature_filter'],
                    transparency=self.parameter['transparency'])
                rtype = region.get_type()

                lines = region.get_TextLine()
                if not lines:
                    LOG.warning("Region '%s' contains no text lines",
                                region.id)
                for line in lines:
                    line_image, line_coords = self.workspace.image_from_segment(
                        line,
                        region_image,
                        region_coords,
                        feature_filter=self.parameter['feature_filter'],
                        transparency=self.parameter['transparency'])
                    words = line.get_Word()
                    if not words:
                        LOG.warning("Line '%s' contains no words", line.id)
                    for word in words:
                        word_image, word_coords = self.workspace.image_from_segment(
                            word,
                            line_image,
                            line_coords,
                            feature_filter=self.parameter['feature_filter'],
                            transparency=self.parameter['transparency'])
                        glyphs = word.get_Glyph()
                        if not glyphs:
                            LOG.warning("Word '%s' contains no glyphs",
                                        word.id)
                        for glyph in glyphs:
                            glyph_image, glyph_coords = self.workspace.image_from_segment(
                                glyph,
                                word_image,
                                word_coords,
                                feature_filter=self.
                                parameter['feature_filter'],
                                transparency=self.parameter['transparency'])
                            lpolygon_rel = coordinates_of_segment(
                                glyph, glyph_image, glyph_coords).tolist()
                            lpolygon_abs = polygon_from_points(
                                glyph.get_Coords().points)
                            ltext = glyph.get_TextEquiv()
                            if not ltext:
                                LOG.warning(
                                    "Glyph '%s' contains no text content",
                                    glyph.id)
                                ltext = ''
                            else:
                                ltext = ltext[0].Unicode
                            lstyle = glyph.get_TextStyle(
                            ) or word.get_TextStyle() or line.get_TextStyle(
                            ) or region.get_TextStyle()
                            if lstyle:
                                lstyle = {
                                    'fontFamily': lstyle.fontFamily,
                                    'fontSize': lstyle.fontSize,
                                    'xHeight': lstyle.xHeight,
                                    'kerning': lstyle.kerning,
                                    'serif': lstyle.serif,
                                    'monospace': lstyle.monospace,
                                    'bold': lstyle.bold,
                                    'italic': lstyle.italic,
                                    'smallCaps': lstyle.smallCaps,
                                    'letterSpaced': lstyle.letterSpaced,
                                    'strikethrough': lstyle.strikethrough,
                                    'underlined': lstyle.underlined,
                                    'underlineStyle': lstyle.underlineStyle,
                                    'subscript': lstyle.subscript,
                                    'superscript': lstyle.superscript
                                }
                            lfeatures = glyph_coords['features']
                            description = {
                                'glyph.ID':
                                glyph.id,
                                'text':
                                ltext,
                                'style':
                                lstyle,
                                'production': (glyph.get_production()
                                               or word.get_production()
                                               or line.get_production()
                                               or region.get_production()),
                                'script': (glyph.get_script()
                                           or word.get_primaryScript()
                                           or line.get_primaryScript()
                                           or region.get_primaryScript()
                                           or page.get_primaryScript()),
                                'ligature':
                                glyph.get_ligature(),
                                'symbol':
                                glyph.get_symbol(),
                                'features':
                                lfeatures,
                                'DPI':
                                dpi,
                                'coords_rel':
                                lpolygon_rel,
                                'coords_abs':
                                lpolygon_abs,
                                'word.ID':
                                word.id,
                                'line.ID':
                                line.id,
                                'region.ID':
                                region.id,
                                'region.type':
                                rtype,
                                'page.ID':
                                page_id,
                                'page.type':
                                ptype,
                                'file_grp':
                                self.input_file_grp,
                                'METS.UID':
                                self.workspace.mets.unique_identifier
                            }
                            if 'binarized' in lfeatures:
                                extension = '.bin'
                            elif 'grayscale_normalized' in lfeatures:
                                extension = '.nrm'
                            else:
                                extension = '.raw'

                            file_id = make_file_id(input_file,
                                                   self.output_file_grp)
                            file_path = self.workspace.save_image_file(
                                glyph_image,
                                file_id + '_' + region.id + '_' + line.id +
                                '_' + word.id + '_' + glyph.id + extension,
                                self.output_file_grp,
                                page_id=page_id,
                                mimetype=self.parameter['mimetype'])
                            file_path = file_path.replace(
                                extension +
                                MIME_TO_EXT[self.parameter['mimetype']],
                                '.json')
                            json.dump(description, open(file_path, 'w'))
                            file_path = file_path.replace('.json', '.gt.txt')
                            with open(file_path, 'wb') as f:
                                f.write((ltext + '\n').encode('utf-8'))
Exemplo n.º 17
0
    def _process_element(self,
                         element,
                         ignore,
                         image,
                         coords,
                         element_id,
                         file_id,
                         page_id,
                         zoom=1.0,
                         rogroup=None):
        """Add PAGE layout elements by segmenting an image.

        Given a PageType, TableRegionType or TextRegionType ``element``, and
        a corresponding binarized PIL.Image object ``image`` with coordinate
        metadata ``coords``, run line segmentation with Ocropy.
        
        If operating on the full page (or table), then also detect horizontal
        and vertical separators, and aggregate the lines into text regions
        afterwards.
        
        Add the resulting sub-segments to the parent ``element``.
        
        If ``ignore`` is not empty, then first suppress all foreground components
        in any of those segments' coordinates during segmentation, and if also
        in full page/table mode, then combine all separators among them with the
        newly detected separators to guide region segmentation.
        """
        LOG = getLogger('processor.OcropySegment')
        if not image.width or not image.height:
            LOG.warning("Skipping '%s' with zero size", element_id)
            return
        element_array = pil2array(image)
        element_bin = np.array(element_array <= midrange(element_array),
                               np.bool)
        sep_bin = np.zeros_like(element_bin, np.bool)
        ignore_labels = np.zeros_like(element_bin, np.int)
        for i, segment in enumerate(ignore):
            LOG.debug('masking foreground of %s "%s" for "%s"',
                      type(segment).__name__[:-4], segment.id, element_id)
            # mark these segments (e.g. separator regions, tables, images)
            # for workflows where they have been detected already;
            # these will be:
            # - ignored during text line segmentation (but not h/v-line detection)
            # - kept and reading-ordered during region segmentation (but not seps)
            segment_polygon = coordinates_of_segment(segment, image, coords)
            # If segment_polygon lies outside of element (causing
            # negative/above-max indices), either fully or partially,
            # then this will silently ignore them. The caller does
            # not need to concern herself with this.
            if isinstance(segment, SeparatorRegionType):
                sep_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:,
                                                                            0],
                                     sep_bin.shape)] = True
            ignore_labels[draw.polygon(
                segment_polygon[:, 1], segment_polygon[:, 0],
                ignore_labels.shape)] = i + 1  # mapped back for RO
        if isinstance(element, PageType):
            element_name = 'page'
            fullpage = True
            report = check_page(element_bin, zoom)
        elif isinstance(element, TableRegionType) or (
                # sole/congruent text region of a table region?
                element.id.endswith('_text')
                and isinstance(element.parent_object_, TableRegionType)):
            element_name = 'table'
            fullpage = True
            report = check_region(element_bin, zoom)
        else:
            element_name = 'region'
            fullpage = False
            report = check_region(element_bin, zoom)
        LOG.info('computing line segmentation for %s "%s"', element_name,
                 element_id)
        # TODO: we should downscale if DPI is large enough to save time
        try:
            if report:
                raise Exception(report)
            line_labels, hlines, vlines, images, colseps, scale = compute_segmentation(
                # suppress separators and ignored regions for textline estimation
                # but keep them for h/v-line detection (in fullpage mode):
                element_bin,
                seps=(sep_bin + ignore_labels) > 0,
                zoom=zoom,
                fullpage=fullpage,
                spread_dist=round(self.parameter['spread'] / zoom * 300 /
                                  72),  # in pt
                # these are ignored when not in fullpage mode:
                maxcolseps=self.parameter['maxcolseps'],
                maxseps=self.parameter['maxseps'],
                maximages=self.parameter['maximages']
                if element_name != 'table' else 0,
                csminheight=self.parameter['csminheight'],
                hlminwidth=self.parameter['hlminwidth'])
        except Exception as err:
            if isinstance(element, TextRegionType):
                LOG.error('Cannot line-segment region "%s": %s', element_id,
                          err)
                # as a fallback, add a single text line comprising the whole region:
                element.add_TextLine(
                    TextLineType(id=element_id + "_line",
                                 Coords=element.get_Coords()))
            else:
                LOG.error('Cannot line-segment %s "%s": %s', element_name,
                          element_id, err)
            return

        LOG.info('Found %d text lines for %s "%s"',
                 len(np.unique(line_labels)) - 1, element_name, element_id)
        # post-process line labels
        if isinstance(element, (PageType, TableRegionType)):
            # aggregate text lines to text regions
            try:
                # pass ignored regions as "line labels with initial assignment",
                # i.e. identical line and region labels
                # to detect their reading order among the others
                # (these cannot be split or grouped together with other regions)
                line_labels = np.where(line_labels, line_labels + len(ignore),
                                       ignore_labels)
                # suppress separators/images in fg and try to use for partitioning slices
                sepmask = np.maximum(np.maximum(hlines, vlines),
                                     np.maximum(sep_bin, images))
                region_labels = lines2regions(
                    element_bin,
                    line_labels,
                    rlabels=ignore_labels,
                    sepmask=np.maximum(sepmask, colseps),  # add bg
                    # decide horizontal vs vertical cut when gaps of similar size
                    prefer_vertical=not isinstance(element, TableRegionType),
                    gap_height=self.parameter['gap_height'],
                    gap_width=self.parameter['gap_width'],
                    scale=scale,
                    zoom=zoom)
                LOG.info('Found %d text regions for %s "%s"',
                         len(np.unique(region_labels)) - 1, element_name,
                         element_id)
            except Exception as err:
                LOG.error('Cannot region-segment %s "%s": %s', element_name,
                          element_id, err)
                region_labels = np.where(line_labels > len(ignore),
                                         1 + len(ignore), line_labels)

            # prepare reading order group index
            if rogroup:
                if isinstance(rogroup,
                              (OrderedGroupType, OrderedGroupIndexedType)):
                    index = 0
                    # start counting from largest existing index
                    for elem in (rogroup.get_RegionRefIndexed() +
                                 rogroup.get_OrderedGroupIndexed() +
                                 rogroup.get_UnorderedGroupIndexed()):
                        if elem.index >= index:
                            index = elem.index + 1
                else:
                    index = None
            # find contours around region labels (can be non-contiguous):
            region_no = 0
            for region_label in np.unique(region_labels):
                if not region_label:
                    continue  # no bg
                region_mask = region_labels == region_label
                region_line_labels = line_labels * region_mask
                region_line_labels0 = np.setdiff1d(region_line_labels, [0])
                if not np.all(region_line_labels0 > len(ignore)):
                    # existing region from `ignore` merely to be ordered
                    # (no new region, no actual text lines)
                    region_line_labels0 = np.intersect1d(
                        region_line_labels0, ignore_labels)
                    assert len(region_line_labels0) == 1, \
                        "region label %d has both existing regions and new lines (%s)" % (
                            region_label, str(region_line_labels0))
                    region = ignore[region_line_labels0[0] - 1]
                    if rogroup and region.parent_object_ == element and not isinstance(
                            region, SeparatorRegionType):
                        index = page_add_to_reading_order(
                            rogroup, region.id, index)
                    LOG.debug('Region label %d is for ignored region "%s"',
                              region_label, region.id)
                    continue
                # normal case: new lines inside new regions
                # remove binary-empty labels, and re-order locally
                order = morph.reading_order(region_line_labels)
                order[np.setdiff1d(region_line_labels0,
                                   element_bin * region_line_labels)] = 0
                region_line_labels = order[region_line_labels]
                # avoid horizontal gaps
                region_line_labels = hmerge_line_seeds(element_bin,
                                                       region_line_labels,
                                                       scale,
                                                       seps=np.maximum(
                                                           sepmask, colseps))
                region_mask |= region_line_labels > 0
                # find contours for region (can be non-contiguous)
                regions, _ = masks2polygons(
                    region_mask * region_label,
                    element_bin,
                    '%s "%s"' % (element_name, element_id),
                    min_area=6000 / zoom / zoom,
                    simplify=ignore_labels * ~(sep_bin))
                # find contours for lines (can be non-contiguous)
                lines, _ = masks2polygons(region_line_labels,
                                          element_bin,
                                          'region "%s"' % element_id,
                                          min_area=640 / zoom / zoom)
                # create new lines in new regions (allocating by intersection)
                line_polys = [Polygon(polygon) for _, polygon in lines]
                for _, region_polygon in regions:
                    region_poly = prep(Polygon(region_polygon))
                    # convert back to absolute (page) coordinates:
                    region_polygon = coordinates_for_segment(
                        region_polygon, image, coords)
                    region_polygon = polygon_for_parent(
                        region_polygon, element)
                    if region_polygon is None:
                        LOG.warning(
                            'Ignoring extant region contour for region label %d',
                            region_label)
                        continue
                    # annotate result:
                    region_no += 1
                    region_id = element_id + "_region%04d" % region_no
                    LOG.debug('Region label %d becomes ID "%s"', region_label,
                              region_id)
                    region = TextRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon)))
                    # find out which line (contours) belong to which region (contours)
                    line_no = 0
                    for i, line_poly in enumerate(line_polys):
                        if not region_poly.intersects(line_poly):  # .contains
                            continue
                        line_label, line_polygon = lines[i]
                        # convert back to absolute (page) coordinates:
                        line_polygon = coordinates_for_segment(
                            line_polygon, image, coords)
                        line_polygon = polygon_for_parent(line_polygon, region)
                        if line_polygon is None:
                            LOG.warning(
                                'Ignoring extant line contour for region label %d line label %d',
                                region_label, line_label)
                            continue
                        # annotate result:
                        line_no += 1
                        line_id = region_id + "_line%04d" % line_no
                        LOG.debug('Line label %d becomes ID "%s"', line_label,
                                  line_id)
                        line = TextLineType(
                            id=line_id,
                            Coords=CoordsType(
                                points=points_from_polygon(line_polygon)))
                        region.add_TextLine(line)
                    # if the region has received text lines, keep it
                    if region.get_TextLine():
                        element.add_TextRegion(region)
                        LOG.info('Added region "%s" with %d lines for %s "%s"',
                                 region_id, line_no, element_name, element_id)
                        if rogroup:
                            index = page_add_to_reading_order(
                                rogroup, region.id, index)
            # add additional image/non-text regions from compute_segmentation
            # (e.g. drop-capitals or images) ...
            image_labels, num_images = morph.label(images)
            LOG.info('Found %d large non-text/image regions for %s "%s"',
                     num_images, element_name, element_id)
            # find contours around region labels (can be non-contiguous):
            image_polygons, _ = masks2polygons(
                image_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            for image_label, polygon in image_polygons:
                # convert back to absolute (page) coordinates:
                region_polygon = coordinates_for_segment(
                    polygon, image, coords)
                region_polygon = polygon_for_parent(region_polygon, element)
                if region_polygon is None:
                    LOG.warning(
                        'Ignoring extant region contour for image label %d',
                        image_label)
                    continue
                region_no += 1
                # annotate result:
                region_id = element_id + "_image%04d" % region_no
                element.add_ImageRegion(
                    ImageRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon))))
            # split rulers into separator regions:
            hline_labels, num_hlines = morph.label(hlines)
            vline_labels, num_vlines = morph.label(vlines)
            LOG.info('Found %d/%d h/v-lines for %s "%s"', num_hlines,
                     num_vlines, element_name, element_id)
            # find contours around region labels (can be non-contiguous):
            hline_polygons, _ = masks2polygons(
                hline_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            vline_polygons, _ = masks2polygons(
                vline_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            for _, polygon in hline_polygons + vline_polygons:
                # convert back to absolute (page) coordinates:
                region_polygon = coordinates_for_segment(
                    polygon, image, coords)
                region_polygon = polygon_for_parent(region_polygon, element)
                if region_polygon is None:
                    LOG.warning('Ignoring extant region contour for separator')
                    continue
                # annotate result:
                region_no += 1
                region_id = element_id + "_sep%04d" % region_no
                element.add_SeparatorRegion(
                    SeparatorRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon))))
            # annotate a text/image-separated image
            element_array[sepmask] = np.amax(element_array)  # clip to white/bg
            image_clipped = array2pil(element_array)
            file_path = self.workspace.save_image_file(
                image_clipped,
                file_id + '.IMG-CLIP',
                page_id=page_id,
                file_grp=self.output_file_grp)
            element.add_AlternativeImage(
                AlternativeImageType(filename=file_path,
                                     comments=coords['features'] + ',clipped'))
        else:
            # get mask from region polygon:
            region_polygon = coordinates_of_segment(element, image, coords)
            region_mask = np.zeros_like(element_bin, np.bool)
            region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:,
                                                                          0],
                                     region_mask.shape)] = True
            # ensure the new line labels do not extrude from the region:
            line_labels = line_labels * region_mask
            # find contours around labels (can be non-contiguous):
            line_polygons, _ = masks2polygons(line_labels,
                                              element_bin,
                                              'region "%s"' % element_id,
                                              min_area=640 / zoom / zoom)
            line_no = 0
            for line_label, polygon in line_polygons:
                # convert back to absolute (page) coordinates:
                line_polygon = coordinates_for_segment(polygon, image, coords)
                line_polygon = polygon_for_parent(line_polygon, element)
                if line_polygon is None:
                    LOG.warning(
                        'Ignoring extant line contour for line label %d',
                        line_label)
                    continue
                # annotate result:
                line_no += 1
                line_id = element_id + "_line%04d" % line_no
                element.add_TextLine(
                    TextLineType(
                        id=line_id,
                        Coords=CoordsType(
                            points=points_from_polygon(line_polygon))))
            if not sep_bin.any():
                return  # no derived image
            # annotate a text/image-separated image
            element_array[sep_bin] = np.amax(element_array)  # clip to white/bg
            image_clipped = array2pil(element_array)
            file_path = self.workspace.save_image_file(
                image_clipped,
                file_id + '.IMG-CLIP',
                page_id=page_id,
                file_grp=self.output_file_grp)
            # update PAGE (reference the image file):
            element.add_AlternativeImage(
                AlternativeImageType(filename=file_path,
                                     comments=coords['features'] + ',clipped'))
Exemplo n.º 18
0
    def image_from_segment(self,
                           segment,
                           parent_image,
                           parent_coords,
                           fill='background',
                           transparency=False,
                           feature_selector='',
                           feature_filter=''):
        """Extract an image for a PAGE-XML hierarchy segment from its parent's image.

        Given...

         * ``parent_image``, a PIL.Image of the parent, with
         * ``parent_coords``, a dict with information about ``parent_image``:
           - ``transform``: a Numpy array with an affine transform which
             converts from absolute coordinates to those relative to the image,
             i.e. after applying all operations (starting with the original image)
           - ``angle``: the rotation/reflection angle applied to the image so far,
           - ``features``: the AlternativeImage @comments for the image, i.e.
             names of all operations that lead up to this result, and
         * ``segment``, a PAGE segment object logically contained in it
           (i.e. TextRegionType / TextLineType / WordType / GlyphType),

        ...extract the segment's corresponding PIL.Image, either from
        AlternativeImage (if it exists), or producing a new image via
        cropping from ``parent_image`` (otherwise).

        If ``feature_selector`` and/or ``feature_filter`` is given, then
        select/filter among the cropped ``parent_image`` and the available
        AlternativeImages the last one which contains all of the selected,
        but none of the filtered features (i.e. @comments classes), or
        raise an error.

        (Required and produced features need not be in the same order, so
        ``feature_selector`` is merely a mask specifying Boolean AND, and
        ``feature_filter`` is merely a mask specifying Boolean OR.)

        Cropping uses a polygon mask (not just the bounding box rectangle).
        Areas outside the polygon will be filled according to ``fill``:

        - if ``background`` (the default),
          then fill with the median color of the image;
        - otherwise, use the given color, e.g. ``white`` or (255,255,255).

        Moreover, if ``transparency`` is true, and unless the image already
        has an alpha channel, then add an alpha channel which is fully opaque
        before cropping and rotating. (Thus, only the exposed areas will be
        transparent afterwards, for those that can interpret alpha channels).

        When cropping, compensate any @orientation angle annotated for the
        parent (from parent-level deskewing) by rotating the segment coordinates
        in an inverse transformation (i.e. translation to center, then passive
        rotation, and translation back).

        Regardless, if any @orientation angle is annotated for the segment
        (from segment-level deskewing), and the chosen image does not have
        the feature "deskewed" yet, and unless "deskewed" is being filtered,
        then rotate it - compensating for any previous ``angle``. (However,
        if @orientation is above the [-45°,45°] interval, then apply as much
        transposition as possible first, unless "rotated-90" / "rotated-180" /
        "rotated-270" is being filtered.)

        Return a tuple:

         * the extracted image,
         * a dictionary with information about the extracted image:
           - ``transform``: a Numpy array with an affine transform which
             converts from absolute coordinates to those relative to the image,
             i.e. after applying all parent operations, and then cropping to
             the segment's bounding box, and deskewing with the segment's
             orientation angle (if any)
           - ``angle``: the rotation/reflection angle applied to the image so far,
           - ``features``: the AlternativeImage @comments for the image, i.e.
             names of all operations that lead up to this result.

        (These can be used to create a new AlternativeImage, or passed down
         for calls on lower hierarchy levels.)

        Example:

         * get a raw (colored) but already deskewed and cropped image:

           ``image, xywh = workspace.image_from_segment(region,
                 page_image, page_xywh,
                 feature_selector='deskewed,cropped',
                 feature_filter='binarized,grayscale_normalized')``
        """
        log = getLogger('ocrd.workspace.image_from_segment')
        # note: We should mask overlapping neighbouring segments here,
        # but finding the right clipping rules can be difficult if operating
        # on the raw (non-binary) image data alone: for each intersection, it
        # must be decided which one of either segment or neighbour to assign,
        # e.g. an ImageRegion which properly contains our TextRegion should be
        # completely ignored, but an ImageRegion which is properly contained
        # in our TextRegion should be completely masked, while partial overlap
        # may be more difficult to decide. On the other hand, on the binary image,
        # we can use connected component analysis to mask foreground areas which
        # originate in the neighbouring regions. But that would introduce either
        # the assumption that the input has already been binarized, or a dependency
        # on some ad-hoc binarization method. Thus, it is preferable to use
        # a dedicated processor for this (which produces clipped AlternativeImage
        # or reduced polygon coordinates).

        # get polygon outline of segment relative to parent image:
        segment_polygon = coordinates_of_segment(segment, parent_image,
                                                 parent_coords)
        # get relative bounding box:
        segment_bbox = bbox_from_polygon(segment_polygon)
        # get size of the segment in the parent image after cropping
        # (i.e. possibly different from size before rotation at the parent, but
        #  also possibly different from size after rotation below/AlternativeImage):
        segment_xywh = xywh_from_bbox(*segment_bbox)
        # create a mask from the segment polygon:
        segment_image = image_from_polygon(parent_image,
                                           segment_polygon,
                                           fill=fill,
                                           transparency=transparency)
        # recrop into segment rectangle:
        segment_image = crop_image(segment_image, box=segment_bbox)
        # subtract offset from parent in affine coordinate transform:
        # (consistent with image cropping)
        segment_coords = {
            'transform':
            shift_coordinates(parent_coords['transform'],
                              np.array([-segment_bbox[0], -segment_bbox[1]]))
        }

        if 'orientation' in segment.__dict__:
            # region angle: PAGE @orientation is defined clockwise,
            # whereas PIL/ndimage rotation is in mathematical direction:
            segment_coords['angle'] = -(segment.get_orientation() or 0)
        else:
            segment_coords['angle'] = 0
        if segment_coords['angle']:
            # @orientation is always absolute; if higher levels
            # have already rotated, then we must compensate:
            angle = segment_coords['angle'] - parent_coords['angle']
            # map angle from (-180,180] to [0,360], and partition into multiples of 90;
            # but avoid unnecessary large remainders, i.e. split symmetrically:
            orientation = (angle + 45) % 360
            orientation = orientation - (orientation % 90)
            skew = (angle % 360) - orientation
            skew = 180 - (180 - skew) % 360  # map to [-45,45]
            log.debug("segment '%s' has orientation=%d skew=%.2f", segment.id,
                      orientation, skew)
        else:
            orientation = 0
            skew = 0
        segment_coords['angle'] = parent_coords[
            'angle']  # nothing applied yet (depends on filters)

        if (orientation and
                not 'rotated-%d' % orientation in feature_filter.split(',')):
            # Transpose in affine coordinate transform:
            # (consistent with image transposition or AlternativeImage below)
            transposition = {
                90: Image.ROTATE_90,
                180: Image.ROTATE_180,
                270: Image.ROTATE_270
            }.get(orientation)  # no default
            segment_coords['transform'] = transpose_coordinates(
                segment_coords['transform'], transposition,
                np.array([0.5 * segment_xywh['w'], 0.5 * segment_xywh['h']]))
            segment_xywh['w'], segment_xywh[
                'h'] = adjust_canvas_to_transposition(
                    [segment_xywh['w'], segment_xywh['h']], transposition)
            segment_coords['angle'] += orientation
        if (skew and not 'deskewed' in feature_filter.split(',')):
            # Rotate around center in affine coordinate transform:
            # (consistent with image rotation or AlternativeImage below)
            segment_coords['transform'] = rotate_coordinates(
                segment_coords['transform'], skew,
                np.array([0.5 * segment_xywh['w'], 0.5 * segment_xywh['h']]))
            segment_coords['angle'] += skew

        # initialize AlternativeImage@comments classes from parent, except
        # for those operations that can apply on multiple hierarchy levels:
        segment_coords['features'] = ','.join([
            feature for feature in parent_coords['features'].split(',')
            if feature in
            ['binarized', 'grayscale_normalized', 'despeckled', 'dewarped']
        ])

        alternative_image = None
        alternative_images = segment.get_AlternativeImage()
        if alternative_images:
            # (e.g. from segment-level cropping, binarization, deskewing or despeckling)
            if feature_selector or feature_filter:
                alternative_image = None
                # search from the end, because by convention we always append,
                # and among multiple satisfactory images we want the most recent:
                for alternative_image in reversed(alternative_images):
                    features = alternative_image.get_comments()
                    if (all(feature in features
                            for feature in feature_selector.split(',')
                            if feature) and
                            not any(feature in features
                                    for feature in feature_filter.split(',')
                                    if feature)):
                        break
                    else:
                        alternative_image = None
            else:
                alternative_image = alternative_images[-1]
                features = alternative_image.get_comments()
            if alternative_image:
                log.debug("Using AlternativeImage %d (%s) for segment '%s'",
                          alternative_images.index(alternative_image) + 1,
                          features, segment.id)
                segment_image = self._resolve_image_as_pil(
                    alternative_image.get_filename())
                segment_coords['features'] = features
        # transpose, if (still) necessary:
        if (orientation and
                not 'rotated-%d' % orientation in segment_coords['features']
                and
                not 'rotated-%d' % orientation in feature_filter.split(',')):
            log.info("Transposing %s for segment '%s' by %d°",
                     "AlternativeImage" if alternative_image else "image",
                     segment.id, orientation)
            segment_image = transpose_image(segment_image, {
                90: Image.ROTATE_90,
                180: Image.ROTATE_180,
                270: Image.ROTATE_270
            }.get(orientation))  # no default
            segment_coords['features'] += ',rotated-%d' % orientation
        if (orientation and
                not 'rotated-%d' % orientation in feature_filter.split(',')):
            # FIXME we should enforce consistency here (i.e. split into transposition
            #       and minimal rotation)
            if not (segment_image.width == segment_xywh['w']
                    and segment_image.height == segment_xywh['h']):
                log.error(
                    'segment "%s" image (%s; %dx%d) has not been transposed properly (%dx%d) during rotation',
                    segment.id, segment_coords['features'],
                    segment_image.width, segment_image.height,
                    segment_xywh['w'], segment_xywh['h'])
        # deskew, if (still) necessary:
        if (skew and not 'deskewed' in segment_coords['features']
                and not 'deskewed' in feature_filter.split(',')):
            log.info("Rotating %s for segment '%s' by %.2f°",
                     "AlternativeImage" if alternative_image else "image",
                     segment.id, skew)
            segment_image = rotate_image(segment_image,
                                         skew,
                                         fill=fill,
                                         transparency=transparency)
            segment_coords['features'] += ',deskewed'
        if (skew and not 'deskewed' in feature_filter.split(',')):
            # FIXME we should enforce consistency here (i.e. rotation always reshapes,
            #       and rescaling never happens)
            w_new, h_new = adjust_canvas_to_rotation(
                [segment_xywh['w'], segment_xywh['h']], skew)
            if not (w_new - 2 < segment_image.width < w_new + 2
                    and h_new - 2 < segment_image.height < h_new + 2):
                log.error(
                    'segment "%s" image (%s; %dx%d) has not been reshaped properly (%dx%d) during rotation',
                    segment.id, segment_coords['features'],
                    segment_image.width, segment_image.height, w_new, h_new)
        else:
            # FIXME: currently unavoidable with line-level dewarping (which increases height)
            if not (segment_xywh['w'] - 2 < segment_image.width <
                    segment_xywh['w'] + 2 and segment_xywh['h'] - 2 <
                    segment_image.height < segment_xywh['h'] + 2):
                log.error(
                    'segment "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)',
                    segment.id, segment_coords['features'],
                    segment_image.width, segment_image.height,
                    segment_xywh['w'], segment_xywh['h'])

        # verify constraints again:
        if not all(feature in segment_coords['features']
                   for feature in feature_selector.split(',') if feature):
            raise Exception(
                'Found no AlternativeImage that satisfies all requirements' +
                'selector="%s" in segment "%s"' %
                (feature_selector, segment.id))
        if any(feature in segment_coords['features']
               for feature in feature_filter.split(',') if feature):
            raise Exception(
                'Found no AlternativeImage that satisfies all requirements ' +
                'filter="%s" in segment "%s"' % (feature_filter, segment.id))
        segment_image.format = 'PNG'  # workaround for tesserocr#194
        return segment_image, segment_coords
Exemplo n.º 19
0
    def process(self):
        """Extract page image and replace original with it.
        
        Open and deserialize PAGE input files and their respective images,
        then go to the page hierarchy level.
        
        Retrieve the image of the (cropped, deskewed, dewarped) page, preferring
        the last annotated form (which, depending on the workflow, could be
        binarized or raw). Add that image file to the workspace with the fileGrp
        USE given in the second position of the output fileGrp, or ``OCR-D-IMG-SUBST``.
        Reference that file in the page (not as AlternativeImage but) as original
        image. Adjust all segment coordinates accordingly.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        try:
            page_grp, image_grp = self.output_file_grp.split(',')
        except ValueError:
            page_grp = self.output_file_grp
            image_grp = FALLBACK_FILEGRP_IMG
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                image_grp)
        feature_selector = self.parameter['feature_selector']
        feature_filter = self.parameter['feature_filter']
        adapt_coords = self.parameter['transform_coordinates']

        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp, page_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(page_grp, n)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter
                                   ])
                    ]))
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter=feature_filter,
                feature_selector=feature_selector)
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            # annotate extracted image
            file_path = self.workspace.save_image_file(
                page_image,
                file_id.replace(page_grp, image_grp),
                image_grp,
                page_id=input_file.pageId,
                mimetype='image/png')
            # replace original image
            page.set_imageFilename(file_path)
            # adjust all coordinates
            if adapt_coords:
                for region in page.get_AllRegions():
                    region_polygon = coordinates_of_segment(
                        region, page_image, page_coords)
                    region.get_Coords().points = points_from_polygon(
                        region_polygon)
                    if isinstance(region, TextRegionType):
                        for line in region.get_TextLine():
                            line_polygon = coordinates_of_segment(
                                line, page_image, page_coords)
                            line.get_Coords().points = points_from_polygon(
                                line_polygon)
                            for word in line.get_Word():
                                word_polygon = coordinates_of_segment(
                                    word, page_image, page_coords)
                                word.get_Coords().points = points_from_polygon(
                                    word_polygon)
                                for glyph in word.get_Glyph():
                                    glyph_polygon = coordinates_of_segment(
                                        glyph, page_image, page_coords)
                                    glyph.get_Coords(
                                    ).points = points_from_polygon(
                                        glyph_polygon)

            # update METS (add the PAGE file):
            file_path = os.path.join(page_grp, file_id + '.xml')
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=page_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     page_grp, out.local_filename)
Exemplo n.º 20
0
    def process(self):
        """Extract region images from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Extract an image for each region (which depending on the workflow
        can already be deskewed, dewarped, binarized etc.), cropped to its
        minimal bounding box, and masked by the coordinate polygon outline.
        If ``transparency`` is true, then also add an alpha channel which is
        fully transparent outside of the mask.
        
        Create a JSON file with:
        * the IDs of the region and its parents,
        * the region's coordinates relative to the region image,
        * the region's absolute coordinates,
        * the (text) region's text content (if any),
        * the (text) region's TextStyle (if any),
        * the (text) region's @production (if any),
        * the (text) region's @readingDirection (if any),
        * the (text) region's @textLineOrder (if any),
        * the (text) region's @primaryScript (if any),
        * the (text) region's @primaryLanguage (if any),
        * the region's AlternativeImage/@comments (features),
        * the region's element class,
        * the region's @type,
        * the page's @type,
        * the page's DPI value.
        
        Write all files in the directory of the output file group, named like so:
        * ID + '.raw.png': region image (if the workflow provides raw images)
        * ID + '.bin.png': region image (if the workflow provides binarized images)
        * ID + '.nrm.png': region image (if the workflow provides grayscale-normalized images)
        * ID + '.json': region metadata.
        """
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            metadata = pcgts.get_Metadata() # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(type_="processingStep",
                                 name=self.ocrd_tool['steps'][0],
                                 value=TOOL,
                                 Labels=[LabelsType(
                                     externalModel="ocrd-tool",
                                     externalId="parameters",
                                     Label=[LabelType(type_=name,
                                                      value=self.parameter[name])
                                            for name in self.parameter.keys()])]))
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id,
                transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            ptype = page.get_type()

            regions = { 'advert': page.get_AdvertRegion(),
                        'text': page.get_TextRegion(),
                        'table': page.get_TableRegion(),
                        'chart': page.get_ChartRegion(),
                        'chem': page.get_ChemRegion(),
                        'graphic': page.get_GraphicRegion(),
                        'image': page.get_ImageRegion(),
                        'linedrawing': page.get_LineDrawingRegion(),
                        'maths': page.get_MathsRegion(),
                        'music': page.get_MusicRegion(),
                        'noise': page.get_NoiseRegion(),
                        'separator': page.get_SeparatorRegion(),
                        'unknown': page.get_UnknownRegion()
            }
            for rtype, rlist in regions.items():
                for region in rlist:
                    description = { 'region.ID': region.id, 'region.type': rtype }
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords,
                        transparency=self.parameter['transparency'])
                    description['subtype'] = region.get_type() if rtype in ['text', 'chart', 'graphic'] else None
                    description['coords_rel'] = coordinates_of_segment(
                        region, region_image, region_coords).tolist()
                    description['coords_abs'] = polygon_from_points(region.get_Coords().points)
                    if rtype == 'text':
                        rtext = region.get_TextEquiv()
                        if rtext:
                            description['region.text'] = rtext[0].Unicode
                        else:
                            description['region.text'] = ''
                        rstyle = region.get_TextStyle() or page.get_TextStyle()
                        if rstyle:
                            description['region.style'] = {
                                'fontFamily': rstyle.fontFamily,
                                'fontSize': rstyle.fontSize,
                                'xHeight': rstyle.xHeight,
                                'kerning': rstyle.kerning,
                                'serif': rstyle.serif,
                                'monospace': rstyle.monospace,
                                'bold': rstyle.bold,
                                'italic': rstyle.italic,
                                'smallCaps': rstyle.smallCaps,
                                'letterSpaced': rstyle.letterSpaced,
                                'strikethrough': rstyle.strikethrough,
                                'underlined': rstyle.underlined,
                                'underlineStyle': rstyle.underlineStyle,
                                'subscript': rstyle.subscript,
                                'superscript': rstyle.superscript
                            }
                        description['production'] = region.get_production()
                        description['readingDirection'] = (
                            region.get_readingDirection() or
                            page.get_readingDirection())
                        description['textLineOrder'] = (
                            region.get_textLineOrder() or
                            page.get_textLineOrder())
                        description['primaryScript'] = (
                            region.get_primaryScript() or
                            page.get_primaryScript())
                        description['primaryLanguage'] = (
                            region.get_primaryLanguage() or
                            page.get_primaryLanguage())
                    description['features'] = region_coords['features']
                    description['DPI']= dpi
                    description['page.ID'] = page_id
                    description['page.type'] = ptype
                    description['file_grp'] = self.input_file_grp
                    description['METS.UID'] = self.workspace.mets.unique_identifier
                    if 'binarized' in region_coords['features']:
                        extension = '.bin'
                    elif 'grayscale_normalized' in region_coords['features']:
                        extension = '.nrm'
                    else:
                        extension = '.raw'
                    
                    file_path = self.workspace.save_image_file(
                        region_image,
                        file_id + '_' + region.id + extension,
                        self.output_file_grp,
                        page_id=page_id,
                        format='PNG')
                    file_path = file_path.replace(extension + '.png', '.json')
                    json.dump(description, open(file_path, 'w'))
Exemplo n.º 21
0
    def process(self):
        """Resegment lines of the workspace.

        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the line level.

        Next, get each region image according to the layout annotation (from
        the alternative image of the region, or by cropping via coordinates
        into the higher-level image), and compute a new line segmentation
        from that (as a label mask).

        Then for each line within the region, find the label with the largest
        foreground area in the binarized image within the annotated polygon
        (or rectangle) of the line. Unless its relative area is too small,
        or its center is far off, convert that label's mask into a polygon
        outline, intersect with the old polygon, and find the contour of that
        segment. Annotate the result as new coordinates of the line.

        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-RESEG`` along with further
        identification of the input element.

        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.OcropyResegment')
        # This makes best sense for bad/coarse line segmentation, like current GT
        # or as postprocessing for bbox-only steps.
        # Most notably, it can convert rectangles to polygons (polygonalization).
        # It depends on a decent line segmentation from ocropy though. So it
        # _should_ ideally be run after deskewing (on the page or region level),
        # _must_ be run after binarization (on page or region level). Also, the
        # method's accuracy crucially depends on a good estimate of the images'
        # pixel density (at least if source input is not 300 DPI).
        threshold = self.parameter['min_fraction']
        margin = self.parameter['extend_margins']
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file.pageId
                     or input_file.ID)
            file_id = make_file_id(input_file, self.output_file_grp)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID  # (PageType has no id)
            page = pcgts.get_Page()

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized')
            if self.parameter['dpi'] > 0:
                zoom = 300.0 / self.parameter['dpi']
            elif page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi *= 2.54
                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
                zoom = 300.0 / dpi
            else:
                zoom = 1

            regions = page.get_AllRegions(classes=['Text'])
            if not regions:
                LOG.warning('Page "%s" contains no text regions', page_id)
            for region in regions:
                lines = region.get_TextLine()
                if not lines:
                    LOG.warning('Page "%s" region "%s" contains no text lines',
                                page_id, region.id)
                    continue
                if len(lines) == 1:
                    LOG.warning('Page "%s" region "%s" contains only one line',
                                page_id, region.id)
                    continue
                region_image, region_xywh = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_xywh,
                    feature_selector='binarized')
                region_array = pil2array(region_image)
                #region_array, _ = common.binarize(region_array, maxskew=0) # just in case still raw
                region_bin = np.array(region_array <= midrange(region_array),
                                      np.bool)
                report = check_region(region_bin, zoom)
                try:
                    if report:
                        raise Exception(report)
                    region_labels, _, _, _, _, _ = compute_segmentation(
                        region_bin, zoom=zoom)
                except Exception as err:
                    LOG.warning(
                        'Cannot line-segment page "%s" region "%s": %s',
                        page_id, region.id, err)
                    # fallback option 1: borderclean
                    # label margins vs interior, but with the interior
                    # extended into the margin by its connected components
                    # to remove noise from neighbouring regions:
                    #region_labels = borderclean_bin(region_bin, margin=round(4/zoom)) + 1
                    # too dangerous, because we risk losing dots from i or punctuation;
                    # fallback option2: only extend_margins
                    # instead, just provide a uniform label, so at least we get
                    # to extend the polygon margins:
                    #region_labels = np.ones_like(region_bin)
                    # fallback option3: keep unchanged
                    continue
                for line in lines:
                    if line.get_AlternativeImage():
                        # get cropped line image:
                        line_image, line_xywh = self.workspace.image_from_segment(
                            line,
                            region_image,
                            region_xywh,
                            feature_selector='binarized')
                        LOG.debug("Using AlternativeImage (%s) for line '%s'",
                                  line_xywh['features'], line.id)
                        # crop region arrays accordingly:
                        line_polygon = coordinates_of_segment(
                            line, region_image, region_xywh)
                        line_bbox = bbox_from_polygon(line_polygon)
                        line_labels = region_labels[line_bbox[1]:line_bbox[3],
                                                    line_bbox[0]:line_bbox[2]]
                        line_bin = region_bin[line_bbox[1]:line_bbox[3],
                                              line_bbox[0]:line_bbox[2]]
                        # get polygon in relative (line) coordinates:
                        line_polygon = coordinates_of_segment(
                            line, line_image, line_xywh)
                        line_polygon = resegment(line_polygon,
                                                 line_labels,
                                                 line_bin,
                                                 line.id,
                                                 extend_margins=margin,
                                                 threshold_relative=threshold)
                        if line_polygon is None:
                            continue  # not good enough – keep
                        # convert back to absolute (page) coordinates:
                        line_polygon = coordinates_for_segment(
                            line_polygon, line_image, line_xywh)
                    else:
                        # get polygon in relative (region) coordinates:
                        line_polygon = coordinates_of_segment(
                            line, region_image, region_xywh)
                        line_polygon = resegment(line_polygon,
                                                 region_labels,
                                                 region_bin,
                                                 line.id,
                                                 extend_margins=margin,
                                                 threshold_relative=threshold)
                        if line_polygon is None:
                            continue  # not good enough – keep
                        # convert back to absolute (page) coordinates:
                        line_polygon = coordinates_for_segment(
                            line_polygon, region_image, region_xywh)
                    # annotate result:
                    line.get_Coords().points = points_from_polygon(
                        line_polygon)
                    # create new image:
                    line_image, line_xywh = self.workspace.image_from_segment(
                        line,
                        region_image,
                        region_xywh,
                        feature_selector='binarized')
                    # update METS (add the image file):
                    file_path = self.workspace.save_image_file(
                        line_image,
                        file_id=file_id + '_' + region.id + '_' + line.id +
                        '.IMG-RESEG',
                        page_id=page_id,
                        file_grp=self.output_file_grp)
                    # update PAGE (reference the image file):
                    line.add_AlternativeImage(
                        AlternativeImageType(filename=file_path,
                                             comments=region_xywh['features']))

            # update METS (add the PAGE file):
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            pcgts.set_pcGtsId(file_id)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     self.output_file_grp, out.local_filename)