Exemplo n.º 1
0
def polygon_for_parent(polygon, parent):
    """Clip polygon to parent polygon range.
    
    (Should be moved to ocrd_utils.coordinates_for_segment.)
    """
    childp = Polygon(polygon)
    if isinstance(parent, PageType):
        if parent.get_Border():
            parentp = Polygon(
                polygon_from_points(parent.get_Border().get_Coords().points))
        else:
            parentp = Polygon(
                [[0, 0], [0, parent.get_imageHeight()],
                 [parent.get_imageWidth(),
                  parent.get_imageHeight()], [parent.get_imageWidth(), 0]])
    else:
        parentp = Polygon(polygon_from_points(parent.get_Coords().points))
    # ensure input coords have valid paths (without self-intersection)
    # (this can happen when shapes valid in floating point are rounded)
    childp = make_valid(childp)
    parentp = make_valid(parentp)
    if not childp.is_valid:
        return None
    if not parentp.is_valid:
        return None
    # check if clipping is necessary
    if childp.within(parentp):
        return childp.exterior.coords[:-1]
    # clip to parent
    interp = make_intersection(childp, parentp)
    if not interp:
        return None
    return interp.exterior.coords[:-1]  # keep open
Exemplo n.º 2
0
 def process(self):
     """
     Performs the binarization.
     """
     log = getLogger('processor.KrakenBinarize')
     log.debug('Level of operation: "%s"',
               self.parameter['level-of-operation'])
     log.debug('Input file group %s', self.input_file_grp)
     log.debug('Input files %s', [str(f) for f in self.input_files])
     for (n, input_file) in enumerate(self.input_files):
         log.info("INPUT FILE %i / %s", n, input_file)
         pcgts = page_from_file(self.workspace.download_file(input_file))
         image_url = pcgts.get_Page().imageFilename
         log.info("pcgts %s", pcgts)
         if self.parameter['level-of-operation'] == 'page':
             log.info("About to binarize page '%s'", pcgts.pcGtsId)
             image = self.workspace.resolve_image_as_pil(image_url)
             bin_image = kraken.binarization.nlbin(image)
             bin_image_bytes = io.BytesIO()
             bin_image.save(bin_image_bytes, format='PNG')
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(self.output_file_grp,
                                     pageId=input_file.pageId,
                                     ID=ID,
                                     mimetype='image/png',
                                     local_filename="%s/%s" %
                                     (self.output_file_grp, ID),
                                     content=bin_image_bytes.getvalue())
         else:
             for region in pcgts.get_Page().get_TextRegion():
                 if self.parameter['level-of-operation'] == 'block':
                     log.info("About to binarize region '%s'", region.id)
                     image = self.workspace.resolve_image_as_pil(
                         image_url,
                         polygon_from_points(region.get_Coords().points))
                 else:
                     textlines = region.get_TextLine()
                     log.info("About to binarize %i lines of region '%s'",
                              len(textlines), region.id)
                     for (line_no, line) in enumerate(textlines):
                         log.debug("Binarizing line '%s' in region '%s'",
                                   line_no, region.id)
                         image = self.workspace.resolve_image_as_pil(
                             image_url,
                             polygon_from_points(line.get_Coords().points))
                         bin_image = kraken.binarization.nlbin(image)
                         bin_image_bytes = io.BytesIO()
                         bin_image.save(bin_image_bytes, format='PNG')
                         ID = concat_padded(self.output_file_grp, n,
                                            region.id, line_no)
                         self.workspace.add_file(
                             self.output_file_grp,
                             pageId=input_file.pageId,
                             ID=ID,
                             local_filename="%s/%s" %
                             (self.output_file_grp, ID),
                             mimetype='image/png',
                             content=bin_image_bytes.getvalue())
Exemplo n.º 3
0
 def validate_coords(self, page, page_id):
     valid = True
     regions = page.get_TextRegion()
     if page.get_Border():
         other_regions = (page.get_AdvertRegion() + page.get_ChartRegion() +
                          page.get_ChemRegion() + page.get_GraphicRegion() +
                          page.get_ImageRegion() +
                          page.get_LineDrawingRegion() +
                          page.get_MathsRegion() + page.get_MusicRegion() +
                          page.get_NoiseRegion() +
                          page.get_SeparatorRegion() +
                          page.get_TableRegion() + page.get_UnknownRegion())
         for region in regions + other_regions:
             if not _child_within_parent(region, page.get_Border()):
                 LOG.warning(
                     'Region "%s" extends beyond Border of page "%s"',
                     region.id, page_id)
                 valid = False
     for region in regions:
         lines = region.get_TextLine()
         for line in lines:
             if not _child_within_parent(line, region):
                 LOG.warning(
                     'Line "%s" extends beyond region "%s" on page "%s"',
                     line.id, region.id, page_id)
                 valid = False
             if line.get_Baseline():
                 baseline = LineString(
                     polygon_from_points(line.get_Baseline().points))
                 linepoly = Polygon(
                     polygon_from_points(line.get_Coords().points))
                 if not baseline.within(linepoly):
                     LOG.warning(
                         'Baseline extends beyond line "%s" in region "%s" on page "%s"',
                         line.id, region.id, page_id)
                     valid = False
             words = line.get_Word()
             for word in words:
                 if not _child_within_parent(word, line):
                     LOG.warning(
                         'Word "%s" extends beyond line "%s" in region "%s" on page "%s"',
                         word.id, line.id, region.id, page_id)
                     valid = False
                 glyphs = word.get_Glyph()
                 for glyph in glyphs:
                     if not _child_within_parent(glyph, word):
                         LOG.warning(
                             'Glyph "%s" extends beyond word "%s" in line "%s" of region "%s" on page "%s"',
                             glyph.id, word.id, line.id, region.id, page_id)
                         valid = False
     return valid
Exemplo n.º 4
0
def ensure_consistent(child):
    """Clip segment element polygon to parent polygon range."""
    points = child.get_Coords().points
    polygon = polygon_from_points(points)
    parent = child.parent_object_
    childp = Polygon(polygon)
    if isinstance(parent, PageType):
        if parent.get_Border():
            parentp = Polygon(
                polygon_from_points(parent.get_Border().get_Coords().points))
        else:
            parentp = Polygon(
                [[0, 0], [0, parent.get_imageHeight()],
                 [parent.get_imageWidth(),
                  parent.get_imageHeight()], [parent.get_imageWidth(), 0]])
    else:
        parentp = Polygon(polygon_from_points(parent.get_Coords().points))
    # ensure input coords have valid paths (without self-intersection)
    # (this can happen when shapes valid in floating point are rounded)
    childp = make_valid(childp)
    parentp = make_valid(parentp)
    # check if clipping is necessary
    if childp.within(parentp):
        return
    # clip to parent
    interp = childp.intersection(parentp)
    if interp.is_empty or interp.area == 0.0:
        if hasattr(parent, 'pcGtsId'):
            parent_id = parent.pcGtsId
        elif hasattr(parent, 'imageFilename'):
            parent_id = parent.imageFilename
        else:
            parent_id = parent.id
        raise Exception("Segment '%s' does not intersect its parent '%s'" %
                        (child.id, parent_id))
    if interp.type == 'GeometryCollection':
        # heterogeneous result: filter zero-area shapes (LineString, Point)
        interp = unary_union([geom for geom in interp.geoms if geom.area > 0])
    if interp.type == 'MultiPolygon':
        # homogeneous result: construct convex hull to connect
        # FIXME: construct concave hull / alpha shape
        interp = interp.convex_hull
    if interp.minimum_clearance < 1.0:
        # follow-up calculations will necessarily be integer;
        # so anticipate rounding here and then ensure validity
        interp = asPolygon(np.round(interp.exterior.coords))
        interp = make_valid(interp)
    polygon = interp.exterior.coords[:-1]  # keep open
    points = points_from_polygon(polygon)
    child.get_Coords().set_points(points)
Exemplo n.º 5
0
def _add_annotation(annotations,
                    segment,
                    imgid,
                    catid,
                    coords=None,
                    mask=None):
    LOG = getLogger('processor.EvaluateSegmentation')
    score = segment.get_Coords().get_conf() or 1.0
    polygon = polygon_from_points(segment.get_Coords().points)
    if len(polygon) < 3:
        LOG.warning('ignoring segment "%s" with only %d points', segment.id,
                    len(polygon))
        return
    xywh = xywh_from_polygon(polygon)
    if mask is None:
        segmentation = np.array(polygon).reshape(1, -1).tolist()
    else:
        polygon = coordinates_of_segment(segment, None, coords)
        py, px = draw.polygon(polygon[:, 1], polygon[:, 0], mask.shape)
        masked = np.zeros(
            mask.shape, dtype=np.uint8,
            order='F')  # pycocotools.mask wants Fortran-contiguous arrays
        masked[py, px] = 1 * mask[py, px]
        segmentation = encodeMask(masked)
    annotations.append({
        'segment_id':
        segment.id,  # non-standard string-valued in addition to 'id'
        'image_id': imgid,
        'category_id': catid,
        'segmentation': segmentation,
        'area': Polygon(polygon).area,
        'bbox': [xywh['x'], xywh['y'], xywh['w'], xywh['h']],
        'score': score,
        'iscrowd': 0
    })
Exemplo n.º 6
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(
         psm=PSM.SINGLE_LINE,
         path=TESSDATA_PREFIX,
     ) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 for line in region.get_TextLine():
                     log.debug("Detecting words in line '%s'", line.id)
                     image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
                     tessapi.SetImage(image)
                     offset = xywh_from_points(line.get_Coords().points)
                     for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)):
                         word_id = '%s_word%04d' % (line.id, word_no)
                         word_xywh = component[1]
                         word_xywh['x'] += offset['x']
                         word_xywh['y'] += offset['y']
                         line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh))))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )
def coordinates_of_segment(segment, parent_image, parent_xywh):
    """Extract the relative coordinates polygon of a PAGE segment element.
    
    Given a Region / TextLine / Word / Glyph `segment` and
    the PIL.Image of its parent Page / Region / TextLine / Word
    along with its bounding box, calculate the relative coordinates
    of the segment within the image. That is, shift all points from
    the offset of the parent, and (in case the parent was rotated,)
    rotate all points with the center of the image as origin.
    
    Return the rounded numpy array of the resulting polygon.
    """
    # get polygon:
    polygon = np.array(polygon_from_points(segment.get_Coords().points))
    # offset correction (shift coordinates to base of segment):
    polygon -= np.array([parent_xywh['x'], parent_xywh['y']])
    # angle correction (rotate coordinates if image has been rotated):
    if 'angle' in parent_xywh:
        polygon = rotate_coordinates(polygon,
                                     parent_xywh['angle'],
                                     orig=np.array([
                                         0.5 * parent_image.width,
                                         0.5 * parent_image.height
                                     ]))
    return np.round(polygon).astype(np.int32)
Exemplo n.º 8
0
 def process(self):
     """
     Performs the binarization.
     """
     for (n, input_file) in enumerate(self.input_files):
         log.info("INPUT FILE %i / %s", n, input_file)
         pcgts = page_from_file(self.workspace.download_file(input_file))
         image_url = pcgts.get_Page().imageFilename
         log.info("pcgts %s", pcgts)
         for region in pcgts.get_Page().get_TextRegion():
             textlines = region.get_TextLine()
             log.info("About to binarize %i lines of region '%s'", len(textlines), region.id)
             for (line_no, line) in enumerate(textlines):
                 log.debug("Binarizing line '%s' in region '%s'", line_no, region.id)
                 image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
                 print(dir(kraken.binarization))
                 bin_image = kraken.binarization.nlbin(image)
                 bin_image_bytes = io.BytesIO()
                 bin_image.save(bin_image_bytes, format='PNG')
                 ID = concat_padded(self.output_file_grp, n)
                 self.workspace.add_file(
                     self.output_file_grp,
                     pageId=input_file.pageId,
                     ID=ID,
                     basename="%s.bin.png" % ID,
                     mimetype='image/png',
                     content=bin_image_bytes.getvalue())
Exemplo n.º 9
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(
                 self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 log.debug("Detecting lines in %s with tesseract",
                           region.id)
                 image = self.workspace.resolve_image_as_pil(
                     image_url,
                     polygon_from_points(region.get_Coords().points))
                 tessapi.SetImage(image)
                 offset = xywh_from_points(region.get_Coords().points)
                 for (line_no, component) in enumerate(
                         tessapi.GetComponentImages(RIL.TEXTLINE, True)):
                     line_id = '%s_line%04d' % (region.id, line_no)
                     line_xywh = component[1]
                     line_xywh['x'] += offset['x']
                     line_xywh['y'] += offset['y']
                     line_points = points_from_xywh(line_xywh)
                     region.add_TextLine(
                         TextLineType(id=line_id,
                                      Coords=CoordsType(line_points)))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 mimetype=MIMETYPE_PAGE,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 content=to_xml(pcgts).encode('utf-8'),
             )
Exemplo n.º 10
0
def polygon_for_parent(polygon, parent):
    """Clip polygon to parent polygon range.
    
    (Should be moved to ocrd_utils.coordinates_for_segment.)
    """
    childp = Polygon(polygon)
    if isinstance(parent, PageType):
        if parent.get_Border():
            parentp = Polygon(
                polygon_from_points(parent.get_Border().get_Coords().points))
        else:
            parentp = Polygon(
                [[0, 0], [0, parent.get_imageHeight()],
                 [parent.get_imageWidth(),
                  parent.get_imageHeight()], [parent.get_imageWidth(), 0]])
    else:
        parentp = Polygon(polygon_from_points(parent.get_Coords().points))
    # check if clipping is necessary
    if childp.within(parentp):
        return polygon
    # ensure input coords have valid paths (without self-intersection)
    # (this can happen when shapes valid in floating point are rounded)
    childp = make_valid(childp)
    parentp = make_valid(parentp)
    # clip to parent
    interp = childp.intersection(parentp)
    if interp.is_empty or interp.area == 0.0:
        # this happens if Tesseract "finds" something
        # outside of the valid Border of a deskewed/cropped page
        # (empty corners created by masking); will be ignored
        return None
    if interp.type == 'GeometryCollection':
        # heterogeneous result: filter zero-area shapes (LineString, Point)
        interp = unary_union([geom for geom in interp.geoms if geom.area > 0])
    if interp.type == 'MultiPolygon':
        # homogeneous result: construct convex hull to connect
        # FIXME: construct concave hull / alpha shape
        interp = interp.convex_hull
    if interp.minimum_clearance < 1.0:
        # follow-up calculations will necessarily be integer;
        # so anticipate rounding here and then ensure validity
        interp = asPolygon(np.round(interp.exterior.coords))
        interp = make_valid(interp)
    return interp.exterior.coords[:-1]  # keep open
Exemplo n.º 11
0
def adapt_coords(segment, parent, transform):
    points = segment.get_Coords().get_points()
    polygon = polygon_from_points(points)
    # polygon absolute coords (after transforming back from page coords, e.g. deskewing)
    polygon_new = coordinates_for_segment(polygon, None, transform)
    # intersection with parent polygon
    polygon_new = polygon_for_parent(polygon_new, parent)
    if polygon_new is None:
        return None
    points_new = points_from_polygon(polygon_new)
    segment.set_Coords(CoordsType(points=points_new))
    return segment
Exemplo n.º 12
0
def polygon_for_parent(polygon, parent):
    """Clip polygon to parent polygon range.
    
    (Should be moved to ocrd_utils.coordinates_for_segment.)
    """
    childp = Polygon(polygon)
    if isinstance(parent, PageType):
        if parent.get_Border():
            parentp = Polygon(polygon_from_points(parent.get_Border().get_Coords().points))
        else:
            parentp = Polygon([[0,0], [0,parent.get_imageHeight()],
                               [parent.get_imageWidth(),parent.get_imageHeight()],
                               [parent.get_imageWidth(),0]])
    else:
        parentp = Polygon(polygon_from_points(parent.get_Coords().points))
    if childp.within(parentp):
        return polygon
    interp = childp.intersection(parentp)
    if interp.is_empty:
        # FIXME: we need a better strategy against this
        raise Exception("intersection of would-be segment with parent is empty")
    if interp.type == 'MultiPolygon':
        interp = interp.convex_hull
    return interp.exterior.coords[:-1] # keep open
Exemplo n.º 13
0
def _fix_glyphs(word):
    """Fix glyph order in a word"""

    glyphs = word.get_Glyph()
    word_text = get_text(word)
    glyphs_text = get_text(glyphs, '')
    if word_text != glyphs_text:
        sorted_glyphs = sorted(
            glyphs,
            key=lambda g: Polygon(polygon_from_points(g.get_Coords().points)
                                  ).centroid.x)
        sorted_glyphs_text = get_text(sorted_glyphs, '')

        if sorted_glyphs_text == word_text:
            LOG.info('Fixing glyph order of word "%s"', word.id)
            word.set_Glyph(sorted_glyphs)
Exemplo n.º 14
0
def _fix_lines(region):
    """Fix line order in a region"""

    lines = region.get_TextLine()
    region_text = get_text(region)
    lines_text = get_text(lines, '\n')
    if region_text != lines_text:
        sorted_lines = sorted(
            lines,
            key=lambda l: Polygon(polygon_from_points(l.get_Coords().points)
                                  ).centroid.y)
        sorted_lines_text = get_text(sorted_lines, '\n')

        if sorted_lines_text == region_text:
            LOG.info('Fixing line order of region "%s"', region.id)
            region.set_TextLine(sorted_lines)
Exemplo n.º 15
0
def _fix_words(line):
    """Fix word order in a line"""

    words = line.get_Word()
    line_text = get_text(line)
    words_text = get_text(words, ' ')
    if line_text != words_text:
        sorted_words = sorted(
            words,
            key=lambda w: Polygon(polygon_from_points(w.get_Coords().points)
                                  ).centroid.x)
        sorted_words_text = get_text(sorted_words, ' ')

        if sorted_words_text == line_text:
            LOG.info('Fixing word order of line "%s"', line.id)
            line.set_Word(sorted_words)
Exemplo n.º 16
0
    def create_baseline(self, text_line: TextLineType) -> Optional[LineString]:
        if text_line.get_Baseline() is None or text_line.get_Baseline(
        ).points is None:
            return None

        points = np.array(polygon_from_points(text_line.get_Baseline().points))
        points = transform_coordinates(points, self.coords['transform'])
        try:
            line = LineString(np.round(points).astype(np.int32))
        except ValueError as err:
            self.logger.error('Page "%s" @ %s/Baseline %s', self.page_id,
                              str(text_line.id), str(err))
            return None

        if not line.is_valid:
            self.logger.error('Page "%s" @ %s/Baseline %s', self.page_id,
                              str(text_line.id), str(explain_validity(line)))
            return None

        return line
Exemplo n.º 17
0
def ensure_valid(element):
    changed = False
    coords = element.get_Coords()
    points = coords.points
    polygon = polygon_from_points(points)
    array = np.array(polygon, np.int)
    if array.min() < 0:
        array = np.maximum(0, array)
        changed = True
    if array.shape[0] < 3:
        array = np.concatenate([array, array[::-1] + 1])
        changed = True
    polygon = array.tolist()
    poly = Polygon(polygon)
    if not poly.is_valid:
        poly = make_valid(poly)
        polygon = poly.exterior.coords[:-1]
        changed = True
    if changed:
        points = points_from_polygon(polygon)
        coords.set_points(points)
Exemplo n.º 18
0
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n, mrcnn_model, class_names, mask):
        LOG = getLogger('OcrdAnybaseocrBlockSegmenter')
        # check for existing text regions and whether to overwrite them
        border = None
        if page.get_TextRegion():
            if self.parameter['overwrite']:
                LOG.info('removing existing TextRegions in page "%s"', page_id)
                page.set_TextRegion([])
            else:
                LOG.warning('keeping existing TextRegions in page "%s"',
                            page_id)
                return
        # check if border exists
        if page.get_Border():
            border_coords = page.get_Border().get_Coords()
            border_points = polygon_from_points(border_coords.get_points())
            border = Polygon(border_points)


#            page_image, page_xy = self.workspace.image_from_segment(page.get_Border(), page_image, page_xywh)

        img_array = ocrolib.pil2array(page_image)
        page_image.save('./checkthis.png')
        if len(img_array.shape) <= 2:
            img_array = np.stack((img_array, ) * 3, axis=-1)
        results = mrcnn_model.detect([img_array], verbose=1)
        r = results[0]

        th = self.parameter['th']
        # check for existing semgentation mask
        # this code executes only when use_deeplr is set to True in ocrd-tool.json file
        if mask:
            mask = ocrolib.pil2array(mask)
            mask = mask // 255
            mask = 1 - mask
            # multiply all the bounding box part with 2
            for i in range(len(r['rois'])):

                min_x = r['rois'][i][0]
                min_y = r['rois'][i][1]
                max_x = r['rois'][i][2]
                max_y = r['rois'][i][3]
                mask[min_x:max_x, min_y:max_y] *= i + 2
            cv2.imwrite('mask_check.png', mask * (255 / (len(r['rois']) + 2)))

            # check for left over pixels and add them to the bounding boxes
            pixel_added = True

            while pixel_added:

                pixel_added = False
                left_over = np.where(mask == 1)
                for x, y in zip(left_over[0], left_over[1]):
                    local_mask = mask[x - th:x + th, y - th:y + th]
                    candidates = np.where(local_mask > 1)
                    candidates = [k for k in zip(candidates[0], candidates[1])]
                    if len(candidates) > 0:
                        pixel_added = True
                        # find closest pixel with x>1
                        candidates.sort(key=lambda j: np.sqrt((j[0] - th)**2 +
                                                              (j[1] - th)**2))
                        index = local_mask[candidates[0]] - 2

                        # add pixel to mask/bbox
                        # x,y to bbox with index
                        if x < r['rois'][index][0]:
                            r['rois'][index][0] = x

                        elif x > r['rois'][index][2]:
                            r['rois'][index][2] = x

                        if y < r['rois'][index][1]:
                            r['rois'][index][1] = y

                        elif y > r['rois'][index][3]:
                            r['rois'][index][3] = y

                        # update the mask
                        mask[x, y] = index + 2

        # resolving overlapping problem
        bbox_dict = {}  # to check any overlapping bbox
        class_id_check = []

        for i in range(len(r['rois'])):
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            region_bbox = [min_y, min_x, max_y, max_x]

            for key in bbox_dict:
                for bbox in bbox_dict[key]:

                    # checking for ymax case with vertical overlapping
                    # along with y, check both for xmax and xmin
                    if (region_bbox[3] <= bbox[3] and region_bbox[3] >= bbox[1]
                            and ((region_bbox[0] >= bbox[0]
                                  and region_bbox[0] <= bbox[2]) or
                                 (region_bbox[2] >= bbox[0]
                                  and region_bbox[2] <= bbox[2]) or
                                 (region_bbox[0] <= bbox[0]
                                  and region_bbox[2] >= bbox[2]))
                            and r['class_ids'][i] != 5):

                        r['rois'][i][2] = bbox[1] - 1

                    # checking for ymin now
                    # along with y, check both for xmax and xmin
                    if (region_bbox[1] <= bbox[3] and region_bbox[1] >= bbox[1]
                            and ((region_bbox[0] >= bbox[0]
                                  and region_bbox[0] <= bbox[2]) or
                                 (region_bbox[2] >= bbox[0]
                                  and region_bbox[2] <= bbox[2]) or
                                 (region_bbox[0] <= bbox[0]
                                  and region_bbox[2] >= bbox[2]))
                            and r['class_ids'][i] != 5):

                        r['rois'][i][0] = bbox[3] + 1

            if r['class_ids'][i] not in class_id_check:
                bbox_dict[r['class_ids'][i]] = []
                class_id_check.append(r['class_ids'][i])

            bbox_dict[r['class_ids'][i]].append(region_bbox)

        # resolving overlapping problem code

        # define reading order on basis of coordinates
        reading_order = []

        for i in range(len(r['rois'])):
            width, height, _ = img_array.shape
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            if (min_y - 5) > width and r['class_ids'][i] == 2:
                min_y -= 5
            if (max_y + 10) < width and r['class_ids'][i] == 2:
                min_y += 10
            reading_order.append((min_y, min_x, max_y, max_x))

        reading_order = sorted(reading_order,
                               key=lambda reading_order:
                               (reading_order[1], reading_order[0]))
        for i in range(len(reading_order)):
            min_y, min_x, max_y, max_x = reading_order[i]
            min_y = 0
            i_poly = Polygon([[min_x, min_y], [max_x, min_y], [max_x, max_y],
                              [min_x, max_y]])
            for j in range(i + 1, len(reading_order)):
                min_y, min_x, max_y, max_x = reading_order[j]
                j_poly = Polygon([[min_x, min_y], [max_x, min_y],
                                  [max_x, max_y], [min_x, max_y]])
                inter = i_poly.intersection(j_poly)
                if inter:
                    reading_order.insert(j + 1, reading_order[i])
                    del reading_order[i]

        # Creating Reading Order object in PageXML
        order_group = OrderedGroupType(caption="Regions reading order",
                                       id=page_id)

        for i in range(len(r['rois'])):
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]
            if (min_y - 5) > width and r['class_ids'][i] == 2:
                min_y -= 5
            if (max_y + 10) < width and r['class_ids'][i] == 2:
                min_y += 10

            region_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y],
                              [min_x, max_y]]

            if border:
                cut_region_polygon = border.intersection(
                    Polygon(region_polygon))
                if cut_region_polygon.is_empty:
                    continue
            else:
                cut_region_polygon = Polygon(region_polygon)

            order_index = reading_order.index((min_y, min_x, max_y, max_x))
            region_id = '%s_region%04d' % (page_id, i)
            regionRefIndex = RegionRefIndexedType(index=order_index,
                                                  regionRef=region_id)
            order_group.add_RegionRefIndexed(regionRefIndex)

        reading_order_object = ReadingOrderType()
        reading_order_object.set_OrderedGroup(order_group)
        page.set_ReadingOrder(reading_order_object)

        for i in range(len(r['rois'])):
            width, height, _ = img_array.shape
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            if (min_y - 5) > width and r['class_ids'][i] == 2:
                min_y -= 5
            if (max_y + 10) < width and r['class_ids'][i] == 2:
                min_y += 10

            # one change here to resolve flipped coordinates
            region_polygon = [[min_y, min_x], [max_y, min_x], [max_y, max_x],
                              [min_y, max_x]]

            cut_region_polygon = border.intersection(Polygon(region_polygon))

            if cut_region_polygon.is_empty:
                continue
            cut_region_polygon = [
                j for j in zip(list(cut_region_polygon.exterior.coords.xy[0]),
                               list(cut_region_polygon.exterior.coords.xy[1]))
            ][:-1]

            # checking whether coordinates are flipped

            region_polygon = coordinates_for_segment(cut_region_polygon,
                                                     page_image, page_xywh)
            region_points = points_from_polygon(region_polygon)

            read_order = reading_order.index((min_y, min_x, max_y, max_x))

            # this can be tested, provided whether we need previous comments or not?
            # resolving overlapping problem

            region_img = img_array[min_x:max_x, min_y:
                                   max_y]  # extract from points and img_array

            region_img = ocrolib.array2pil(region_img)

            file_id = make_file_id(input_file, self.output_file_grp)
            file_path = self.workspace.save_image_file(
                region_img,
                file_id + "_" + str(i),
                page_id=page_id,
                file_grp=self.output_file_grp)

            # ai = AlternativeImageType(filename=file_path, comments=page_xywh['features'])
            region_id = '%s_region%04d' % (page_id, i)
            coords = CoordsType(region_points)

            # incase of imageRegion
            if r['class_ids'][i] == 15:
                image_region = ImageRegionType(
                    custom='readingOrder {index:' + str(read_order) + ';}',
                    id=region_id,
                    Coords=coords,
                    type_=class_names[r['class_ids'][i]])
                # image_region.add_AlternativeImage(ai)
                page.add_ImageRegion(image_region)
                continue
            if r['class_ids'][i] == 16:
                table_region = TableRegionType(
                    custom='readingOrder {index:' + str(read_order) + ';}',
                    id=region_id,
                    Coords=coords,
                    type_=class_names[r['class_ids'][i]])
                # table_region.add_AlternativeImage(ai)
                page.add_TableRegion(table_region)
                continue
            if r['class_ids'][i] == 17:
                graphic_region = GraphicRegionType(
                    custom='readingOrder {index:' + str(read_order) + ';}',
                    id=region_id,
                    Coords=coords,
                    type_=class_names[r['class_ids'][i]])
                # graphic_region.add_AlternativeImage(ai)
                page.add_GraphicRegion(graphic_region)
                continue

            textregion = TextRegionType(custom='readingOrder {index:' +
                                        str(read_order) + ';}',
                                        id=region_id,
                                        Coords=coords,
                                        type_=class_names[r['class_ids'][i]])
            # textregion.add_AlternativeImage(ai)

            #border = page.get_Border()
            # if border:
            #    border.add_TextRegion(textregion)
            # else:
            page.add_TextRegion(textregion)
Exemplo n.º 19
0
    def process(self):
        log = getLogger('processor.OcrdSbbTextlineDetectorRecognize')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, input_file)

            file_id = make_file_id(input_file, self.output_file_grp)

            # Process the files
            try:
                os.mkdir(self.output_file_grp)
            except FileExistsError:
                pass

            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = \
                self.workspace.image_from_page(
                        page, page_id,
                        feature_filter='cropped,binarized,grayscale_normalized'
                )

            with tempfile.TemporaryDirectory() as tmp_dirname:
                # Save the image
                image_file = tempfile.mkstemp(dir=tmp_dirname,
                                              suffix='.png')[1]
                page_image.save(image_file)

                # Segment the image
                model = self.parameter['model']
                x = textline_detector(image_file, tmp_dirname, file_id, model)
                x.run()

                # Read segmentation results
                tmp_filename = os.path.join(tmp_dirname, file_id) + '.xml'
                tmp_pcgts = ocrd_models.ocrd_page.parse(tmp_filename,
                                                        silence=True)
                tmp_page = tmp_pcgts.get_Page()

            # Create a new PAGE file from the input file
            pcgts.set_pcGtsId(file_id)
            page = pcgts.get_Page()

            # Merge results → PAGE file

            # 1. Border
            if page.get_Border():
                log.warning("Page already contained a border")
            # We need to translate the coordinates:
            text_border = tmp_page.get_Border()
            coords = text_border.get_Coords().get_points()
            polygon = polygon_from_points(coords)
            polygon_new = coordinates_for_segment(polygon, page_image,
                                                  page_coords)
            points_new = points_from_polygon(polygon_new)
            coords_new = CoordsType(points=points_new)
            text_border.set_Coords(coords_new)
            page.set_Border(text_border)

            # 2. ReadingOrder
            if page.get_ReadingOrder():
                log.warning("Page already contained a reading order")
            page.set_ReadingOrder(tmp_page.get_ReadingOrder())

            # 3. TextRegion
            if page.get_TextRegion():
                log.warning("Page already contained text regions")
            # We need to translate the coordinates:
            text_regions_new = []
            for text_region in tmp_page.get_TextRegion():
                coords = text_region.get_Coords().get_points()
                polygon = polygon_from_points(coords)
                polygon_new = coordinates_for_segment(polygon, page_image,
                                                      page_coords)
                points_new = points_from_polygon(polygon_new)
                coords_new = CoordsType(points=points_new)
                text_region.set_Coords(coords_new)
                text_regions_new.append(text_region)
            page.set_TextRegion(text_regions_new)

            # Save metadata about this operation
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=page_id,
                mimetype='application/vnd.prima.page+xml',
                local_filename=os.path.join(self.output_file_grp, file_id) +
                '.xml',
                content=ocrd_models.ocrd_page.to_xml(pcgts))
Exemplo n.º 20
0
    def process(self):
        """Extract region images from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Extract an image for each region (which depending on the workflow
        can already be deskewed, dewarped, binarized etc.), cropped to its
        minimal bounding box, and masked by the coordinate polygon outline.
        If ``transparency`` is true, then also add an alpha channel which is
        fully transparent outside of the mask.
        
        Create a JSON file with:
        * the IDs of the region and its parents,
        * the region's coordinates relative to the region image,
        * the region's absolute coordinates,
        * the (text) region's text content (if any),
        * the (text) region's TextStyle (if any),
        * the (text) region's @production (if any),
        * the (text) region's @readingDirection (if any),
        * the (text) region's @textLineOrder (if any),
        * the (text) region's @primaryScript (if any),
        * the (text) region's @primaryLanguage (if any),
        * the region's AlternativeImage/@comments (features),
        * the region's element class,
        * the region's @type,
        * the page's @type,
        * the page's DPI value.
        
        Write all files in the directory of the output file group, named like so:
        * ID + '.raw.png': region image (if the workflow provides raw images)
        * ID + '.bin.png': region image (if the workflow provides binarized images)
        * ID + '.nrm.png': region image (if the workflow provides grayscale-normalized images)
        * ID + '.json': region metadata.
        """
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            metadata = pcgts.get_Metadata() # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(type_="processingStep",
                                 name=self.ocrd_tool['steps'][0],
                                 value=TOOL,
                                 Labels=[LabelsType(
                                     externalModel="ocrd-tool",
                                     externalId="parameters",
                                     Label=[LabelType(type_=name,
                                                      value=self.parameter[name])
                                            for name in self.parameter.keys()])]))
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id,
                transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            ptype = page.get_type()

            regions = { 'advert': page.get_AdvertRegion(),
                        'text': page.get_TextRegion(),
                        'table': page.get_TableRegion(),
                        'chart': page.get_ChartRegion(),
                        'chem': page.get_ChemRegion(),
                        'graphic': page.get_GraphicRegion(),
                        'image': page.get_ImageRegion(),
                        'linedrawing': page.get_LineDrawingRegion(),
                        'maths': page.get_MathsRegion(),
                        'music': page.get_MusicRegion(),
                        'noise': page.get_NoiseRegion(),
                        'separator': page.get_SeparatorRegion(),
                        'unknown': page.get_UnknownRegion()
            }
            for rtype, rlist in regions.items():
                for region in rlist:
                    description = { 'region.ID': region.id, 'region.type': rtype }
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords,
                        transparency=self.parameter['transparency'])
                    description['subtype'] = region.get_type() if rtype in ['text', 'chart', 'graphic'] else None
                    description['coords_rel'] = coordinates_of_segment(
                        region, region_image, region_coords).tolist()
                    description['coords_abs'] = polygon_from_points(region.get_Coords().points)
                    if rtype == 'text':
                        rtext = region.get_TextEquiv()
                        if rtext:
                            description['region.text'] = rtext[0].Unicode
                        else:
                            description['region.text'] = ''
                        rstyle = region.get_TextStyle() or page.get_TextStyle()
                        if rstyle:
                            description['region.style'] = {
                                'fontFamily': rstyle.fontFamily,
                                'fontSize': rstyle.fontSize,
                                'xHeight': rstyle.xHeight,
                                'kerning': rstyle.kerning,
                                'serif': rstyle.serif,
                                'monospace': rstyle.monospace,
                                'bold': rstyle.bold,
                                'italic': rstyle.italic,
                                'smallCaps': rstyle.smallCaps,
                                'letterSpaced': rstyle.letterSpaced,
                                'strikethrough': rstyle.strikethrough,
                                'underlined': rstyle.underlined,
                                'underlineStyle': rstyle.underlineStyle,
                                'subscript': rstyle.subscript,
                                'superscript': rstyle.superscript
                            }
                        description['production'] = region.get_production()
                        description['readingDirection'] = (
                            region.get_readingDirection() or
                            page.get_readingDirection())
                        description['textLineOrder'] = (
                            region.get_textLineOrder() or
                            page.get_textLineOrder())
                        description['primaryScript'] = (
                            region.get_primaryScript() or
                            page.get_primaryScript())
                        description['primaryLanguage'] = (
                            region.get_primaryLanguage() or
                            page.get_primaryLanguage())
                    description['features'] = region_coords['features']
                    description['DPI']= dpi
                    description['page.ID'] = page_id
                    description['page.type'] = ptype
                    description['file_grp'] = self.input_file_grp
                    description['METS.UID'] = self.workspace.mets.unique_identifier
                    if 'binarized' in region_coords['features']:
                        extension = '.bin'
                    elif 'grayscale_normalized' in region_coords['features']:
                        extension = '.nrm'
                    else:
                        extension = '.raw'
                    
                    file_path = self.workspace.save_image_file(
                        region_image,
                        file_id + '_' + region.id + extension,
                        self.output_file_grp,
                        page_id=page_id,
                        format='PNG')
                    file_path = file_path.replace(extension + '.png', '.json')
                    json.dump(description, open(file_path, 'w'))
Exemplo n.º 21
0
 def test_polygon_from_points(self):
     self.assertEqual(
         polygon_from_points('100,100 200,100 200,200 100,200'),
         [[100, 100], [200, 100], [200, 200], [100, 200]])
Exemplo n.º 22
0
    def process(self):
        """Performs segmentation evaluation with Shapely on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Return information on the plausibility of the segmentation into
        regions on the logging level.
        """
        plausibilize = self.parameter['plausibilize']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this
                    # what we want here is `externalModel="ocrd-tool" externalId="parameters"`
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()

            regions = page.get_TextRegion()

            mark_for_deletion = set()
            mark_for_merging = set()

            for i in range(0, len(regions)):
                for j in range(i + 1, len(regions)):
                    LOG.info('Comparing regions "%s" and "%s"', regions[i].id,
                             regions[j].id)
                    region_poly1 = Polygon(
                        polygon_from_points(regions[i].get_Coords().points))
                    region_poly2 = Polygon(
                        polygon_from_points(regions[j].get_Coords().points))

                    LOG.debug('Checking for equality ...')
                    equality = region_poly1.almost_equals(region_poly2)
                    if equality:
                        LOG.warn(
                            'Warning: regions %s and %s cover the same area.' %
                            (regions[i].id, regions[j].id))
                        mark_for_deletion.add(j)

                    LOG.debug('Checking for containment ...')
                    containment_r = region_poly1.contains(region_poly2)
                    containment_l = region_poly2.contains(region_poly1)
                    if containment_r:
                        LOG.warn('Warning: %s contains %s' %
                                 (regions[i].id, regions[j].id))
                        mark_for_deletion.add(j)
                    if containment_l:
                        LOG.warn('Warning: %s contains %s' %
                                 (regions[j].id, regions[i].id))
                        mark_for_deletion.add(i)

            if plausibilize:
                new_regions = []
                for i in range(0, len(regions)):
                    if not i in mark_for_deletion:
                        new_regions.append(regions[i])
                page.set_TextRegion(new_regions)

                #LOG.info('Intersection %i', region_poly1.intersects(region_poly2))
                #LOG.info('Containment %i', region_poly1.contains(region_poly2))
                #if region_poly1.intersects(region_poly2):
                #    LOG.info('Area 1 %d', region_poly1.area)
                #    LOG.info('Area 2 %d', region_poly2.area)
                #    LOG.info('Area intersect %d', region_poly1.intersection(region_poly2).area)

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Exemplo n.º 23
0
 def polygon_position(child, horizontal=sort_horizontal):
     polygon = Polygon(polygon_from_points(child.get_Coords().points))
     if horizontal:
         return polygon.centroid.x
     return polygon.centroid.y
Exemplo n.º 24
0
    def image_from_page(self,
                        page,
                        page_id,
                        fill='background',
                        transparency=False,
                        feature_selector='',
                        feature_filter=''):
        """Extract an image for a PAGE-XML page from the workspace.

        Given ``page``, a PAGE PageType object, extract its PIL.Image,
        either from its AlternativeImage (if it exists), or from its
        @imageFilename (otherwise). Also crop it, if a Border exists,
        and rotate it, if any @orientation angle is annotated.

        If ``feature_selector`` and/or ``feature_filter`` is given, then
        select/filter among the @imageFilename image and the available
        AlternativeImages the last one which contains all of the selected,
        but none of the filtered features (i.e. @comments classes), or
        raise an error.

        (Required and produced features need not be in the same order, so
        ``feature_selector`` is merely a mask specifying Boolean AND, and
        ``feature_filter`` is merely a mask specifying Boolean OR.)

        If the chosen image does not have the feature "cropped" yet, but
        a Border exists, and unless "cropped" is being filtered, then crop it.
        Likewise, if the chosen image does not have the feature "deskewed" yet,
        but an @orientation angle is annotated, and unless "deskewed" is being
        filtered, then rotate it. (However, if @orientation is above the
        [-45°,45°] interval, then apply as much transposition as possible first,
        unless "rotated-90" / "rotated-180" / "rotated-270" is being filtered.)

        Cropping uses a polygon mask (not just the bounding box rectangle).
        Areas outside the polygon will be filled according to ``fill``:

        - if ``background`` (the default),
          then fill with the median color of the image;
        - otherwise, use the given color, e.g. ``white`` or (255,255,255).

        Moreover, if ``transparency`` is true, and unless the image already
        has an alpha channel, then add an alpha channel which is fully opaque
        before cropping and rotating. (Thus, only the exposed areas will be
        transparent afterwards, for those that can interpret alpha channels).

        Return a tuple:

         * the extracted image,
         * a dictionary with information about the extracted image:

           - ``transform``: a Numpy array with an affine transform which
             converts from absolute coordinates to those relative to the image,
             i.e. after cropping to the page's border / bounding box (if any)
             and deskewing with the page's orientation angle (if any)
           - ``angle``: the rotation/reflection angle applied to the image so far,
           - ``features``: the AlternativeImage @comments for the image, i.e.
             names of all operations that lead up to this result,

         * an OcrdExif instance associated with the original image.

        (The first two can be used to annotate a new AlternativeImage,
         or be passed down with ``image_from_segment``.)

        Example:

         * get a raw (colored) but already deskewed and cropped image:

           ``
           page_image, page_coords, page_image_info = workspace.image_from_page(
                 page, page_id,
                 feature_selector='deskewed,cropped',
                 feature_filter='binarized,grayscale_normalized')
           ``
        """
        log = getLogger('ocrd.workspace.image_from_page')
        page_image = self._resolve_image_as_pil(page.imageFilename)
        page_image_info = OcrdExif(page_image)
        page_coords = dict()
        # use identity as initial affine coordinate transform:
        page_coords['transform'] = np.eye(3)
        # interim bbox (updated with each change to the transform):
        page_bbox = [0, 0, page_image.width, page_image.height]
        page_xywh = {
            'x': 0,
            'y': 0,
            'w': page_image.width,
            'h': page_image.height
        }

        border = page.get_Border()
        # page angle: PAGE @orientation is defined clockwise,
        # whereas PIL/ndimage rotation is in mathematical direction:
        page_coords['angle'] = -(page.get_orientation() or 0)
        # map angle from (-180,180] to [0,360], and partition into multiples of 90;
        # but avoid unnecessary large remainders, i.e. split symmetrically:
        orientation = (page_coords['angle'] + 45) % 360
        orientation = orientation - (orientation % 90)
        skew = (page_coords['angle'] % 360) - orientation
        skew = 180 - (180 - skew) % 360  # map to [-45,45]
        page_coords['angle'] = 0  # nothing applied yet (depends on filters)
        log.debug("page '%s' has %s orientation=%d skew=%.2f", page_id,
                  "border," if border else "", orientation, skew)

        # initialize AlternativeImage@comments classes as empty:
        page_coords['features'] = ''
        alternative_image = None
        alternative_images = page.get_AlternativeImage()
        if alternative_images:
            # (e.g. from page-level cropping, binarization, deskewing or despeckling)
            if feature_selector or feature_filter:
                alternative_image = None
                # search from the end, because by convention we always append,
                # and among multiple satisfactory images we want the most recent:
                for alternative_image in reversed(alternative_images):
                    features = alternative_image.get_comments()
                    if (all(feature in features
                            for feature in feature_selector.split(',')
                            if feature) and
                            not any(feature in features
                                    for feature in feature_filter.split(',')
                                    if feature)):
                        break
                    else:
                        alternative_image = None
            else:
                alternative_image = alternative_images[-1]
                features = alternative_image.get_comments()
            if alternative_image:
                log.debug("Using AlternativeImage %d (%s) for page '%s'",
                          alternative_images.index(alternative_image) + 1,
                          features, page_id)
                page_image = self._resolve_image_as_pil(
                    alternative_image.get_filename())
                page_coords['features'] = features

        # adjust the coord transformation to the steps applied on the image,
        # and apply steps on the existing image in case it is missing there,
        # but traverse all steps (crop/reflect/rotate) in a particular order:
        # - existing image features take priority (in the order annotated),
        # - next is cropping (if necessary but not already applied),
        # - next is reflection (if necessary but not already applied),
        # - next is rotation (if necessary but not already applied).
        # This helps deal with arbitrary workflows (e.g. crop then deskew,
        # or deskew then crop), regardless of where images are generated.
        alternative_image_features = page_coords['features'].split(',')
        for i, feature in enumerate(
                alternative_image_features + (['cropped'] if (
                    border and not 'cropped' in page_coords['features']
                    and not 'cropped' in feature_filter.split(',')) else []) +
            (['rotated-%d' % orientation] if
             (orientation and not 'rotated-%d' %
              orientation in page_coords['features'] and not 'rotated-%d' %
              orientation in feature_filter.split(',')) else []) +
            (['deskewed'] if
             (skew and not 'deskewed' in page_coords['features']
              and not 'deskewed' in feature_filter.split(',')) else []) +
                # not a feature to be added, but merely as a fallback position
                # to always enter loop at i == len(alternative_image_features)
            ['_check']):
            # image geometry vs feature consistency can only be checked
            # after all features on the existing AlternativeImage have
            # been adjusted for in the transform, and when there is a mismatch,
            # additional steps applied here would only repeat the respective
            # error message; so we only check once at the boundary between
            # existing and new features
            # FIXME we should check/enforce consistency when _adding_ AlternativeImage
            if (i == len(alternative_image_features)
                    and not (page_xywh['w'] - 2 < page_image.width <
                             page_xywh['w'] + 2 and page_xywh['h'] - 2 <
                             page_image.height < page_xywh['h'] + 2)):
                log.error(
                    'page "%s" image (%s; %dx%d) has not been cropped properly (%dx%d)',
                    page_id, page_coords['features'], page_image.width,
                    page_image.height, page_xywh['w'], page_xywh['h'])
            # adjust transform to feature, possibly apply feature to image
            if feature == 'cropped':
                page_points = border.get_Coords().points
                log.debug(
                    "Using explicitly set page border '%s' for page '%s'",
                    page_points, page_id)
                # get polygon outline of page border:
                page_polygon = np.array(polygon_from_points(page_points),
                                        dtype=np.int32)
                page_polygon = transform_coordinates(page_polygon,
                                                     page_coords['transform'])
                page_polygon = np.round(page_polygon).astype(np.int32)
                page_bbox = bbox_from_polygon(page_polygon)
                # get size of the page after cropping but before rotation:
                page_xywh = xywh_from_bbox(*page_bbox)
                # subtract offset in affine coordinate transform:
                # (consistent with image cropping or AlternativeImage below)
                page_coords['transform'] = shift_coordinates(
                    page_coords['transform'],
                    np.array([-page_xywh['x'], -page_xywh['y']]))
                # crop, if (still) necessary:
                if not 'cropped' in page_coords['features']:
                    log.debug(
                        "Cropping %s for page '%s' to border",
                        "AlternativeImage" if alternative_image else "image",
                        page_id)
                    # create a mask from the page polygon:
                    page_image = image_from_polygon(page_image,
                                                    page_polygon,
                                                    fill=fill,
                                                    transparency=transparency)
                    # recrop into page rectangle:
                    page_image = crop_image(page_image, box=page_bbox)
                    page_coords['features'] += ',cropped'

            elif feature == 'rotated-%d' % orientation:
                # Transpose in affine coordinate transform:
                # (consistent with image transposition or AlternativeImage below)
                transposition = {
                    90: Image.ROTATE_90,
                    180: Image.ROTATE_180,
                    270: Image.ROTATE_270
                }.get(orientation)  # no default
                page_coords['transform'] = transpose_coordinates(
                    page_coords['transform'], transposition,
                    np.array([0.5 * page_xywh['w'], 0.5 * page_xywh['h']]))
                (page_xywh['w'],
                 page_xywh['h']) = adjust_canvas_to_transposition(
                     [page_xywh['w'], page_xywh['h']], transposition)
                page_coords['angle'] = orientation
                # transpose, if (still) necessary:
                if not 'rotated-%d' % orientation in page_coords['features']:
                    log.info(
                        "Transposing %s for page '%s' by %d°",
                        "AlternativeImage" if alternative_image else "image",
                        page_id, orientation)
                    page_image = transpose_image(
                        page_image, {
                            90: Image.ROTATE_90,
                            180: Image.ROTATE_180,
                            270: Image.ROTATE_270
                        }.get(orientation))  # no default
                    page_coords['features'] += ',rotated-%d' % orientation
            elif feature == 'deskewed':
                # Rotate around center in affine coordinate transform:
                # (consistent with image rotation or AlternativeImage below)
                page_coords['transform'] = rotate_coordinates(
                    page_coords['transform'], skew,
                    np.array([0.5 * page_xywh['w'], 0.5 * page_xywh['h']]))
                page_coords['angle'] += skew
                # deskew, if (still) necessary:
                if not 'deskewed' in page_coords['features']:
                    log.info(
                        "Rotating %s for page '%s' by %.2f°",
                        "AlternativeImage" if alternative_image else "image",
                        page_id, skew)
                    page_image = rotate_image(page_image,
                                              skew,
                                              fill=fill,
                                              transparency=transparency)
                    page_coords['features'] += ',deskewed'
                (page_xywh['w'], page_xywh['h']) = adjust_canvas_to_rotation(
                    [page_xywh['w'], page_xywh['h']], skew)

        # verify constraints again:
        if not all(feature in page_coords['features']
                   for feature in feature_selector.split(',') if feature):
            raise Exception(
                'Found no AlternativeImage that satisfies all requirements ' +
                'selector="%s" in page "%s"' % (feature_selector, page_id))
        if any(feature in page_coords['features']
               for feature in feature_filter.split(',') if feature):
            raise Exception(
                'Found no AlternativeImage that satisfies all requirements ' +
                'filter="%s" in page "%s"' % (feature_filter, page_id))
        page_image.format = 'PNG'  # workaround for tesserocr#194
        return page_image, page_coords, page_image_info
Exemplo n.º 25
0
def validate_consistency(node, page_textequiv_consistency,
                         page_textequiv_strategy, check_baseline, check_coords,
                         report, file_id):
    """
    Check whether the text results on an element is consistent with its child element text results,
    and whether the coordinates of an element are fully within its parent element coordinates.
    """
    if isinstance(node, PcGtsType):
        # top-level (start recursion)
        node_id = node.get_pcGtsId()
        node = node.get_Page()  # has no .id
    elif isinstance(node, GlyphType):
        # terminal level (end recursion)
        return True
    else:
        node_id = node.id
    tag = node.original_tagname_
    log.debug("Validating %s %s", tag, node_id)
    consistent = True
    if check_coords or check_baseline:
        if isinstance(node, PageType):
            parent = node.get_Border()
        else:
            parent = node
        if parent:
            parent_points = parent.get_Coords().points
            node_poly = Polygon(polygon_from_points(parent_points))
            reason = ''
            if not node_poly.is_valid:
                reason = explain_validity(node_poly)
            elif node_poly.is_empty:
                reason = 'is empty'
            elif node_poly.bounds[0] < 0 or node_poly.bounds[1] < 0:
                reason = 'is negative'
            elif node_poly.length < 4:
                reason = 'has too few points'
            if reason:
                report.add_error(
                    CoordinateValidityError(tag, node_id, file_id,
                                            parent_points, reason))
                log.debug("Invalid coords of %s %s", tag, node_id)
                consistent = False
        else:
            node_poly = None
    for class_, getter, concatenate_with in _HIERARCHY:
        if not isinstance(node, class_):
            continue
        children = getattr(node, getter)()
        for child in children:
            consistent = (validate_consistency(
                child, page_textequiv_consistency, page_textequiv_strategy,
                check_baseline, check_coords, report, file_id) and consistent)
            if check_coords and node_poly:
                child_tag = child.original_tagname_
                child_points = child.get_Coords().points
                child_poly = Polygon(polygon_from_points(child_points))
                if (not child_poly.is_valid or child_poly.is_empty
                        or child_poly.bounds[0] < 0 or child_poly.bounds[1] < 0
                        or child_poly.length < 4):
                    # report.add_error(CoordinateValidityError(child_tag, child.id, file_id, child_points))
                    # log.debug("Invalid coords of %s %s", child_tag, child.id)
                    # consistent = False
                    pass  # already reported in recursive call above
                elif not child_poly.within(node_poly):
                    # TODO: automatic repair?
                    report.add_error(
                        CoordinateConsistencyError(tag, child.id, file_id,
                                                   parent_points,
                                                   child_points))
                    log.debug("Inconsistent coords of %s %s", child_tag,
                              child.id)
                    consistent = False
        if isinstance(node,
                      TextLineType) and check_baseline and node.get_Baseline():
            baseline_points = node.get_Baseline().points
            baseline_line = LineString(polygon_from_points(baseline_points))
            reason = ''
            if not baseline_line.is_valid:
                reason = explain_validity(baseline_line)
            elif baseline_line.is_empty:
                reason = 'is empty'
            elif baseline_line.bounds[0] < 0 or baseline_line.bounds[1] < 0:
                reason = 'is negative'
            elif baseline_line.length < 2:
                reason = 'has too few points'
            if reason:
                report.add_error(
                    CoordinateValidityError("Baseline", node_id, file_id,
                                            baseline_points, reason))
                log.debug("Invalid coords of baseline in %s", node_id)
                consistent = False
            elif not baseline_line.within(node_poly):
                report.add_error(
                    CoordinateConsistencyError("Baseline", node_id, file_id,
                                               parent_points, baseline_points))
                log.debug("Inconsistent coords of baseline in %s %s", tag,
                          node_id)
                consistent = False
        if concatenate_with is not None and page_textequiv_consistency != 'off':
            # validate textual consistency of node with children
            concatenated = concatenate(children, concatenate_with,
                                       page_textequiv_strategy)
            text_results = get_text(node, page_textequiv_strategy)
            if concatenated and text_results and concatenated != text_results:
                consistent = False
                if page_textequiv_consistency == 'fix':
                    log.debug("Repaired text of %s %s", tag, node_id)
                    set_text(node, concatenated, page_textequiv_strategy)
                elif (page_textequiv_consistency == 'strict'  # or 'lax' but...
                      or not compare_without_whitespace(
                          concatenated, text_results)):
                    log.debug("Inconsistent text of %s %s", tag, node_id)
                    report.add_error(
                        ConsistencyError(tag, node_id, file_id, text_results,
                                         concatenated))
    return consistent
Exemplo n.º 26
0
    def process(self):
        """Performs segmentation evaluation with Shapely on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Return information on the plausibility of the segmentation into
        regions on the logging level.
        """
        sanitize = self.parameter['sanitize']
        plausibilize = self.parameter['plausibilize']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            #
            # validate segmentation (warn of children extending beyond their parents)
            #
            self.validate_coords(page, page_id)

            #
            # sanitize region segmentation (shrink to hull of lines)
            #
            if sanitize:
                self.sanitize_page(page, page_id)

            #
            # plausibilize region segmentation (remove redundant text regions)
            #
            mark_for_deletion = list()  # what regions get removed?
            mark_for_merging = dict(
            )  # what regions get merged into which regions?

            # TODO: cover recursive region structure (but compare only at the same level)
            regions = page.get_TextRegion()
            # sort by area to ensure to arrive at a total ordering compatible
            # with the topological sort along containment/equivalence arcs
            # (so we can avoid substituting regions with superregions that have
            #  themselves been substituted/deleted):
            RegionPolygon = namedtuple('RegionPolygon', ['region', 'polygon'])
            regionspolys = sorted([
                RegionPolygon(
                    region,
                    Polygon(polygon_from_points(region.get_Coords().points)))
                for region in regions
            ],
                                  key=lambda x: x.polygon.area)
            for i in range(0, len(regionspolys)):
                for j in range(i + 1, len(regionspolys)):
                    region1 = regionspolys[i].region
                    region2 = regionspolys[j].region
                    poly1 = regionspolys[i].polygon
                    poly2 = regionspolys[j].polygon
                    LOG.debug('Comparing regions "%s" and "%s"', region1.id,
                              region2.id)

                    if poly1.almost_equals(poly2):
                        LOG.warning(
                            'Page "%s" region "%s" is almost equal to "%s" %s',
                            page_id, region2.id, region1.id,
                            '(removing)' if plausibilize else '')
                        mark_for_deletion.append(region2.id)
                    elif poly1.contains(poly2):
                        LOG.warning('Page "%s" region "%s" is within "%s" %s',
                                    page_id, region2.id, region1.id,
                                    '(removing)' if plausibilize else '')
                        mark_for_deletion.append(region2.id)
                    elif poly2.contains(poly1):
                        LOG.warning('Page "%s" region "%s" is within "%s" %s',
                                    page_id, region1.id, region2.id,
                                    '(removing)' if plausibilize else '')
                        mark_for_deletion.append(region1.id)
                    elif poly1.overlaps(poly2):
                        inter_poly = poly1.intersection(poly2)
                        union_poly = poly1.union(poly2)
                        LOG.debug(
                            'Page "%s" region "%s" overlaps "%s" by %f/%f',
                            page_id, region1.id, region2.id,
                            inter_poly.area / poly1.area,
                            inter_poly.area / poly2.area)
                        if union_poly.convex_hull.area >= poly1.area + poly2.area:
                            # skip this pair -- combined polygon encloses previously free segments
                            pass
                        elif inter_poly.area / poly2.area > self.parameter[
                                'plausibilize_merge_min_overlap']:
                            LOG.warning(
                                'Page "%s" region "%s" is almost within "%s" %s',
                                page_id, region2.id, region1.id,
                                '(merging)' if plausibilize else '')
                            mark_for_merging[region2.id] = region1
                        elif inter_poly.area / poly1.area > self.parameter[
                                'plausibilize_merge_min_overlap']:
                            LOG.warning(
                                'Page "%s" region "%s" is almost within "%s" %s',
                                page_id, region1.id, region2.id,
                                '(merging)' if plausibilize else '')
                            mark_for_merging[region1.id] = region2

                    # TODO: more merging cases...
                    #LOG.info('Intersection %i', poly1.intersects(poly2))
                    #LOG.info('Containment %i', poly1.contains(poly2))
                    #if poly1.intersects(poly2):
                    #    LOG.info('Area 1 %d', poly1.area)
                    #    LOG.info('Area 2 %d', poly2.area)
                    #    LOG.info('Area intersect %d', poly1.intersection(poly2).area)

            if plausibilize:
                # the reading order does not have to include all regions
                # but it may include all types of regions!
                ro = page.get_ReadingOrder()
                if ro:
                    rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
                else:
                    rogroup = None
                # pass the regions sorted (see above)
                _plausibilize_group(regionspolys, rogroup, mark_for_deletion,
                                    mark_for_merging)

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Exemplo n.º 27
0
def _plausibilize_group(regionspolys, rogroup, mark_for_deletion,
                        mark_for_merging):
    wait_for_deletion = list()
    reading_order = dict()
    ordered = False
    if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)):
        regionrefs = (rogroup.get_RegionRefIndexed() +
                      rogroup.get_OrderedGroupIndexed() +
                      rogroup.get_UnorderedGroupIndexed())
        ordered = True
    if isinstance(rogroup, (UnorderedGroupType, UnorderedGroupIndexedType)):
        regionrefs = (rogroup.get_RegionRef() + rogroup.get_OrderedGroup() +
                      rogroup.get_UnorderedGroup())
    for elem in regionrefs:
        reading_order[elem.get_regionRef()] = elem
        if not isinstance(elem, (RegionRefType, RegionRefIndexedType)):
            # recursive reading order element (un/ordered group):
            _plausibilize_group(regionspolys, elem, mark_for_deletion,
                                mark_for_merging)
    for regionpoly in regionspolys:
        delete = regionpoly.region.id in mark_for_deletion
        merge = regionpoly.region.id in mark_for_merging
        if delete or merge:
            region = regionpoly.region
            poly = regionpoly.polygon
            if merge:
                # merge region with super region:
                superreg = mark_for_merging[region.id]
                # granularity will necessarily be lost here --
                # this is not for workflows/processors that already
                # provide good/correct segmentation and reading order
                # (in which case orientation, script and style detection
                #  can be expected as well), but rather as a postprocessor
                # for suboptimal segmentation (possibly before reading order
                # detection/correction); hence, all we now do here is
                # show warnings when granularity is lost; but there might
                # be good reasons to do more here when we have better processors
                # and use-cases in the future
                superpoly = Polygon(
                    polygon_from_points(superreg.get_Coords().points))
                superpoly = superpoly.union(poly)
                superreg.get_Coords().points = points_from_polygon(
                    superpoly.exterior.coords)
                # FIXME should we merge/mix attributes and features?
                if region.get_orientation() != superreg.get_orientation():
                    LOG.warning(
                        'Merging region "%s" with orientation %f into "%s" with %f',
                        region.id, region.get_orientation(), superreg.id,
                        superreg.get_orientation())
                if region.get_type() != superreg.get_type():
                    LOG.warning(
                        'Merging region "%s" with type %s into "%s" with %s',
                        region.id, region.get_type(), superreg.id,
                        superreg.get_type())
                if region.get_primaryScript() != superreg.get_primaryScript():
                    LOG.warning(
                        'Merging region "%s" with primaryScript %s into "%s" with %s',
                        region.id, region.get_primaryScript(), superreg.id,
                        superreg.get_primaryScript())
                if region.get_primaryLanguage(
                ) != superreg.get_primaryLanguage():
                    LOG.warning(
                        'Merging region "%s" with primaryLanguage %s into "%s" with %s',
                        region.id, region.get_primaryLanguage(), superreg.id,
                        superreg.get_primaryLanguage())
                if region.get_TextStyle():
                    LOG.warning(
                        'Merging region "%s" with TextStyle %s into "%s" with %s',
                        region.id,
                        region.get_TextStyle(),  # FIXME needs repr...
                        superreg.id,
                        superreg.get_TextStyle())  # ...to be informative
                if region.get_TextEquiv():
                    LOG.warning(
                        'Merging region "%s" with TextEquiv %s into "%s" with %s',
                        region.id,
                        region.get_TextEquiv(),  # FIXME needs repr...
                        superreg.id,
                        superreg.get_TextEquiv())  # ...to be informative
            wait_for_deletion.append(region)
            if region.id in reading_order:
                regionref = reading_order[region.id]
                # TODO: re-assign regionref.continuation and regionref.type to other?
                # could be any of the 6 types above:
                regionrefs = rogroup.__getattribute__(
                    regionref.__class__.__name__.replace('Type', ''))
                # remove in-place
                regionrefs.remove(regionref)

    if ordered:
        # re-index the reading order!
        regionrefs.sort(key=RegionRefIndexedType.get_index)
        for i, regionref in enumerate(regionrefs):
            regionref.set_index(i)

    for region in wait_for_deletion:
        if region.parent_object_:
            # remove in-place
            region.parent_object_.get_TextRegion().remove(region)
Exemplo n.º 28
0
def _child_within_parent(child, parent):
    child_poly = Polygon(polygon_from_points(child.get_Coords().points))
    parent_poly = Polygon(polygon_from_points(parent.get_Coords().points))
    return child_poly.within(parent_poly)
Exemplo n.º 29
0
    def process(self):
        """Clip text regions / lines of the workspace at intersections with neighbours.

        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the requested
        ``level-of-operation``.

        Next, get each segment image according to the layout annotation (by cropping
        via coordinates into the higher-level image), as well as all its neighbours',
        binarize them (without deskewing), and make a connected component analysis.
        (Segments must not already have AlternativeImage annotated, otherwise they
        will be skipped.)

        Then, for each section of overlap with a neighbour, re-assign components
        which are only contained in the neighbour by clipping them to white (background),
        and export the (final) result as image file.

        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CLIP`` along with further
        identification of the input element.

        Reference each new image in the AlternativeImage of the element.

        Produce a new output file by serialising the resulting hierarchy.
        """
        # This makes best sense for overlapping segmentation, like current GT
        # or Tesseract layout analysis. Most notably, it can suppress graphics
        # and separators within or across a region or line. It _should_ ideally
        # be run after binarization (on page level for region-level clipping,
        # and on the region level for line-level clipping), because the
        # connected component analysis after implicit binarization could be
        # suboptimal, and the explicit binarization after clipping could be,
        # too. However, region-level clipping _must_ be run before region-level
        # deskewing, because that would make segments incomensurable with their
        # neighbours.
        LOG = getLogger('processor.OcropyClip')
        level = self.parameter['level-of-operation']
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file.pageId
                     or input_file.ID)
            file_id = make_file_id(input_file, self.output_file_grp)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID  # (PageType has no id)
            page = pcgts.get_Page()

            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized')
            if self.parameter['dpi'] > 0:
                zoom = 300.0 / self.parameter['dpi']
            elif page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi *= 2.54
                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
                zoom = 300.0 / dpi
            else:
                zoom = 1

            # FIXME: what about text regions inside table regions?
            regions = list(page.get_TextRegion())
            num_texts = len(regions)
            regions += (page.get_AdvertRegion() + page.get_ChartRegion() +
                        page.get_ChemRegion() + page.get_GraphicRegion() +
                        page.get_ImageRegion() + page.get_LineDrawingRegion() +
                        page.get_MathsRegion() + page.get_MusicRegion() +
                        page.get_NoiseRegion() + page.get_SeparatorRegion() +
                        page.get_TableRegion() + page.get_UnknownRegion())
            if not num_texts:
                LOG.warning('Page "%s" contains no text regions', page_id)
            background = ImageStat.Stat(page_image)
            # workaround for Pillow#4925
            if len(background.bands) > 1:
                background = tuple(background.median)
            else:
                background = background.median[0]
            if level == 'region':
                background_image = Image.new(page_image.mode, page_image.size,
                                             background)
                page_array = pil2array(page_image)
                page_bin = np.array(page_array <= midrange(page_array),
                                    np.uint8)
                # in absolute coordinates merely for comparison/intersection
                shapes = [
                    Polygon(polygon_from_points(region.get_Coords().points))
                    for region in regions
                ]
                # in relative coordinates for mask/cropping
                polygons = [
                    coordinates_of_segment(region, page_image, page_coords)
                    for region in regions
                ]
                for i, polygon in enumerate(polygons[num_texts:], num_texts):
                    # for non-text regions, extend mask by 3 pixels in each direction
                    # to ensure they do not leak components accidentally
                    # (accounts for bad cropping of such regions in GT):
                    polygon = Polygon(polygon).buffer(
                        3).exterior.coords[:-1]  # keep open
                    polygons[i] = polygon
                masks = [
                    pil2array(polygon_mask(page_image,
                                           polygon)).astype(np.uint8)
                    for polygon in polygons
                ]
            for i, region in enumerate(regions):
                if i >= num_texts:
                    break  # keep non-text regions unchanged
                if level == 'region':
                    if region.get_AlternativeImage():
                        # FIXME: This should probably be an exception (bad workflow configuration).
                        LOG.warning(
                            'Page "%s" region "%s" already contains image data: skipping',
                            page_id, region.id)
                        continue
                    shape = prep(shapes[i])
                    neighbours = [
                        (regionj, maskj) for shapej, regionj, maskj in zip(
                            shapes[:i] + shapes[i + 1:], regions[:i] +
                            regions[i + 1:], masks[:i] + masks[i + 1:])
                        if shape.intersects(shapej)
                    ]
                    if neighbours:
                        self.process_segment(region, masks[i], polygons[i],
                                             neighbours, background_image,
                                             page_image, page_coords, page_bin,
                                             input_file.pageId,
                                             file_id + '_' + region.id)
                    continue
                # level == 'line':
                lines = region.get_TextLine()
                if not lines:
                    LOG.warning('Page "%s" region "%s" contains no text lines',
                                page_id, region.id)
                    continue
                region_image, region_coords = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_coords,
                    feature_selector='binarized')
                background_image = Image.new(region_image.mode,
                                             region_image.size, background)
                region_array = pil2array(region_image)
                region_bin = np.array(region_array <= midrange(region_array),
                                      np.uint8)
                # in absolute coordinates merely for comparison/intersection
                shapes = [
                    Polygon(polygon_from_points(line.get_Coords().points))
                    for line in lines
                ]
                # in relative coordinates for mask/cropping
                polygons = [
                    coordinates_of_segment(line, region_image, region_coords)
                    for line in lines
                ]
                masks = [
                    pil2array(polygon_mask(region_image,
                                           polygon)).astype(np.uint8)
                    for polygon in polygons
                ]
                for j, line in enumerate(lines):
                    if line.get_AlternativeImage():
                        # FIXME: This should probably be an exception (bad workflow configuration).
                        LOG.warning(
                            'Page "%s" region "%s" line "%s" already contains image data: skipping',
                            page_id, region.id, line.id)
                        continue
                    shape = prep(shapes[j])
                    neighbours = [(linej, maskj)
                                  for shapej, linej, maskj in zip(
                                      shapes[:j] + shapes[j + 1:], lines[:j] +
                                      lines[j + 1:], masks[:j] + masks[j + 1:])
                                  if shape.intersects(shapej)]
                    if neighbours:
                        self.process_segment(
                            line, masks[j], polygons[j], neighbours,
                            background_image, region_image, region_coords,
                            region_bin, input_file.pageId,
                            file_id + '_' + region.id + '_' + line.id)

            # update METS (add the PAGE file):
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            pcgts.set_pcGtsId(file_id)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     self.output_file_grp, out.local_filename)
Exemplo n.º 30
0
    def process(self):
        """Extract textline images and texts from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the line level.
        
        Extract an image for each textline (which depending on the workflow
        can already be deskewed, dewarped, binarized etc.), cropped to its
        minimal bounding box, and masked by the coordinate polygon outline.
        If ``transparency`` is true, then also add an alpha channel which is
        fully transparent outside of the mask.
        
        Create a JSON file with:
        * the IDs of the textline and its parents,
        * the textline's text content,
        * the textline's coordinates relative to the line image,
        * the textline's absolute coordinates,
        * the textline's TextStyle (if any),
        * the textline's @production (if any),
        * the textline's @readingDirection (if any),
        * the textline's @primaryScript (if any),
        * the textline's @primaryLanguage (if any),
        * the textline's AlternativeImage/@comments (features),
        * the parent textregion's @type,
        * the page's @type,
        * the page's DPI value.
        
        Create a plain text file for the text content, too.
        
        Write all files in the directory of the output file group, named like so:
        * ID + '.raw.png': line image (if the workflow provides raw images)
        * ID + '.bin.png': line image (if the workflow provides binarized images)
        * ID + '.nrm.png': line image (if the workflow provides grayscale-normalized images)
        * ID + '.json': line metadata.
        * ID + '.gt.txt': line text.
        
        (This is intended for training and evaluation of OCR models.)
        """
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            ptype = page.get_type()

            regions = itertools.chain.from_iterable([page.get_TextRegion()] + [
                subregion.get_TextRegion()
                for subregion in page.get_TableRegion()
            ])
            if not regions:
                LOG.warning("Page '%s' contains no text regions", page_id)
            for region in regions:
                region_image, region_coords = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_coords,
                    transparency=self.parameter['transparency'])
                rtype = region.get_type()

                lines = region.get_TextLine()
                if not lines:
                    LOG.warning("Region '%s' contains no text lines",
                                region.id)
                for line in lines:
                    line_image, line_coords = self.workspace.image_from_segment(
                        line,
                        region_image,
                        region_coords,
                        transparency=self.parameter['transparency'])
                    lpolygon_rel = coordinates_of_segment(
                        line, line_image, line_coords).tolist()
                    lpolygon_abs = polygon_from_points(
                        line.get_Coords().points)
                    ltext = line.get_TextEquiv()
                    if not ltext:
                        LOG.warning("Line '%s' contains no text conent",
                                    line.id)
                        ltext = ''
                    else:
                        ltext = ltext[0].Unicode
                    lstyle = line.get_TextStyle() or region.get_TextStyle()
                    if lstyle:
                        lstyle = {
                            'fontFamily': lstyle.fontFamily,
                            'fontSize': lstyle.fontSize,
                            'xHeight': lstyle.xHeight,
                            'kerning': lstyle.kerning,
                            'serif': lstyle.serif,
                            'monospace': lstyle.monospace,
                            'bold': lstyle.bold,
                            'italic': lstyle.italic,
                            'smallCaps': lstyle.smallCaps,
                            'letterSpaced': lstyle.letterSpaced,
                            'strikethrough': lstyle.strikethrough,
                            'underlined': lstyle.underlined,
                            'underlineStyle': lstyle.underlineStyle,
                            'subscript': lstyle.subscript,
                            'superscript': lstyle.superscript
                        }
                    lfeatures = line_coords['features']
                    description = {
                        'line.ID':
                        line.id,
                        'text':
                        ltext,
                        'style':
                        lstyle,
                        'production': (line.get_production()
                                       or region.get_production()),
                        'readingDirection': (line.get_readingDirection()
                                             or region.get_readingDirection()
                                             or page.get_readingDirection()),
                        'primaryScript': (line.get_primaryScript()
                                          or region.get_primaryScript()
                                          or page.get_primaryScript()),
                        'primaryLanguage': (line.get_primaryLanguage()
                                            or region.get_primaryLanguage()
                                            or page.get_primaryLanguage()),
                        'features':
                        lfeatures,
                        'DPI':
                        dpi,
                        'coords_rel':
                        lpolygon_rel,
                        'coords_abs':
                        lpolygon_abs,
                        'region.ID':
                        region.id,
                        'region.type':
                        rtype,
                        'page.ID':
                        page_id,
                        'page.type':
                        ptype,
                        'file_grp':
                        self.input_file_grp,
                        'METS.UID':
                        self.workspace.mets.unique_identifier
                    }
                    if 'binarized' in lfeatures:
                        extension = '.bin'
                    elif 'grayscale_normalized' in lfeatures:
                        extension = '.nrm'
                    else:
                        extension = '.raw'

                    file_path = self.workspace.save_image_file(
                        line_image,
                        file_id + '_' + region.id + '_' + line.id + extension,
                        self.output_file_grp,
                        page_id=page_id,
                        mimetype=self.parameter['mimetype'])
                    file_path = file_path.replace(
                        extension + MIME_TO_EXT[self.parameter['mimetype']],
                        '.json')
                    json.dump(description, open(file_path, 'w'))
                    file_path = file_path.replace('.json', '.gt.txt')
                    with open(file_path, 'wb') as f:
                        f.write((ltext + '\n').encode('utf-8'))