Python TextRegionType.get_TextLine 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: ocrd_models.ocrd_page

클래스/타입: TextRegionType

메소드/함수: get_TextLine

hotexamples.com에서의 예제들: 2

Python TextRegionType.get_TextLine - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 ocrd_models.ocrd_page.TextRegionType.get_TextLine에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

TextRegionType(22)

add_TextLine(7)

add_AlternativeImage(4)

get_TextLine(2)

set_orientation(2)

set_type(2)

예제 #1

파일 보기

 def render_text_region(self, text_region: TextRegionType) -> None:
     line: TextLineType
     word: WordType
     glyph: GlyphType
     for line in text_region.get_TextLine():
         self.render_type(line)
         for word in line.get_Word():
             self.render_type(word)
             for glyph in word.get_Glyph():
                 self.render_type(glyph)

예제 #2

파일 보기

파일: segment.py 프로젝트: cisocrgroup/ocrd_cis

    def _process_element(self,
                         element,
                         ignore,
                         image,
                         coords,
                         element_id,
                         file_id,
                         page_id,
                         zoom=1.0,
                         rogroup=None):
        """Add PAGE layout elements by segmenting an image.

        Given a PageType, TableRegionType or TextRegionType ``element``, and
        a corresponding binarized PIL.Image object ``image`` with coordinate
        metadata ``coords``, run line segmentation with Ocropy.
        
        If operating on the full page (or table), then also detect horizontal
        and vertical separators, and aggregate the lines into text regions
        afterwards.
        
        Add the resulting sub-segments to the parent ``element``.
        
        If ``ignore`` is not empty, then first suppress all foreground components
        in any of those segments' coordinates during segmentation, and if also
        in full page/table mode, then combine all separators among them with the
        newly detected separators to guide region segmentation.
        """
        LOG = getLogger('processor.OcropySegment')
        if not image.width or not image.height:
            LOG.warning("Skipping '%s' with zero size", element_id)
            return
        element_array = pil2array(image)
        element_bin = np.array(element_array <= midrange(element_array),
                               np.bool)
        sep_bin = np.zeros_like(element_bin, np.bool)
        ignore_labels = np.zeros_like(element_bin, np.int)
        for i, segment in enumerate(ignore):
            LOG.debug('masking foreground of %s "%s" for "%s"',
                      type(segment).__name__[:-4], segment.id, element_id)
            # mark these segments (e.g. separator regions, tables, images)
            # for workflows where they have been detected already;
            # these will be:
            # - ignored during text line segmentation (but not h/v-line detection)
            # - kept and reading-ordered during region segmentation (but not seps)
            segment_polygon = coordinates_of_segment(segment, image, coords)
            # If segment_polygon lies outside of element (causing
            # negative/above-max indices), either fully or partially,
            # then this will silently ignore them. The caller does
            # not need to concern herself with this.
            if isinstance(segment, SeparatorRegionType):
                sep_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:,
                                                                            0],
                                     sep_bin.shape)] = True
            ignore_labels[draw.polygon(
                segment_polygon[:, 1], segment_polygon[:, 0],
                ignore_labels.shape)] = i + 1  # mapped back for RO
        if isinstance(element, PageType):
            element_name = 'page'
            fullpage = True
            report = check_page(element_bin, zoom)
        elif isinstance(element, TableRegionType) or (
                # sole/congruent text region of a table region?
                element.id.endswith('_text')
                and isinstance(element.parent_object_, TableRegionType)):
            element_name = 'table'
            fullpage = True
            report = check_region(element_bin, zoom)
        else:
            element_name = 'region'
            fullpage = False
            report = check_region(element_bin, zoom)
        LOG.info('computing line segmentation for %s "%s"', element_name,
                 element_id)
        # TODO: we should downscale if DPI is large enough to save time
        try:
            if report:
                raise Exception(report)
            line_labels, hlines, vlines, images, colseps, scale = compute_segmentation(
                # suppress separators and ignored regions for textline estimation
                # but keep them for h/v-line detection (in fullpage mode):
                element_bin,
                seps=(sep_bin + ignore_labels) > 0,
                zoom=zoom,
                fullpage=fullpage,
                spread_dist=round(self.parameter['spread'] / zoom * 300 /
                                  72),  # in pt
                # these are ignored when not in fullpage mode:
                maxcolseps=self.parameter['maxcolseps'],
                maxseps=self.parameter['maxseps'],
                maximages=self.parameter['maximages']
                if element_name != 'table' else 0,
                csminheight=self.parameter['csminheight'],
                hlminwidth=self.parameter['hlminwidth'])
        except Exception as err:
            if isinstance(element, TextRegionType):
                LOG.error('Cannot line-segment region "%s": %s', element_id,
                          err)
                # as a fallback, add a single text line comprising the whole region:
                element.add_TextLine(
                    TextLineType(id=element_id + "_line",
                                 Coords=element.get_Coords()))
            else:
                LOG.error('Cannot line-segment %s "%s": %s', element_name,
                          element_id, err)
            return

        LOG.info('Found %d text lines for %s "%s"',
                 len(np.unique(line_labels)) - 1, element_name, element_id)
        # post-process line labels
        if isinstance(element, (PageType, TableRegionType)):
            # aggregate text lines to text regions
            try:
                # pass ignored regions as "line labels with initial assignment",
                # i.e. identical line and region labels
                # to detect their reading order among the others
                # (these cannot be split or grouped together with other regions)
                line_labels = np.where(line_labels, line_labels + len(ignore),
                                       ignore_labels)
                # suppress separators/images in fg and try to use for partitioning slices
                sepmask = np.maximum(np.maximum(hlines, vlines),
                                     np.maximum(sep_bin, images))
                region_labels = lines2regions(
                    element_bin,
                    line_labels,
                    rlabels=ignore_labels,
                    sepmask=np.maximum(sepmask, colseps),  # add bg
                    # decide horizontal vs vertical cut when gaps of similar size
                    prefer_vertical=not isinstance(element, TableRegionType),
                    gap_height=self.parameter['gap_height'],
                    gap_width=self.parameter['gap_width'],
                    scale=scale,
                    zoom=zoom)
                LOG.info('Found %d text regions for %s "%s"',
                         len(np.unique(region_labels)) - 1, element_name,
                         element_id)
            except Exception as err:
                LOG.error('Cannot region-segment %s "%s": %s', element_name,
                          element_id, err)
                region_labels = np.where(line_labels > len(ignore),
                                         1 + len(ignore), line_labels)

            # prepare reading order group index
            if rogroup:
                if isinstance(rogroup,
                              (OrderedGroupType, OrderedGroupIndexedType)):
                    index = 0
                    # start counting from largest existing index
                    for elem in (rogroup.get_RegionRefIndexed() +
                                 rogroup.get_OrderedGroupIndexed() +
                                 rogroup.get_UnorderedGroupIndexed()):
                        if elem.index >= index:
                            index = elem.index + 1
                else:
                    index = None
            # find contours around region labels (can be non-contiguous):
            region_no = 0
            for region_label in np.unique(region_labels):
                if not region_label:
                    continue  # no bg
                region_mask = region_labels == region_label
                region_line_labels = line_labels * region_mask
                region_line_labels0 = np.setdiff1d(region_line_labels, [0])
                if not np.all(region_line_labels0 > len(ignore)):
                    # existing region from `ignore` merely to be ordered
                    # (no new region, no actual text lines)
                    region_line_labels0 = np.intersect1d(
                        region_line_labels0, ignore_labels)
                    assert len(region_line_labels0) == 1, \
                        "region label %d has both existing regions and new lines (%s)" % (
                            region_label, str(region_line_labels0))
                    region = ignore[region_line_labels0[0] - 1]
                    if rogroup and region.parent_object_ == element and not isinstance(
                            region, SeparatorRegionType):
                        index = page_add_to_reading_order(
                            rogroup, region.id, index)
                    LOG.debug('Region label %d is for ignored region "%s"',
                              region_label, region.id)
                    continue
                # normal case: new lines inside new regions
                # remove binary-empty labels, and re-order locally
                order = morph.reading_order(region_line_labels)
                order[np.setdiff1d(region_line_labels0,
                                   element_bin * region_line_labels)] = 0
                region_line_labels = order[region_line_labels]
                # avoid horizontal gaps
                region_line_labels = hmerge_line_seeds(element_bin,
                                                       region_line_labels,
                                                       scale,
                                                       seps=np.maximum(
                                                           sepmask, colseps))
                region_mask |= region_line_labels > 0
                # find contours for region (can be non-contiguous)
                regions, _ = masks2polygons(
                    region_mask * region_label,
                    element_bin,
                    '%s "%s"' % (element_name, element_id),
                    min_area=6000 / zoom / zoom,
                    simplify=ignore_labels * ~(sep_bin))
                # find contours for lines (can be non-contiguous)
                lines, _ = masks2polygons(region_line_labels,
                                          element_bin,
                                          'region "%s"' % element_id,
                                          min_area=640 / zoom / zoom)
                # create new lines in new regions (allocating by intersection)
                line_polys = [Polygon(polygon) for _, polygon in lines]
                for _, region_polygon in regions:
                    region_poly = prep(Polygon(region_polygon))
                    # convert back to absolute (page) coordinates:
                    region_polygon = coordinates_for_segment(
                        region_polygon, image, coords)
                    region_polygon = polygon_for_parent(
                        region_polygon, element)
                    if region_polygon is None:
                        LOG.warning(
                            'Ignoring extant region contour for region label %d',
                            region_label)
                        continue
                    # annotate result:
                    region_no += 1
                    region_id = element_id + "_region%04d" % region_no
                    LOG.debug('Region label %d becomes ID "%s"', region_label,
                              region_id)
                    region = TextRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon)))
                    # find out which line (contours) belong to which region (contours)
                    line_no = 0
                    for i, line_poly in enumerate(line_polys):
                        if not region_poly.intersects(line_poly):  # .contains
                            continue
                        line_label, line_polygon = lines[i]
                        # convert back to absolute (page) coordinates:
                        line_polygon = coordinates_for_segment(
                            line_polygon, image, coords)
                        line_polygon = polygon_for_parent(line_polygon, region)
                        if line_polygon is None:
                            LOG.warning(
                                'Ignoring extant line contour for region label %d line label %d',
                                region_label, line_label)
                            continue
                        # annotate result:
                        line_no += 1
                        line_id = region_id + "_line%04d" % line_no
                        LOG.debug('Line label %d becomes ID "%s"', line_label,
                                  line_id)
                        line = TextLineType(
                            id=line_id,
                            Coords=CoordsType(
                                points=points_from_polygon(line_polygon)))
                        region.add_TextLine(line)
                    # if the region has received text lines, keep it
                    if region.get_TextLine():
                        element.add_TextRegion(region)
                        LOG.info('Added region "%s" with %d lines for %s "%s"',
                                 region_id, line_no, element_name, element_id)
                        if rogroup:
                            index = page_add_to_reading_order(
                                rogroup, region.id, index)
            # add additional image/non-text regions from compute_segmentation
            # (e.g. drop-capitals or images) ...
            image_labels, num_images = morph.label(images)
            LOG.info('Found %d large non-text/image regions for %s "%s"',
                     num_images, element_name, element_id)
            # find contours around region labels (can be non-contiguous):
            image_polygons, _ = masks2polygons(
                image_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            for image_label, polygon in image_polygons:
                # convert back to absolute (page) coordinates:
                region_polygon = coordinates_for_segment(
                    polygon, image, coords)
                region_polygon = polygon_for_parent(region_polygon, element)
                if region_polygon is None:
                    LOG.warning(
                        'Ignoring extant region contour for image label %d',
                        image_label)
                    continue
                region_no += 1
                # annotate result:
                region_id = element_id + "_image%04d" % region_no
                element.add_ImageRegion(
                    ImageRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon))))
            # split rulers into separator regions:
            hline_labels, num_hlines = morph.label(hlines)
            vline_labels, num_vlines = morph.label(vlines)
            LOG.info('Found %d/%d h/v-lines for %s "%s"', num_hlines,
                     num_vlines, element_name, element_id)
            # find contours around region labels (can be non-contiguous):
            hline_polygons, _ = masks2polygons(
                hline_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            vline_polygons, _ = masks2polygons(
                vline_labels, element_bin,
                '%s "%s"' % (element_name, element_id))
            for _, polygon in hline_polygons + vline_polygons:
                # convert back to absolute (page) coordinates:
                region_polygon = coordinates_for_segment(
                    polygon, image, coords)
                region_polygon = polygon_for_parent(region_polygon, element)
                if region_polygon is None:
                    LOG.warning('Ignoring extant region contour for separator')
                    continue
                # annotate result:
                region_no += 1
                region_id = element_id + "_sep%04d" % region_no
                element.add_SeparatorRegion(
                    SeparatorRegionType(
                        id=region_id,
                        Coords=CoordsType(
                            points=points_from_polygon(region_polygon))))
            # annotate a text/image-separated image
            element_array[sepmask] = np.amax(element_array)  # clip to white/bg
            image_clipped = array2pil(element_array)
            file_path = self.workspace.save_image_file(
                image_clipped,
                file_id + '.IMG-CLIP',
                page_id=page_id,
                file_grp=self.output_file_grp)
            element.add_AlternativeImage(
                AlternativeImageType(filename=file_path,
                                     comments=coords['features'] + ',clipped'))
        else:
            # get mask from region polygon:
            region_polygon = coordinates_of_segment(element, image, coords)
            region_mask = np.zeros_like(element_bin, np.bool)
            region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:,
                                                                          0],
                                     region_mask.shape)] = True
            # ensure the new line labels do not extrude from the region:
            line_labels = line_labels * region_mask
            # find contours around labels (can be non-contiguous):
            line_polygons, _ = masks2polygons(line_labels,
                                              element_bin,
                                              'region "%s"' % element_id,
                                              min_area=640 / zoom / zoom)
            line_no = 0
            for line_label, polygon in line_polygons:
                # convert back to absolute (page) coordinates:
                line_polygon = coordinates_for_segment(polygon, image, coords)
                line_polygon = polygon_for_parent(line_polygon, element)
                if line_polygon is None:
                    LOG.warning(
                        'Ignoring extant line contour for line label %d',
                        line_label)
                    continue
                # annotate result:
                line_no += 1
                line_id = element_id + "_line%04d" % line_no
                element.add_TextLine(
                    TextLineType(
                        id=line_id,
                        Coords=CoordsType(
                            points=points_from_polygon(line_polygon))))
            if not sep_bin.any():
                return  # no derived image
            # annotate a text/image-separated image
            element_array[sep_bin] = np.amax(element_array)  # clip to white/bg
            image_clipped = array2pil(element_array)
            file_path = self.workspace.save_image_file(
                image_clipped,
                file_id + '.IMG-CLIP',
                page_id=page_id,
                file_grp=self.output_file_grp)
            # update PAGE (reference the image file):
            element.add_AlternativeImage(
                AlternativeImageType(filename=file_path,
                                     comments=coords['features'] + ',clipped'))