Пример #1
0
    def process(self):
        """Clip text regions / lines of the workspace at intersections with neighbours.

        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the requested
        ``level-of-operation``.

        Next, get each segment image according to the layout annotation (by cropping
        via coordinates into the higher-level image), as well as all its neighbours',
        binarize them (without deskewing), and make a connected component analysis.
        (Segments must not already have AlternativeImage annotated, otherwise they
        will be skipped.)

        Then, for each section of overlap with a neighbour, re-assign components
        which are only contained in the neighbour by clipping them to white (background),
        and export the (final) result as image file.

        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CLIP`` along with further
        identification of the input element.

        Reference each new image in the AlternativeImage of the element.

        Produce a new output file by serialising the resulting hierarchy.
        """
        # This makes best sense for overlapping segmentation, like current GT
        # or Tesseract layout analysis. Most notably, it can suppress graphics
        # and separators within or across a region or line. It _should_ ideally
        # be run after binarization (on page level for region-level clipping,
        # and on the region level for line-level clipping), because the
        # connected component analysis after implicit binarization could be
        # suboptimal, and the explicit binarization after clipping could be,
        # too. However, region-level clipping _must_ be run before region-level
        # deskewing, because that would make segments incomensurable with their
        # neighbours.
        LOG = getLogger('processor.OcropyClip')
        level = self.parameter['level-of-operation']
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
            file_id = make_file_id(input_file, self.output_file_grp)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
            page = pcgts.get_Page()
            
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized')
            if self.parameter['dpi'] > 0:
                zoom = 300.0/self.parameter['dpi']
            elif page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi *= 2.54
                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
                zoom = 300.0/dpi
            else:
                zoom = 1

            regions = list(page.get_TextRegion())
            num_texts = len(regions)
            regions += (
                page.get_AdvertRegion() +
                page.get_ChartRegion() +
                page.get_ChemRegion() +
                page.get_GraphicRegion() +
                page.get_ImageRegion() +
                page.get_LineDrawingRegion() +
                page.get_MathsRegion() +
                page.get_MusicRegion() +
                page.get_NoiseRegion() +
                page.get_SeparatorRegion() +
                page.get_TableRegion() +
                page.get_UnknownRegion())
            if not num_texts:
                LOG.warning('Page "%s" contains no text regions', page_id)
            background = ImageStat.Stat(page_image)
            # workaround for Pillow#4925
            if len(background.bands) > 1:
                background = tuple(background.median)
            else:
                background = background.median[0]
            if level == 'region':
                background_image = Image.new(page_image.mode, page_image.size, background)
                page_array = pil2array(page_image)
                page_bin = np.array(page_array <= midrange(page_array), np.uint8)
                # in absolute coordinates merely for comparison/intersection
                shapes = [Polygon(polygon_from_points(region.get_Coords().points))
                          for region in regions]
                # in relative coordinates for mask/cropping
                polygons = [coordinates_of_segment(region, page_image, page_coords)
                            for region in regions]
                for i, polygon in enumerate(polygons[num_texts:], num_texts):
                    # for non-text regions, extend mask by 3 pixels in each direction
                    # to ensure they do not leak components accidentally
                    # (accounts for bad cropping of such regions in GT):
                    polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open
                    polygons[i] = polygon
                masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8)
                         for polygon in polygons]
            for i, region in enumerate(regions):
                if i >= num_texts:
                    break # keep non-text regions unchanged
                if level == 'region':
                    if region.get_AlternativeImage():
                        # FIXME: This should probably be an exception (bad workflow configuration).
                        LOG.warning('Page "%s" region "%s" already contains image data: skipping',
                                    page_id, region.id)
                        continue
                    shape = prep(shapes[i])
                    neighbours = [(regionj, maskj) for shapej, regionj, maskj
                                  in zip(shapes[:i] + shapes[i+1:],
                                         regions[:i] + regions[i+1:],
                                         masks[:i] + masks[i+1:])
                                  if shape.intersects(shapej)]
                    if neighbours:
                        self.process_segment(region, masks[i], polygons[i],
                                             neighbours, background_image,
                                             page_image, page_coords, page_bin,
                                             input_file.pageId, file_id + '_' + region.id)
                    continue
                # level == 'line':
                lines = region.get_TextLine()
                if not lines:
                    LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id)
                    continue
                region_image, region_coords = self.workspace.image_from_segment(
                    region, page_image, page_coords, feature_selector='binarized')
                background_image = Image.new(region_image.mode, region_image.size, background)
                region_array = pil2array(region_image)
                region_bin = np.array(region_array <= midrange(region_array), np.uint8)
                # in absolute coordinates merely for comparison/intersection
                shapes = [Polygon(polygon_from_points(line.get_Coords().points))
                          for line in lines]
                # in relative coordinates for mask/cropping
                polygons = [coordinates_of_segment(line, region_image, region_coords)
                            for line in lines]
                masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8)
                         for polygon in polygons]
                for j, line in enumerate(lines):
                    if line.get_AlternativeImage():
                        # FIXME: This should probably be an exception (bad workflow configuration).
                        LOG.warning('Page "%s" region "%s" line "%s" already contains image data: skipping',
                                    page_id, region.id, line.id)
                        continue
                    shape = prep(shapes[j])
                    neighbours = [(linej, maskj) for shapej, linej, maskj
                                  in zip(shapes[:j] + shapes[j+1:],
                                         lines[:j] + lines[j+1:],
                                         masks[:j] + masks[j+1:])
                                  if shape.intersects(shapej)]
                    if neighbours:
                        self.process_segment(line, masks[j], polygons[j],
                                             neighbours, background_image,
                                             region_image, region_coords, region_bin,
                                             input_file.pageId, file_id + '_' + region.id + '_' + line.id)

            # update METS (add the PAGE file):
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            pcgts.set_pcGtsId(file_id)
            out = self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                local_filename=file_path,
                mimetype=MIMETYPE_PAGE,
                content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s',
                     file_id, self.output_file_grp, out.local_filename)
Пример #2
0
 def test_make_file_id_simple(self):
     f = create_ocrd_file('MAX', ID="MAX_0012")
     self.assertEqual(make_file_id(f, 'FOO'), 'FOO_0012')
Пример #3
0
 def test_make_file_id_605(self):
     """https://github.com/OCR-D/core/pull/605"""
     mets = OcrdMets.empty_mets()
     f = mets.add_file('GRP1', ID='FOO_0001', pageId='phys0001')
     f = mets.add_file('GRP2', ID='FOO_0002', pageId='phys0002')
     self.assertEqual(make_file_id(f, 'GRP2'), 'GRP2_0001')
    def process(self):
        """Detect font shapes via rule-based OCR with Tesseract on the workspace.
        
        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the line level.
        
        Set up Tesseract to recognise each word's image (either from
        AlternativeImage or cropping the bounding box rectangle and masking
        it from the polygon outline) in word mode and with the ``osd`` model.
        
        Query the result's font attributes and write them into the word element's
        ``TextStyle``.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrFontShape')
        LOG.debug("TESSDATA: %s, installed Tesseract models: %s",
                  *get_languages())

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        model = self.parameter['model']
        if model not in get_languages()[1]:
            raise Exception(
                "model " + model +
                " (needed for font style detection) is not installed")

        with PyTessBaseAPI(
                path=TESSDATA_PREFIX,
                #oem=OEM.TESSERACT_LSTM_COMBINED, # legacy required for OSD or WordFontAttributes!
                oem=OEM.
                TESSERACT_ONLY,  # legacy required for OSD or WordFontAttributes!
                lang=model) as tessapi:
            LOG.info(
                "Using model '%s' in %s for recognition at the word level",
                model,
                get_languages()[0])
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()

                page_image, page_coords, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from parameter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))

                LOG.info("Processing page '%s'", page_id)
                regions = page.get_AllRegions(classes=['Text'])
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                else:
                    self._process_regions(tessapi, regions, page_image,
                                          page_coords)

                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
Пример #5
0
    def process(self):
        LOG = getLogger('OcrdAnybaseocrTiseg')

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        oplevel = self.parameter['operation_level']

        model = None
        if self.parameter['use_deeplr']:

            model_weights = self.parameter['seg_weights']

            if not Path(model_weights).is_file():
                LOG.error("""\
                    Segementation model weights file was not found at '%s'. Make sure the `seg_weights` parameter
                    points to the local model weights path.
                    """ % model_weights)
                sys.exit(1)

            #model = resnet50_unet(n_classes=self.parameter['classes'], input_height=self.parameter['height'], input_width=self.parameter['width'])
            #model.load_weights(model_weights)
            model = load_model(model_weights)
            LOG.info('Segmentation Model loaded')

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)

            page = pcgts.get_Page()
            LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID)

            if self.parameter['use_deeplr']:
                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page, page_id, feature_filter='binarized,deskewed,cropped')
            else:
                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page,
                    page_id,
                    feature_selector='binarized,deskewed,cropped')

            if oplevel == 'page':
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n, model)
            else:
                LOG.warning('Operation level %s, but should be "page".',
                            oplevel)
                break

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp,
                                            file_id + '.xml'),
                content=to_xml(pcgts).encode('utf-8'),
            )
Пример #6
0
    def process(self):
        """Replace everything below the page level with another annotation.
        
        Open and deserialize PAGE input files from both input file groups,
        then go to the page hierarchy level.
        
        Replace all regions (and their reading order) from the page of
        the first input file group with all regions from the page of
        the second input file group. Keep page-level annotations unchanged
        (i.e. Border, orientation, type, AlternativeImage etc).
        
        If ``transform_coordinates`` is true, then also retrieve the
        coordinate transform of the (cropped, deskewed, dewarped) page
        from the first input fileGrp, and use it to adjust all segment
        coordinates from the second input fileGrp, accordingly.
        (This assumes both are consistent, i.e. the second input was derived
        from the first input via ``ocrd-segment-replace-original`` or similar.)
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.ReplacePage')
        assert_file_grp_cardinality(self.input_file_grp, 2, 'original, page')
        assert_file_grp_cardinality(self.output_file_grp, 1)
        adapt_coords = self.parameter['transform_coordinates']

        # collect input file tuples
        ifts = self.zip_input_files()  # input file tuples
        # process input file tuples
        for n, ift in enumerate(ifts):
            input_file, page_file = ift
            if input_file is None or page_file is None:
                continue
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            pcgts2 = page_from_file(self.workspace.download_file(page_file))
            page2 = pcgts2.get_Page()
            # adjust all coordinates (recursively)
            if adapt_coords:
                try:
                    _, page_coords, _ = self.workspace.image_from_page(
                        page, page_id)
                    for region in page2.get_AllRegions():
                        region_coords = region.get_Coords()
                        region_polygon = polygon_from_points(
                            region_coords.points)
                        region_polygon = coordinates_for_segment(
                            region_polygon, None, page_coords)
                        region_coords.set_points(
                            points_from_polygon(region_polygon))
                        ensure_consistent(region)
                        if isinstance(region, TextRegionType):
                            for line in region.get_TextLine():
                                line_coords = line.get_Coords()
                                line_polygon = polygon_from_points(
                                    line_coords.points)
                                line_polygon = coordinates_for_segment(
                                    line_polygon, None, page_coords)
                                line_coords.set_points(
                                    points_from_polygon(line_polygon))
                                ensure_consistent(line)
                                for word in line.get_Word():
                                    word_coords = word.get_Coords()
                                    word_polygon = polygon_from_points(
                                        word_coords.points)
                                    word_polygon = coordinates_for_segment(
                                        word_polygon, None, page_coords)
                                    word_coords.set_points(
                                        points_from_polygon(word_polygon))
                                    ensure_consistent(word)
                                    for glyph in word.get_Glyph():
                                        glyph_coords = glyph.get_Coords()
                                        glyph_polygon = polygon_from_points(
                                            glyph_coords.points)
                                        glyph_polygon = coordinates_for_segment(
                                            glyph_polygon, None, page_coords)
                                        glyph_coords.set_points(
                                            points_from_polygon(glyph_polygon))
                                        ensure_consistent(glyph)
                except:
                    LOG.error('invalid coordinates on page %s', page_id)
                    continue
            # replace all regions
            page.set_ReadingOrder(page2.get_ReadingOrder())
            page.set_TextRegion(page2.get_TextRegion())
            page.set_ImageRegion(page2.get_ImageRegion())
            page.set_LineDrawingRegion(page2.get_LineDrawingRegion())
            page.set_GraphicRegion(page2.get_GraphicRegion())
            page.set_TableRegion(page2.get_TableRegion())
            page.set_ChartRegion(page2.get_ChartRegion())
            page.set_MapRegion(page2.get_MapRegion())
            page.set_SeparatorRegion(page2.get_SeparatorRegion())
            page.set_MathsRegion(page2.get_MathsRegion())
            page.set_ChemRegion(page2.get_ChemRegion())
            page.set_MusicRegion(page2.get_MusicRegion())
            page.set_AdvertRegion(page2.get_AdvertRegion())
            page.set_NoiseRegion(page2.get_NoiseRegion())
            page.set_UnknownRegion(page2.get_UnknownRegion())
            page.set_CustomRegion(page2.get_CustomRegion())

            # update METS (add the PAGE file):
            file_id = make_file_id(page_file, self.output_file_grp)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=os.path.join(
                                              self.output_file_grp,
                                              file_id + '.xml'),
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     self.output_file_grp, out.local_filename)
Пример #7
0
    def process(self):
        """Perform OCR post-correction with encoder-attention-decoder ANN on the workspace.
        
        Open and deserialise PAGE input files, then iterate over the element hierarchy
        down to the requested `textequiv_level`, making sequences of TextEquiv objects
        as lists of lines. Concatenate their string values, obeying rules of implicit
        whitespace, and map the string positions where the objects start.
        
        Next, transcode the input lines into output lines in parallel, and use
        the retrieved soft alignment scores to calculate hard alignment paths
        between input and output string via Viterbi decoding. Then use those
        to map back the start positions and overwrite each TextEquiv with its
        new content, paying special attention to whitespace:
        
        Distribute edits such that whitespace objects cannot become more than whitespace
        (or be deleted) and that non-whitespace objects must not start or end with
        whitespace (but may contain new whitespace in the middle).
        
        Subsequently, unless processing on the `line` level, make the Word segmentation
        consistent with that result again: merge around deleted whitespace tokens and
        split at whitespace inside non-whitespace tokens.
        
        Finally, make the levels above `textequiv_level` consistent with that
        textual result (via concatenation joined by whitespace).
        
        Produce new output files by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        # Dragging Word/TextLine references along in all lists besides TextEquiv
        # is necessary because the generateDS version of the PAGE-XML model
        # has no references upwards in the hierarchy (from TextEquiv to containing
        # elements, from Glyph/Word/TextLine to Word/TextLine/TextRegion), and
        # its classes are not hashable.
        level = self.parameter['textequiv_level']
        for n, input_file in enumerate(self.input_files):
            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId
                             or input_file.ID)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            page_id = input_file.pageId or input_file.ID  # (PageType has no id)
            self.logger.info("Correcting text in page '%s' at the %s level",
                             page_id, level)

            # annotate processing metadata:
            self.add_metadata(pcgts)

            # get textequiv references for all lines:
            # FIXME: conf with TextEquiv alternatives
            line_sequences = _page_get_line_sequences_at(level, pcgts)

            # concatenate to strings and get dict of start positions to refs:
            input_lines, conf, textequiv_starts, word_starts, textline_starts = (
                _line_sequences2string_sequences(
                    self.s2s.mapping[0],
                    line_sequences,
                    charmap=self.parameter['charmap']))

            # correct string and get input-output alignment:
            # FIXME: split into self.batch_size chunks
            output_lines, output_probs, output_scores, alignments = (
                self.s2s.correct_lines(input_lines,
                                       conf,
                                       fast=self.parameter['fast_mode'],
                                       greedy=self.parameter['fast_mode']))

            # re-align (from alignment scores) and overwrite the textequiv references:
            for (input_line, output_line, output_prob, output_score, alignment,
                 textequivs, words,
                 textlines) in zip(input_lines, output_lines, output_probs,
                                   output_scores, alignments, textequiv_starts,
                                   word_starts, textline_starts):
                self.logger.debug('"%s" -> "%s"', input_line.rstrip('\n'),
                                  output_line.rstrip('\n'))

                # convert soft scores (seen from output) to hard path (seen from input):
                #realignment = _alignment2path(alignment, len(input_line), len(output_line),
                #                              1. / self.s2s.voc_size)
                # create hard path via minimal edit distance:
                realignment, distance = _alignment_path(
                    input_line, output_line)

                # overwrite TextEquiv references:
                new_sequence = _update_sequence(input_line, output_line,
                                                output_prob, output_score,
                                                realignment, textequivs, words,
                                                textlines)

                # update Word segmentation:
                if level != 'line':
                    _resegment_sequence(new_sequence, level)

                self.logger.info(
                    'corrected line with %d elements, ppl: %.3f, CER: %.1f%%',
                    len(new_sequence), np.exp(output_score), distance * 100)

            # make higher levels consistent again:
            page_update_higher_textequiv_levels(level, pcgts)

            # write back result to new annotation:
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    local_filename=file_path,
                                    mimetype=MIMETYPE_PAGE,
                                    content=to_xml(pcgts))
Пример #8
0
    def process(self):
        """Extract word images and texts from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the word level.
        
        Extract an image for each word (which depending on the workflow
        can already be deskewed, dewarped, binarized etc.), cropped to its
        minimal bounding box, and masked by the coordinate polygon outline.
        Apply ``feature_filter`` (a comma-separated list of image features,
        cf. :py:func:`ocrd.workspace.Workspace.image_from_page`) to skip
        specific features when retrieving derived images.
        If ``transparency`` is true, then also add an alpha channel which is
        fully transparent outside of the mask.
        
        Create a JSON file with:
        * the IDs of the word and its parents,
        * the word's text content,
        * the word's coordinates relative to the line image,
        * the word's absolute coordinates,
        * the word's TextStyle (if any),
        * the word's @production (if any),
        * the word's @readingDirection (if any),
        * the word's @primaryScript (if any),
        * the word's @language (if any),
        * the word's AlternativeImage/@comments (features),
        * the parent textregion's @type,
        * the page's @type,
        * the page's DPI value.
        
        Create a plain text file for the text content, too.
        
        Write all files in the directory of the output file group, named like so:
        * ID + '.raw.png': word image (if the workflow provides raw images)
        * ID + '.bin.png': word image (if the workflow provides binarized images)
        * ID + '.nrm.png': word image (if the workflow provides grayscale-normalized images)
        * ID + '.json': word metadata.
        * ID + '.gt.txt': word text.
        
        (This is intended for training and evaluation of OCR models.)
        """
        LOG = getLogger('processor.ExtractWords')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter=self.parameter['feature_filter'],
                transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            ptype = page.get_type()

            regions = itertools.chain.from_iterable([page.get_TextRegion()] + [
                subregion.get_TextRegion()
                for subregion in page.get_TableRegion()
            ])
            if not regions:
                LOG.warning("Page '%s' contains no text regions", page_id)
            for region in regions:
                region_image, region_coords = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_coords,
                    feature_filter=self.parameter['feature_filter'],
                    transparency=self.parameter['transparency'])
                rtype = region.get_type()

                lines = region.get_TextLine()
                if not lines:
                    LOG.warning("Region '%s' contains no text lines",
                                region.id)
                for line in lines:
                    line_image, line_coords = self.workspace.image_from_segment(
                        line,
                        region_image,
                        region_coords,
                        feature_filter=self.parameter['feature_filter'],
                        transparency=self.parameter['transparency'])
                    words = line.get_Word()
                    if not words:
                        LOG.warning("Line '%s' contains no words", line.id)
                    for word in words:
                        word_image, word_coords = self.workspace.image_from_segment(
                            word,
                            line_image,
                            line_coords,
                            feature_filter=self.parameter['feature_filter'],
                            transparency=self.parameter['transparency'])
                        lpolygon_rel = coordinates_of_segment(
                            word, word_image, word_coords).tolist()
                        lpolygon_abs = polygon_from_points(
                            word.get_Coords().points)
                        ltext = word.get_TextEquiv()
                        if not ltext:
                            LOG.warning("Word '%s' contains no text content",
                                        word.id)
                            ltext = ''
                        else:
                            ltext = ltext[0].Unicode
                        lstyle = word.get_TextStyle() or line.get_TextStyle(
                        ) or region.get_TextStyle()
                        if lstyle:
                            lstyle = {
                                'fontFamily': lstyle.fontFamily,
                                'fontSize': lstyle.fontSize,
                                'xHeight': lstyle.xHeight,
                                'kerning': lstyle.kerning,
                                'serif': lstyle.serif,
                                'monospace': lstyle.monospace,
                                'bold': lstyle.bold,
                                'italic': lstyle.italic,
                                'smallCaps': lstyle.smallCaps,
                                'letterSpaced': lstyle.letterSpaced,
                                'strikethrough': lstyle.strikethrough,
                                'underlined': lstyle.underlined,
                                'underlineStyle': lstyle.underlineStyle,
                                'subscript': lstyle.subscript,
                                'superscript': lstyle.superscript
                            }
                        lfeatures = word_coords['features']
                        description = {
                            'word.ID':
                            word.id,
                            'text':
                            ltext,
                            'style':
                            lstyle,
                            'production': (word.get_production()
                                           or line.get_production()
                                           or region.get_production()),
                            'readingDirection':
                            (word.get_readingDirection()
                             or line.get_readingDirection()
                             or region.get_readingDirection()
                             or page.get_readingDirection()),
                            'primaryScript': (word.get_primaryScript()
                                              or line.get_primaryScript()
                                              or region.get_primaryScript()
                                              or page.get_primaryScript()),
                            'language':
                            (word.get_language() or line.get_primaryLanguage()
                             or region.get_primaryLanguage()
                             or page.get_primaryLanguage()),
                            'features':
                            lfeatures,
                            'DPI':
                            dpi,
                            'coords_rel':
                            lpolygon_rel,
                            'coords_abs':
                            lpolygon_abs,
                            'line.ID':
                            line.id,
                            'region.ID':
                            region.id,
                            'region.type':
                            rtype,
                            'page.ID':
                            page_id,
                            'page.type':
                            ptype,
                            'file_grp':
                            self.input_file_grp,
                            'METS.UID':
                            self.workspace.mets.unique_identifier
                        }
                        if 'binarized' in lfeatures:
                            extension = '.bin'
                        elif 'grayscale_normalized' in lfeatures:
                            extension = '.nrm'
                        else:
                            extension = '.raw'

                        file_id = make_file_id(input_file,
                                               self.output_file_grp)
                        file_path = self.workspace.save_image_file(
                            word_image,
                            file_id + '_' + region.id + '_' + line.id + '_' +
                            word.id + extension,
                            self.output_file_grp,
                            page_id=page_id,
                            mimetype=self.parameter['mimetype'])
                        file_path = file_path.replace(
                            extension +
                            MIME_TO_EXT[self.parameter['mimetype']], '.json')
                        json.dump(description, open(file_path, 'w'))
                        file_path = file_path.replace('.json', '.gt.txt')
                        with open(file_path, 'wb') as f:
                            f.write((ltext + '\n').encode('utf-8'))
Пример #9
0
    def process(self):
        """Deskew the regions of the workspace.

        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the TextRegion level.

        Next, for each file, crop each region image according to the layout
        annotation (via coordinates into the higher-level image, or from the
        alternative image), and determine the threshold for binarization and
        the deskewing angle of the region (up to ``maxskew``). Annotate the
        angle in the region.

        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-DESKEW`` along with further
        identification of the input element.

        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.OcropyDeskew')
        level = self.parameter['level-of-operation']
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file.pageId
                     or input_file.ID)
            file_id = make_file_id(input_file, self.output_file_grp)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID  # (PageType has no id)
            page = pcgts.get_Page()

            page_image, page_coords, _ = self.workspace.image_from_page(
                page,
                page_id,
                # image must not have been rotated already,
                # (we will overwrite @orientation anyway,)
                # abort if no such image can be produced:
                feature_filter='deskewed' if level == 'page' else '')
            if level == 'page':
                self._process_segment(page, page_image, page_coords,
                                      "page '%s'" % page_id, input_file.pageId,
                                      file_id)
            else:
                if level == 'table':
                    regions = page.get_TableRegion()
                else:  # region
                    regions = page.get_AllRegions(classes=['Text'],
                                                  order='reading-order')
                if not regions:
                    LOG.warning('Page "%s" contains no text regions', page_id)
                for region in regions:
                    # process region:
                    region_image, region_coords = self.workspace.image_from_segment(
                        region,
                        page_image,
                        page_coords,
                        # image must not have been rotated already,
                        # (we will overwrite @orientation anyway,)
                        # abort if no such image can be produced:
                        feature_filter='deskewed')
                    self._process_segment(region, region_image, region_coords,
                                          "region '%s'" % region.id,
                                          input_file.pageId,
                                          file_id + '_' + region.id)

            # update METS (add the PAGE file):
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            pcgts.set_pcGtsId(file_id)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     self.output_file_grp, out.local_filename)
Пример #10
0
    def process(self):
        """Resegment lines of the workspace.

        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the line level.

        Next, get each region image according to the layout annotation (from
        the alternative image of the region, or by cropping via coordinates
        into the higher-level image), and compute a new line segmentation
        from that (as a label mask).

        Then for each line within the region, find the label with the largest
        foreground area in the binarized image within the annotated polygon
        (or rectangle) of the line. Unless its relative area is too small,
        or its center is far off, convert that label's mask into a polygon
        outline, intersect with the old polygon, and find the contour of that
        segment. Annotate the result as new coordinates of the line.

        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-RESEG`` along with further
        identification of the input element.

        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.OcropyResegment')
        # This makes best sense for bad/coarse line segmentation, like current GT
        # or as postprocessing for bbox-only steps.
        # Most notably, it can convert rectangles to polygons (polygonalization).
        # It depends on a decent line segmentation from ocropy though. So it
        # _should_ ideally be run after deskewing (on the page or region level),
        # _must_ be run after binarization (on page or region level). Also, the
        # method's accuracy crucially depends on a good estimate of the images'
        # pixel density (at least if source input is not 300 DPI).
        threshold = self.parameter['min_fraction']
        margin = self.parameter['extend_margins']
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file.pageId
                     or input_file.ID)
            file_id = make_file_id(input_file, self.output_file_grp)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID  # (PageType has no id)
            page = pcgts.get_Page()

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized')
            if self.parameter['dpi'] > 0:
                zoom = 300.0 / self.parameter['dpi']
            elif page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi *= 2.54
                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
                zoom = 300.0 / dpi
            else:
                zoom = 1

            regions = page.get_AllRegions(classes=['Text'])
            if not regions:
                LOG.warning('Page "%s" contains no text regions', page_id)
            for region in regions:
                lines = region.get_TextLine()
                if not lines:
                    LOG.warning('Page "%s" region "%s" contains no text lines',
                                page_id, region.id)
                    continue
                if len(lines) == 1:
                    LOG.warning('Page "%s" region "%s" contains only one line',
                                page_id, region.id)
                    continue
                region_image, region_xywh = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_xywh,
                    feature_selector='binarized')
                region_array = pil2array(region_image)
                #region_array, _ = common.binarize(region_array, maxskew=0) # just in case still raw
                region_bin = np.array(region_array <= midrange(region_array),
                                      np.bool)
                report = check_region(region_bin, zoom)
                try:
                    if report:
                        raise Exception(report)
                    region_labels, _, _, _, _, _ = compute_segmentation(
                        region_bin, zoom=zoom)
                except Exception as err:
                    LOG.warning(
                        'Cannot line-segment page "%s" region "%s": %s',
                        page_id, region.id, err)
                    # fallback option 1: borderclean
                    # label margins vs interior, but with the interior
                    # extended into the margin by its connected components
                    # to remove noise from neighbouring regions:
                    #region_labels = borderclean_bin(region_bin, margin=round(4/zoom)) + 1
                    # too dangerous, because we risk losing dots from i or punctuation;
                    # fallback option2: only extend_margins
                    # instead, just provide a uniform label, so at least we get
                    # to extend the polygon margins:
                    #region_labels = np.ones_like(region_bin)
                    # fallback option3: keep unchanged
                    continue
                for line in lines:
                    if line.get_AlternativeImage():
                        # get cropped line image:
                        line_image, line_xywh = self.workspace.image_from_segment(
                            line,
                            region_image,
                            region_xywh,
                            feature_selector='binarized')
                        LOG.debug("Using AlternativeImage (%s) for line '%s'",
                                  line_xywh['features'], line.id)
                        # crop region arrays accordingly:
                        line_polygon = coordinates_of_segment(
                            line, region_image, region_xywh)
                        line_bbox = bbox_from_polygon(line_polygon)
                        line_labels = region_labels[line_bbox[1]:line_bbox[3],
                                                    line_bbox[0]:line_bbox[2]]
                        line_bin = region_bin[line_bbox[1]:line_bbox[3],
                                              line_bbox[0]:line_bbox[2]]
                        # get polygon in relative (line) coordinates:
                        line_polygon = coordinates_of_segment(
                            line, line_image, line_xywh)
                        line_polygon = resegment(line_polygon,
                                                 line_labels,
                                                 line_bin,
                                                 line.id,
                                                 extend_margins=margin,
                                                 threshold_relative=threshold)
                        if line_polygon is None:
                            continue  # not good enough – keep
                        # convert back to absolute (page) coordinates:
                        line_polygon = coordinates_for_segment(
                            line_polygon, line_image, line_xywh)
                    else:
                        # get polygon in relative (region) coordinates:
                        line_polygon = coordinates_of_segment(
                            line, region_image, region_xywh)
                        line_polygon = resegment(line_polygon,
                                                 region_labels,
                                                 region_bin,
                                                 line.id,
                                                 extend_margins=margin,
                                                 threshold_relative=threshold)
                        if line_polygon is None:
                            continue  # not good enough – keep
                        # convert back to absolute (page) coordinates:
                        line_polygon = coordinates_for_segment(
                            line_polygon, region_image, region_xywh)
                    # annotate result:
                    line.get_Coords().points = points_from_polygon(
                        line_polygon)
                    # create new image:
                    line_image, line_xywh = self.workspace.image_from_segment(
                        line,
                        region_image,
                        region_xywh,
                        feature_selector='binarized')
                    # update METS (add the image file):
                    file_path = self.workspace.save_image_file(
                        line_image,
                        file_id=file_id + '_' + region.id + '_' + line.id +
                        '.IMG-RESEG',
                        page_id=page_id,
                        file_grp=self.output_file_grp)
                    # update PAGE (reference the image file):
                    line.add_AlternativeImage(
                        AlternativeImageType(filename=file_path,
                                             comments=region_xywh['features']))

            # update METS (add the PAGE file):
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            pcgts.set_pcGtsId(file_id)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     self.output_file_grp, out.local_filename)
Пример #11
0
    def process(self):
        """Performs page cropping with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images.
        Set up Tesseract to detect text blocks on each page, and find
        the largest coordinate extent spanning all of them. Use this
        extent in defining a Border, and add that to the page.
        
        Moreover, crop the original image accordingly, and reference the
        resulting image file as AlternativeImage in the Page element.
        
        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CROP`` along with further
        identification of the input element.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrCrop')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            # disable table detection here (tables count as text blocks),
            # because we do not want to risk confusing the spine with
            # a column separator and thus creeping into a neighbouring
            # page:
            tessapi.SetVariable("textord_tabfind_find_tables", "0")
            for (n, input_file) in enumerate(self.input_files):
                file_id = make_file_id(input_file, self.output_file_grp)
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()
                
                # warn of existing Border:
                border = page.get_Border()
                if border:
                    left, top, right, bottom = bbox_from_points(border.get_Coords().points)
                    LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
                                left, top, right, bottom)
                
                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page, page_id,
                    # image must not have been cropped already,
                    # abort if no such image can be produced:
                    feature_filter='cropped')
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi)
                else:
                    dpi = 0
                    LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                    zoom = 300 / dpi
                else:
                    zoom = 1

                bounds = self.estimate_bounds(page, page_image, tessapi, zoom)
                self.process_page(page, page_image, page_xywh, bounds, file_id, input_file.pageId)

                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(
                    ID=file_id,
                    file_grp=self.output_file_grp,
                    pageId=input_file.pageId,
                    mimetype=MIMETYPE_PAGE,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    content=to_xml(pcgts))
    def process(self):
        """Performs region segmentation by reading mask images in pseudo-colour.
        
        Open and deserialize each PAGE input file (or generate from image input file)
        from the first input file group, as well as mask image file from the second.
        
        Then iterate over all connected (equally colored) mask segments and compute
        convex hull contours for them. Convert them to polygons, and look up their
        color value in ``colordict`` to instantiate the appropriate region types
        (optionally with subtype). Instantiate and annotate regions accordingly.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.ImportImageSegmentation')
        assert_file_grp_cardinality(self.input_file_grp, 2, 'base and mask')
        assert_file_grp_cardinality(self.output_file_grp, 1)

        colordict = self.parameter['colordict']
        typedict = {
            "TextRegion": TextTypeSimpleType,
            "GraphicRegion": GraphicsTypeSimpleType,
            "ChartType": ChartTypeSimpleType
        }
        # collect input file tuples
        ifts = self.zip_input_files()  # input file tuples
        # process input file tuples
        for ift in ifts:
            input_file, segmentation_file = ift
            LOG.info("processing page %s", input_file.pageId)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()

            # import mask image
            segmentation_filename = self.workspace.download_file(
                segmentation_file).local_filename
            with pushd_popd(self.workspace.directory):
                segmentation_pil = Image.open(segmentation_filename)
            has_alpha = segmentation_pil.mode == 'RGBA'
            if has_alpha:
                colorformat = "%08X"
            else:
                colorformat = "%06X"
                if segmentation_pil.mode != 'RGB':
                    segmentation_pil = segmentation_pil.convert('RGB')
            # convert to array
            segmentation_array = np.array(segmentation_pil)
            # collapse 3 color channels
            segmentation_array = segmentation_array.dot(
                np.array([2**24, 2**16, 2**8, 1],
                         np.uint32)[0 if has_alpha else 1:])
            # partition mapped colors vs background
            colors = np.unique(segmentation_array)
            bgcolors = []
            for i, color in enumerate(colors):
                colorname = colorformat % color
                if (colorname not in colordict or not colordict[colorname]):
                    #raise Exception("Unknown color %s (not in colordict)" % colorname)
                    LOG.info("Ignoring background color %s", colorname)
                    bgcolors.append(i)
            background = np.zeros_like(segmentation_array, np.uint8)
            if bgcolors:
                for i in bgcolors:
                    background += np.array(segmentation_array == colors[i],
                                           np.uint8)
                colors = np.delete(colors, bgcolors, 0)
            # iterate over mask for each mapped color/class
            regionno = 0
            for color in colors:
                # get region (sub)type
                colorname = colorformat % color
                classname = colordict[colorname]
                regiontype = None
                custom = None
                if ":" in classname:
                    classname, regiontype = classname.split(":")
                    if classname in typedict:
                        typename = membername(typedict[classname], regiontype)
                        if typename == regiontype:
                            # not predefined in PAGE: use other + custom
                            custom = "subtype:%s" % regiontype
                            regiontype = "other"
                    else:
                        custom = "subtype:%s" % regiontype
                if classname + "Type" not in globals():
                    raise Exception(
                        "Unknown class '%s' for color %s in colordict" %
                        (classname, colorname))
                classtype = globals()[classname + "Type"]
                if classtype is BorderType:
                    # mask from all non-background regions
                    classmask = 1 - background
                else:
                    # mask from current color/class
                    classmask = np.array(segmentation_array == color, np.uint8)
                if not np.count_nonzero(classmask):
                    continue
                # now get the contours and make polygons for them
                contours, _ = cv2.findContours(classmask, cv2.RETR_EXTERNAL,
                                               cv2.CHAIN_APPROX_SIMPLE)
                for contour in contours:
                    # (could also just take bounding boxes to avoid islands/inclusions...)
                    area = cv2.contourArea(contour)
                    # filter too small regions
                    area_pct = area / np.prod(segmentation_array.shape) * 100
                    if area < 100 and area_pct < 0.1:
                        LOG.warning(
                            'ignoring contour of only %.1f%% area for %s',
                            area_pct, classname)
                        continue
                    LOG.info('found region %s:%s:%s with area %.1f%%',
                             classname, regiontype or '', custom or '',
                             area_pct)
                    # simplify shape
                    poly = cv2.approxPolyDP(
                        contour, 2, False)[:, 0, ::]  # already ordered x,y
                    if len(poly) < 4:
                        LOG.warning(
                            'ignoring contour of only %d points (area %.1f%%) for %s',
                            len(poly), area_pct, classname)
                        continue
                    if classtype is BorderType:
                        # add Border
                        page.set_Border(
                            BorderType(Coords=CoordsType(
                                points=points_from_polygon(poly))))
                        break
                    else:
                        # instantiate region
                        regionno += 1
                        region = classtype(
                            id="region_%d" % regionno,
                            type_=regiontype,
                            custom=custom,
                            Coords=CoordsType(
                                points=points_from_polygon(poly)))
                        # add region
                        getattr(page, 'add_%s' % classname)(region)

            file_id = make_file_id(input_file, self.output_file_grp)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Пример #13
0
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n):
        LOG = getLogger('OcrdAnybaseocrBinarizer')
        raw = ocrolib.pil2array(page_image)
        if len(raw.shape) > 2:
            raw = np.mean(raw, 2)
        raw = raw.astype("float64")
        # perform image normalization
        image = raw - amin(raw)
        if amax(image) == amin(image):
            LOG.info("# image is empty: %s" % (page_id))
            return
        image /= amax(image)

        # check whether the image is already effectively binarized
        if self.parameter['gray']:
            extreme = 0
        else:
            extreme = (np.sum(image < 0.05) +
                       np.sum(image > 0.95)) * 1.0 / np.prod(image.shape)
        if extreme > 0.95:
            comment = "no-normalization"
            flat = image
        else:
            comment = ""
            # if not, we need to flatten it by estimating the local whitelevel
            LOG.info("Flattening")
            m = interpolation.zoom(image, self.parameter['zoom'])
            m = filters.percentile_filter(m,
                                          self.parameter['perc'],
                                          size=(self.parameter['range'], 2))
            m = filters.percentile_filter(m,
                                          self.parameter['perc'],
                                          size=(2, self.parameter['range']))
            m = interpolation.zoom(m, 1.0 / self.parameter['zoom'])
            if self.parameter['debug'] > 0:
                clf()
                imshow(m, vmin=0, vmax=1)
                ginput(1, self.parameter['debug'])
            w, h = minimum(array(image.shape), array(m.shape))
            flat = clip(image[:w, :h] - m[:w, :h] + 1, 0, 1)
            if self.parameter['debug'] > 0:
                clf()
                imshow(flat, vmin=0, vmax=1)
                ginput(1, self.parameter['debug'])

        # estimate low and high thresholds
        LOG.info("Estimating Thresholds")
        d0, d1 = flat.shape
        o0, o1 = int(self.parameter['bignore'] * d0), int(
            self.parameter['bignore'] * d1)
        est = flat[o0:d0 - o0, o1:d1 - o1]
        if self.parameter['escale'] > 0:
            # by default, we use only regions that contain
            # significant variance; this makes the percentile
            # based low and high estimates more reliable
            e = self.parameter['escale']
            v = est - filters.gaussian_filter(est, e * 20.0)
            v = filters.gaussian_filter(v**2, e * 20.0)**0.5
            v = (v > 0.3 * amax(v))
            v = morphology.binary_dilation(v, structure=ones((int(e * 50), 1)))
            v = morphology.binary_dilation(v, structure=ones((1, int(e * 50))))
            if self.parameter['debug'] > 0:
                imshow(v)
                ginput(1, self.parameter['debug'])
            est = est[v]
        lo = stats.scoreatpercentile(est.ravel(), self.parameter['lo'])
        hi = stats.scoreatpercentile(est.ravel(), self.parameter['hi'])
        # rescale the image to get the gray scale image
        LOG.info("Rescaling")
        flat -= lo
        flat /= (hi - lo)
        flat = clip(flat, 0, 1)
        if self.parameter['debug'] > 0:
            imshow(flat, vmin=0, vmax=1)
            ginput(1, self.parameter['debug'])
        binarized = 1 * (flat > self.parameter['threshold'])

        # output the normalized grayscale and the thresholded images
        # print_info("%s lo-hi (%.2f %.2f) angle %4.1f %s" % (fname, lo, hi, angle, comment))
        LOG.info("%s lo-hi (%.2f %.2f) %s" % (page_id, lo, hi, comment))
        LOG.info("writing")
        if self.parameter['debug'] > 0 or self.parameter['show']:
            clf()
            gray()
            imshow(binarized)
            ginput(1, max(0.1, self.parameter['debug']))

        page_xywh['features'] += ',binarized'

        bin_array = array(255 * (binarized > ocrolib.midrange(binarized)), 'B')
        bin_image = ocrolib.array2pil(bin_array)

        file_id = make_file_id(input_file, self.output_file_grp)
        file_path = self.workspace.save_image_file(
            bin_image,
            file_id + '-IMG',
            page_id=page_id,
            file_grp=self.output_file_grp)
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features']))
Пример #14
0
    def process(self):
        log = getLogger('processor.OcrdSbbTextlineDetectorRecognize')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, input_file)

            file_id = make_file_id(input_file, self.output_file_grp)

            # Process the files
            try:
                os.mkdir(self.output_file_grp)
            except FileExistsError:
                pass

            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = \
                self.workspace.image_from_page(
                    page, page_id,
                    feature_filter='cropped,binarized,grayscale_normalized'
                )

            with tempfile.TemporaryDirectory() as tmp_dirname:
                # Save the image
                image_file = tempfile.mkstemp(dir=tmp_dirname,
                                              suffix='.png')[1]
                page_image.save(image_file)

                # Segment the image
                model = self.parameter['model']
                x = textline_detector(image_file, tmp_dirname, file_id, model)
                x.run()

                # Read segmentation results
                tmp_filename = os.path.join(tmp_dirname, file_id) + '.xml'
                tmp_pcgts = ocrd_models.ocrd_page.parse(tmp_filename,
                                                        silence=True)
                tmp_page = tmp_pcgts.get_Page()

            # Create a new PAGE file from the input file
            pcgts.set_pcGtsId(file_id)

            # Merge results → PAGE file

            # 1. Border
            if page.get_Border():
                log.warning("Removing existing page border")
            page.set_Border(None)
            # We need to translate the coordinates:
            text_border = adapt_coords(tmp_page.get_Border(), page,
                                       page_coords)
            if text_border is None:
                # intersection is empty (border outside of rotated original image)
                log.warning("new border would be empty, skipping")
            else:
                page.set_Border(text_border)

            # 2. ReadingOrder
            if page.get_ReadingOrder():
                log.warning("Removing existing regions' reading order")
            page.set_ReadingOrder(tmp_page.get_ReadingOrder())

            # 3. TextRegion
            # FIXME: what about table and image regions?
            if page.get_TextRegion():
                log.warning("Removing existing text regions")
            # We need to translate the coordinates:
            text_regions_new = []
            for text_region in tmp_page.get_TextRegion():
                text_region = adapt_coords(text_region, page, page_coords)
                if text_region is None:
                    # intersection is empty (polygon outside of above border)
                    log.warning(
                        "new text region polygon would be empty, skipping")
                    continue
                text_regions_new.append(text_region)
                text_lines_new = []
                for text_line in text_region.get_TextLine():
                    text_line = adapt_coords(text_line, text_region,
                                             page_coords)
                    if text_line is None:
                        # intersection is empty (polygon outside of region)
                        log.warning(
                            "new text line polygon would be empty, skipping")
                        continue
                    text_lines_new.append(text_line)
                text_region.set_TextLine(text_lines_new)
            page.set_TextRegion(text_regions_new)

            # Save metadata about this operation
            self.add_metadata(pcgts)

            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=page_id,
                mimetype='application/vnd.prima.page+xml',
                local_filename=os.path.join(self.output_file_grp, file_id) +
                '.xml',
                content=ocrd_models.ocrd_page.to_xml(pcgts))
    def process(self):
        """Performs region segmentation by reading from COCO annotations.
        
        Open and deserialize the COCO JSON file from the second input file group.
        (It lists region categories/subtypes, file names and segmentations for all pages.)
        
        Open and deserialize each PAGE input file (or generate from image input file)
        from the first input file group. Now find this page in COCO:
        - try to match the PAGE ``imageFilename`` or METS file path matches to some
          COCO ``file_name``, otherwise
        - try to match the numeric part of the METS physical page ID to some
          COCO ``id``, otherwise
        - skip with an error.
        
        Then create and add a region for each ``segmentation``, converting its polygon
        to coordinate points and its COCO category to a region type (and subtype),
        either for a PubLayNet classification or PAGE classification (as produced by
        ocrd-segment-extract-pages), as indicated by ``source``.
        
        Produce a new output file by serialising the resulting hierarchy.
        
        Afterwards, if there are still COCO images left unaccounted for (i.e. without
        corresponding input files), then show a warning.
        """
        LOG = getLogger('processor.ImportCOCOSegmentation')
        # Load JSON
        assert_file_grp_cardinality(self.input_file_grp, 2, 'base and COCO')
        # pylint: disable=attribute-defined-outside-init
        self.input_file_grp, coco_grp = self.input_file_grp.split(',')
        # pylint: disable=attribute-defined-outside-init
        if not self.input_files:
            LOG.warning('No input files to process')
            return
        if coco_grp in self.workspace.mets.file_groups:
            try:
                cocofile = next(
                    f for f in self.workspace.mets.find_files(fileGrp=coco_grp)
                    # if f.mimetype == 'application/json' and not f.pageId
                    if not f.pageId)
            except StopIteration:
                raise Exception(
                    "no non-page-specific file in second file group (COCO file)",
                    coco_grp)
            cocofile = self.workspace.download_file(cocofile).local_filename
        elif os.path.isfile(coco_grp):
            cocofile = coco_grp
        else:
            raise Exception("file not found in second file group (COCO file)",
                            coco_grp)

        LOG.info('Loading COCO annotations from "%s" into memory...', cocofile)
        with open(cocofile, 'r') as inp:
            coco = json.load(inp)
        LOG.info('Loaded JSON for %d images with %d regions in %d categories',
                 len(coco['images']), len(coco['annotations']),
                 len(coco['categories']))
        coco_source = 'PubLayNet'
        # Convert to usable dicts
        # classes:
        categories = dict()
        subcategories = dict()
        for cat in coco['categories']:
            if cat['source'] == 'PAGE':
                coco_source = 'PAGE'
            if 'supercategory' in cat and cat['supercategory']:
                categories[cat['id']] = cat['supercategory']
                subcategories[cat['id']] = cat['name']
            else:
                categories[cat['id']] = cat['name']
        # images and annotations:
        images_by_id = dict()
        images_by_filename = dict()
        for image in coco['images']:
            images_by_id[image['id']] = image
            images_by_filename[image['file_name']] = image
        for annotation in coco['annotations']:
            image = images_by_id[annotation['image_id']]
            regions = image.setdefault('regions', list())
            regions.append(annotation)
        del coco

        LOG.info('Converting %s annotations into PAGE-XML', coco_source)
        for n, input_file in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            num_page_id = int(page_id.strip(page_id.strip("0123456789")))
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()

            # find COCO image
            if page.imageFilename in images_by_filename:
                image = images_by_filename[page.imageFilename]
            elif num_page_id in images_by_id:
                image = images_by_id[num_page_id]
            else:
                LOG.error('Page "%s" / file "%s" not found in COCO', page_id,
                          page.imageFilename)
                # todo: maybe we should at least write the (unchanged) output PAGE?
                continue
            if image['width'] != page.imageWidth:
                LOG.error(
                    'Page "%s" width %d does not match annotated width %d',
                    page_id, page.imageWidth, image['width'])
            if image['height'] != page.imageHeight:
                LOG.error(
                    'Page "%s" height %d does not match annotated height %d',
                    page_id, page.imageHeight, image['height'])

            # todo: remove existing segmentation first?
            for region in image['regions']:
                assert isinstance(
                    region['segmentation'],
                    list), "importing RLE/mask segmentation not implemented"
                polygon = np.array(region['segmentation'])
                polygon = np.reshape(polygon, (polygon.shape[1] // 2, 2))
                coords = CoordsType(points=points_from_polygon(polygon))
                category = categories[region['category_id']]
                if region['category_id'] in subcategories:
                    subcategory = subcategories[region['category_id']]
                else:
                    subcategory = None
                region_id = 'r' + str(region['id'])
                LOG.info('Adding region %s:%s [area %d]', category, subcategory
                         or '', region['area'])
                if coco_source == 'PubLayNet':
                    if category == 'text':
                        region_obj = TextRegionType(
                            id=region_id,
                            Coords=coords,
                            type_=TextTypeSimpleType.PARAGRAPH)
                        page.add_TextRegion(region_obj)
                    elif category == 'title':
                        region_obj = TextRegionType(
                            id=region_id,
                            Coords=coords,
                            type_=TextTypeSimpleType.HEADING)  # CAPTION?
                        page.add_TextRegion(region_obj)
                    elif category == 'list':
                        region_obj = TextRegionType(
                            id=region_id,
                            Coords=coords,
                            type_=TextTypeSimpleType.LISTLABEL)  # OTHER?
                        page.add_TextRegion(region_obj)
                    elif category == 'table':
                        region_obj = TableRegionType(id=region_id,
                                                     Coords=coords)
                        page.add_TableRegion(region_obj)
                    elif category == 'figure':
                        region_obj = ImageRegionType(id=region_id,
                                                     Coords=coords)
                        page.add_ImageRegion(region_obj)
                    else:
                        raise Exception('unknown region category: %s' %
                                        category)
                else:  # 'PAGE'
                    args = {'id': region_id, 'Coords': coords}
                    if subcategory:
                        typedict = {
                            "TextRegion": TextTypeSimpleType,
                            "GraphicRegion": GraphicsTypeSimpleType,
                            "ChartType": ChartTypeSimpleType
                        }
                        if category in typedict:
                            subtype = membername(typedict[category],
                                                 subcategory)
                            if subtype == subcategory:
                                # not predefined in PAGE: use other + custom
                                args['custom'] = "subtype:%s" % subcategory
                                args['type_'] = "other"
                            else:
                                args['type_'] = subcategory
                        else:
                            args['custom'] = "subtype:%s" % subcategory
                    if category + 'Type' not in globals():
                        raise Exception('unknown region category: %s' %
                                        category)
                    region_type = globals()[category + 'Type']
                    if region_type is BorderType:
                        page.set_Border(BorderType(Coords=coords))
                    else:
                        region_obj = region_type(**args)
                        getattr(page, 'add_%s' % category)(region_obj)
            # remove image from dicts
            images_by_id.pop(num_page_id, None)
            images_by_filename.pop(page.imageFilename, None)

            file_id = make_file_id(input_file, self.output_file_grp)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))

        # warn of remaining COCO images
        if images_by_filename and not self.page_id:
            LOG.warning('%d images remain unaccounted for after processing',
                        len(images_by_filename))
            if LOG.isEnabledFor(logging.DEBUG):
                for filename in images_by_filename:
                    LOG.debug('not found in workspace: "%s"', filename)
Пример #16
0
    def process(self):
        """
        Binarize with sbb_binarization
        """
        LOG = getLogger('processor.SbbBinarize')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        oplevel = self.parameter['operation_level']
        model_path = self.resolve_resource(self.parameter['model'])
        binarizer = SbbBinarizer(model_dir=model_path, logger=LOG)

        for n, input_file in enumerate(self.input_files):
            file_id = make_file_id(input_file, self.output_file_grp)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            pcgts.set_pcGtsId(file_id)
            page = pcgts.get_Page()
            page_image, page_xywh, _ = self.workspace.image_from_page(
                page, page_id, feature_filter='binarized')

            if oplevel == 'page':
                LOG.info("Binarizing on 'page' level in page '%s'", page_id)
                bin_image = cv2pil(
                    binarizer.run(image=pil2cv(page_image), use_patches=True))
                # update METS (add the image file):
                bin_image_path = self.workspace.save_image_file(
                    bin_image,
                    file_id + '.IMG-BIN',
                    page_id=input_file.pageId,
                    file_grp=self.output_file_grp)
                page.add_AlternativeImage(
                    AlternativeImageType(filename=bin_image_path,
                                         comments='%s,binarized' %
                                         page_xywh['features']))

            elif oplevel == 'region':
                regions = page.get_AllRegions(['Text', 'Table'], depth=1)
                if not regions:
                    LOG.warning("Page '%s' contains no text/table regions",
                                page_id)
                for region in regions:
                    region_image, region_xywh = self.workspace.image_from_segment(
                        region,
                        page_image,
                        page_xywh,
                        feature_filter='binarized')
                    region_image_bin = cv2pil(
                        binarizer.run(image=pil2cv(region_image),
                                      use_patches=True))
                    region_image_bin_path = self.workspace.save_image_file(
                        region_image_bin,
                        "%s_%s.IMG-BIN" % (file_id, region.id),
                        page_id=input_file.pageId,
                        file_grp=self.output_file_grp)
                    region.add_AlternativeImage(
                        AlternativeImageType(filename=region_image_bin_path,
                                             comments='%s,binarized' %
                                             region_xywh['features']))

            elif oplevel == 'line':
                region_line_tuples = [
                    (r.id, r.get_TextLine())
                    for r in page.get_AllRegions(['Text'], depth=0)
                ]
                if not region_line_tuples:
                    LOG.warning("Page '%s' contains no text lines", page_id)
                for region_id, line in region_line_tuples:
                    line_image, line_xywh = self.workspace.image_from_segment(
                        line,
                        page_image,
                        page_xywh,
                        feature_filter='binarized')
                    line_image_bin = cv2pil(
                        binarizer.run(image=pil2cv(line_image),
                                      use_patches=True))
                    line_image_bin_path = self.workspace.save_image_file(
                        line_image_bin,
                        "%s_%s_%s.IMG-BIN" % (file_id, region_id, line.id),
                        page_id=input_file.pageId,
                        file_grp=self.output_file_grp)
                    line.add_AlternativeImage(
                        AlternativeImageType(filename=line_image_bin_path,
                                             comments='%s,binarized' %
                                             line_xywh['features']))

            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
    def process(self):
        """Extract region images from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Extract an image for each region (which depending on the workflow
        can already be deskewed, dewarped, binarized etc.), cropped to its
        minimal bounding box, and masked by the coordinate polygon outline.
        If ``transparency`` is true, then also add an alpha channel which is
        fully transparent outside of the mask.
        
        Create a JSON file with:
        * the IDs of the region and its parents,
        * the region's coordinates relative to the region image,
        * the region's absolute coordinates,
        * the (text) region's text content (if any),
        * the (text) region's TextStyle (if any),
        * the (text) region's @production (if any),
        * the (text) region's @readingDirection (if any),
        * the (text) region's @textLineOrder (if any),
        * the (text) region's @primaryScript (if any),
        * the (text) region's @primaryLanguage (if any),
        * the region's AlternativeImage/@comments (features),
        * the region's element class,
        * the region's @type,
        * the page's @type,
        * the page's DPI value.
        
        Write all files in the directory of the output file group, named like so:
        * ID + '.raw.png': region image (if the workflow provides raw images)
        * ID + '.bin.png': region image (if the workflow provides binarized images)
        * ID + '.nrm.png': region image (if the workflow provides grayscale-normalized images)
        * ID + '.json': region metadata.
        """
        LOG = getLogger('processor.ExtractRegions')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            ptype = page.get_type()

            regions = dict()
            for name in CLASSES.keys():
                if not name or name == 'Border' or ':' in name:
                    # no subtypes here
                    continue
                regions[name] = getattr(page, 'get_' + name)()
            for rtype, rlist in regions.items():
                for region in rlist:
                    description = {
                        'region.ID': region.id,
                        'region.type': rtype
                    }
                    region_image, region_coords = self.workspace.image_from_segment(
                        region,
                        page_image,
                        page_coords,
                        transparency=self.parameter['transparency'])
                    if rtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']:
                        subrtype = region.get_type()
                    else:
                        subrtype = None
                    description['subtype'] = subrtype
                    description['coords_rel'] = coordinates_of_segment(
                        region, region_image, region_coords).tolist()
                    description['coords_abs'] = polygon_from_points(
                        region.get_Coords().points)
                    if rtype == 'text':
                        rtext = region.get_TextEquiv()
                        if rtext:
                            description['region.text'] = rtext[0].Unicode
                        else:
                            description['region.text'] = ''
                        rstyle = region.get_TextStyle() or page.get_TextStyle()
                        if rstyle:
                            description['region.style'] = {
                                'fontFamily': rstyle.fontFamily,
                                'fontSize': rstyle.fontSize,
                                'xHeight': rstyle.xHeight,
                                'kerning': rstyle.kerning,
                                'serif': rstyle.serif,
                                'monospace': rstyle.monospace,
                                'bold': rstyle.bold,
                                'italic': rstyle.italic,
                                'smallCaps': rstyle.smallCaps,
                                'letterSpaced': rstyle.letterSpaced,
                                'strikethrough': rstyle.strikethrough,
                                'underlined': rstyle.underlined,
                                'underlineStyle': rstyle.underlineStyle,
                                'subscript': rstyle.subscript,
                                'superscript': rstyle.superscript
                            }
                        description['production'] = region.get_production()
                        description['readingDirection'] = (
                            region.get_readingDirection()
                            or page.get_readingDirection())
                        description['textLineOrder'] = (
                            region.get_textLineOrder()
                            or page.get_textLineOrder())
                        description['primaryScript'] = (
                            region.get_primaryScript()
                            or page.get_primaryScript())
                        description['primaryLanguage'] = (
                            region.get_primaryLanguage()
                            or page.get_primaryLanguage())
                    description['features'] = region_coords['features']
                    description['DPI'] = dpi
                    description['page.ID'] = page_id
                    description['page.type'] = ptype
                    description['file_grp'] = self.input_file_grp
                    description[
                        'METS.UID'] = self.workspace.mets.unique_identifier
                    if 'binarized' in region_coords['features']:
                        extension = '.bin'
                    elif 'grayscale_normalized' in region_coords['features']:
                        extension = '.nrm'
                    else:
                        extension = '.raw'

                    file_id = make_file_id(input_file, self.output_file_grp)
                    file_path = self.workspace.save_image_file(
                        region_image,
                        file_id + '_' + region.id + extension,
                        self.output_file_grp,
                        pageId=input_file.pageId,
                        mimetype=self.parameter['mimetype'])
                    self.workspace.add_file(
                        ID=file_id + '.json',
                        file_grp=self.output_file_grp,
                        local_filename=file_path.replace(
                            extension +
                            MIME_TO_EXT[self.parameter['mimetype']], '.json'),
                        pageId=input_file.pageId,
                        mimetype='application/json',
                        content=json.dumps(description))
Пример #18
0
    def process(self):
        """Performs deskewing of the page / region with Tesseract on the workspace.

        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level
        for all text and table regions.

        Set up Tesseract to recognise the region image's orientation, skew
        and script (with both OSD and AnalyseLayout). Rotate the image
        accordingly, and annotate the angle, readingDirection and textlineOrder.
        
        Create a corresponding image file, and reference it as AlternativeImage
        in the element. Add the new image file to the workspace with the fileGrp USE
        given in the second position of the output fileGrp, or ``OCR-D-IMG-DESKEW``,
        and an ID based on input file and input element.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrDeskew')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        oplevel = self.parameter['operation_level']

        with PyTessBaseAPI(
                path=get_tessdata_path(),
                lang="osd",  # osd required for legacy init!
                oem=OEM.TESSERACT_LSTM_COMBINED,  # legacy required for OSD!
                psm=PSM.AUTO_OSD) as tessapi:
            if oplevel == 'line':
                tessapi.SetVariable("min_characters_to_try", "15")
            for n, input_file in enumerate(self.input_files):
                file_id = make_file_id(input_file, self.output_file_grp)
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                pcgts.set_pcGtsId(file_id)
                self.add_metadata(pcgts)
                page = pcgts.get_Page()

                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page,
                    page_id,
                    # image must not have been rotated already,
                    # (we will overwrite @orientation anyway,)
                    # abort if no such image can be produced:
                    feature_filter='deskewed' if oplevel == 'page' else '')
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from parameter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))

                LOG.info("Deskewing on '%s' level in page '%s'", oplevel,
                         page_id)

                if oplevel == 'page':
                    self._process_segment(tessapi, page, page_image, page_xywh,
                                          "page '%s'" % page_id,
                                          input_file.pageId, file_id)
                else:
                    regions = page.get_AllRegions(classes=['Text', 'Table'])
                    if not regions:
                        LOG.warning("Page '%s' contains no text regions",
                                    page_id)
                    for region in regions:
                        region_image, region_xywh = self.workspace.image_from_segment(
                            region,
                            page_image,
                            page_xywh,
                            # image must not have been rotated already,
                            # (we will overwrite @orientation anyway,)
                            # abort if no such image can be produced:
                            feature_filter='deskewed')
                        if oplevel == 'region':
                            self._process_segment(tessapi, region,
                                                  region_image, region_xywh,
                                                  "region '%s'" % region.id,
                                                  input_file.pageId,
                                                  file_id + '_' + region.id)
                        elif isinstance(region, TextRegionType):
                            lines = region.get_TextLine()
                            if not lines:
                                LOG.warning(
                                    "Page '%s' region '%s' contains no lines",
                                    page_id, region.id)
                            for line in lines:
                                line_image, line_xywh = self.workspace.image_from_segment(
                                    line, region_image, region_xywh)
                                self._process_segment(
                                    tessapi, line, line_image, line_xywh,
                                    "line '%s'" % line.id, input_file.pageId,
                                    file_id + '_' + region.id + '_' + line.id)

                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
Пример #19
0
    def process(self):
        """Performs segmentation evaluation with Shapely on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Return information on the plausibility of the segmentation into
        regions on the logging level.
        """
        LOG = getLogger('processor.RepairSegmentation')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        sanitize = self.parameter['sanitize']
        plausibilize = self.parameter['plausibilize']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()

            #
            # validate segmentation (warn of children extending beyond their parents)
            #
            report = PageValidator.validate(ocrd_page=pcgts,
                                            page_textequiv_consistency='off',
                                            check_baseline=False)
            if not report.is_valid:
                errors = report.errors
                report.errors = []
                for error in errors:
                    if isinstance(
                            error,
                        (CoordinateConsistencyError, CoordinateValidityError)):
                        if error.tag == 'Page':
                            element = page.get_Border()
                        elif error.tag.endswith('Region'):
                            element = next(
                                (region for region in page.get_AllRegions()
                                 if region.id == error.ID), None)
                        elif error.tag == 'TextLine':
                            element = next((line
                                            for region in page.get_AllRegions(
                                                classes=['Text'])
                                            for line in region.get_TextLine()
                                            if line.id == error.ID), None)
                        elif error.tag == 'Word':
                            element = next((word
                                            for region in page.get_AllRegions(
                                                classes=['Text'])
                                            for line in region.get_TextLine()
                                            for word in line.get_Word()
                                            if word.id == error.ID), None)
                        elif error.tag == 'Glyph':
                            element = next((glyph
                                            for region in page.get_AllRegions(
                                                classes=['Text'])
                                            for line in region.get_TextLine()
                                            for word in line.get_Word()
                                            for glyph in word.get_Glyph()
                                            if glyph.id == error.ID), None)
                        else:
                            LOG.error(
                                "Unrepairable error for unknown segment type: %s",
                                str(error))
                            report.add_error(error)
                            continue
                        if not element:
                            LOG.error(
                                "Unrepairable error for unknown segment element: %s",
                                str(error))
                            report.add_error(error)
                            continue
                        if isinstance(error, CoordinateConsistencyError):
                            try:
                                ensure_consistent(element)
                            except Exception as e:
                                LOG.error(str(e))
                                report.add_error(error)
                                continue
                        else:
                            ensure_valid(element)
                        LOG.warning("Fixed %s for %s '%s'",
                                    error.__class__.__name__, error.tag,
                                    error.ID)
            if not report.is_valid:
                LOG.warning(report.to_xml())

            #
            # plausibilize region segmentation (remove redundant text regions)
            #
            ro = page.get_ReadingOrder()
            if ro:
                rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
            else:
                rogroup = None
            mark_for_deletion = list()  # what regions get removed?
            mark_for_merging = dict(
            )  # what regions get merged into which regions?
            # cover recursive region structure (but compare only at the same level)
            parents = list(
                set([
                    region.parent_object_
                    for region in page.get_AllRegions(classes=['Text'])
                ]))
            for parent in parents:
                regions = parent.get_TextRegion()
                # sort by area to ensure to arrive at a total ordering compatible
                # with the topological sort along containment/equivalence arcs
                # (so we can avoid substituting regions with superregions that have
                #  themselves been substituted/deleted):
                RegionPolygon = namedtuple('RegionPolygon',
                                           ['region', 'polygon'])
                regionspolys = sorted([
                    RegionPolygon(
                        region,
                        Polygon(polygon_from_points(
                            region.get_Coords().points))) for region in regions
                ],
                                      key=lambda x: x.polygon.area)
                for i in range(0, len(regionspolys)):
                    for j in range(i + 1, len(regionspolys)):
                        region1 = regionspolys[i].region
                        region2 = regionspolys[j].region
                        poly1 = regionspolys[i].polygon
                        poly2 = regionspolys[j].polygon
                        LOG.debug('Comparing regions "%s" and "%s"',
                                  region1.id, region2.id)

                        if poly1.almost_equals(poly2):
                            LOG.warning(
                                'Page "%s" region "%s" is almost equal to "%s" %s',
                                page_id, region2.id, region1.id,
                                '(removing)' if plausibilize else '')
                            mark_for_deletion.append(region2.id)
                        elif poly1.contains(poly2):
                            LOG.warning(
                                'Page "%s" region "%s" is within "%s" %s',
                                page_id, region2.id, region1.id,
                                '(removing)' if plausibilize else '')
                            mark_for_deletion.append(region2.id)
                        elif poly2.contains(poly1):
                            LOG.warning(
                                'Page "%s" region "%s" is within "%s" %s',
                                page_id, region1.id, region2.id,
                                '(removing)' if plausibilize else '')
                            mark_for_deletion.append(region1.id)
                        elif poly1.overlaps(poly2):
                            inter_poly = poly1.intersection(poly2)
                            union_poly = poly1.union(poly2)
                            LOG.debug(
                                'Page "%s" region "%s" overlaps "%s" by %f/%f',
                                page_id, region1.id, region2.id,
                                inter_poly.area / poly1.area,
                                inter_poly.area / poly2.area)
                            if union_poly.convex_hull.area >= poly1.area + poly2.area:
                                # skip this pair -- combined polygon encloses previously free segments
                                pass
                            elif inter_poly.area / poly2.area > self.parameter[
                                    'plausibilize_merge_min_overlap']:
                                LOG.warning(
                                    'Page "%s" region "%s" is almost within "%s" %s',
                                    page_id, region2.id, region1.id,
                                    '(merging)' if plausibilize else '')
                                mark_for_merging[region2.id] = region1
                            elif inter_poly.area / poly1.area > self.parameter[
                                    'plausibilize_merge_min_overlap']:
                                LOG.warning(
                                    'Page "%s" region "%s" is almost within "%s" %s',
                                    page_id, region1.id, region2.id,
                                    '(merging)' if plausibilize else '')
                                mark_for_merging[region1.id] = region2

                        # TODO: more merging cases...
                        #LOG.info('Intersection %i', poly1.intersects(poly2))
                        #LOG.info('Containment %i', poly1.contains(poly2))
                        #if poly1.intersects(poly2):
                        #    LOG.info('Area 1 %d', poly1.area)
                        #    LOG.info('Area 2 %d', poly2.area)
                        #    LOG.info('Area intersect %d', poly1.intersection(poly2).area)

                if plausibilize:
                    # pass the regions sorted (see above)
                    _plausibilize_group(regionspolys, rogroup,
                                        mark_for_deletion, mark_for_merging)

            #
            # sanitize region segmentation (shrink to hull of lines)
            #
            if sanitize:
                self.sanitize_page(page, page_id)

            file_id = make_file_id(input_file, self.output_file_grp)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Пример #20
0
    def process(self):
        """Dewarp the lines of the workspace.

        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the TextLine level.

        Next, get each line image according to the layout annotation (from
        the alternative image of the line, or by cropping via coordinates
        into the higher-level image), and dewarp it (without resizing).
        Export the result as an image file.

        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-DEWARP`` along with further
        identification of the input element.

        Reference each new image in the AlternativeImage of the element.

        Produce a new output file by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId
                             or input_file.ID)
            file_id = make_file_id(input_file, self.output_file_grp)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID  # (PageType has no id)
            page = pcgts.get_Page()

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id)
            if self.parameter['dpi'] > 0:
                zoom = 300.0 / self.parameter['dpi']
            elif page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi *= 2.54
                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                zoom = 300.0 / dpi
            else:
                zoom = 1

            regions = page.get_AllRegions(classes=['Text'],
                                          order='reading-order')
            if not regions:
                self.logger.warning('Page "%s" contains no text regions',
                                    page_id)
            for region in regions:
                region_image, region_xywh = self.workspace.image_from_segment(
                    region, page_image, page_xywh)

                lines = region.get_TextLine()
                if not lines:
                    self.logger.warning('Region %s contains no text lines',
                                        region.id)
                for line in lines:
                    line_image, line_xywh = self.workspace.image_from_segment(
                        line, region_image, region_xywh)

                    self.logger.info(
                        "About to dewarp page '%s' region '%s' line '%s'",
                        page_id, region.id, line.id)
                    try:
                        dew_image = dewarp(
                            line_image,
                            self.lnorm,
                            check=True,
                            max_neighbour=self.parameter['max_neighbour'],
                            zoom=zoom)
                    except InvalidLine as err:
                        self.logger.error('cannot dewarp line "%s": %s',
                                          line.id, err)
                        continue
                    except InadequateLine as err:
                        self.logger.warning('cannot dewarp line "%s": %s',
                                            line.id, err)
                        # as a fallback, simply pad the image vertically
                        # (just as dewarping would do on average, so at least
                        #  this line has similar margins as the others):
                        dew_image = padvert(line_image,
                                            self.parameter['range'])
                    # update METS (add the image file):
                    file_path = self.workspace.save_image_file(
                        dew_image,
                        file_id + '_' + region.id + '_' + line.id +
                        '.IMG-DEWARP',
                        page_id=input_file.pageId,
                        file_grp=self.output_file_grp)
                    # update PAGE (reference the image file):
                    alternative_image = line.get_AlternativeImage()
                    line.add_AlternativeImage(
                        AlternativeImageType(filename=file_path,
                                             comments=line_xywh['features'] +
                                             ',dewarped'))

            # update METS (add the PAGE file):
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            pcgts.set_pcGtsId(file_id)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                             file_id, self.output_file_grp, out.local_filename)
Пример #21
0
    def process(self):
        """Perform OCR recognition with Tesseract on the workspace.
        
        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the requested
        ``textequiv_level`` if it exists and ``overwrite_words`` is disabled,
        or to the line level otherwise. In the latter case,
        (remove any existing segmentation below the line level, and)
        create new segmentation below the line level if necessary.
        
        Set up Tesseract to recognise each segment's image (either from
        AlternativeImage or cropping the bounding box rectangle and masking
        it from the polygon outline) with the appropriate mode and ``model``.
        
        Put text and confidence results into the TextEquiv at ``textequiv_level``,
        removing any existing TextEquiv.
        
        Finally, make the higher levels consistent with these results by concatenation,
        ordered as appropriate for its readingDirection, textLineOrder, and ReadingOrder,
        and joined by whitespace, as appropriate for the respective level and Relation/join
        status.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrRecognize')
        LOG.debug("TESSDATA: %s, installed Tesseract models: %s", *get_languages())

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        maxlevel = self.parameter['textequiv_level']
        model = get_languages()[1][-1] # last installed model
        if 'model' in self.parameter:
            model = self.parameter['model']
            for sub_model in model.split('+'):
                if sub_model not in get_languages()[1]:
                    raise Exception("configured model " + sub_model + " is not installed")
        
        with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi:
            LOG.info("Using model '%s' in %s for recognition at the %s level",
                     model, get_languages()[0], maxlevel)
            if maxlevel == 'glyph':
                # populate GetChoiceIterator() with LSTM models, too:
                tessapi.SetVariable("lstm_choice_mode", "2") # aggregate symbols
                tessapi.SetVariable("lstm_choice_iterations", "15") # squeeze out more best paths
            # TODO: maybe warn/raise when illegal combinations or characters not in the model unicharset?
            if self.parameter['char_whitelist']:
                tessapi.SetVariable("tessedit_char_whitelist", self.parameter['char_whitelist'])
            if self.parameter['char_blacklist']:
                tessapi.SetVariable("tessedit_char_blacklist", self.parameter['char_blacklist'])
            if self.parameter['char_unblacklist']:
                tessapi.SetVariable("tessedit_char_unblacklist", self.parameter['char_unblacklist'])
            # todo: determine relevancy of these variables:
            # tessapi.SetVariable("tessedit_single_match", "0")
            #
            # tessedit_load_sublangs
            # tessedit_preserve_min_wd_len 2
            # tessedit_prefer_joined_punct 0
            # tessedit_write_rep_codes 0
            # tessedit_parallelize 0
            # tessedit_zero_rejection 0
            # tessedit_zero_kelvin_rejection 0
            # tessedit_reject_mode 0
            # tessedit_use_reject_spaces 1
            # tessedit_fix_fuzzy_spaces 1
            # tessedit_char_blacklist
            # tessedit_char_whitelist
            # chs_leading_punct ('`"
            # chs_trailing_punct1 ).,;:?!
            # chs_trailing_punct2 )'`"
            # numeric_punctuation .,
            # unrecognised_char |
            # ok_repeated_ch_non_alphanum_wds -?*=
            # conflict_set_I_l_1 Il1[]
            # preserve_interword_spaces 0
            # tessedit_enable_dict_correction 0
            # tessedit_enable_bigram_correction 1
            # stopper_smallword_size 2
            # wordrec_max_join_chunks 4
            # suspect_space_level 100
            # suspect_short_words 2
            # language_model_ngram_on 0
            # language_model_ngram_order 8
            # language_model_min_compound_length 3
            # language_model_penalty_non_freq_dict_word 0.1
            # language_model_penalty_non_dict_word 0.15
            # language_model_penalty_punc 0.2
            # language_model_penalty_case 0.1
            # language_model_penalty_script 0.5
            # language_model_penalty_chartype 0.3
            # language_model_penalty_spacing 0.05
            # textord_max_noise_size 7
            # enable_noise_removal 1
            # classify_bln_numeric_mode 0
            # lstm_use_matrix 1
            # user_words_file
            # user_patterns_file
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()
                
                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info("Page '%s' images will use %d DPI from paramter override", page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi)
                else:
                    dpi = 0
                    LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                
                LOG.info("Processing page '%s'", page_id)
                regions = itertools.chain.from_iterable(
                    [page.get_TextRegion()] +
                    [subregion.get_TextRegion() for subregion in page.get_TableRegion()])
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                else:
                    self._process_regions(tessapi, regions, page_image, page_xywh)
                page_update_higher_textequiv_levels(maxlevel, pcgts)
                
                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(
                    ID=file_id,
                    file_grp=self.output_file_grp,
                    pageId=input_file.pageId,
                    mimetype=MIMETYPE_PAGE,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    content=to_xml(pcgts))
Пример #22
0
    def process(self):
        """Segment pages into regions+lines, tables into cells+lines, or regions into lines.
        
        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the requested level.
        
        Depending on ``level-of-operation``, consider existing segments:
        - If ``overwrite_separators=True`` on ``page`` level, then
          delete any SeparatorRegions.
        - If ``overwrite_regions=True`` on ``page`` level, then
          delete any top-level TextRegions (along with ReadingOrder).
        - If ``overwrite_regions=True`` on ``table`` level, then
          delete any TextRegions in TableRegions (along with their OrderGroup).
        - If ``overwrite_lines=True`` on ``region`` level, then
          delete any TextLines in TextRegions.
        - If ``overwrite_order=True`` on ``page`` or ``table`` level, then
          delete the reading order OrderedGroup entry corresponding
          to the (page/table) segment.
        
        Next, get each element image according to the layout annotation (from
        the alternative image of the page/region, or by cropping via coordinates
        into the higher-level image) in binarized form, and represent it as an array
        with non-text regions and (remaining) text neighbours suppressed.
        
        Then compute a text line segmentation for that array (as a label mask).
        When ``level-of-operation`` is ``page`` or ``table``, this also entails
        detecting
        - up to ``maximages`` large foreground images,
        - up to ``maxseps`` foreground h/v-line separators and
        - up to ``maxcolseps`` background column separators
        before text line segmentation itself, as well as aggregating text lines
        to text regions afterwards.
        
        Text regions are detected via a hybrid variant recursive X-Y cut algorithm
        (RXYC): RXYC partitions the binarized image in top-down manner by detecting
        horizontal or vertical gaps. This implementation uses the bottom-up text line
        segmentation to guide the search, and also uses both pre-existing and newly
        detected separators to alternatively partition the respective boxes into
        non-rectangular parts.
        
        During line segmentation, suppress the foreground of all previously annotated
        regions (of any kind) and lines, except if just removed due to ``overwrite``.
        During region aggregation however, combine the existing separators with the
        new-found separators to guide the column search.
        
        All detected segments (both text line and text region) are sorted according
        to their reading order (assuming a top-to-bottom, left-to-right ordering).
        When ``level-of-operation`` is ``page``, prefer vertical (column-first)
        succession of regions. When it is ``table``, prefer horizontal (row-first)
        succession of cells.
        
        Then for each resulting segment label, convert its background mask into
        polygon outlines by finding the outer contours consistent with the element's
        polygon outline. Annotate the result by adding it as a new TextLine/TextRegion:
        - If ``level-of-operation`` is ``region``, then append the new lines to the
          parent region.
        - If it is ``table``, then append the new lines to their respective regions,
          and append the new regions to the parent table.
          (Also, create an OrderedGroup for it as the parent's RegionRef.)
        - If it is ``page``, then append the new lines to their respective regions,
          and append the new regions to the page.
          (Also, create an OrderedGroup for it in the ReadingOrder.)
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.OcropySegment')
        # FIXME: allow passing a-priori info on reading order / textline order
        # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture
        #  of different scripts; also, vertical writing needs internal rotation
        #  because our line segmentation only works for horizontal writing)
        overwrite_lines = self.parameter['overwrite_lines']
        overwrite_regions = self.parameter['overwrite_regions']
        overwrite_separators = self.parameter['overwrite_separators']
        overwrite_order = self.parameter['overwrite_order']
        oplevel = self.parameter['level-of-operation']

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file.pageId
                     or input_file.ID)
            file_id = make_file_id(input_file, self.output_file_grp)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID  # (PageType has no id)
            page = pcgts.get_Page()

            # TODO: also allow grayscale_normalized (try/except?)
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized')
            if self.parameter['dpi'] > 0:
                zoom = 300.0 / self.parameter['dpi']
            elif page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi *= 2.54
                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
                zoom = 300.0 / dpi
            else:
                zoom = 1

            # aggregate existing regions so their foreground can be ignored
            ignore = (page.get_ImageRegion() + page.get_LineDrawingRegion() +
                      page.get_GraphicRegion() + page.get_ChartRegion() +
                      page.get_MapRegion() + page.get_MathsRegion() +
                      page.get_ChemRegion() + page.get_MusicRegion() +
                      page.get_AdvertRegion() + page.get_NoiseRegion() +
                      page.get_UnknownRegion() + page.get_CustomRegion())
            if oplevel == 'page' and overwrite_separators:
                page.set_SeparatorRegion([])
            else:
                ignore.extend(page.get_SeparatorRegion())
            # prepare reading order
            reading_order = dict()
            ro = page.get_ReadingOrder()
            if ro:
                rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
                if rogroup:
                    page_get_reading_order(reading_order, rogroup)

            # get segments to process / overwrite
            if oplevel == 'page':
                ignore.extend(page.get_TableRegion())
                regions = list(page.get_TextRegion())
                if regions:
                    # page is already region-segmented
                    if overwrite_regions:
                        LOG.info('removing existing TextRegions in page "%s"',
                                 page_id)
                        # we could remove all other region types as well,
                        # but this is more flexible (for workflows with
                        # specialized separator/image/table detectors):
                        page.set_TextRegion([])
                        page.set_ReadingOrder(None)
                        ro = None
                    else:
                        LOG.warning(
                            'keeping existing TextRegions in page "%s"',
                            page_id)
                        ignore.extend(regions)
                # create reading order if necessary
                if not ro or overwrite_order:
                    ro = ReadingOrderType()
                    page.set_ReadingOrder(ro)
                rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
                if not rogroup:
                    # new top-level group
                    rogroup = OrderedGroupType(id="reading-order")
                    ro.set_OrderedGroup(rogroup)
                # go get TextRegions with TextLines (and SeparatorRegions):
                self._process_element(page,
                                      ignore,
                                      page_image,
                                      page_coords,
                                      page_id,
                                      file_id,
                                      zoom,
                                      rogroup=rogroup)
            elif oplevel == 'table':
                ignore.extend(page.get_TextRegion())
                regions = list(page.get_TableRegion())
                if not regions:
                    LOG.warning('Page "%s" contains no table regions', page_id)
                for region in regions:
                    subregions = region.get_TextRegion()
                    if subregions:
                        # table is already cell-segmented
                        if overwrite_regions:
                            LOG.info(
                                'removing existing TextRegions in table "%s"',
                                region.id)
                            region.set_TextRegion([])
                            roelem = reading_order.get(region.id)
                            # replace by empty group with same index and ref
                            # (which can then take the cells as subregions)
                            reading_order[
                                region.id] = page_subgroup_in_reading_order(
                                    roelem)
                        else:
                            LOG.warning(
                                'skipping table "%s" with existing TextRegions',
                                region.id)
                            continue
                    # TODO: also allow grayscale_normalized (try/except?)
                    region_image, region_coords = self.workspace.image_from_segment(
                        region,
                        page_image,
                        page_coords,
                        feature_selector='binarized')
                    # ignore everything but the current table region
                    subignore = regions + ignore
                    subignore.remove(region)
                    # create reading order group if necessary
                    roelem = reading_order.get(region.id)
                    if not roelem:
                        LOG.warning(
                            "Page '%s' table region '%s' is not referenced in reading order (%s)",
                            page_id, region.id, "no target to add cells to")
                    elif overwrite_order:
                        # replace by empty ordered group with same (index and) ref
                        # (which can then take the cells as subregions)
                        roelem = page_subgroup_in_reading_order(roelem)
                        reading_order[region.id] = roelem
                    elif isinstance(
                            roelem,
                        (OrderedGroupType, OrderedGroupIndexedType)):
                        LOG.warning(
                            "Page '%s' table region '%s' already has an ordered group (%s)",
                            page_id, region.id, "cells will be appended")
                    elif isinstance(
                            roelem,
                        (UnorderedGroupType, UnorderedGroupIndexedType)):
                        LOG.warning(
                            "Page '%s' table region '%s' already has an unordered group (%s)",
                            page_id, region.id, "cells will not be appended")
                        roelem = None
                    else:
                        # replace regionRef(Indexed) by group with same index and ref
                        # (which can then take the cells as subregions)
                        roelem = page_subgroup_in_reading_order(roelem)
                        reading_order[region.id] = roelem
                    # go get TextRegions with TextLines (and SeparatorRegions)
                    self._process_element(region,
                                          subignore,
                                          region_image,
                                          region_coords,
                                          region.id,
                                          file_id + '_' + region.id,
                                          zoom,
                                          rogroup=roelem)
            else:  # 'region'
                regions = list(page.get_TextRegion())
                # besides top-level text regions, line-segment any table cells,
                # and for tables without any cells, add a pseudo-cell
                for region in page.get_TableRegion():
                    subregions = region.get_TextRegion()
                    if subregions:
                        regions.extend(subregions)
                    else:
                        subregion = TextRegionType(
                            id=region.id + '_text',
                            Coords=region.get_Coords(),
                            # as if generated from parser:
                            parent_object_=region)
                        region.add_TextRegion(subregion)
                        regions.append(subregion)
                if not regions:
                    LOG.warning('Page "%s" contains no text regions', page_id)
                for region in regions:
                    if region.get_TextLine():
                        if overwrite_lines:
                            LOG.info(
                                'removing existing TextLines in page "%s" region "%s"',
                                page_id, region.id)
                            region.set_TextLine([])
                        else:
                            LOG.warning(
                                'keeping existing TextLines in page "%s" region "%s"',
                                page_id, region.id)
                            ignore.extend(region.get_TextLine())
                    # TODO: also allow grayscale_normalized (try/except?)
                    region_image, region_coords = self.workspace.image_from_segment(
                        region,
                        page_image,
                        page_coords,
                        feature_selector='binarized')
                    # go get TextLines
                    self._process_element(region, ignore, region_image,
                                          region_coords, region.id,
                                          file_id + '_' + region.id, zoom)

            # update METS (add the PAGE file):
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            pcgts.set_pcGtsId(file_id)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     self.output_file_grp, out.local_filename)
Пример #23
0
    def process(self):
        """Performs binarization of the region / line with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the requested level.
        
        Set up Tesseract to recognize the segment image's layout, and get
        the binarized image. Create an image file, and reference it as
        AlternativeImage in the segment element.
        
        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-BIN`` along with further
        identification of the input element.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        oplevel = self.parameter['operation_level']

        with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            for n, input_file in enumerate(self.input_files):
                file_id = make_file_id(input_file, self.output_file_grp)
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                page = pcgts.get_Page()

                # add metadata about this operation and its runtime parameters:
                metadata = pcgts.get_Metadata()  # ensured by from_file()
                metadata.add_MetadataItem(
                    MetadataItemType(
                        type_="processingStep",
                        name=self.ocrd_tool['steps'][0],
                        value=TOOL,
                        Labels=[
                            LabelsType(externalModel="ocrd-tool",
                                       externalId="parameters",
                                       Label=[
                                           LabelType(
                                               type_=name,
                                               value=self.parameter[name])
                                           for name in self.parameter.keys()
                                       ])
                        ]))

                page_image, page_xywh, _ = self.workspace.image_from_page(
                    page, page_id)
                LOG.info("Binarizing on '%s' level in page '%s'", oplevel,
                         page_id)

                regions = page.get_TextRegion() + page.get_TableRegion()
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                for region in regions:
                    region_image, region_xywh = self.workspace.image_from_segment(
                        region, page_image, page_xywh)
                    if oplevel == 'region':
                        tessapi.SetPageSegMode(PSM.SINGLE_BLOCK)
                        self._process_segment(tessapi, RIL.BLOCK, region,
                                              region_image, region_xywh,
                                              "region '%s'" % region.id,
                                              input_file.pageId,
                                              file_id + '_' + region.id)
                    elif isinstance(region, TextRegionType):
                        lines = region.get_TextLine()
                        if not lines:
                            LOG.warning(
                                "Page '%s' region '%s' contains no text lines",
                                page_id, region.id)
                        for line in lines:
                            line_image, line_xywh = self.workspace.image_from_segment(
                                line, region_image, region_xywh)
                            tessapi.SetPageSegMode(PSM.SINGLE_LINE)
                            self._process_segment(
                                tessapi, RIL.TEXTLINE, line, line_image,
                                line_xywh, "line '%s'" % line.id,
                                input_file.pageId,
                                file_id + '_' + region.id + '_' + line.id)

                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
Пример #24
0
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n, model):
        LOG = getLogger('OcrdAnybaseocrTiseg')

        if model:

            I = ocrolib.pil2array(
                page_image.resize((800, 1024), Image.ANTIALIAS))
            I = np.array(I)[np.newaxis, :, :, :]
            LOG.info('I shape %s', I.shape)
            if len(I.shape) < 3:
                print('Wrong input shape. Image should have 3 channel')

            # get prediction
            #out = model.predict_segmentation(
            #    inp=I,
            #    out_fname="/tmp/out.png"
            #)
            out = model.predict(I)
            out = out.reshape((2048, 1600, 3)).argmax(axis=2)

            text_part = np.ones(out.shape)
            text_part[np.where(out == 1)] = 0

            image_part = np.ones(out.shape)
            image_part[np.where(out == 2)] = 0

            image_part = array(255 * (image_part), 'B')
            image_part = ocrolib.array2pil(image_part)

            text_part = array(255 * (text_part), 'B')
            text_part = ocrolib.array2pil(text_part)

            text_part = text_part.resize(page_image.size, Image.BICUBIC)
            image_part = image_part.resize(page_image.size, Image.BICUBIC)

        else:
            I = ocrolib.pil2array(page_image)

            if len(I.shape) > 2:
                I = np.mean(I, 2)
            I = 1 - I / I.max()
            rows, cols = I.shape

            # Generate Mask and Seed Images
            Imask, Iseed = self.pixMorphSequence_mask_seed_fill_holes(I)

            # Iseedfill: Union of Mask and Seed Images
            Iseedfill = self.pixSeedfillBinary(Imask, Iseed)

            # Dilation of Iseedfill
            mask = ones((3, 3))
            Iseedfill = ndimage.binary_dilation(Iseedfill, mask)

            # Expansion of Iseedfill to become equal in size of I
            Iseedfill = self.expansion(Iseedfill, (rows, cols))

            # Write Text and Non-Text images
            image_part = array((1 - I * Iseedfill), dtype=int)
            text_part = array((1 - I * (1 - Iseedfill)), dtype=int)

            bin_array = array(255 * (text_part > ocrolib.midrange(img_part)),
                              'B')
            text_part = ocrolib.array2pil(bin_array)

            bin_array = array(255 * (text_part > ocrolib.midrange(text_part)),
                              'B')
            image_part = ocrolib.array2pil(bin_array)

        file_id = make_file_id(input_file, self.output_file_grp)
        file_path = self.workspace.save_image_file(
            image_part,
            file_id + "_img",
            page_id=page_id,
            file_grp=self.output_file_grp,
        )
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features'] + ',non_text'))

        page_xywh['features'] += ',clipped'
        file_path = self.workspace.save_image_file(
            text_part,
            file_id + "_txt",
            page_id=page_id,
            file_grp=self.output_file_grp,
        )
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features']))