Exemplo n.º 1
0
 def add_metadata(self, pcgts):
     """
     Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
     the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
     """
     pcgts.get_Metadata().add_MetadataItem(
         MetadataItemType(
             type_="processingStep",
             name=self.ocrd_tool['steps'][0],
             value=self.ocrd_tool['executable'],
             Labels=[
                 LabelsType(externalModel="ocrd-tool",
                            externalId="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ]),
                 LabelsType(externalModel="ocrd-tool",
                            externalId="version",
                            Label=[
                                LabelType(
                                    type_=self.ocrd_tool['executable'],
                                    value=self.version),
                                LabelType(type_='ocrd/core',
                                          value=OCRD_VERSION)
                            ])
             ]))
Exemplo n.º 2
0
    def process(self):
        try:
            self.page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            self.page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()
            angle = page.get_orientation()
            if angle:
                LOG.warning('Overwriting existing deskewing angle: %i', angle)
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_filter='deskewed')

            if oplevel == "page":
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n)
            else:
                LOG.warning('Operation level %s, but should be "page".',
                            oplevel)
                break

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)

            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Exemplo n.º 3
0
    def process(self):
        if not tf.test.is_gpu_available():
            LOG.error("Your system has no CUDA installed. No GPU detected.")
            sys.exit(1)
        model_path = Path(self.parameter['model_path'])
        class_mapper_path = Path(self.parameter['class_mapping_path'])
        if not Path(model_path).is_file():
            LOG.error("""\
                Layout Classfication model was not found at '%s'. Make sure the `model_path` parameter
                points to the local model path.
                model can be downloaded from http://url
                """ % model_path)
            sys.exit(1)
        else:

            LOG.info('Loading model from file %s', model_path)
            model = self.create_model(str(model_path))
            # load the mapping
            pickle_in = open(str(class_mapper_path), "rb")
            class_indices = pickle.load(pickle_in)
            label_mapping = dict((v, k) for k, v in class_indices.items())

            # print("INPUT FILE HERE",self.input_files)
        for (n, input_file) in enumerate(self.input_files):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            fname = pcgts.get_Page().imageFilename
            page_id = input_file.pageId or input_file.ID
            size = 600, 500

            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameter",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))

            page = pcgts.get_Page()
            LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID)

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized')

            img_array = ocrolib.pil2array(
                page_image.resize((500, 600), Image.ANTIALIAS))
            img_array = img_array * 1. / 255.
            img_array = img_array[np.newaxis, :, :, np.newaxis]
            results = self.start_test(model, img_array, fname, label_mapping)
            LOG.info(results)
            self.workspace.mets.set_physical_page_for_file(
                "PHYS_000" + str(n), input_file)
            self.create_logmap_smlink(pcgts)
            self.write_to_mets(results, "PHYS_000" + str(n))
Exemplo n.º 4
0
    def process(self):
        try:
            self.page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            self.page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id)

            if oplevel == "page":
                self._process_segment(page, page_image.filename, page_id,
                                      file_id + ".ds")

            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)

            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Exemplo n.º 5
0
 def add_metadata(self, pcgts):
     """
     Adds PAGE-XML MetadataItem describing the processing step
     """
     pcgts.get_Metadata().add_MetadataItem(
             MetadataItemType(type_="processingStep",
                 name=self.ocrd_tool['steps'][0],
                 value=self.ocrd_tool['executable'],
                 Labels=[LabelsType(
                     externalModel="ocrd-tool",
                     externalId="parameters",
                     Label=[LabelType(type_=name,
                         value=self.parameter[name])
                         for name in self.parameter.keys()])]))
Exemplo n.º 6
0
 def _add_my_metadata_to_page(self, pcgts):
     metadata = pcgts.get_Metadata()
     metadata.add_MetadataItem(
         MetadataItemType(type_='processingStep',
                          name=OCRD_TOOL['tools']
                          ['ocrd-cor-asv-fst-process']['steps'][0],
                          value='ocrd-cor-asv-fst-process',
                          Labels=[
                              LabelsType(
                                  externalRef='parameters',
                                  Label=[
                                      LabelType(type_=name,
                                                value=self.parameter[name])
                                      for name in self.parameter.keys()
                                  ])
                          ]))
Exemplo n.º 7
0
    def process(self):
        LOG = getLogger('processor.RepairInconsistencies')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()

            # add metadata about this operation and its runtime parameters:
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            regions = []
            regions.extend(page.get_TextRegion())
            for special_region in page.get_TableRegion(
            ) + page.get_GraphicRegion():
                regions.extend(special_region.get_TextRegion())

            for region in regions:
                textLineOrder = 'top-to-bottom'
                for segment in [region, page]:
                    if segment.textLineOrder is None:
                        continue
                    else:
                        textLineOrder = segment.textLineOrder
                        break
                if textLineOrder not in ['top-to-bottom', 'bottom-to-top']:
                    LOG.info(
                        'Not processing page "%s" region "%s" (textLineOrder=%s)',
                        page_id, region.id, textLineOrder)
                    continue

                _fix_segment(region,
                             page_id,
                             reverse=(textLineOrder == 'bottom-to-top'))

                lines = region.get_TextLine()
                for line in lines:
                    readingDirection = 'left-to-right'
                    for segment in [line, region, page]:
                        if segment.readingDirection is None:
                            continue
                        else:
                            readingDirection = segment.readingDirection
                            break
                    if readingDirection not in [
                            'left-to-right', 'right-to-left'
                    ]:
                        LOG.info(
                            'Not processing page "%s" line "%s" (readingDirection=%s)',
                            page_id, line.id, readingDirection)
                        continue

                    _fix_segment(line,
                                 page_id,
                                 reverse=(readingDirection == 'right-to-left'))

                    words = line.get_Word()
                    for word in words:
                        readingDirection = 'left-to-right'
                        for segment in [word, line, region, page]:
                            if segment.readingDirection is None:
                                continue
                            else:
                                readingDirection = segment.readingDirection
                                break
                        if readingDirection not in [
                                'left-to-right', 'right-to-left'
                        ]:
                            LOG.info(
                                'Not processing page "%s" word "%s" (readingDirection=%s)',
                                page_id, word.id, readingDirection)
                            continue

                        _fix_segment(
                            word,
                            page_id,
                            reverse=(readingDirection == 'right-to-left'))

            file_id = make_file_id(input_file, self.output_file_grp)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Exemplo n.º 8
0
    def process(self):
        try:
            page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        oplevel = self.parameter['operation_level']

        model = None
        if self.parameter['use_deeplr']:

            model_weights = self.parameter['seg_weights']
            if not Path(model_weights).is_file():
                LOG.error("""\
                    Segementation model weights file was not found at '%s'. Make sure the `seg_weights` parameter
                    points to the local model weights path.
                    """ % model_weights)
                sys.exit(1)

            model = resnet50_unet(n_classes=self.parameter['classes'],
                                  input_height=self.parameter['height'],
                                  input_width=self.parameter['width'])
            model.load_weights(model_weights)
            LOG.info('Segmentation Model loaded')

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))

            page = pcgts.get_Page()
            LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID)

            if self.parameter['use_deeplr']:
                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page, page_id, feature_filter='binarized,deskewed,cropped')
            else:
                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page,
                    page_id,
                    feature_selector='binarized,deskewed,cropped')

            if oplevel == 'page':
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n, model)
            else:
                LOG.warning('Operation level %s, but should be "page".',
                            oplevel)
                break

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(
                ID=file_id,
                file_grp=page_grp,  #self.output_file_grp,
                pageId=input_file.pageId,
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp,
                                            file_id + '.xml'),
                content=to_xml(pcgts).encode('utf-8'),
                force=self.parameter['force'])
Exemplo n.º 9
0
    def process(self):
        """Performs border detection on the workspace. """
        try:
            LOG.info("OUTPUT FILE %s", self.output_file_grp)
            page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        oplevel = self.parameter['operation_level']
        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            # Check for existing Border --> already cropped
            border = page.get_Border()
            if border:
                left, top, right, bottom = bbox_from_points(
                    border.get_Coords().points)
                LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left,
                            top, right, bottom)

            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  # externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter='cropped',
                feature_selector='binarized')  # should also be deskewed

            #page_image, page_xywh, page_image_info = self.workspace.image_from_page(
            #    page, page_id, feature_filter='cropped')

            if oplevel == "page":
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n)
            else:
                LOG.warning('Operation level %s, but should be "page".',
                            oplevel)
                break
            file_id = input_file.ID.replace(self.input_file_grp, page_grp)

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            if file_id == input_file.ID:
                file_id = concat_padded(page_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=page_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        page_grp, file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'),
                                    force=self.parameter['force'])
Exemplo n.º 10
0
    def process(self):
        try:
            self.page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            self.page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))

            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id)
            LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)

            if oplevel == "page":
                self._process_segment(page, page_image.filename, page_id,
                                      file_id + ".bin")
            else:
                regions = page.get_TextRegion() + page.get_TableRegion()
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                for region in regions:
                    region_image, region_xywh = self.workspace.image_from_segment(
                        region, page_image, page_xywh)
                    # strange TODO at the moment
                    #self._process_segment(region.filename, region.id)

            # To retain the basenames of files and their respective dir:
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Exemplo n.º 11
0
    def process(self):
        """Performs page cropping with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images.
        Set up Tesseract to detect text blocks on each page, and find
        the largest coordinate extent spanning all of them. Use this
        extent in defining a Border, and add that to the page.
        
        Moreover, crop the original image accordingly, and reference the
        resulting image file as AlternativeImage in the Page element.
        
        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CROP`` along with further
        identification of the input element.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        padding = self.parameter['padding']
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            # disable table detection here (tables count as text blocks),
            # because we do not want to risk confusing the spine with
            # a column separator and thus creeping into a neighbouring
            # page:
            tessapi.SetVariable("textord_tabfind_find_tables", "0")
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                page = pcgts.get_Page()

                # add metadata about this operation and its runtime parameters:
                metadata = pcgts.get_Metadata()  # ensured by from_file()
                metadata.add_MetadataItem(
                    MetadataItemType(
                        type_="processingStep",
                        name=self.ocrd_tool['steps'][0],
                        value=TOOL,
                        Labels=[
                            LabelsType(externalModel="ocrd-tool",
                                       externalId="parameters",
                                       Label=[
                                           LabelType(
                                               type_=name,
                                               value=self.parameter[name])
                                           for name in self.parameter.keys()
                                       ])
                        ]))

                # warn of existing Border:
                border = page.get_Border()
                if border:
                    left, top, right, bottom = bbox_from_points(
                        border.get_Coords().points)
                    LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
                                left, top, right, bottom)

                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page,
                    page_id,
                    # image must not have been cropped already,
                    # abort if no such image can be produced:
                    feature_filter='cropped')
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from parameter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                    zoom = 300 / dpi
                else:
                    zoom = 1

                # warn of existing segmentation:
                regions = page.get_TextRegion()
                if regions:
                    min_x = page_image.width
                    min_y = page_image.height
                    max_x = 0
                    max_y = 0
                    for region in regions:
                        left, top, right, bottom = bbox_from_points(
                            region.get_Coords().points)
                        min_x = min(min_x, left)
                        min_y = min(min_y, top)
                        max_x = max(max_x, right)
                        max_y = max(max_y, bottom)
                    LOG.warning(
                        'Ignoring extent from existing TextRegions: %i:%i,%i:%i',
                        min_x, max_x, min_y, max_y)

                LOG.debug("Cropping with Tesseract")
                tessapi.SetImage(page_image)
                # PSM.SPARSE_TEXT: get as much text as possible in no particular order
                # PSM.AUTO (default): includes tables (dangerous)
                tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT)
                #
                # helper variables for saving the box coordinates
                #
                min_x = page_image.width
                min_y = page_image.height
                max_x = 0
                max_y = 0
                # iterate over all text blocks and compare their
                # bbox extent to the running min and max values
                for component in tessapi.GetComponentImages(
                        tesserocr.RIL.BLOCK, True):
                    image, xywh, index, _ = component
                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    left, top, right, bottom = bbox_from_xywh(xywh)
                    LOG.debug("Detected text region '%s': %i:%i,%i:%i", ID,
                              left, right, top, bottom)
                    # filter region results:
                    bin_bbox = image.getbbox()
                    if not bin_bbox:
                        # this does happen!
                        LOG.info(
                            "Ignoring region '%s' because its binarization is empty",
                            ID)
                        continue
                    width = bin_bbox[2] - bin_bbox[0]
                    if width < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.info(
                            "Ignoring region '%s' because its width is too small (%d)",
                            ID, width)
                        continue
                    height = bin_bbox[3] - bin_bbox[1]
                    if height < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.debug(
                            "Ignoring region '%s' because its height is too small (%d)",
                            ID, height)
                        continue
                    min_x = min(min_x, left)
                    min_y = min(min_y, top)
                    max_x = max(max_x, right)
                    max_y = max(max_y, bottom)
                    LOG.info("Updated page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)

                #
                # set the identified page border
                #
                if min_x < max_x and min_y < max_y:
                    # add padding:
                    min_x = max(min_x - padding, 0)
                    max_x = min(max_x + padding, page_image.width)
                    min_y = max(min_y - padding, 0)
                    max_y = min(max_y + padding, page_image.height)
                    LOG.info("Padded page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)
                    polygon = polygon_from_bbox(min_x, min_y, max_x, max_y)
                    polygon = coordinates_for_segment(polygon, page_image,
                                                      page_xywh)
                    polygon = polygon_for_parent(polygon, page)
                    border = BorderType(
                        Coords=CoordsType(points_from_polygon(polygon)))
                    # intersection with parent could have changed bbox,
                    # so recalculate:
                    bbox = bbox_from_polygon(
                        coordinates_of_segment(border, page_image, page_xywh))
                    # update PAGE (annotate border):
                    page.set_Border(border)
                    # update METS (add the image file):
                    page_image = crop_image(page_image, box=bbox)
                    page_xywh['features'] += ',cropped'
                    file_id = make_file_id(input_file, self.output_file_grp)
                    file_path = self.workspace.save_image_file(
                        page_image,
                        file_id + '.IMG-CROP',
                        page_id=input_file.pageId,
                        file_grp=self.output_file_grp)
                    # update PAGE (reference the image file):
                    page.add_AlternativeImage(
                        AlternativeImageType(filename=file_path,
                                             comments=page_xywh['features']))
                else:
                    LOG.error("Cannot find valid extent for page '%s'",
                              page_id)

                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
Exemplo n.º 12
0
    def process(self):
        """Performs table cell segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the block level
        for table regions. If ``overwrite_regions`` is enabled and any
        layout annotation already exists inside, then remove it.
        
        Set up Tesseract to detect text blocks (as table cells).
        (This is not Tesseract's internal table structure recognition,
        but the general page segmentation.)
        Add each to the block at the detected coordinates.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        overwrite_regions = self.parameter['overwrite_regions']

        with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            # disable table detection here, so we won't get
            # tables inside tables, but try to analyse them as
            # independent text/line blocks:
            tessapi.SetVariable("textord_tabfind_find_tables", "0")
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                page = pcgts.get_Page()

                # add metadata about this operation and its runtime parameters:
                metadata = pcgts.get_Metadata()  # ensured by from_file()
                metadata.add_MetadataItem(
                    MetadataItemType(
                        type_="processingStep",
                        name=self.ocrd_tool['steps'][0],
                        value=TOOL,
                        Labels=[
                            LabelsType(externalModel="ocrd-tool",
                                       externalId="parameters",
                                       Label=[
                                           LabelType(
                                               type_=name,
                                               value=self.parameter[name])
                                           for name in self.parameter.keys()
                                       ])
                        ]))

                page_image, page_coords, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from parameter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))

                #
                # prepare dict of reading order
                reading_order = dict()
                ro = page.get_ReadingOrder()
                if not ro:
                    LOG.warning("Page '%s' contains no ReadingOrder", page_id)
                    rogroup = None
                else:
                    rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
                    page_get_reading_order(reading_order, rogroup)
                #
                # dive into regions
                regions = page.get_TableRegion()
                for region in regions:
                    # delete or warn of existing regions:
                    if region.get_TextRegion():
                        if overwrite_regions:
                            LOG.info(
                                'removing existing TextRegions in block "%s" of page "%s"',
                                region.id, page_id)
                            for subregion in region.get_TextRegion():
                                if subregion.id in reading_order:
                                    regionref = reading_order[subregion.id]
                                    # could be any of the 6 types above:
                                    regionrefs = rogroup.__getattribute__(
                                        regionref.__class__.__name__.replace(
                                            'Type', ''))
                                    # remove in-place
                                    regionrefs.remove(regionref)
                                    # TODO: adjust index to make contiguous again?
                            region.set_TextRegion([])
                        else:
                            LOG.warning(
                                'keeping existing TextRegions in block "%s" of page "%s"',
                                region.id, page_id)
                    # get region image
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords)
                    tessapi.SetImage(region_image)
                    LOG.info("Detecting table cells in region '%s'", region.id)
                    #
                    # detect the region segments:
                    tessapi.SetPageSegMode(PSM.SPARSE_TEXT)  # retrieve "cells"
                    # TODO: we should XY-cut the sparse cells in regroup them into consistent cells
                    layout = tessapi.AnalyseLayout()
                    roelem = reading_order.get(region.id)
                    if not roelem:
                        LOG.warning(
                            "Page '%s' table region '%s' is not referenced in reading order (%s)",
                            page_id, region.id, "no target to add cells into")
                    elif isinstance(
                            roelem,
                        (OrderedGroupType, OrderedGroupIndexedType)):
                        LOG.warning(
                            "Page '%s' table region '%s' already has an ordered group (%s)",
                            page_id, region.id, "cells will be appended")
                    elif isinstance(
                            roelem,
                        (UnorderedGroupType, UnorderedGroupIndexedType)):
                        LOG.warning(
                            "Page '%s' table region '%s' already has an unordered group (%s)",
                            page_id, region.id, "cells will not be appended")
                        roelem = None
                    elif isinstance(roelem, RegionRefIndexedType):
                        # replace regionref by group with same index and ref
                        # (which can then take the cells as subregions)
                        roelem2 = OrderedGroupIndexedType(
                            id=region.id + '_order',
                            index=roelem.index,
                            regionRef=roelem.regionRef)
                        roelem.parent_object_.add_OrderedGroupIndexed(roelem2)
                        roelem.parent_object_.get_RegionRefIndexed().remove(
                            roelem)
                        roelem = roelem2
                    elif isinstance(roelem, RegionRefType):
                        # replace regionref by group with same ref
                        # (which can then take the cells as subregions)
                        roelem2 = OrderedGroupType(id=region.id + '_order',
                                                   regionRef=roelem.regionRef)
                        roelem.parent_object_.add_OrderedGroup(roelem2)
                        roelem.parent_object_.get_RegionRef().remove(roelem)
                        roelem = roelem2
                    self._process_region(layout, region, roelem, region_image,
                                         region_coords)

                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(force=True,
                                        ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
Exemplo n.º 13
0
    def process(self):
        """Rates textual annotation of PAGE input files, producing output files with LM scores (and choices).
        
        ... explain incremental page-wise processing here ...
        """
        level = self.parameter['textequiv_level']
        beam_width = self.parameter['beam_width']
        lm_weight = self.parameter['lm_weight']

        prev_traceback = None
        prev_pcgts = None
        prev_file_id = None
        prev_page_id = None
        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            LOG.info("Scoring text in page '%s' at the %s level",
                     pcgts.get_pcGtsId(), level)

            # annotate processing metadata:
            metadata = pcgts.get_Metadata()  # ensured by page_from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=OCRD_TOOL['tools']['ocrd-keraslm-rate']['steps'][0],
                    value='ocrd-keraslm-rate',
                    Labels=[
                        LabelsType(externalRef="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            # context preprocessing:
            # todo: as soon as we have true MODS meta-data in METS (dmdSec/mdWrap/xmlData/mods),
            #       get global context variables from there (e.g. originInfo/dateIssued/@text for year)
            ident = self.workspace.mets.unique_identifier  # at least try to get purl
            context = [0]
            if ident:
                name = ident.split('/')[-1]
                year = name.split('_')[-1]
                if year.isnumeric():
                    year = ceil(int(year) / 10)
                    context = [year]
                    # todo: author etc

            # create a graph for the linear sequence of elements at the given level:
            graph, start_node, end_node = page_get_linear_graph_at(
                level, pcgts)

            # apply language model to (TextEquiv path in) graph,
            # remove non-path TextEquivs, modify confidences:
            if not self.parameter['alternative_decoding']:
                text = [(edge['element'], edge['alternatives'])
                        for edge in _get_edges(graph, 0)]  # graph's path
                textstring = u''.join(
                    textequivs[0].Unicode
                    for element, textequivs in text)  # same length as text
                LOG.info("Rating %d elements with a total of %d characters",
                         len(text), len(textstring))
                confidences = self.rater.rate(textstring,
                                              context)  # much faster
                i = 0
                for element, textequivs in text:
                    textequiv = textequivs[0]  # 1st choice only
                    if element:
                        element.set_TextEquiv([textequiv])  # delete others
                    textequiv_len = len(textequiv.Unicode)
                    conf = sum(confidences[i:i + textequiv_len]
                               ) / textequiv_len  # mean probability
                    conf2 = textequiv.conf
                    textequiv.set_conf(conf * lm_weight + conf2 *
                                       (1. - lm_weight))
                    i += textequiv_len
                if i != len(confidences):
                    LOG.critical(
                        "Input text length and output scores length are off by %d characters",
                        i - len(confidences))
                avg = sum(confidences) / len(confidences)
                ent = sum([-log(max(p, 1e-99), 2)
                           for p in confidences]) / len(confidences)
                ppl = pow(2.0, ent)  # character level
                ppll = pow(
                    2.0,
                    ent * len(confidences) /
                    len(text))  # textequiv level (including spaces/newlines)
                LOG.info("avg: %.3f, char ppl: %.3f, %s ppl: %.3f", avg, ppl,
                         level, ppll)  # character need not always equal glyph!

                # ensure parent textequivs are up to date:
                page_update_higher_textequiv_levels(level, pcgts)

                # write back result
                file_id = input_file.ID.replace(self.input_file_grp,
                                                self.output_file_grp)
                if file_id == input_file.ID:
                    file_id = concat_padded(self.output_file_grp, n)
                self.workspace.add_file(
                    ID=file_id,
                    pageId=input_file.pageId,
                    file_grp=self.output_file_grp,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    mimetype=MIMETYPE_PAGE,
                    content=to_xml(pcgts),
                )
            else:
                LOG.info("Rating %d elements including its alternatives",
                         end_node - start_node)
                path, entropy, traceback = self.rater.rate_best(
                    graph,
                    start_node,
                    end_node,
                    start_traceback=prev_traceback,
                    context=context,
                    lm_weight=lm_weight,
                    beam_width=beam_width,
                    beam_clustering_dist=BEAM_CLUSTERING_DIST
                    if BEAM_CLUSTERING_ENABLE else 0)

                if prev_pcgts:
                    _page_update_from_path(level, path, entropy)

                    # ensure parent textequivs are up to date:
                    page_update_higher_textequiv_levels(level, prev_pcgts)

                    # write back result
                    file_id = prev_file_id.replace(self.input_file_grp,
                                                   self.output_file_grp)
                    if file_id == prev_file_id:
                        file_id = concat_padded(self.output_file_grp, n - 1)
                    self.workspace.add_file(
                        ID=file_id,
                        pageId=prev_page_id,
                        file_grp=self.output_file_grp,
                        local_filename=os.path.join(self.output_file_grp,
                                                    file_id + '.xml'),
                        mimetype=MIMETYPE_PAGE,
                        content=to_xml(prev_pcgts),
                    )

                prev_page_id = input_file.pageId
                prev_file_id = input_file.ID
                prev_pcgts = pcgts
                prev_traceback = traceback

        if prev_pcgts:
            path, entropy, _ = self.rater.next_path(prev_traceback[0],
                                                    ([], prev_traceback[1]))
            _page_update_from_path(level, path, entropy)

            # ensure parent textequivs are up to date:
            page_update_higher_textequiv_levels(level, prev_pcgts)

            # write back result
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(
                ID=file_id,
                pageId=input_file.pageId,
                file_grp=self.output_file_grp,
                local_filename=os.path.join(self.output_file_grp,
                                            file_id + '.xml'),
                mimetype=MIMETYPE_PAGE,
                content=to_xml(prev_pcgts),
            )
Exemplo n.º 14
0
    def process(self):
        """Perform OCR recognition with Tesseract on the workspace.
        
        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the requested
        ``textequiv_level`` if it exists and ``overwrite_words`` is disabled,
        or to the line level otherwise. In the latter case,
        (remove any existing segmentation below the line level, and)
        create new segmentation below the line level if necessary.
        
        Set up Tesseract to recognise each segment's image (either from
        AlternativeImage or cropping the bounding box rectangle and masking
        it from the polygon outline) with the appropriate mode and ``model``.
        
        Put text and confidence results into the TextEquiv at ``textequiv_level``,
        removing any existing TextEquiv.
        
        Finally, make the higher levels consistent with these results by concatenation,
        ordered as appropriate for its readingDirection, textLineOrder, and ReadingOrder,
        and joined by whitespace, as appropriate for the respective level and Relation/join
        status.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        LOG.debug("TESSDATA: %s, installed Tesseract models: %s",
                  *get_languages())

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        maxlevel = self.parameter['textequiv_level']
        model = get_languages()[1][-1]  # last installed model
        if 'model' in self.parameter:
            model = self.parameter['model']
            for sub_model in model.split('+'):
                if sub_model not in get_languages()[1]:
                    raise Exception("configured model " + sub_model +
                                    " is not installed")

        with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi:
            LOG.info("Using model '%s' in %s for recognition at the %s level",
                     model,
                     get_languages()[0], maxlevel)
            if maxlevel == 'glyph':
                # populate GetChoiceIterator() with LSTM models, too:
                tessapi.SetVariable("lstm_choice_mode",
                                    "2")  # aggregate symbols
                tessapi.SetVariable("lstm_choice_iterations",
                                    "15")  # squeeze out more best paths
            # TODO: maybe warn/raise when illegal combinations or characters not in the model unicharset?
            if self.parameter['char_whitelist']:
                tessapi.SetVariable("tessedit_char_whitelist",
                                    self.parameter['char_whitelist'])
            if self.parameter['char_blacklist']:
                tessapi.SetVariable("tessedit_char_blacklist",
                                    self.parameter['char_blacklist'])
            if self.parameter['char_unblacklist']:
                tessapi.SetVariable("tessedit_char_unblacklist",
                                    self.parameter['char_unblacklist'])
            # todo: determine relevancy of these variables:
            # tessapi.SetVariable("tessedit_single_match", "0")
            #
            # tessedit_load_sublangs
            # tessedit_preserve_min_wd_len 2
            # tessedit_prefer_joined_punct 0
            # tessedit_write_rep_codes 0
            # tessedit_parallelize 0
            # tessedit_zero_rejection 0
            # tessedit_zero_kelvin_rejection 0
            # tessedit_reject_mode 0
            # tessedit_use_reject_spaces 1
            # tessedit_fix_fuzzy_spaces 1
            # tessedit_char_blacklist
            # tessedit_char_whitelist
            # chs_leading_punct ('`"
            # chs_trailing_punct1 ).,;:?!
            # chs_trailing_punct2 )'`"
            # numeric_punctuation .,
            # unrecognised_char |
            # ok_repeated_ch_non_alphanum_wds -?*=
            # conflict_set_I_l_1 Il1[]
            # preserve_interword_spaces 0
            # tessedit_enable_dict_correction 0
            # tessedit_enable_bigram_correction 1
            # stopper_smallword_size 2
            # wordrec_max_join_chunks 4
            # suspect_space_level 100
            # suspect_short_words 2
            # language_model_ngram_on 0
            # language_model_ngram_order 8
            # language_model_min_compound_length 3
            # language_model_penalty_non_freq_dict_word 0.1
            # language_model_penalty_non_dict_word 0.15
            # language_model_penalty_punc 0.2
            # language_model_penalty_case 0.1
            # language_model_penalty_script 0.5
            # language_model_penalty_chartype 0.3
            # language_model_penalty_spacing 0.05
            # textord_max_noise_size 7
            # enable_noise_removal 1
            # classify_bln_numeric_mode 0
            # lstm_use_matrix 1
            # user_words_file
            # user_patterns_file
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                page = pcgts.get_Page()

                # add metadata about this operation and its runtime parameters:
                metadata = pcgts.get_Metadata()  # ensured by from_file()
                metadata.add_MetadataItem(
                    MetadataItemType(
                        type_="processingStep",
                        name=self.ocrd_tool['steps'][0],
                        value=TOOL,
                        Labels=[
                            LabelsType(externalModel="ocrd-tool",
                                       externalId="parameters",
                                       Label=[
                                           LabelType(
                                               type_=name,
                                               value=self.parameter[name])
                                           for name in self.parameter.keys()
                                       ])
                        ]))
                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from paramter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))

                LOG.info("Processing page '%s'", page_id)
                regions = itertools.chain.from_iterable(
                    [page.get_TextRegion()] + [
                        subregion.get_TextRegion()
                        for subregion in page.get_TableRegion()
                    ])
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                else:
                    self._process_regions(tessapi, regions, page_image,
                                          page_xywh)
                page_update_higher_textequiv_levels(maxlevel, pcgts)

                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
Exemplo n.º 15
0
    def process(self):
        """Performs word segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the textline level,
        and remove any existing Word elements (unless ``overwrite_words``
        is False).
        
        Set up Tesseract to detect words, and add each one to the line
        at the detected coordinates.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        overwrite_words = self.parameter['overwrite_words']

        with PyTessBaseAPI(
            psm=PSM.SINGLE_LINE,
            path=TESSDATA_PREFIX
        ) as tessapi:
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                page = pcgts.get_Page()
                
                # add metadata about this operation and its runtime parameters:
                metadata = pcgts.get_Metadata() # ensured by from_file()
                metadata.add_MetadataItem(
                    MetadataItemType(type_="processingStep",
                                     name=self.ocrd_tool['steps'][0],
                                     value=TOOL,
                                     Labels=[LabelsType(
                                         externalModel="ocrd-tool",
                                         externalId="parameters",
                                         Label=[LabelType(type_=name,
                                                          value=self.parameter[name])
                                                for name in self.parameter.keys()])]))
                page_image, page_coords, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi)
                else:
                    dpi = 0
                    LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                
                for region in page.get_TextRegion():
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords)
                    for line in region.get_TextLine():
                        if line.get_Word():
                            if overwrite_words:
                                LOG.info('removing existing Words in line "%s"', line.id)
                                line.set_Word([])
                            else:
                                LOG.warning('keeping existing Words in line "%s"', line.id)
                        LOG.debug("Detecting words in line '%s'", line.id)
                        line_image, line_coords = self.workspace.image_from_segment(
                            line, region_image, region_coords)
                        tessapi.SetImage(line_image)
                        for word_no, component in enumerate(tessapi.GetComponentImages(RIL.WORD, True, raw_image=True)):
                            word_id = '%s_word%04d' % (line.id, word_no)
                            word_polygon = polygon_from_xywh(component[1])
                            word_polygon = coordinates_for_segment(word_polygon, line_image, line_coords)
                            word_points = points_from_polygon(word_polygon)
                            line.add_Word(WordType(
                                id=word_id, Coords=CoordsType(word_points)))
                            
                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(
                    ID=file_id,
                    file_grp=self.output_file_grp,
                    pageId=input_file.pageId,
                    mimetype=MIMETYPE_PAGE,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    content=to_xml(pcgts))
Exemplo n.º 16
0
    def process(self):
        """Performs (text) line segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the (text) region level,
        and remove any existing TextLine elements (unless ``overwrite_lines``
        is False).
        
        Set up Tesseract to detect lines, and add each one to the region
        at the detected coordinates.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        overwrite_lines = self.parameter['overwrite_lines']
        
        with PyTessBaseAPI(
                psm=PSM.SINGLE_BLOCK,
                path=TESSDATA_PREFIX
        ) as tessapi:
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                page = pcgts.get_Page()
                
                # add metadata about this operation and its runtime parameters:
                metadata = pcgts.get_Metadata() # ensured by from_file()
                metadata.add_MetadataItem(
                    MetadataItemType(type_="processingStep",
                                     name=self.ocrd_tool['steps'][0],
                                     value=TOOL,
                                     Labels=[LabelsType(
                                         externalModel="ocrd-tool",
                                         externalId="parameters",
                                         Label=[LabelType(type_=name,
                                                          value=self.parameter[name])
                                                for name in self.parameter.keys()])]))
                
                page_image, page_coords, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi)
                else:
                    dpi = 0
                    LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                
                for region in itertools.chain.from_iterable(
                        [page.get_TextRegion()] +
                        [subregion.get_TextRegion() for subregion in page.get_TableRegion()]):
                    if region.get_TextLine():
                        if overwrite_lines:
                            LOG.info('removing existing TextLines in region "%s"', region.id)
                            region.set_TextLine([])
                        else:
                            LOG.warning('keeping existing TextLines in region "%s"', region.id)
                    LOG.debug("Detecting lines in region '%s'", region.id)
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords)
                    region_polygon = coordinates_of_segment(region, region_image, region_coords)
                    region_poly = Polygon(region_polygon)
                    tessapi.SetImage(region_image)
                    for line_no, component in enumerate(tessapi.GetComponentImages(RIL.TEXTLINE, True, raw_image=True)):
                        line_id = '%s_line%04d' % (region.id, line_no)
                        line_polygon = polygon_from_xywh(component[1])
                        line_poly = Polygon(line_polygon)
                        if not line_poly.within(region_poly):
                            # this could happen due to rotation
                            interline = line_poly.intersection(region_poly)
                            if interline.is_empty:
                                continue # ignore this line
                            if hasattr(interline, 'geoms'):
                                # is (heterogeneous) GeometryCollection
                                area = 0
                                for geom in interline.geoms:
                                    if geom.area > area:
                                        area = geom.area
                                        interline = geom
                                if not area:
                                    continue
                            line_poly = interline.convex_hull
                            line_polygon = line_poly.exterior.coords
                        line_polygon = coordinates_for_segment(line_polygon, region_image, region_coords)
                        line_points = points_from_polygon(line_polygon)
                        region.add_TextLine(TextLineType(
                            id=line_id, Coords=CoordsType(line_points)))
                
                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(
                    ID=file_id,
                    file_grp=self.output_file_grp,
                    pageId=input_file.pageId,
                    mimetype=MIMETYPE_PAGE,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    content=to_xml(pcgts))
    def process(self):
        try:
            page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))

            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_filter="binarized")
            LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)

            if oplevel == "page":
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n)
            else:
                regions = page.get_TextRegion() + page.get_TableRegion()
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                for (k, region) in enumerate(regions):
                    region_image, region_xywh = self.workspace.image_from_segment(
                        region, page_image, page_xywh)
                    # TODO: not tested on regions
                    self._process_segment(region_image, page, region_xywh,
                                          region.id, input_file,
                                          str(n) + "_" + str(k))

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp, page_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            #LOG.info('Adding force option to False')
            self.workspace.add_file(ID=file_id,
                                    file_grp=page_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'),
                                    force=self.parameter['force'])
    def process(self):
        """Performs segmentation on the input binary image

        Produces a PageXML file as output.
        """
        LOG = getLogger('processor.PixelClassifierSegmentation')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        overwrite_regions = self.parameter['overwrite_regions']
        xheight = self.parameter['xheight']
        gpu_allow_growth = self.parameter['gpu_allow_growth']
        resize_height = self.parameter['resize_height']

        model = self.parameter['model']
        if model == '__DEFAULT__':
            from ocrd_pc_segmentation import DEFAULT_SEGMENTATION_MODEL_PATH
            model = DEFAULT_SEGMENTATION_MODEL_PATH
        elif model == '__LEGACY__':
            from ocrd_pc_segmentation import LEGACY_SEGMENTATION_MODEL_PATH
            model = LEGACY_SEGMENTATION_MODEL_PATH

        page_grp = self.output_file_grp

        for n, input_file in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))
            page = pcgts.get_Page()
            if page.get_TextRegion():
                if overwrite_regions:
                    LOG.info('removing existing TextRegions')
                    page.set_TextRegion([])
                else:
                    LOG.warning('keeping existing TextRegions')

            page.set_AdvertRegion([])
            page.set_ChartRegion([])
            page.set_ChemRegion([])
            page.set_GraphicRegion([])
            page.set_ImageRegion([])
            page.set_LineDrawingRegion([])
            page.set_MathsRegion([])
            page.set_MusicRegion([])
            page.set_NoiseRegion([])
            page.set_SeparatorRegion([])
            page.set_TableRegion([])
            page.set_UnknownRegion([])

            page_image, page_coords, _ = self.workspace.image_from_page(
                page, page_id)

            # ensure the image doesn't have an alpha channel
            if page_image.mode[-1] == "A":
                page_image = page_image.convert(mode=page_image.mode[0:-1])
            page_binary = page_image.convert(mode='1')

            self._process_page(page, np.asarray(page_image),
                               np.asarray(page_binary), page_coords, xheight,
                               model, gpu_allow_growth, resize_height)

            file_id = make_file_id(input_file, self.output_file_grp)
            self.workspace.add_file(ID=file_id,
                                    file_grp=page_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        page_grp, file_id + '.xml'),
                                    content=to_xml(pcgts))
    def process(self):
        """Performs region segmentation by reading mask images in pseudo-colour.
        
        Open and deserialize each PAGE input file (or generate from image input file)
        from the first input file group, as well as mask image file from the second.
        
        Then iterate over all connected (equally colored) mask segments and compute
        convex hull contours for them. Convert them to polygons, and look up their
        color value in ``colordict`` to instantiate the appropriate region types
        (optionally with subtype). Instantiate and annotate regions accordingly.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        colordict = self.parameter['colordict']
        if not colordict:
            LOG.info('Using default PAGE colordict')
            colordict = dict(('#' + col, name)
                             for name, col in CLASSES.items()
                             if name)
        typedict = {"TextRegion": TextTypeSimpleType,
                    "GraphicRegion": GraphicsTypeSimpleType,
                    "ChartType": ChartTypeSimpleType}
        ifgs = self.input_file_grp.split(",") # input file groups
        if len(ifgs) != 2:
            raise Exception("need 2 input file groups (base and mask)")
        # collect input file tuples
        ifts = self.zip_input_files(ifgs) # input file tuples
        # process input file tuples
        for n, ift in enumerate(ifts):
            input_file, segmentation_file = ift
            LOG.info("processing page %s", input_file.pageId)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()

            # add metadata about this operation and its runtime parameters:
            metadata = pcgts.get_Metadata() # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(type_="processingStep",
                                 name=self.ocrd_tool['steps'][0],
                                 value=TOOL,
                                 Labels=[LabelsType(
                                     externalModel="ocrd-tool",
                                     externalId="parameters",
                                     Label=[LabelType(type_=name,
                                                      value=self.parameter[name])
                                            for name in self.parameter.keys()])]))

            # import mask image
            segmentation_filename = self.workspace.download_file(segmentation_file).local_filename
            with pushd_popd(self.workspace.directory):
                segmentation_pil = Image.open(segmentation_filename)
            has_alpha = segmentation_pil.mode == 'RGBA'
            if has_alpha:
                colorformat = "#%08X"
            else:
                colorformat = "#%06X"
                if segmentation_pil.mode != 'RGB':
                    segmentation_pil = segmentation_pil.convert('RGB')
            # convert to array
            segmentation_array = np.array(segmentation_pil)
            # collapse 3 color channels
            segmentation_array = segmentation_array.dot(
                np.array([2**24, 2**16, 2**8, 1], np.uint32)[0 if has_alpha else 1:])
            # partition mapped colors vs background
            colors = np.unique(segmentation_array)
            bgcolors = []
            for i, color in enumerate(colors):
                colorname = colorformat % color
                if (colorname not in colordict or
                    not colordict[colorname]):
                    #raise Exception("Unknown color %s (not in colordict)" % colorname)
                    LOG.info("Ignoring background color %s", colorname)
                    bgcolors.append(i)
            background = np.zeros_like(segmentation_array, np.uint8)
            if bgcolors:
                for i in bgcolors:
                    background += np.array(segmentation_array == colors[i], np.uint8)
                colors = np.delete(colors, bgcolors, 0)
            # iterate over mask for each mapped color/class
            regionno = 0
            for color in colors:
                # get region (sub)type
                colorname = colorformat % color
                classname = colordict[colorname]
                regiontype = None
                custom = None
                if ":" in classname:
                    classname, regiontype = classname.split(":")
                    if classname in typedict:
                        typename = membername(typedict[classname], regiontype)
                        if typename == regiontype:
                            # not predefined in PAGE: use other + custom
                            custom = "subtype:%s" % regiontype
                            regiontype = "other"
                    else:
                        custom = "subtype:%s" % regiontype
                if classname + "Type" not in globals():
                    raise Exception("Unknown class '%s' for color %s in colordict" % (classname, colorname))
                classtype = globals()[classname + "Type"]
                if classtype is BorderType:
                    # mask from all non-background regions
                    classmask = 1 - background
                else:
                    # mask from current color/class
                    classmask = np.array(segmentation_array == color, np.uint8)
                if not np.count_nonzero(classmask):
                    continue
                # now get the contours and make polygons for them
                contours, _ = cv2.findContours(classmask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                for contour in contours:
                    # (could also just take bounding boxes to avoid islands/inclusions...)
                    area = cv2.contourArea(contour)
                    # filter too small regions
                    area_pct = area / np.prod(segmentation_array.shape) * 100
                    if area < 100 and area_pct < 0.1:
                        LOG.warning('ignoring contour of only %.1f%% area for %s',
                                    area_pct, classname)
                        continue
                    LOG.info('found region %s:%s:%s with area %.1f%%',
                             classname, regiontype or '', custom or '', area_pct)
                    # simplify shape
                    poly = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y
                    if len(poly) < 4:
                        LOG.warning('ignoring contour of only %d points (area %.1f%%) for %s',
                                    len(poly), area_pct, classname)
                        continue
                    if classtype is BorderType:
                        # add Border
                        page.set_Border(BorderType(Coords=CoordsType(points=points_from_polygon(poly))))
                        break
                    else:
                        # instantiate region
                        regionno += 1
                        region = classtype(id="region_%d" % regionno, type_=regiontype, custom=custom,
                                           Coords=CoordsType(points=points_from_polygon(poly)))
                        # add region
                        getattr(page, 'add_%s' % classname)(region)
                    
            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(ifgs[0], self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp,
                                            file_id + '.xml'),
                content=to_xml(pcgts))
    def process(self):
        try:
            page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        if not torch.cuda.is_available():
            LOG.error("Your system has no CUDA installed. No GPU detected.")
            sys.exit(1)

        path = self.parameter['pix2pixHD']
        if not Path(path).is_dir():
            LOG.error("""\
                NVIDIA's pix2pixHD was not found at '%s'. Make sure the `pix2pixHD` parameter 
                in ocrd-tools.json points to the local path to the cloned pix2pixHD repository.

                pix2pixHD can be downloaded from https://github.com/NVIDIA/pix2pixHD
                """ % path)
            sys.exit(1)
        model_file_path = os.path.join(path, 'checkpoints/latest_net_G.pth')
        if not Path(model_file_path).is_file():
            LOG.error("""\
                pix2pixHD model file was not found at '%s'. Make sure the this file exists.
                """ % model_file_path)
            sys.exit(1)

        opt, model = self.prepare_options(path)

        oplevel = self.parameter['operation_level']
        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %s", page_id)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))

            page = pcgts.get_Page()

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter='dewarped',
                feature_selector='binarized'
            )  # images should be deskewed and cropped
            if oplevel == 'page':
                dataset = self.prepare_data(opt, page_image, path)
                orig_img_size = page_image.size
                self._process_segment(model, dataset, page, page_xywh, page_id,
                                      input_file, orig_img_size, n)
            else:
                regions = page.get_TextRegion() + page.get_TableRegion(
                )  #get all regions?
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                for (k, region) in enumerate(regions):
                    region_image, region_xywh = self.workspace.image_from_segment(
                        region, page_image, page_xywh)
                    # TODO: not tested on regions
                    # TODO: region has to exist as a physical file to be processed by pix2pixHD
                    dataset = self.prepare_data(opt, region_image, path)
                    orig_img_size = region_image.size
                    self._process_segment(model, dataset, page, region_xywh,
                                          region.id, input_file, orig_img_size,
                                          n)

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp, page_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(page_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=page_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        page_grp, file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'),
                                    force=self.parameter['force'])
        os.rmdir(self.input_file_grp +
                 "/test_A/")  #FIXME: better way of deleting a temp_dir?
Exemplo n.º 21
0
    def process(self):
        """Extract page images and region descriptions (type and coordinates) from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Get all regions with their types (region element class), sub-types (@type)
        and coordinates relative to the page (which depending on the workflow could
        already be cropped, deskewed, dewarped, binarized etc). Extract the image of
        the (cropped, deskewed, dewarped) page, both in binarized form (if available)
        and non-binarized form. In addition, create a new image with masks for all
        regions, color-coded by type. Create two JSON files with region types and
        coordinates: one (page-wise) in our custom format and one (global) in MS-COCO.
        
        The output file group may be given as a comma-separated list to separate
        these 3 page-level images. Write files as follows:
        * in the first (or only) output file group (directory):
          - ID + '.png': raw image of the (preprocessed) page
          - ID + '.json': region coordinates/classes (custom format)
        * in the second (or first) output file group (directory):
          - ID + '.bin.png': binarized image of the (preprocessed) page, if available
        * in the third (or first) output file group (directory):
          - ID + '.dbg.png': debug image
        
        In addition, write a file for all pages at once:
        * in the third (or first) output file group (directory):
          - output_file_grp + '.coco.json': region coordinates/classes (MS-COCO format)
          - output_file_grp + '.colordict.json': color definitions (as in PAGE viewer)
        
        (This is intended for training and evaluation of region segmentation models.)
        """
        file_groups = self.output_file_grp.split(',')
        if len(file_groups) > 3:
            raise Exception(
                "at most 3 output file grps allowed (raw, [binarized, [mask]] image)"
            )
        if len(file_groups) > 2:
            dbg_image_grp = file_groups[2]
        else:
            dbg_image_grp = file_groups[0]
            LOG.info(
                "No output file group for debug images specified, falling back to output filegrp '%s'",
                dbg_image_grp)
        if len(file_groups) > 1:
            bin_image_grp = file_groups[1]
        else:
            bin_image_grp = file_groups[0]
            LOG.info(
                "No output file group for binarized images specified, falling back to output filegrp '%s'",
                bin_image_grp)
        self.output_file_grp = file_groups[0]

        # COCO: init data structures
        images = list()
        annotations = list()
        categories = list()
        i = 0
        for cat, color in CLASSES.items():
            # COCO format does not allow alpha channel
            color = (int(color[0:2], 16), int(color[2:4],
                                              16), int(color[4:6], 16))
            try:
                supercat, name = cat.split(':')
            except ValueError:
                name = cat
                supercat = ''
            categories.append({
                'id': i,
                'name': name,
                'supercategory': supercat,
                'source': 'PAGE',
                'color': color
            })
            i += 1

        i = 0
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            page_id = input_file.pageId or input_file.ID
            num_page_id = int(page_id.strip(page_id.strip("0123456789")))
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            ptype = page.get_type()
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter
                                   ])
                    ]))
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter='binarized',
                transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            file_path = self.workspace.save_image_file(
                page_image,
                file_id,
                self.output_file_grp,
                page_id=page_id,
                mimetype=self.parameter['mimetype'])
            try:
                page_image_bin, _, _ = self.workspace.image_from_page(
                    page,
                    page_id,
                    feature_selector='binarized',
                    transparency=self.parameter['transparency'])
                self.workspace.save_image_file(page_image_bin,
                                               file_id + '.bin',
                                               bin_image_grp,
                                               page_id=page_id)
            except Exception as err:
                if err.args[0].startswith('Found no AlternativeImage'):
                    LOG.warning(
                        'Page "%s" has no binarized images, skipping .bin',
                        page_id)
                else:
                    raise
            page_image_dbg = Image.new(mode='RGBA',
                                       size=page_image.size,
                                       color='#' + CLASSES[''])
            if page.get_Border():
                polygon = coordinates_of_segment(page.get_Border(), page_image,
                                                 page_coords).tolist()
                ImageDraw.Draw(page_image_dbg).polygon(
                    list(map(tuple, polygon)), fill='#' + CLASSES['Border'])
            else:
                page_image_dbg.paste(
                    '#' + CLASSES['Border'],
                    (0, 0, page_image.width, page_image.height))
            regions = dict()
            for name in CLASSES.keys():
                if not name or name == 'Border' or ':' in name:
                    # no subtypes here
                    continue
                regions[name] = getattr(page, 'get_' + name)()
            description = {'angle': page.get_orientation()}
            Neighbor = namedtuple('Neighbor', ['id', 'poly', 'type'])
            neighbors = []
            for rtype, rlist in regions.items():
                for region in rlist:
                    if rtype in ['TextRegion', 'ChartRegion', 'GraphicRegion']:
                        subrtype = region.get_type()
                    else:
                        subrtype = None
                    polygon = coordinates_of_segment(region, page_image,
                                                     page_coords)
                    polygon2 = polygon.reshape(1, -1).tolist()
                    polygon = polygon.tolist()
                    xywh = xywh_from_polygon(polygon)
                    # validate coordinates and check intersection with neighbours
                    # (which would melt into another in the mask image):
                    try:
                        poly = Polygon(polygon)
                        reason = ''
                    except ValueError as err:
                        reason = err
                    if not poly.is_valid:
                        reason = explain_validity(poly)
                    elif poly.is_empty:
                        reason = 'is empty'
                    elif poly.bounds[0] < 0 or poly.bounds[1] < 0:
                        reason = 'is negative'
                    elif poly.length < 4:
                        reason = 'has too few points'
                    if reason:
                        LOG.error('Page "%s" region "%s" %s', page_id,
                                  region.id, reason)
                        continue
                    poly_prep = prep(poly)
                    for neighbor in neighbors:
                        if (rtype == neighbor.type
                                and poly_prep.intersects(neighbor.poly)
                                and poly.intersection(neighbor.poly).area > 0):
                            LOG.warning('Page "%s" region "%s" intersects neighbour "%s" (IoU: %.3f)',
                                        page_id, region.id, neighbor.id,
                                        poly.intersection(neighbor.poly).area / \
                                        poly.union(neighbor.poly).area)
                        elif (rtype != neighbor.type
                              and poly_prep.within(neighbor.poly)):
                            LOG.warning(
                                'Page "%s" region "%s" within neighbour "%s" (IoU: %.3f)',
                                page_id, region.id, neighbor.id,
                                poly.area / neighbor.poly.area)
                    neighbors.append(Neighbor(region.id, poly, rtype))
                    area = poly.area
                    description.setdefault('regions', []).append({
                        'type':
                        rtype,
                        'subtype':
                        subrtype,
                        'coords':
                        polygon,
                        'area':
                        area,
                        'features':
                        page_coords['features'],
                        'DPI':
                        dpi,
                        'region.ID':
                        region.id,
                        'page.ID':
                        page_id,
                        'page.type':
                        ptype,
                        'file_grp':
                        self.input_file_grp,
                        'METS.UID':
                        self.workspace.mets.unique_identifier
                    })
                    # draw region:
                    ImageDraw.Draw(page_image_dbg).polygon(
                        list(map(tuple, polygon)),
                        fill='#' + CLASSES[(rtype + ':' +
                                            subrtype) if subrtype else rtype])
                    # COCO: add annotations
                    i += 1
                    annotations.append({
                        'id':
                        i,
                        'image_id':
                        num_page_id,
                        'category_id':
                        next(
                            (cat['id']
                             for cat in categories if cat['name'] == subrtype),
                            next((cat['id'] for cat in categories
                                  if cat['name'] == rtype))),
                        'segmentation':
                        polygon2,
                        'area':
                        area,
                        'bbox': [xywh['x'], xywh['y'], xywh['w'], xywh['h']],
                        'iscrowd':
                        0
                    })

            self.workspace.save_image_file(page_image_dbg,
                                           file_id + '.dbg',
                                           dbg_image_grp,
                                           page_id=page_id,
                                           mimetype=self.parameter['mimetype'])
            self.workspace.add_file(
                ID=file_id + '.json',
                file_grp=dbg_image_grp,
                pageId=page_id,
                local_filename=file_path.replace(
                    MIME_TO_EXT[self.parameter['mimetype']], '.json'),
                mimetype='application/json',
                content=json.dumps(description))

            # COCO: add image
            images.append({
                # COCO does not allow string identifiers:
                # -> use numerical part of page_id
                'id': num_page_id,
                # all exported coordinates are relative to the cropped page:
                # -> use that for reference (instead of original page.imageFilename)
                'file_name': file_path,
                # -> use its size (instead of original page.imageWidth/page.imageHeight)
                'width': page_image.width,
                'height': page_image.height
            })

        # COCO: write result
        file_id = dbg_image_grp + '.coco.json'
        LOG.info('Writing COCO result file "%s" in "%s"', file_id,
                 dbg_image_grp)
        self.workspace.add_file(ID=file_id,
                                file_grp=dbg_image_grp,
                                local_filename=os.path.join(
                                    dbg_image_grp, file_id),
                                mimetype='application/json',
                                content=json.dumps({
                                    'categories': categories,
                                    'images': images,
                                    'annotations': annotations
                                }))

        # write inverse colordict (for ocrd-segment-from-masks)
        file_id = dbg_image_grp + '.colordict.json'
        LOG.info('Writing colordict file "%s" in .', file_id)
        with open(file_id, 'w') as out:
            json.dump(
                dict(('#' + col, name) for name, col in CLASSES.items()
                     if name), out)
Exemplo n.º 22
0
    def process(self):
        """Extract page image and replace original with it.
        
        Open and deserialize PAGE input files and their respective images,
        then go to the page hierarchy level.
        
        Retrieve the image of the (cropped, deskewed, dewarped) page, preferring
        the last annotated form (which, depending on the workflow, could be
        binarized or raw). Add that image file to the workspace with the fileGrp
        USE given in the second position of the output fileGrp, or ``OCR-D-IMG-SUBST``.
        Reference that file in the page (not as AlternativeImage but) as original
        image. Adjust all segment coordinates accordingly.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        try:
            page_grp, image_grp = self.output_file_grp.split(',')
        except ValueError:
            page_grp = self.output_file_grp
            image_grp = FALLBACK_FILEGRP_IMG
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                image_grp)
        feature_selector = self.parameter['feature_selector']
        feature_filter = self.parameter['feature_filter']
        adapt_coords = self.parameter['transform_coordinates']

        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp, page_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(page_grp, n)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter
                                   ])
                    ]))
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter=feature_filter,
                feature_selector=feature_selector)
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            # annotate extracted image
            file_path = self.workspace.save_image_file(
                page_image,
                file_id.replace(page_grp, image_grp),
                image_grp,
                page_id=input_file.pageId,
                mimetype='image/png')
            # replace original image
            page.set_imageFilename(file_path)
            # adjust all coordinates
            if adapt_coords:
                for region in page.get_AllRegions():
                    region_polygon = coordinates_of_segment(
                        region, page_image, page_coords)
                    region.get_Coords().points = points_from_polygon(
                        region_polygon)
                    if isinstance(region, TextRegionType):
                        for line in region.get_TextLine():
                            line_polygon = coordinates_of_segment(
                                line, page_image, page_coords)
                            line.get_Coords().points = points_from_polygon(
                                line_polygon)
                            for word in line.get_Word():
                                word_polygon = coordinates_of_segment(
                                    word, page_image, page_coords)
                                word.get_Coords().points = points_from_polygon(
                                    word_polygon)
                                for glyph in word.get_Glyph():
                                    glyph_polygon = coordinates_of_segment(
                                        glyph, page_image, page_coords)
                                    glyph.get_Coords(
                                    ).points = points_from_polygon(
                                        glyph_polygon)

            # update METS (add the PAGE file):
            file_path = os.path.join(page_grp, file_id + '.xml')
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=page_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     page_grp, out.local_filename)
Exemplo n.º 23
0
    def process(self):
        """Performs segmentation evaluation with Shapely on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Return information on the plausibility of the segmentation into
        regions on the logging level.
        """
        plausibilize = self.parameter['plausibilize']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this
                    # what we want here is `externalModel="ocrd-tool" externalId="parameters"`
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()

            regions = page.get_TextRegion()

            mark_for_deletion = set()
            mark_for_merging = set()

            for i in range(0, len(regions)):
                for j in range(i + 1, len(regions)):
                    LOG.info('Comparing regions "%s" and "%s"', regions[i].id,
                             regions[j].id)
                    region_poly1 = Polygon(
                        polygon_from_points(regions[i].get_Coords().points))
                    region_poly2 = Polygon(
                        polygon_from_points(regions[j].get_Coords().points))

                    LOG.debug('Checking for equality ...')
                    equality = region_poly1.almost_equals(region_poly2)
                    if equality:
                        LOG.warn(
                            'Warning: regions %s and %s cover the same area.' %
                            (regions[i].id, regions[j].id))
                        mark_for_deletion.add(j)

                    LOG.debug('Checking for containment ...')
                    containment_r = region_poly1.contains(region_poly2)
                    containment_l = region_poly2.contains(region_poly1)
                    if containment_r:
                        LOG.warn('Warning: %s contains %s' %
                                 (regions[i].id, regions[j].id))
                        mark_for_deletion.add(j)
                    if containment_l:
                        LOG.warn('Warning: %s contains %s' %
                                 (regions[j].id, regions[i].id))
                        mark_for_deletion.add(i)

            if plausibilize:
                new_regions = []
                for i in range(0, len(regions)):
                    if not i in mark_for_deletion:
                        new_regions.append(regions[i])
                page.set_TextRegion(new_regions)

                #LOG.info('Intersection %i', region_poly1.intersects(region_poly2))
                #LOG.info('Containment %i', region_poly1.contains(region_poly2))
                #if region_poly1.intersects(region_poly2):
                #    LOG.info('Area 1 %d', region_poly1.area)
                #    LOG.info('Area 2 %d', region_poly2.area)
                #    LOG.info('Area intersect %d', region_poly1.intersection(region_poly2).area)

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Exemplo n.º 24
0
 def process(self):
     """Extract page images and region descriptions (type and coordinates) from the workspace.
     
     Open and deserialize PAGE input files and their respective images,
     then iterate over the element hierarchy down to the region level.
     
     Get all regions with their types (region element class), sub-types (@type)
     and coordinates relative to the page (which depending on the workflow could
     already be cropped, deskewed, dewarped, binarized etc). Extract the image of
     the page, both in binarized and non-binarized form. In addition, create a new
     image which color-codes all regions. Create a JSON file with region types and
     coordinates.
     
     Write all files in the directory of the output file group, named like so:
     * ID + '.png': raw image
     * ID + '.bin.png': binarized image
     * ID + '.dbg.png': debug image
     * ID + '.json': region coordinates
     
     (This is intended for training and evaluation of region segmentation models.)
     """
     # pylint: disable=attribute-defined-outside-init
     for n, input_file in enumerate(self.input_files):
         file_id = input_file.ID.replace(self.input_file_grp,
                                         self.output_file_grp)
         page_id = input_file.pageId or input_file.ID
         LOG.info("INPUT FILE %i / %s", n, page_id)
         pcgts = page_from_file(self.workspace.download_file(input_file))
         page = pcgts.get_Page()
         ptype = page.get_type()
         metadata = pcgts.get_Metadata()  # ensured by from_file()
         metadata.add_MetadataItem(
             MetadataItemType(
                 type_="processingStep",
                 name=self.ocrd_tool['steps'][0],
                 value=TOOL,
                 Labels=[
                     LabelsType(externalModel="ocrd-tool",
                                externalId="parameters",
                                Label=[
                                    LabelType(type_=name,
                                              value=self.parameter[name])
                                    for name in self.parameter.keys()
                                ])
                 ]))
         page_image, page_coords, page_image_info = self.workspace.image_from_page(
             page,
             page_id,
             feature_filter='binarized',
             transparency=self.parameter['transparency'])
         if page_image_info.resolution != 1:
             dpi = page_image_info.resolution
             if page_image_info.resolutionUnit == 'cm':
                 dpi = round(dpi * 2.54)
         else:
             dpi = None
         file_path = self.workspace.save_image_file(page_image,
                                                    file_id,
                                                    self.output_file_grp,
                                                    page_id=page_id)
         page_image_bin, _, _ = self.workspace.image_from_page(
             page,
             page_id,
             feature_selector='binarized',
             transparency=self.parameter['transparency'])
         self.workspace.save_image_file(page_image_bin,
                                        file_id + '.bin',
                                        self.output_file_grp,
                                        page_id=page_id)
         page_image_dbg = Image.new(mode='RGB',
                                    size=page_image.size,
                                    color=0)
         regions = {
             'text': page.get_TextRegion(),
             'table': page.get_TableRegion(),
             'chart': page.get_ChartRegion(),
             'chem': page.get_ChemRegion(),
             'graphic': page.get_GraphicRegion(),
             'image': page.get_ImageRegion(),
             'linedrawing': page.get_LineDrawingRegion(),
             'maths': page.get_MathsRegion(),
             'music': page.get_MusicRegion(),
             'noise': page.get_NoiseRegion(),
             'separator': page.get_SeparatorRegion(),
             'unknown': page.get_UnknownRegion()
         }
         description = {'angle': page.get_orientation()}
         for rtype, rlist in regions.items():
             for region in rlist:
                 polygon = coordinates_of_segment(region, page_image,
                                                  page_coords).tolist()
                 description.setdefault('regions', []).append({
                     'type':
                     rtype,
                     'subtype':
                     region.get_type()
                     if rtype in ['text', 'chart', 'graphic'] else None,
                     'coords':
                     polygon,
                     'features':
                     page_coords['features'],
                     'DPI':
                     dpi,
                     'region.ID':
                     region.id,
                     'page.ID':
                     page_id,
                     'page.type':
                     ptype,
                     'file_grp':
                     self.input_file_grp,
                     'METS.UID':
                     self.workspace.mets.unique_identifier
                 })
                 ImageDraw.Draw(page_image_dbg).polygon(list(
                     map(tuple, polygon)),
                                                        fill=CLASSES[rtype])
                 ImageDraw.Draw(page_image_dbg).line(list(
                     map(tuple, polygon + [polygon[0]])),
                                                     fill=CLASSES['border'],
                                                     width=3)
         self.workspace.save_image_file(page_image_dbg,
                                        file_id + '.dbg',
                                        self.output_file_grp,
                                        page_id=page_id)
         file_path = file_path.replace('.png', '.json')
         json.dump(description, open(file_path, 'w'))
Exemplo n.º 25
0
    def process(self):
        """Extract region images from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Extract an image for each region (which depending on the workflow
        can already be deskewed, dewarped, binarized etc.), cropped to its
        minimal bounding box, and masked by the coordinate polygon outline.
        If ``transparency`` is true, then also add an alpha channel which is
        fully transparent outside of the mask.
        
        Create a JSON file with:
        * the IDs of the region and its parents,
        * the region's coordinates relative to the region image,
        * the region's absolute coordinates,
        * the (text) region's text content (if any),
        * the (text) region's TextStyle (if any),
        * the (text) region's @production (if any),
        * the (text) region's @readingDirection (if any),
        * the (text) region's @textLineOrder (if any),
        * the (text) region's @primaryScript (if any),
        * the (text) region's @primaryLanguage (if any),
        * the region's AlternativeImage/@comments (features),
        * the region's element class,
        * the region's @type,
        * the page's @type,
        * the page's DPI value.
        
        Write all files in the directory of the output file group, named like so:
        * ID + '.raw.png': region image (if the workflow provides raw images)
        * ID + '.bin.png': region image (if the workflow provides binarized images)
        * ID + '.nrm.png': region image (if the workflow provides grayscale-normalized images)
        * ID + '.json': region metadata.
        """
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            metadata = pcgts.get_Metadata() # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(type_="processingStep",
                                 name=self.ocrd_tool['steps'][0],
                                 value=TOOL,
                                 Labels=[LabelsType(
                                     externalModel="ocrd-tool",
                                     externalId="parameters",
                                     Label=[LabelType(type_=name,
                                                      value=self.parameter[name])
                                            for name in self.parameter.keys()])]))
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id,
                transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            ptype = page.get_type()

            regions = { 'advert': page.get_AdvertRegion(),
                        'text': page.get_TextRegion(),
                        'table': page.get_TableRegion(),
                        'chart': page.get_ChartRegion(),
                        'chem': page.get_ChemRegion(),
                        'graphic': page.get_GraphicRegion(),
                        'image': page.get_ImageRegion(),
                        'linedrawing': page.get_LineDrawingRegion(),
                        'maths': page.get_MathsRegion(),
                        'music': page.get_MusicRegion(),
                        'noise': page.get_NoiseRegion(),
                        'separator': page.get_SeparatorRegion(),
                        'unknown': page.get_UnknownRegion()
            }
            for rtype, rlist in regions.items():
                for region in rlist:
                    description = { 'region.ID': region.id, 'region.type': rtype }
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords,
                        transparency=self.parameter['transparency'])
                    description['subtype'] = region.get_type() if rtype in ['text', 'chart', 'graphic'] else None
                    description['coords_rel'] = coordinates_of_segment(
                        region, region_image, region_coords).tolist()
                    description['coords_abs'] = polygon_from_points(region.get_Coords().points)
                    if rtype == 'text':
                        rtext = region.get_TextEquiv()
                        if rtext:
                            description['region.text'] = rtext[0].Unicode
                        else:
                            description['region.text'] = ''
                        rstyle = region.get_TextStyle() or page.get_TextStyle()
                        if rstyle:
                            description['region.style'] = {
                                'fontFamily': rstyle.fontFamily,
                                'fontSize': rstyle.fontSize,
                                'xHeight': rstyle.xHeight,
                                'kerning': rstyle.kerning,
                                'serif': rstyle.serif,
                                'monospace': rstyle.monospace,
                                'bold': rstyle.bold,
                                'italic': rstyle.italic,
                                'smallCaps': rstyle.smallCaps,
                                'letterSpaced': rstyle.letterSpaced,
                                'strikethrough': rstyle.strikethrough,
                                'underlined': rstyle.underlined,
                                'underlineStyle': rstyle.underlineStyle,
                                'subscript': rstyle.subscript,
                                'superscript': rstyle.superscript
                            }
                        description['production'] = region.get_production()
                        description['readingDirection'] = (
                            region.get_readingDirection() or
                            page.get_readingDirection())
                        description['textLineOrder'] = (
                            region.get_textLineOrder() or
                            page.get_textLineOrder())
                        description['primaryScript'] = (
                            region.get_primaryScript() or
                            page.get_primaryScript())
                        description['primaryLanguage'] = (
                            region.get_primaryLanguage() or
                            page.get_primaryLanguage())
                    description['features'] = region_coords['features']
                    description['DPI']= dpi
                    description['page.ID'] = page_id
                    description['page.type'] = ptype
                    description['file_grp'] = self.input_file_grp
                    description['METS.UID'] = self.workspace.mets.unique_identifier
                    if 'binarized' in region_coords['features']:
                        extension = '.bin'
                    elif 'grayscale_normalized' in region_coords['features']:
                        extension = '.nrm'
                    else:
                        extension = '.raw'
                    
                    file_path = self.workspace.save_image_file(
                        region_image,
                        file_id + '_' + region.id + extension,
                        self.output_file_grp,
                        page_id=page_id,
                        format='PNG')
                    file_path = file_path.replace(extension + '.png', '.json')
                    json.dump(description, open(file_path, 'w'))
Exemplo n.º 26
0
    def process(self):
        """Performs segmentation on the input binary image

        Produces a PageXML file as output.
        """
        overwrite_regions = self.parameter['overwrite_regions']

        try:
            self.page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            self.page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        for n, input_file in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this
                    # what we want here is `externalModel="ocrd-tool" externalId="parameters"`
                    Labels=[
                        LabelsType(  # externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()
            if page.get_TextRegion():
                if overwrite_regions:
                    LOG.info('removing existing TextRegions')
                    page.set_TextRegion([])
                else:
                    LOG.warning('keeping existing TextRegions')

            page.set_AdvertRegion([])
            page.set_ChartRegion([])
            page.set_ChemRegion([])
            page.set_GraphicRegion([])
            page.set_ImageRegion([])
            page.set_LineDrawingRegion([])
            page.set_MathsRegion([])
            page.set_MusicRegion([])
            page.set_NoiseRegion([])
            page.set_SeparatorRegion([])
            page.set_TableRegion([])
            page.set_UnknownRegion([])

            page_image, page_xywh, _ = image_from_page(self.workspace, page,
                                                       page_id)

            self._process_page(page, page_image, page_xywh, input_file.pageId,
                               file_id)

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp, self.page_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.page_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.page_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.page_grp, file_id + '.xml'),
                                    content=to_xml(pcgts))
Exemplo n.º 27
0
 def process(self):
     """
     Performs the (text) recognition.
     """
     # print(self.parameter)
     log.debug("TESSDATA: %s, installed tesseract models: %s",
               *get_languages())
     maxlevel = self.parameter['textequiv_level']
     model = get_languages()[1][-1]  # last installed model
     if 'model' in self.parameter:
         model = self.parameter['model']
         if model not in get_languages()[1]:
             raise Exception("configured model " + model +
                             " is not installed")
     with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi:
         log.info("Using model '%s' in %s for recognition at the %s level",
                  model,
                  get_languages()[0], maxlevel)
         # todo: populate GetChoiceIterator() with LSTM models, too:
         #tessapi.SetVariable("lstm_choice_mode", "2")
         # todo: determine relevancy of these variables:
         # tessapi.SetVariable("tessedit_single_match", "0")
         #
         # tessedit_load_sublangs
         # tessedit_preserve_min_wd_len 2
         # tessedit_prefer_joined_punct 0
         # tessedit_write_rep_codes 0
         # tessedit_parallelize 0
         # tessedit_zero_rejection 0
         # tessedit_zero_kelvin_rejection 0
         # tessedit_reject_mode 0
         # tessedit_use_reject_spaces 1
         # tessedit_fix_fuzzy_spaces 1
         # tessedit_char_blacklist
         # tessedit_char_whitelist
         # chs_leading_punct ('`"
         # chs_trailing_punct1 ).,;:?!
         # chs_trailing_punct2 )'`"
         # numeric_punctuation .,
         # unrecognised_char |
         # ok_repeated_ch_non_alphanum_wds -?*=
         # conflict_set_I_l_1 Il1[]
         # preserve_interword_spaces 0
         # tessedit_enable_dict_correction 0
         # tessedit_enable_bigram_correction 1
         # stopper_smallword_size 2
         # wordrec_max_join_chunks 4
         # suspect_space_level 100
         # suspect_short_words 2
         # language_model_ngram_on 0
         # language_model_ngram_order 8
         # language_model_min_compound_length 3
         # language_model_penalty_non_freq_dict_word 0.1
         # language_model_penalty_non_dict_word 0.15
         # language_model_penalty_punc 0.2
         # language_model_penalty_case 0.1
         # language_model_penalty_script 0.5
         # language_model_penalty_chartype 0.3
         # language_model_penalty_spacing 0.05
         # textord_max_noise_size 7
         # enable_noise_removal 1
         # classify_bln_numeric_mode 0
         # lstm_use_matrix 1
         # user_words_file
         # user_patterns_file
         for (n, input_file) in enumerate(self.input_files):
             log.info("INPUT FILE %i / %s", n, input_file)
             pcgts = page_from_file(
                 self.workspace.download_file(input_file))
             # TODO use binarized / gray
             pil_image = self.workspace.resolve_image_as_pil(
                 pcgts.get_Page().imageFilename)
             tessapi.SetImage(pil_image)
             metadata = pcgts.get_Metadata()  # ensured by from_file()
             metadata.add_MetadataItem(
                 MetadataItemType(
                     type_="processingStep",
                     name=OCRD_TOOL['tools']['ocrd-tesserocr-recognize']
                     ['steps'][0],
                     value='ocrd-tesserocr-recognize',
                     Labels=[
                         LabelsType(externalRef="parameters",
                                    Label=[
                                        LabelType(
                                            type_=name,
                                            value=self.parameter[name])
                                        for name in self.parameter.keys()
                                    ])
                     ]))
             log.info("Recognizing text in page '%s'", pcgts.get_pcGtsId())
             regions = pcgts.get_Page().get_TextRegion()
             if not regions:
                 log.warning("Page contains no text regions")
             self._process_regions(regions, maxlevel, tessapi)
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 mimetype=MIMETYPE_PAGE,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 content=to_xml(pcgts),
             )
Exemplo n.º 28
0
    def process(self):
        """Extract textline images and texts from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the line level.
        
        Extract an image for each textline (which depending on the workflow
        can already be deskewed, dewarped, binarized etc.), cropped to its
        minimal bounding box, and masked by the coordinate polygon outline.
        If ``transparency`` is true, then also add an alpha channel which is
        fully transparent outside of the mask.
        
        Create a JSON file with:
        * the IDs of the textline and its parents,
        * the textline's text content,
        * the textline's coordinates relative to the line image,
        * the textline's absolute coordinates,
        * the textline's TextStyle (if any),
        * the textline's @production (if any),
        * the textline's @readingDirection (if any),
        * the textline's @primaryScript (if any),
        * the textline's @primaryLanguage (if any),
        * the textline's AlternativeImage/@comments (features),
        * the parent textregion's @type,
        * the page's @type,
        * the page's DPI value.
        
        Create a plain text file for the text content, too.
        
        Write all files in the directory of the output file group, named like so:
        * ID + '.raw.png': line image (if the workflow provides raw images)
        * ID + '.bin.png': line image (if the workflow provides binarized images)
        * ID + '.nrm.png': line image (if the workflow provides grayscale-normalized images)
        * ID + '.json': line metadata.
        * ID + '.gt.txt': line text.
        
        (This is intended for training and evaluation of OCR models.)
        """
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            ptype = page.get_type()

            regions = itertools.chain.from_iterable([page.get_TextRegion()] + [
                subregion.get_TextRegion()
                for subregion in page.get_TableRegion()
            ])
            if not regions:
                LOG.warning("Page '%s' contains no text regions", page_id)
            for region in regions:
                region_image, region_coords = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_coords,
                    transparency=self.parameter['transparency'])
                rtype = region.get_type()

                lines = region.get_TextLine()
                if not lines:
                    LOG.warning("Region '%s' contains no text lines",
                                region.id)
                for line in lines:
                    line_image, line_coords = self.workspace.image_from_segment(
                        line,
                        region_image,
                        region_coords,
                        transparency=self.parameter['transparency'])
                    lpolygon_rel = coordinates_of_segment(
                        line, line_image, line_coords).tolist()
                    lpolygon_abs = polygon_from_points(
                        line.get_Coords().points)
                    ltext = line.get_TextEquiv()
                    if not ltext:
                        LOG.warning("Line '%s' contains no text conent",
                                    line.id)
                        ltext = ''
                    else:
                        ltext = ltext[0].Unicode
                    lstyle = line.get_TextStyle() or region.get_TextStyle()
                    if lstyle:
                        lstyle = {
                            'fontFamily': lstyle.fontFamily,
                            'fontSize': lstyle.fontSize,
                            'xHeight': lstyle.xHeight,
                            'kerning': lstyle.kerning,
                            'serif': lstyle.serif,
                            'monospace': lstyle.monospace,
                            'bold': lstyle.bold,
                            'italic': lstyle.italic,
                            'smallCaps': lstyle.smallCaps,
                            'letterSpaced': lstyle.letterSpaced,
                            'strikethrough': lstyle.strikethrough,
                            'underlined': lstyle.underlined,
                            'underlineStyle': lstyle.underlineStyle,
                            'subscript': lstyle.subscript,
                            'superscript': lstyle.superscript
                        }
                    lfeatures = line_coords['features']
                    description = {
                        'line.ID':
                        line.id,
                        'text':
                        ltext,
                        'style':
                        lstyle,
                        'production': (line.get_production()
                                       or region.get_production()),
                        'readingDirection': (line.get_readingDirection()
                                             or region.get_readingDirection()
                                             or page.get_readingDirection()),
                        'primaryScript': (line.get_primaryScript()
                                          or region.get_primaryScript()
                                          or page.get_primaryScript()),
                        'primaryLanguage': (line.get_primaryLanguage()
                                            or region.get_primaryLanguage()
                                            or page.get_primaryLanguage()),
                        'features':
                        lfeatures,
                        'DPI':
                        dpi,
                        'coords_rel':
                        lpolygon_rel,
                        'coords_abs':
                        lpolygon_abs,
                        'region.ID':
                        region.id,
                        'region.type':
                        rtype,
                        'page.ID':
                        page_id,
                        'page.type':
                        ptype,
                        'file_grp':
                        self.input_file_grp,
                        'METS.UID':
                        self.workspace.mets.unique_identifier
                    }
                    if 'binarized' in lfeatures:
                        extension = '.bin'
                    elif 'grayscale_normalized' in lfeatures:
                        extension = '.nrm'
                    else:
                        extension = '.raw'

                    file_path = self.workspace.save_image_file(
                        line_image,
                        file_id + '_' + region.id + '_' + line.id + extension,
                        self.output_file_grp,
                        page_id=page_id,
                        mimetype=self.parameter['mimetype'])
                    file_path = file_path.replace(
                        extension + MIME_TO_EXT[self.parameter['mimetype']],
                        '.json')
                    json.dump(description, open(file_path, 'w'))
                    file_path = file_path.replace('.json', '.gt.txt')
                    with open(file_path, 'wb') as f:
                        f.write((ltext + '\n').encode('utf-8'))
Exemplo n.º 29
0
    def process(self):
        """
        Performs the recognition.
        """

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        self._init_calamari()

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id)

            for region in pcgts.get_Page().get_TextRegion():
                region_image, region_xywh = self.workspace.image_from_segment(
                    region, page_image, page_xywh)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'",
                         len(textlines), region.id)
                for (line_no, line) in enumerate(textlines):
                    log.debug("Recognizing line '%s' in region '%s'", line.id,
                              region.id)

                    line_image, line_coords = self.workspace.image_from_segment(
                        line, region_image, region_xywh)
                    line_image_np = np.array(line_image, dtype=np.uint8)

                    raw_results = list(
                        self.predictor.predict_raw([line_image_np],
                                                   progress_bar=False))[0]
                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    # Build line text on our own
                    #
                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
                    # on prediction.positions. Do it on our own to have consistency.
                    #
                    # XXX Check Calamari's built-in post-processing on prediction.sentence

                    def _sort_chars(p):
                        """Filter and sort chars of prediction p"""
                        chars = p.chars
                        chars = [
                            c for c in chars if c.char
                        ]  # XXX Note that omission probabilities are not normalized?!
                        chars = [
                            c for c in chars if c.probability >=
                            self.parameter['glyph_conf_cutoff']
                        ]
                        chars = sorted(chars,
                                       key=lambda k: k.probability,
                                       reverse=True)
                        return chars

                    def _drop_leading_spaces(positions):
                        return list(
                            itertools.dropwhile(
                                lambda p: _sort_chars(p)[0].char == " ",
                                positions))

                    def _drop_trailing_spaces(positions):
                        return list(
                            reversed(_drop_leading_spaces(
                                reversed(positions))))

                    def _drop_double_spaces(positions):
                        def _drop_double_spaces_generator(positions):
                            last_was_space = False
                            for p in positions:
                                if p.chars[0].char == " ":
                                    if not last_was_space:
                                        yield p
                                    last_was_space = True
                                else:
                                    yield p
                                    last_was_space = False

                        return list(_drop_double_spaces_generator(positions))

                    positions = prediction.positions
                    positions = _drop_leading_spaces(positions)
                    positions = _drop_trailing_spaces(positions)
                    positions = _drop_double_spaces(positions)
                    positions = list(positions)

                    line_text = ''.join(
                        _sort_chars(p)[0].char for p in positions)
                    if line_text != prediction.sentence:
                        log.warning(
                            "Our own line text is not the same as Calamari's: '%s' != '%s'",
                            line_text, prediction.sentence)

                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results",
                                    line.id)
                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning(
                            "Line '%s' already contained word segmentation",
                            line.id)
                    line.set_Word([])

                    # Save line results
                    line_conf = prediction.avg_char_probability
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_text, conf=line_conf)])

                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
                    # hierarchy of lines > words > glyphs.

                    def _words(s):
                        """Split words based on spaces and include spaces as 'words'"""
                        spaces = None
                        word = ''
                        for c in s:
                            if c == ' ' and spaces is True:
                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word

                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
                        word_no = 0
                        i = 0

                        for word_text in _words(line_text):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
                                word_positions = positions[i:i + word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end

                                polygon = polygon_from_x0y0x1y1([
                                    word_start, 0, word_end, line_image.height
                                ])
                                points = points_from_polygon(
                                    coordinates_for_segment(
                                        polygon, None, line_coords))
                                # XXX Crop to line polygon?

                                word = WordType(id='%s_word%04d' %
                                                (line.id, word_no),
                                                Coords=CoordsType(points))
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=word_text))

                                if self.parameter[
                                        'textequiv_level'] == 'glyph':
                                    for glyph_no, p in enumerate(
                                            word_positions):
                                        glyph_start = p.global_start
                                        glyph_end = p.global_end

                                        polygon = polygon_from_x0y0x1y1([
                                            glyph_start, 0, glyph_end,
                                            line_image.height
                                        ])
                                        points = points_from_polygon(
                                            coordinates_for_segment(
                                                polygon, None, line_coords))

                                        glyph = GlyphType(
                                            id='%s_glyph%04d' %
                                            (word.id, glyph_no),
                                            Coords=CoordsType(points))

                                        # Add predictions (= TextEquivs)
                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char_index, char in enumerate(
                                                _sort_chars(p),
                                                start=char_index_start):
                                            glyph.add_TextEquiv(
                                                TextEquivType(
                                                    Unicode=char.char,
                                                    index=char_index,
                                                    conf=char.probability))

                                        word.add_Glyph(glyph)

                                line.add_Word(word)
                                word_no += 1

                            i += word_length

            _page_update_higher_textequiv_levels('line', pcgts)

            # Add metadata about this operation and its runtime parameters:
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Exemplo n.º 30
0
    def process(self):
        """Performs segmentation evaluation with Shapely on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Return information on the plausibility of the segmentation into
        regions on the logging level.
        """
        sanitize = self.parameter['sanitize']
        plausibilize = self.parameter['plausibilize']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            #
            # validate segmentation (warn of children extending beyond their parents)
            #
            self.validate_coords(page, page_id)

            #
            # sanitize region segmentation (shrink to hull of lines)
            #
            if sanitize:
                self.sanitize_page(page, page_id)

            #
            # plausibilize region segmentation (remove redundant text regions)
            #
            mark_for_deletion = list()  # what regions get removed?
            mark_for_merging = dict(
            )  # what regions get merged into which regions?

            # TODO: cover recursive region structure (but compare only at the same level)
            regions = page.get_TextRegion()
            # sort by area to ensure to arrive at a total ordering compatible
            # with the topological sort along containment/equivalence arcs
            # (so we can avoid substituting regions with superregions that have
            #  themselves been substituted/deleted):
            RegionPolygon = namedtuple('RegionPolygon', ['region', 'polygon'])
            regionspolys = sorted([
                RegionPolygon(
                    region,
                    Polygon(polygon_from_points(region.get_Coords().points)))
                for region in regions
            ],
                                  key=lambda x: x.polygon.area)
            for i in range(0, len(regionspolys)):
                for j in range(i + 1, len(regionspolys)):
                    region1 = regionspolys[i].region
                    region2 = regionspolys[j].region
                    poly1 = regionspolys[i].polygon
                    poly2 = regionspolys[j].polygon
                    LOG.debug('Comparing regions "%s" and "%s"', region1.id,
                              region2.id)

                    if poly1.almost_equals(poly2):
                        LOG.warning(
                            'Page "%s" region "%s" is almost equal to "%s" %s',
                            page_id, region2.id, region1.id,
                            '(removing)' if plausibilize else '')
                        mark_for_deletion.append(region2.id)
                    elif poly1.contains(poly2):
                        LOG.warning('Page "%s" region "%s" is within "%s" %s',
                                    page_id, region2.id, region1.id,
                                    '(removing)' if plausibilize else '')
                        mark_for_deletion.append(region2.id)
                    elif poly2.contains(poly1):
                        LOG.warning('Page "%s" region "%s" is within "%s" %s',
                                    page_id, region1.id, region2.id,
                                    '(removing)' if plausibilize else '')
                        mark_for_deletion.append(region1.id)
                    elif poly1.overlaps(poly2):
                        inter_poly = poly1.intersection(poly2)
                        union_poly = poly1.union(poly2)
                        LOG.debug(
                            'Page "%s" region "%s" overlaps "%s" by %f/%f',
                            page_id, region1.id, region2.id,
                            inter_poly.area / poly1.area,
                            inter_poly.area / poly2.area)
                        if union_poly.convex_hull.area >= poly1.area + poly2.area:
                            # skip this pair -- combined polygon encloses previously free segments
                            pass
                        elif inter_poly.area / poly2.area > self.parameter[
                                'plausibilize_merge_min_overlap']:
                            LOG.warning(
                                'Page "%s" region "%s" is almost within "%s" %s',
                                page_id, region2.id, region1.id,
                                '(merging)' if plausibilize else '')
                            mark_for_merging[region2.id] = region1
                        elif inter_poly.area / poly1.area > self.parameter[
                                'plausibilize_merge_min_overlap']:
                            LOG.warning(
                                'Page "%s" region "%s" is almost within "%s" %s',
                                page_id, region1.id, region2.id,
                                '(merging)' if plausibilize else '')
                            mark_for_merging[region1.id] = region2

                    # TODO: more merging cases...
                    #LOG.info('Intersection %i', poly1.intersects(poly2))
                    #LOG.info('Containment %i', poly1.contains(poly2))
                    #if poly1.intersects(poly2):
                    #    LOG.info('Area 1 %d', poly1.area)
                    #    LOG.info('Area 2 %d', poly2.area)
                    #    LOG.info('Area intersect %d', poly1.intersection(poly2).area)

            if plausibilize:
                # the reading order does not have to include all regions
                # but it may include all types of regions!
                ro = page.get_ReadingOrder()
                if ro:
                    rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
                else:
                    rogroup = None
                # pass the regions sorted (see above)
                _plausibilize_group(regionspolys, rogroup, mark_for_deletion,
                                    mark_for_merging)

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))