Exemplo n.º 1
0
 def estimate_bounds(self, page, page_image, tessapi, zoom=1.0):
     """Get outer bounds of all (existing or detected) regions."""
     LOG = getLogger('processor.TesserocrCrop')
     all_left = page_image.width
     all_top = page_image.height
     all_right = 0
     all_bottom = 0
     LOG.info("Cropping with Tesseract")
     tessapi.SetImage(page_image)
     # PSM.SPARSE_TEXT: get as much text as possible in no particular order
     # PSM.AUTO (default): includes tables (dangerous)
     # PSM.SPARSE_TEXT_OSD: sparse but all orientations
     tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT)
     #
     # iterate over all text blocks and compare their
     # bbox extent to the running min and max values
     for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
         image, xywh, index, _ = component
         #
         # the region reference in the reading order element
         #
         ID = "region%04d" % index
         left, top, right, bottom = bbox_from_xywh(xywh)
         LOG.debug("Detected text region '%s': %i:%i,%i:%i",
                   ID, left, right, top, bottom)
         # filter region results:
         bin_bbox = image.getbbox()
         if not bin_bbox:
             # this does happen!
             LOG.info("Ignoring region '%s' because its binarization is empty", ID)
             continue
         width = bin_bbox[2]-bin_bbox[0]
         if width < 25 / zoom:
             # we must be conservative here: page numbers are tiny regions, too!
             LOG.info("Ignoring region '%s' because its width is too small (%d)", ID, width)
             continue
         height = bin_bbox[3]-bin_bbox[1]
         if height < 25 / zoom:
             # we must be conservative here: page numbers are tiny regions, too!
             LOG.debug("Ignoring region '%s' because its height is too small (%d)", ID, height)
             continue
         all_left = min(all_left, left)
         all_top = min(all_top, top)
         all_right = max(all_right, right)
         all_bottom = max(all_bottom, bottom)
     # use existing segmentation as "upper bound"
     regions = page.get_AllRegions(classes=['Text'])
     for region in regions:
         left, top, right, bottom = bbox_from_points(region.get_Coords().points)
         LOG.debug("Found existing text region '%s': %i:%i,%i:%i",
                   region.id, left, right, top, bottom)
         all_left = min(all_left, left)
         all_top = min(all_top, top)
         all_right = max(all_right, right)
         all_bottom = max(all_bottom, bottom)
     LOG.info("Combined page bounds from text regions: %i:%i,%i:%i",
              all_left, all_right, all_top, all_bottom)
     return all_left, all_top, all_right, all_bottom
Exemplo n.º 2
0
 def convert_text(self):
     for reg_page in self.page_page.get_AllRegions(depth=0,
                                                   order=self.region_order):
         reg_page_type = reg_page.__class__.__name__[
             0:-10]  # len('RegionType') == 10
         reg_alto_type = REGION_PAGE_TO_ALTO[reg_page_type]
         if not reg_alto_type:
             raise ValueError("Cannot handle PAGE-XML %sRegion" %
                              reg_page_type)
         # determine if the region belongs to PrintSpace or to any of the Margins
         reg_bbox = bbox_from_points(reg_page.get_Coords().points)
         if contains(self.alto_printspace, reg_bbox):
             parent = self.alto_printspace
         else:
             parent = None
             for margin in [
                     'LeftMargin', 'RightMargin', 'TopMargin',
                     'BottomMargin'
             ]:
                 if not hasattr(self.alto_page, margin):
                     continue
                 margin = getattr(self.alto_page, margin)
                 if contains(margin, reg_bbox):
                     parent = margin
                     break  # pick first match only
             if not parent:
                 parent = self.alto_printspace
                 self.logger.warning(
                     "region '%s' not properly contained in PrintSpace or Margins",
                     reg_page.id)
         reg_alto = ET.SubElement(parent, reg_alto_type)
         set_alto_id_from_page_id(reg_alto, reg_page)
         set_alto_xywh_from_coords(reg_alto, reg_page)
         if version.parse(self.alto_version) >= version.parse('3.1'):
             set_alto_shape_from_coords(reg_alto, reg_page)
         if version.parse(self.alto_version) >= version.parse('2.1'):
             set_alto_lang_from_page_lang(reg_alto, reg_page)
         self.textstyle_mgr.set_alto_styleref_from_textstyle(
             reg_alto, reg_page)
         self.parastyle_mgr.set_alto_styleref_from_textstyle(
             reg_alto, reg_page)
         if version.parse(self.alto_version) >= version.parse('2.1'):
             self.layouttag_mgr.set_alto_tag_from_type(reg_alto, reg_page)
         if reg_page_type == 'Text':
             self._convert_textlines(reg_alto, reg_page)
         elif reg_page_type == 'Table':
             self._convert_table(reg_alto, reg_page)
    def process(self):
        """Performs border detection on the workspace. """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        LOG = getLogger('OcrdAnybaseocrCropper')

        oplevel = self.parameter['operation_level']
        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            # Check for existing Border --> already cropped
            border = page.get_Border()
            if border:
                left, top, right, bottom = bbox_from_points(
                    border.get_Coords().points)
                LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
                            left, top, right, bottom)

            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id,
                feature_filter='cropped',
                feature_selector='binarized') # should also be deskewed

            if oplevel == "page":
                self._process_segment(
                    page_image, page, page_coords, page_id, input_file, n)
            else:
                raise Exception(
                    'Operation level %s, but should be "page".', oplevel)
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
                content=to_xml(pcgts).encode('utf-8')
            )
Exemplo n.º 4
0
 def test_bbox_from_points(self):
     self.assertEqual(bbox_from_points('100,100 200,100 200,200 100,200'),
                      (100, 100, 200, 200))
Exemplo n.º 5
0
    def convert_border(self):
        page_width = self.page_page.imageWidth
        page_height = self.page_page.imageHeight
        setxml(self.alto_page, 'WIDTH', page_width)
        setxml(self.alto_page, 'HEIGHT', page_height)
        page_border = self.page_page.get_Border()
        page_pspace = self.page_page.get_PrintSpace()
        if page_pspace is None and page_border is not None:
            self.logger.warning(
                "PAGE-XML has Border but no PrintSpace - Margins will be empty"
            )
            page_pspace = page_border
        elif page_border is None and page_pspace is not None:
            self.logger.warning(
                "PAGE-XML has PrintSpace but no Border - Margins will be empty"
            )
            page_border = page_pspace
        elif page_border is None and page_pspace is None:
            self.logger.warning(
                "PAGE-XML has neither Border nor PrintSpace - PrintSpace will fill the image"
            )
            alto_pspace = ET.SubElement(self.alto_page, 'PrintSpace')
            setxml(alto_pspace, 'VPOS', 0)
            setxml(alto_pspace, 'HPOS', 0)
            setxml(alto_pspace, 'HEIGHT', page_height)
            setxml(alto_pspace, 'WIDTH', page_width)
            return alto_pspace

        alto_pspace = ET.SubElement(self.alto_page, 'PrintSpace')
        set_alto_xywh_from_coords(alto_pspace, page_pspace)
        set_alto_shape_from_coords(alto_pspace, page_pspace)

        if page_border is not page_pspace:
            bmin_x, bmin_y, bmax_x, bmax_y = bbox_from_points(
                page_border.get_Coords().points)
            pmin_x, pmin_y, pmax_x, pmax_y = bbox_from_points(
                page_pspace.get_Coords().points)
            #
            #  ╔═══════╗   ╔═══════╗   ╔╗   ╔══╗
            #  ║┌───┐  ║   ╚═══════╝   ║║   ║  ║              ┌───┐
            #  ║│   │  ║ →           + ║║   ║  ║ (margins) +  │   │ (pspace)
            #  ║└───┘  ║   ╔═══════╗   ║║   ║  ║              └───┘
            #  ║       ║   ║       ║   ║║   ║  ║
            #  ╚═══════╝   ╚═══════╝   ╚╝   ╚══╝
            #
            alto_topmargin = ET.SubElement(self.alto_page, 'TopMargin')
            setxml(alto_topmargin, 'VPOS', bmin_y)
            setxml(alto_topmargin, 'HPOS', bmin_x)
            setxml(alto_topmargin, 'HEIGHT', pmin_y - bmin_y)
            setxml(alto_topmargin, 'WIDTH', bmax_x - bmin_x)
            alto_leftmargin = ET.SubElement(self.alto_page, 'LeftMargin')
            setxml(alto_leftmargin, 'VPOS', bmin_y)
            setxml(alto_leftmargin, 'HPOS', bmin_x)
            setxml(alto_leftmargin, 'HEIGHT', bmax_y - bmin_y)
            setxml(alto_leftmargin, 'WIDTH', pmin_x - bmin_x)
            alto_rightmargin = ET.SubElement(self.alto_page, 'RightMargin')
            setxml(alto_rightmargin, 'VPOS', bmin_y)
            setxml(alto_rightmargin, 'HPOS', pmax_x)
            setxml(alto_rightmargin, 'HEIGHT', bmax_y - bmin_y)
            setxml(alto_rightmargin, 'WIDTH', bmax_x - pmax_x)
            alto_bottommargin = ET.SubElement(self.alto_page, 'BottomMargin')
            setxml(alto_bottommargin, 'VPOS', pmax_y)
            setxml(alto_bottommargin, 'HPOS', bmin_x)
            setxml(alto_bottommargin, 'HEIGHT', bmax_y - pmax_y)
            setxml(alto_bottommargin, 'WIDTH', bmax_x - bmin_x)

        return alto_pspace
Exemplo n.º 6
0
    def process(self):
        """Performs heuristic page frame detection (cropping) on the workspace.
        
        Open and deserialize PAGE input files and their respective images.
        (Input should be deskewed already.) Retrieve the raw (non-binarized,
        uncropped) page image.
        
        Detect line segments via edge gradients, and cluster them into contiguous
        horizontal and vertical lines if possible. If candidates which are located
        at the margin and long enough (covering a large fraction of the page) exist
        on all four sides, then pick the best (i.e. thickest, longest and inner-most)
        one on each side and use their intersections as border points.
        
        Otherwise, first try to detect a ruler (i.e. image segment depicting a rule
        placed on the scan/photo for scale references) via thresholding and contour
        detection, identifying a single large rectangular region with a certain aspect
        ratio. Suppress (mask) any such segment during further calculations.

        Next in that line, try to detect text segments on the page. For that purpose,
        get the gradient of grayscale image, threshold and morphologically close it,
        then determine contours to define approximate text boxes. Merge these into
        columns, filtering candidates too small or entirely in the margin areas.
        Finally, merge the remaining columns across short gaps. If only one column
        remains, and it covers a significant fraction of the page, pick that segment
        as solution.
        
        Otherwise, keep the border points derived from line segments (intersecting
        with the full image on each side without line candidates).
        
        Lastly, map coordinates to the original (undeskewed) image and intersect
        the border polygon with the full image frame. Use that to define the page's
        Border.
        
        Moreover, crop (and mask) the image accordingly, and reference the
        resulting image file as AlternativeImage in the Page element.
        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CROP`` along with further
        identification of the input element.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        self.logger = getLogger('processor.AnybaseocrCropper')

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            self.logger.info("INPUT FILE %i / %s", n, page_id)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            # Check for existing Border --> already cropped
            border = page.get_Border()
            if border:
                left, top, right, bottom = bbox_from_points(
                    border.get_Coords().points)
                self.logger.warning('Overwriting existing Border: %i:%i,%i:%i',
                                    left, top, right, bottom)

            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page,
                page_id,  # should be deskewed already
                feature_filter='cropped,binarized,grayscale_normalized')
            if self.parameter['dpi'] > 0:
                zoom = 300.0 / self.parameter['dpi']
            elif page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi *= 2.54
                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                zoom = 300.0 / dpi
            else:
                zoom = 1

            self._process_page(page, page_image, page_coords, input_file, zoom)
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Exemplo n.º 7
0
    def process(self):
        """Performs page cropping with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images.
        Set up Tesseract to detect text blocks on each page, and find
        the largest coordinate extent spanning all of them. Use this
        extent in defining a Border, and add that to the page.
        
        Moreover, crop the original image accordingly, and reference the
        resulting image file as AlternativeImage in the Page element.
        
        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CROP`` along with further
        identification of the input element.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrCrop')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        padding = self.parameter['padding']
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            # disable table detection here (tables count as text blocks),
            # because we do not want to risk confusing the spine with
            # a column separator and thus creeping into a neighbouring
            # page:
            tessapi.SetVariable("textord_tabfind_find_tables", "0")
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()

                # warn of existing Border:
                border = page.get_Border()
                if border:
                    left, top, right, bottom = bbox_from_points(
                        border.get_Coords().points)
                    LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
                                left, top, right, bottom)

                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page,
                    page_id,
                    # image must not have been cropped already,
                    # abort if no such image can be produced:
                    feature_filter='cropped')
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from parameter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                    zoom = 300 / dpi
                else:
                    zoom = 1

                # warn of existing segmentation:
                regions = page.get_TextRegion()
                if regions:
                    min_x = page_image.width
                    min_y = page_image.height
                    max_x = 0
                    max_y = 0
                    for region in regions:
                        left, top, right, bottom = bbox_from_points(
                            region.get_Coords().points)
                        min_x = min(min_x, left)
                        min_y = min(min_y, top)
                        max_x = max(max_x, right)
                        max_y = max(max_y, bottom)
                    LOG.warning(
                        'Ignoring extent from existing TextRegions: %i:%i,%i:%i',
                        min_x, max_x, min_y, max_y)

                LOG.debug("Cropping with Tesseract")
                tessapi.SetImage(page_image)
                # PSM.SPARSE_TEXT: get as much text as possible in no particular order
                # PSM.AUTO (default): includes tables (dangerous)
                tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT)
                #
                # helper variables for saving the box coordinates
                #
                min_x = page_image.width
                min_y = page_image.height
                max_x = 0
                max_y = 0
                # iterate over all text blocks and compare their
                # bbox extent to the running min and max values
                for component in tessapi.GetComponentImages(
                        tesserocr.RIL.BLOCK, True):
                    image, xywh, index, _ = component
                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    left, top, right, bottom = bbox_from_xywh(xywh)
                    LOG.debug("Detected text region '%s': %i:%i,%i:%i", ID,
                              left, right, top, bottom)
                    # filter region results:
                    bin_bbox = image.getbbox()
                    if not bin_bbox:
                        # this does happen!
                        LOG.info(
                            "Ignoring region '%s' because its binarization is empty",
                            ID)
                        continue
                    width = bin_bbox[2] - bin_bbox[0]
                    if width < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.info(
                            "Ignoring region '%s' because its width is too small (%d)",
                            ID, width)
                        continue
                    height = bin_bbox[3] - bin_bbox[1]
                    if height < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.debug(
                            "Ignoring region '%s' because its height is too small (%d)",
                            ID, height)
                        continue
                    min_x = min(min_x, left)
                    min_y = min(min_y, top)
                    max_x = max(max_x, right)
                    max_y = max(max_y, bottom)
                    LOG.info("Updated page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)

                #
                # set the identified page border
                #
                if min_x < max_x and min_y < max_y:
                    # add padding:
                    min_x = max(min_x - padding, 0)
                    max_x = min(max_x + padding, page_image.width)
                    min_y = max(min_y - padding, 0)
                    max_y = min(max_y + padding, page_image.height)
                    LOG.info("Padded page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)
                    polygon = polygon_from_bbox(min_x, min_y, max_x, max_y)
                    polygon = coordinates_for_segment(polygon, page_image,
                                                      page_xywh)
                    polygon = polygon_for_parent(polygon, page)
                    border = BorderType(
                        Coords=CoordsType(points_from_polygon(polygon)))
                    # intersection with parent could have changed bbox,
                    # so recalculate:
                    bbox = bbox_from_polygon(
                        coordinates_of_segment(border, page_image, page_xywh))
                    # update PAGE (annotate border):
                    page.set_Border(border)
                    # update METS (add the image file):
                    page_image = crop_image(page_image, box=bbox)
                    page_xywh['features'] += ',cropped'
                    file_id = make_file_id(input_file, self.output_file_grp)
                    file_path = self.workspace.save_image_file(
                        page_image,
                        file_id + '.IMG-CROP',
                        page_id=input_file.pageId,
                        file_grp=self.output_file_grp)
                    # update PAGE (reference the image file):
                    page.add_AlternativeImage(
                        AlternativeImageType(filename=file_path,
                                             comments=page_xywh['features']))
                else:
                    LOG.error("Cannot find valid extent for page '%s'",
                              page_id)

                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
    def process(self):
        """Performs border detection on the workspace. """
        try:
            print("OUTPUT FILE ", self.output_file_grp)
            self.page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            self.page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        oplevel = self.parameter['operation_level']
        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            # Check for existing Border --> already cropped
            border = page.get_Border()
            if border:
                left, top, right, bottom = bbox_from_points(
                    border.get_Coords().points)
                LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left,
                            top, right, bottom)

            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  # externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_filter='cropped')

            if oplevel == "page":
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n)
            else:
                LOG.warning('Operation level %s, but should be "page".',
                            oplevel)
                break
            file_id = input_file.ID.replace(self.input_file_grp, self.page_grp)

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            if file_id == input_file.ID:
                file_id = concat_padded(self.page_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.page_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.page_grp, file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Exemplo n.º 9
0
def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint,
             noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority):
    if purpose == "NERD":
        out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
    elif purpose == "OCR":
        out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id']
        if min_confidence is not None and max_confidence is not None:
            out_columns += ['ocrconf']
    else:
        raise RuntimeError("Unknown purpose.")

    if noproxy:
        os.environ['no_proxy'] = '*'

    urls = []
    if os.path.exists(tsv_out_file):
        parts = extract_doc_links(tsv_out_file)
        urls = [part['url'] for part in parts]
    else:
        pd.DataFrame([], columns=out_columns).to_csv(tsv_out_file, sep="\t", quoting=3, index=False)

    pcgts = parse(page_xml_file)
    tsv = []
    line_info = []

    for region_idx, region in enumerate(pcgts.get_Page().get_AllRegions(classes=['Text'], order='reading-order')):
        for text_line in region.get_TextLine():

            left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)]

            if min_confidence is not None and max_confidence is not None:
                conf = np.max([textequiv.conf for textequiv in text_line.get_TextEquiv()])
            else:
                conf = np.nan

            line_info.append((len(urls), left, right, top, bottom, conf, text_line.id))

            words = [word for word in text_line.get_Word()]

            if len(words) <= 0:
                for text_equiv in text_line.get_TextEquiv():
                    # transform OCR coordinates using `scale_factor` to derive
                    # correct coordinates for the web presentation image
                    left, top, right, bottom = [int(scale_factor * x) for x in
                                                bbox_from_points(text_line.get_Coords().points)]

                    tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
                                text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id))
            else:
                for word in words:

                    for text_equiv in word.get_TextEquiv():
                        # transform OCR coordinates using `scale_factor` to derive
                        # correct coordinates for the web presentation image
                        left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)]

                        tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
                                    text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id))

    line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id'])

    if min_confidence is not None and max_confidence is not None:
        line_info['ocrconf'] = line_info.conf.map(lambda x: get_conf_color(x, min_confidence, max_confidence))

    tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] +
                                    ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'line_id'])

    if len(tsv) == 0:
        return

    with open(tsv_out_file, 'a') as f:

        f.write('# ' + image_url + '\n')

    vlinecenter = pd.DataFrame(tsv[['line', 'top']].groupby('line', sort=False).mean().top +
                               (tsv[['line', 'bottom']].groupby('line', sort=False).mean().bottom -
                                tsv[['line', 'top']].groupby('line', sort=False).mean().top) / 2,
                               columns=['vlinecenter'])

    tsv = tsv.merge(vlinecenter, left_on='line', right_index=True)

    regions = [region.sort_values(['vlinecenter', 'hcenter']) for rid, region in tsv.groupby('rid', sort=False)]

    tsv = pd.concat(regions)

    if purpose == 'NERD':

        tsv['No.'] = 0
        tsv['NE-TAG'] = 'O'
        tsv['NE-EMB'] = 'O'
        tsv['ID'] = '-'
        tsv['conf'] = '-'

        tsv = tsv.rename(columns={'TEXT': 'TOKEN'})
    elif purpose == 'OCR':

        tsv = pd.DataFrame([(line, " ".join(part.TEXT.to_list())) for line, part in tsv.groupby('line')],
                           columns=['line', 'TEXT'])

        tsv = tsv.merge(line_info, left_on='line', right_index=True)

    tsv = tsv[out_columns].reset_index(drop=True)

    try:
        if purpose == 'NERD' and ner_rest_endpoint is not None:

            tsv, ner_result = ner(tsv, ner_rest_endpoint)

            if ned_rest_endpoint is not None:

                tsv, _ = ned(tsv, ner_result, ned_rest_endpoint, threshold=ned_threshold, priority=ned_priority)

        tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False)
    except requests.HTTPError as e:
        print(e)
Exemplo n.º 10
0
    def process(self):
        """Performs page cropping with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images.
        Set up Tesseract to detect text blocks on each page, and find
        the largest coordinate extent spanning all of them. Use this
        extent in defining a Border, and add that to the page.
        
        Moreover, crop the original image accordingly, and reference the
        resulting image file as AlternativeImage in the Page element.
        
        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CROP`` along with further
        identification of the input element.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrCrop')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            # disable table detection here (tables count as text blocks),
            # because we do not want to risk confusing the spine with
            # a column separator and thus creeping into a neighbouring
            # page:
            tessapi.SetVariable("textord_tabfind_find_tables", "0")
            for (n, input_file) in enumerate(self.input_files):
                file_id = make_file_id(input_file, self.output_file_grp)
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()
                
                # warn of existing Border:
                border = page.get_Border()
                if border:
                    left, top, right, bottom = bbox_from_points(border.get_Coords().points)
                    LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
                                left, top, right, bottom)
                
                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page, page_id,
                    # image must not have been cropped already,
                    # abort if no such image can be produced:
                    feature_filter='cropped')
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi)
                else:
                    dpi = 0
                    LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                    zoom = 300 / dpi
                else:
                    zoom = 1

                bounds = self.estimate_bounds(page, page_image, tessapi, zoom)
                self.process_page(page, page_image, page_xywh, bounds, file_id, input_file.pageId)

                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(
                    ID=file_id,
                    file_grp=self.output_file_grp,
                    pageId=input_file.pageId,
                    mimetype=MIMETYPE_PAGE,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    content=to_xml(pcgts))