Пример #1
0
    def validate(filename=None,
                 ocrd_page=None,
                 ocrd_file=None,
                 strictness='strict',
                 strategy='index1'):
        """
        Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly.

        Arguments:
            filename (string): Path to PAGE
            ocrd_page (OcrdPage): OcrdPage instance
            ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage
            strictness (string): 'strict', 'lax', 'fix' or 'off'
            strategy (string): Currently only 'index1'

        Returns:
            report (:class:`ValidationReport`) Report on the validity
        """
        if ocrd_page:
            validator = PageValidator(ocrd_page, strictness, strategy)
        elif ocrd_file:
            validator = PageValidator(page_from_file(ocrd_file), strictness,
                                      strategy)
        elif filename:
            validator = PageValidator(parse(filename, silence=True),
                                      strictness, strategy)
        else:
            raise Exception(
                "At least one of ocrd_page, ocrd_file or filename must be set")
        return validator._validate()  # pylint: disable=protected-access
 def test_fix(self):
     ocrd_page = parse(FAULTY_GLYPH_PAGE_FILENAME, silence=True)
     report = PageValidator.validate(ocrd_page=ocrd_page)
     self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
     PageValidator.validate(ocrd_page=ocrd_page, strictness='fix')
     report = PageValidator.validate(ocrd_page=ocrd_page)
     self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 0, 'no more textequiv consistency errors')
Пример #3
0
 def test_fix(self):
     ocrd_page = parse(assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS'), silence=True)
     report = PageValidator.validate(ocrd_page=ocrd_page)
     self.assertEqual(len(report.errors), 17, 'errors')
     PageValidator.validate(ocrd_page=ocrd_page, strictness='fix')
     report = PageValidator.validate(ocrd_page=ocrd_page)
     self.assertEqual(len(report.errors), 0, 'no more errors')
Пример #4
0
    def process(self):
        print(Path(self.parameter['pix2pixHD']).absolute())
        if not torch.cuda.is_available():
            print("Your system has no CUDA installed. No GPU detected.")
            sys.exit(1)

        path = Path(self.parameter['pix2pixHD']).absolute()

        if not Path(path).is_dir():
            print("""\
                NVIDIA's pix2pixHD was not found at '%s'. Make sure the `pix2pixHD` parameter
                points to the local path to the cloned pix2pixHD repository.

                pix2pixHD can be downloaded from https://github.com/NVIDIA/pix2pixHD
                """ % path)
            sys.exit(1)

        for (_, input_file) in enumerate(self.input_files):
            local_input_file = self.workspace.download_file(input_file)
            pcgts = parse(local_input_file.url, silence=True)
            image_coords = pcgts.get_Page().get_Border().get_Coords(
            ).points.split()
            fname = pcgts.get_Page().imageFilename

            # Get page Co-ordinates
            min_x, min_y = image_coords[0].split(",")
            max_x, max_y = image_coords[2].split(",")
            img_tmp_dir = "OCR-D-IMG/test_A"
            img_dir = os.path.dirname(str(fname))
            # Path of pix2pixHD
            Path(img_tmp_dir).mkdir(parents=True, exist_ok=True)

            crop_region = int(min_x), int(min_y), int(max_x), int(max_y)
            cropped_img = self.crop_image(fname, crop_region)

            base, _ = ocrolib.allsplitext(fname)
            filename = base.split("/")[-1] + ".png"
            cropped_img.save(img_tmp_dir + "/" + filename)
            #os.system("cp %s %s" % (str(fname), os.path.join(img_tmp_dir, os.path.basename(str(fname)))))
            #os.system("mkdir -p %s" % img_tmp_dir)
            #os.system("cp %s %s" % (str(fname), os.path.join(img_tmp_dir, os.path.basename(str(fname)))))
            os.system(
                "python " + str(path) +
                "/test.py --dataroot %s --checkpoints_dir ./ --name models --results_dir %s --label_nc 0 --no_instance --no_flip --resize_or_crop none --n_blocks_global 10 --n_local_enhancers 2 --gpu_ids %s --loadSize %d --fineSize %d --resize_or_crop %s"
                % (os.path.dirname(img_tmp_dir), img_dir,
                   self.parameter['gpu_id'], self.parameter['resizeHeight'],
                   self.parameter['resizeWidth'], self.parameter['imgresize']))
            synthesized_image = filename.split(
                ".")[0] + "_synthesized_image.jpg"
            pix2pix_img_dir = img_dir + "/models/test_latest/images/"
            dewarped_image = Path(pix2pix_img_dir + synthesized_image)
            if (dewarped_image.is_file()):
                shutil.copy(dewarped_image,
                            img_dir + "/" + filename.split(".")[0] + ".dw.jpg")

            if (Path(img_tmp_dir).is_dir()):
                shutil.rmtree(img_tmp_dir)
            if (Path(img_dir + "/models").is_dir()):
                shutil.rmtree(img_dir + "/models")
Пример #5
0
    def test_validate_lax(self):
        ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE'), silence=True)

        # introduce a single word error (not just whitespace inconsistency)
        ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1].get_TextEquiv()[0].set_Unicode('FOO')

        self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page).errors), 26, '26 errors - strict')
        self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page, strictness='lax').errors), 1, '1 error - lax')
Пример #6
0
 def test_id(self):
     """
     https://github.com/OCR-D/core/issues/682
     """
     fpath_page = assets.path_to(
         'kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0017_PAGE.xml')
     pcgts = parse(fpath_page)
     assert pcgts.id == 'PAGE_0017_PAGE'
     assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif'
    def test_validate_lax(self):
        ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE.xml'), silence=True)

        # introduce a single word error (not just whitespace inconsistency)
        ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1].get_TextEquiv()[0].set_Unicode('FOO')

        report = PageValidator.validate(ocrd_page=ocrd_page)
        self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 26, '26 textequiv consistency errors - strict')
        report = PageValidator.validate(ocrd_page=ocrd_page, strictness='lax')
        self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 1, '1 textequiv consistency errors - lax')
Пример #8
0
def page_from_file(input_file):
    """
    Create a new PAGE-XML from a METS file representing a PAGE-XML or an image.

    Arguments:
        * input_file (OcrdFile):
    """
    #  print("PARSING PARSING '%s'" % input_file)
    if input_file.mimetype.startswith('image'):
        return page_from_image(input_file)
    if input_file.mimetype == MIMETYPE_PAGE:
        return parse(input_file.local_filename, silence=True)
    raise Exception("Unsupported mimetype '%s'" % input_file.mimetype)
Пример #9
0
def test_id():
    """
    https://github.com/OCR-D/core/issues/682
    """
    fpath_page = assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0017_PAGE.xml')
    pcgts = parse(fpath_page)

    # assert
    assert pcgts.id == 'PAGE_0017_PAGE'

    # TODO: is this *really* desired?
    # I would expect for a single Page-Element the ID is like from the top-level-Pgts-Container, not like a fileName
    assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif'
Пример #10
0
    def validate(filename=None,
                 ocrd_page=None,
                 ocrd_file=None,
                 page_textequiv_consistency='strict',
                 page_textequiv_strategy='first',
                 check_baseline=True,
                 check_coords=True):
        """
        Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly.

        Arguments:
            filename (string): Path to PAGE
            ocrd_page (OcrdPage): OcrdPage instance
            ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage
            page_textequiv_consistency (string): 'strict', 'lax', 'fix' or 'off'
            page_textequiv_strategy (string): Currently only 'first'
            check_baseline (bool): whether Baseline must be fully within TextLine/Coords
            check_coords (bool): whether *Region/TextLine/Word/Glyph must each be fully
                                 contained within Border/*Region/TextLine/Word, resp.

        Returns:
            report (:class:`ValidationReport`) Report on the validity
        """
        log = getLogger('ocrd.page_validator.validate')
        if ocrd_page:
            page = ocrd_page
            file_id = ocrd_page.get_pcGtsId()
        elif ocrd_file:
            page = page_from_file(ocrd_file)
            file_id = ocrd_file.ID
        elif filename:
            page = parse(filename, silence=True)
            file_id = filename
        else:
            raise Exception(
                "At least one of ocrd_page, ocrd_file or filename must be set")
        if page_textequiv_strategy not in ('first'):
            raise Exception("page_textequiv_strategy %s not implemented" %
                            page_textequiv_strategy)
        if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'):
            raise Exception(
                "page_textequiv_consistency level %s not implemented" %
                page_textequiv_consistency)
        report = ValidationReport()
        log.info("Validating input file '%s'", file_id)
        validate_consistency(page, page_textequiv_consistency,
                             page_textequiv_strategy, check_baseline,
                             check_coords, report, file_id)
        return report
Пример #11
0
    def test_validate_multi_textequiv(self):
        ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE'), silence=True)
        self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page).errors), 25, '25 errors - strict')

        word = ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1]

        # delete all textequivs
        del(word.get_TextEquiv()[0])

        # Add textequiv
        set_text(word, 'FOO', 'index1')
        word.add_TextEquiv(TextEquivType(Unicode='BAR', conf=.7))

        self.assertEqual(get_text(word, 'index1'), 'FOO')
        set_text(word, 'BAR', 'index1')
        self.assertEqual(get_text(word, 'index1'), 'BAR')
Пример #12
0
def page_from_file(input_file):
    """
    Create a new PAGE-XML from a METS file representing a PAGE-XML or an image.

    Arguments:
        * input_file (OcrdFile):
    """
    if not input_file.local_filename:
        raise ValueError("input_file must have 'local_filename' property")
    if not Path(input_file.local_filename).exists():
        raise FileNotFoundError("File not found: '%s' (%s)" %
                                (input_file.local_filename, input_file))
    if input_file.mimetype.startswith('image'):
        return page_from_image(input_file)
    if input_file.mimetype == MIMETYPE_PAGE:
        return parse(input_file.local_filename, silence=True)
    raise ValueError("Unsupported mimetype '%s'" % input_file.mimetype)
Пример #13
0
    def remove_file(self, ID, force=False, keep_file=False, page_recursive=False, page_same_group=False):
        """
        Remove a file from the workspace.

        Arguments:
            ID (string|OcrdFile): ID of the file to delete or the file itself
            force (boolean): Continue removing even if file not found in METS
            keep_file (boolean): Whether to keep files on disk
            page_recursive (boolean): Whether to remove all images referenced in the file if the file is a PAGE-XML document.
            page_same_group (boolean): Remove only images in the same file group as the PAGE-XML. Has no effect unless ``page_recursive`` is ``True``.
        """
        log = getLogger('ocrd.workspace.remove_file')
        log.debug('Deleting mets:file %s', ID)
        if not force and self.overwrite_mode:
            force = True
        if isinstance(ID, OcrdFile):
            ID = ID.ID
        try:
            ocrd_file_ = self.mets.remove_file(ID)
            ocrd_files = [ocrd_file_] if isinstance(ocrd_file_, OcrdFile) else ocrd_file_
            if page_recursive:
                with pushd_popd(self.directory):
                    for ocrd_file in ocrd_files:
                        if ocrd_file.mimetype != MIMETYPE_PAGE:
                            continue
                        ocrd_page = parse(self.download_file(ocrd_file).local_filename, silence=True)
                        for img_url in ocrd_page.get_AllAlternativeImagePaths():
                            img_kwargs = {'url': img_url}
                            if page_same_group:
                                img_kwargs['fileGrp'] = ocrd_file.fileGrp
                            for img_file in self.mets.find_files(**img_kwargs):
                                self.remove_file(img_file, keep_file=keep_file, force=force)
            if not keep_file:
                with pushd_popd(self.directory):
                    for ocrd_file in ocrd_files:
                        if not ocrd_file.local_filename:
                            log.warning("File not locally available %s", ocrd_file)
                            if not force:
                                raise Exception("File not locally available %s" % ocrd_file)
                        else:
                            log.info("rm %s [cwd=%s]", ocrd_file.local_filename, self.directory)
                            unlink(ocrd_file.local_filename)
            return ocrd_file_
        except FileNotFoundError as e:
            if not force:
                raise e
Пример #14
0
    def test_validate_multi_textequiv_first(self):
        ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE.xml'), silence=True)
        report = PageValidator.validate(ocrd_page=ocrd_page)
        self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 25, '25 textequiv consistency errors - strict')

        word = ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1]

        # delete all textequivs
        word.set_TextEquiv([])

        # Add textequiv
        set_text(word, 'FOO', 'first')
        word.add_TextEquiv(TextEquivType(Unicode='BAR', conf=.7))
        word.add_TextEquiv(TextEquivType(Unicode='BAZ', conf=.5, index=0))
        self.assertEqual(get_text(word, 'first'), 'BAZ')
        set_text(word, 'XYZ', 'first')
        self.assertEqual(get_text(word, 'first'), 'XYZ')
Пример #15
0
    def process(self):
        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file)
            local_input_file = self.workspace.download_file(input_file)
            pcgts = parse(local_input_file.url, silence=True)
            LOG.info("Scoring text in page '%s' at the %s level",
                     pcgts.get_pcGtsId(), self.parameter['textequiv_level'])
            self._process_page(pcgts)

            # write back result
            file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                local_filename=os.path.join(self.output_file_grp, file_id),
                mimetype=MIMETYPE_PAGE,
                content=to_xml(pcgts),
            )
Пример #16
0
 def __init__(self,
              *,
              alto_version='4.2',
              check_words=True,
              check_border=True,
              skip_empty_lines=False,
              trailing_dash_to_hyp=False,
              textequiv_index=0,
              textequiv_fallback_strategy='last',
              region_order='document',
              page_filename=None,
              dummy_textline=True,
              dummy_word=True,
              page_etree=None,
              pcgts=None,
              logger=None):
     """
     Keyword Args:
         alto_version (string): Version of ALTO-XML schema to produce (older versions may not preserve all features)
         check_words (boolean): Whether to check if PAGE-XML contains any words before conversion and fail if not
         check_border (boolean): Whether to abort if neither Border nor PrintSpace is defined
         skip_empty_lines (boolean): Whether to omit empty lines completely (True) or create a placeholder empty String in ALTO (False)
         trailing_dash_to_hyp (boolean): Whether to add a <HYP/> element if the last word in a line ends in ``-`` etc
         textequiv_index (int): @index of the TextEquiv to choose
         textequiv_fallback_strategy ("raise"|"first"|"last"): Strategy to handle case of no matchin TextEquiv by textequiv_index
         region_order ("document"|"reading-order"|"reading-order-only"): The order in which to iterate over regions.
         dummy_textline (boolean): Whether to create a TextLine for regions that have TextEquiv/Unicode but no TextLine
         dummy_word (boolean): Whether to create a Word for TextLine that have TextEquiv/Unicode but no Word
     """
     if not (page_filename or page_etree or pcgts):
         raise ValueError(
             "Must pass either pcgts, page_etree or page_filename to constructor"
         )
     if alto_version not in XSD_ALTO_URLS:
         raise ValueError("Converting to ALTO-XML v%s is not supported" %
                          alto_version)
     self.alto_version = alto_version
     self.skip_empty_lines = skip_empty_lines
     self.trailing_dash_to_hyp = trailing_dash_to_hyp
     self.dummy_textline = dummy_textline
     self.region_order = region_order
     self.dummy_word = dummy_word
     self.logger = logger if logger else getLogger('page-to-alto')
     if pcgts:
         self.page_pcgts = pcgts
     elif page_etree:
         self.page_pcgts = parseString(ET.tostring(page_etree))
     else:
         self.page_pcgts = parse(page_filename)
     self.page_page = self.page_pcgts.get_Page()
     if check_words:
         self.check_words()
     if check_border:
         tree = ET.fromstring(to_xml(self.page_pcgts).encode('utf-8'))
         self.check_border(tree)
     self.textequiv_index = textequiv_index
     self.textequiv_fallback_strategy = textequiv_fallback_strategy
     self.alto_alto, self.alto_description, self.alto_styles, self.alto_tags, self.alto_page = self.create_alto(
     )
     self.alto_printspace = self.convert_border()
     self.textstyle_mgr = TextStylesManager(self.alto_version)
     self.parastyle_mgr = ParagraphStyleManager(self.alto_version)
     self.layouttag_mgr = LayoutTagManager(self.alto_version)
Пример #17
0
def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint,
             noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority):
    if purpose == "NERD":
        out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf']
    elif purpose == "OCR":
        out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id']
        if min_confidence is not None and max_confidence is not None:
            out_columns += ['ocrconf']
    else:
        raise RuntimeError("Unknown purpose.")

    if noproxy:
        os.environ['no_proxy'] = '*'

    urls = []
    if os.path.exists(tsv_out_file):
        parts = extract_doc_links(tsv_out_file)
        urls = [part['url'] for part in parts]
    else:
        pd.DataFrame([], columns=out_columns).to_csv(tsv_out_file, sep="\t", quoting=3, index=False)

    pcgts = parse(page_xml_file)
    tsv = []
    line_info = []

    for region_idx, region in enumerate(pcgts.get_Page().get_AllRegions(classes=['Text'], order='reading-order')):
        for text_line in region.get_TextLine():

            left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)]

            if min_confidence is not None and max_confidence is not None:
                conf = np.max([textequiv.conf for textequiv in text_line.get_TextEquiv()])
            else:
                conf = np.nan

            line_info.append((len(urls), left, right, top, bottom, conf, text_line.id))

            words = [word for word in text_line.get_Word()]

            if len(words) <= 0:
                for text_equiv in text_line.get_TextEquiv():
                    # transform OCR coordinates using `scale_factor` to derive
                    # correct coordinates for the web presentation image
                    left, top, right, bottom = [int(scale_factor * x) for x in
                                                bbox_from_points(text_line.get_Coords().points)]

                    tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
                                text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id))
            else:
                for word in words:

                    for text_equiv in word.get_TextEquiv():
                        # transform OCR coordinates using `scale_factor` to derive
                        # correct coordinates for the web presentation image
                        left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)]

                        tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0,
                                    text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id))

    line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id'])

    if min_confidence is not None and max_confidence is not None:
        line_info['ocrconf'] = line_info.conf.map(lambda x: get_conf_color(x, min_confidence, max_confidence))

    tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] +
                                    ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'line_id'])

    if len(tsv) == 0:
        return

    with open(tsv_out_file, 'a') as f:

        f.write('# ' + image_url + '\n')

    vlinecenter = pd.DataFrame(tsv[['line', 'top']].groupby('line', sort=False).mean().top +
                               (tsv[['line', 'bottom']].groupby('line', sort=False).mean().bottom -
                                tsv[['line', 'top']].groupby('line', sort=False).mean().top) / 2,
                               columns=['vlinecenter'])

    tsv = tsv.merge(vlinecenter, left_on='line', right_index=True)

    regions = [region.sort_values(['vlinecenter', 'hcenter']) for rid, region in tsv.groupby('rid', sort=False)]

    tsv = pd.concat(regions)

    if purpose == 'NERD':

        tsv['No.'] = 0
        tsv['NE-TAG'] = 'O'
        tsv['NE-EMB'] = 'O'
        tsv['ID'] = '-'
        tsv['conf'] = '-'

        tsv = tsv.rename(columns={'TEXT': 'TOKEN'})
    elif purpose == 'OCR':

        tsv = pd.DataFrame([(line, " ".join(part.TEXT.to_list())) for line, part in tsv.groupby('line')],
                           columns=['line', 'TEXT'])

        tsv = tsv.merge(line_info, left_on='line', right_index=True)

    tsv = tsv[out_columns].reset_index(drop=True)

    try:
        if purpose == 'NERD' and ner_rest_endpoint is not None:

            tsv, ner_result = ner(tsv, ner_rest_endpoint)

            if ned_rest_endpoint is not None:

                tsv, _ = ned(tsv, ner_result, ned_rest_endpoint, threshold=ned_threshold, priority=ned_priority)

        tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False)
    except requests.HTTPError as e:
        print(e)