def validate(filename=None, ocrd_page=None, ocrd_file=None, strictness='strict', strategy='index1'): """ Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly. Arguments: filename (string): Path to PAGE ocrd_page (OcrdPage): OcrdPage instance ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage strictness (string): 'strict', 'lax', 'fix' or 'off' strategy (string): Currently only 'index1' Returns: report (:class:`ValidationReport`) Report on the validity """ if ocrd_page: validator = PageValidator(ocrd_page, strictness, strategy) elif ocrd_file: validator = PageValidator(page_from_file(ocrd_file), strictness, strategy) elif filename: validator = PageValidator(parse(filename, silence=True), strictness, strategy) else: raise Exception( "At least one of ocrd_page, ocrd_file or filename must be set") return validator._validate() # pylint: disable=protected-access
def test_fix(self): ocrd_page = parse(FAULTY_GLYPH_PAGE_FILENAME, silence=True) report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors') PageValidator.validate(ocrd_page=ocrd_page, strictness='fix') report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 0, 'no more textequiv consistency errors')
def test_fix(self): ocrd_page = parse(assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS'), silence=True) report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len(report.errors), 17, 'errors') PageValidator.validate(ocrd_page=ocrd_page, strictness='fix') report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len(report.errors), 0, 'no more errors')
def process(self): print(Path(self.parameter['pix2pixHD']).absolute()) if not torch.cuda.is_available(): print("Your system has no CUDA installed. No GPU detected.") sys.exit(1) path = Path(self.parameter['pix2pixHD']).absolute() if not Path(path).is_dir(): print("""\ NVIDIA's pix2pixHD was not found at '%s'. Make sure the `pix2pixHD` parameter points to the local path to the cloned pix2pixHD repository. pix2pixHD can be downloaded from https://github.com/NVIDIA/pix2pixHD """ % path) sys.exit(1) for (_, input_file) in enumerate(self.input_files): local_input_file = self.workspace.download_file(input_file) pcgts = parse(local_input_file.url, silence=True) image_coords = pcgts.get_Page().get_Border().get_Coords( ).points.split() fname = pcgts.get_Page().imageFilename # Get page Co-ordinates min_x, min_y = image_coords[0].split(",") max_x, max_y = image_coords[2].split(",") img_tmp_dir = "OCR-D-IMG/test_A" img_dir = os.path.dirname(str(fname)) # Path of pix2pixHD Path(img_tmp_dir).mkdir(parents=True, exist_ok=True) crop_region = int(min_x), int(min_y), int(max_x), int(max_y) cropped_img = self.crop_image(fname, crop_region) base, _ = ocrolib.allsplitext(fname) filename = base.split("/")[-1] + ".png" cropped_img.save(img_tmp_dir + "/" + filename) #os.system("cp %s %s" % (str(fname), os.path.join(img_tmp_dir, os.path.basename(str(fname))))) #os.system("mkdir -p %s" % img_tmp_dir) #os.system("cp %s %s" % (str(fname), os.path.join(img_tmp_dir, os.path.basename(str(fname))))) os.system( "python " + str(path) + "/test.py --dataroot %s --checkpoints_dir ./ --name models --results_dir %s --label_nc 0 --no_instance --no_flip --resize_or_crop none --n_blocks_global 10 --n_local_enhancers 2 --gpu_ids %s --loadSize %d --fineSize %d --resize_or_crop %s" % (os.path.dirname(img_tmp_dir), img_dir, self.parameter['gpu_id'], self.parameter['resizeHeight'], self.parameter['resizeWidth'], self.parameter['imgresize'])) synthesized_image = filename.split( ".")[0] + "_synthesized_image.jpg" pix2pix_img_dir = img_dir + "/models/test_latest/images/" dewarped_image = Path(pix2pix_img_dir + synthesized_image) if (dewarped_image.is_file()): shutil.copy(dewarped_image, img_dir + "/" + filename.split(".")[0] + ".dw.jpg") if (Path(img_tmp_dir).is_dir()): shutil.rmtree(img_tmp_dir) if (Path(img_dir + "/models").is_dir()): shutil.rmtree(img_dir + "/models")
def test_validate_lax(self): ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE'), silence=True) # introduce a single word error (not just whitespace inconsistency) ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1].get_TextEquiv()[0].set_Unicode('FOO') self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page).errors), 26, '26 errors - strict') self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page, strictness='lax').errors), 1, '1 error - lax')
def test_id(self): """ https://github.com/OCR-D/core/issues/682 """ fpath_page = assets.path_to( 'kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0017_PAGE.xml') pcgts = parse(fpath_page) assert pcgts.id == 'PAGE_0017_PAGE' assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif'
def test_validate_lax(self): ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE.xml'), silence=True) # introduce a single word error (not just whitespace inconsistency) ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1].get_TextEquiv()[0].set_Unicode('FOO') report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 26, '26 textequiv consistency errors - strict') report = PageValidator.validate(ocrd_page=ocrd_page, strictness='lax') self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 1, '1 textequiv consistency errors - lax')
def page_from_file(input_file): """ Create a new PAGE-XML from a METS file representing a PAGE-XML or an image. Arguments: * input_file (OcrdFile): """ # print("PARSING PARSING '%s'" % input_file) if input_file.mimetype.startswith('image'): return page_from_image(input_file) if input_file.mimetype == MIMETYPE_PAGE: return parse(input_file.local_filename, silence=True) raise Exception("Unsupported mimetype '%s'" % input_file.mimetype)
def test_id(): """ https://github.com/OCR-D/core/issues/682 """ fpath_page = assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0017_PAGE.xml') pcgts = parse(fpath_page) # assert assert pcgts.id == 'PAGE_0017_PAGE' # TODO: is this *really* desired? # I would expect for a single Page-Element the ID is like from the top-level-Pgts-Container, not like a fileName assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif'
def validate(filename=None, ocrd_page=None, ocrd_file=None, page_textequiv_consistency='strict', page_textequiv_strategy='first', check_baseline=True, check_coords=True): """ Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly. Arguments: filename (string): Path to PAGE ocrd_page (OcrdPage): OcrdPage instance ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage page_textequiv_consistency (string): 'strict', 'lax', 'fix' or 'off' page_textequiv_strategy (string): Currently only 'first' check_baseline (bool): whether Baseline must be fully within TextLine/Coords check_coords (bool): whether *Region/TextLine/Word/Glyph must each be fully contained within Border/*Region/TextLine/Word, resp. Returns: report (:class:`ValidationReport`) Report on the validity """ log = getLogger('ocrd.page_validator.validate') if ocrd_page: page = ocrd_page file_id = ocrd_page.get_pcGtsId() elif ocrd_file: page = page_from_file(ocrd_file) file_id = ocrd_file.ID elif filename: page = parse(filename, silence=True) file_id = filename else: raise Exception( "At least one of ocrd_page, ocrd_file or filename must be set") if page_textequiv_strategy not in ('first'): raise Exception("page_textequiv_strategy %s not implemented" % page_textequiv_strategy) if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'): raise Exception( "page_textequiv_consistency level %s not implemented" % page_textequiv_consistency) report = ValidationReport() log.info("Validating input file '%s'", file_id) validate_consistency(page, page_textequiv_consistency, page_textequiv_strategy, check_baseline, check_coords, report, file_id) return report
def test_validate_multi_textequiv(self): ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE'), silence=True) self.assertEqual(len(PageValidator.validate(ocrd_page=ocrd_page).errors), 25, '25 errors - strict') word = ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1] # delete all textequivs del(word.get_TextEquiv()[0]) # Add textequiv set_text(word, 'FOO', 'index1') word.add_TextEquiv(TextEquivType(Unicode='BAR', conf=.7)) self.assertEqual(get_text(word, 'index1'), 'FOO') set_text(word, 'BAR', 'index1') self.assertEqual(get_text(word, 'index1'), 'BAR')
def page_from_file(input_file): """ Create a new PAGE-XML from a METS file representing a PAGE-XML or an image. Arguments: * input_file (OcrdFile): """ if not input_file.local_filename: raise ValueError("input_file must have 'local_filename' property") if not Path(input_file.local_filename).exists(): raise FileNotFoundError("File not found: '%s' (%s)" % (input_file.local_filename, input_file)) if input_file.mimetype.startswith('image'): return page_from_image(input_file) if input_file.mimetype == MIMETYPE_PAGE: return parse(input_file.local_filename, silence=True) raise ValueError("Unsupported mimetype '%s'" % input_file.mimetype)
def remove_file(self, ID, force=False, keep_file=False, page_recursive=False, page_same_group=False): """ Remove a file from the workspace. Arguments: ID (string|OcrdFile): ID of the file to delete or the file itself force (boolean): Continue removing even if file not found in METS keep_file (boolean): Whether to keep files on disk page_recursive (boolean): Whether to remove all images referenced in the file if the file is a PAGE-XML document. page_same_group (boolean): Remove only images in the same file group as the PAGE-XML. Has no effect unless ``page_recursive`` is ``True``. """ log = getLogger('ocrd.workspace.remove_file') log.debug('Deleting mets:file %s', ID) if not force and self.overwrite_mode: force = True if isinstance(ID, OcrdFile): ID = ID.ID try: ocrd_file_ = self.mets.remove_file(ID) ocrd_files = [ocrd_file_] if isinstance(ocrd_file_, OcrdFile) else ocrd_file_ if page_recursive: with pushd_popd(self.directory): for ocrd_file in ocrd_files: if ocrd_file.mimetype != MIMETYPE_PAGE: continue ocrd_page = parse(self.download_file(ocrd_file).local_filename, silence=True) for img_url in ocrd_page.get_AllAlternativeImagePaths(): img_kwargs = {'url': img_url} if page_same_group: img_kwargs['fileGrp'] = ocrd_file.fileGrp for img_file in self.mets.find_files(**img_kwargs): self.remove_file(img_file, keep_file=keep_file, force=force) if not keep_file: with pushd_popd(self.directory): for ocrd_file in ocrd_files: if not ocrd_file.local_filename: log.warning("File not locally available %s", ocrd_file) if not force: raise Exception("File not locally available %s" % ocrd_file) else: log.info("rm %s [cwd=%s]", ocrd_file.local_filename, self.directory) unlink(ocrd_file.local_filename) return ocrd_file_ except FileNotFoundError as e: if not force: raise e
def test_validate_multi_textequiv_first(self): ocrd_page = parse(assets.path_to('kant_aufklaerung_1784/data/OCR-D-GT-PAGE/PAGE_0020_PAGE.xml'), silence=True) report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 25, '25 textequiv consistency errors - strict') word = ocrd_page.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word()[1] # delete all textequivs word.set_TextEquiv([]) # Add textequiv set_text(word, 'FOO', 'first') word.add_TextEquiv(TextEquivType(Unicode='BAR', conf=.7)) word.add_TextEquiv(TextEquivType(Unicode='BAZ', conf=.5, index=0)) self.assertEqual(get_text(word, 'first'), 'BAZ') set_text(word, 'XYZ', 'first') self.assertEqual(get_text(word, 'first'), 'XYZ')
def process(self): for (n, input_file) in enumerate(self.input_files): LOG.info("INPUT FILE %i / %s", n, input_file) local_input_file = self.workspace.download_file(input_file) pcgts = parse(local_input_file.url, silence=True) LOG.info("Scoring text in page '%s' at the %s level", pcgts.get_pcGtsId(), self.parameter['textequiv_level']) self._process_page(pcgts) # write back result file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, local_filename=os.path.join(self.output_file_grp, file_id), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), )
def __init__(self, *, alto_version='4.2', check_words=True, check_border=True, skip_empty_lines=False, trailing_dash_to_hyp=False, textequiv_index=0, textequiv_fallback_strategy='last', region_order='document', page_filename=None, dummy_textline=True, dummy_word=True, page_etree=None, pcgts=None, logger=None): """ Keyword Args: alto_version (string): Version of ALTO-XML schema to produce (older versions may not preserve all features) check_words (boolean): Whether to check if PAGE-XML contains any words before conversion and fail if not check_border (boolean): Whether to abort if neither Border nor PrintSpace is defined skip_empty_lines (boolean): Whether to omit empty lines completely (True) or create a placeholder empty String in ALTO (False) trailing_dash_to_hyp (boolean): Whether to add a <HYP/> element if the last word in a line ends in ``-`` etc textequiv_index (int): @index of the TextEquiv to choose textequiv_fallback_strategy ("raise"|"first"|"last"): Strategy to handle case of no matchin TextEquiv by textequiv_index region_order ("document"|"reading-order"|"reading-order-only"): The order in which to iterate over regions. dummy_textline (boolean): Whether to create a TextLine for regions that have TextEquiv/Unicode but no TextLine dummy_word (boolean): Whether to create a Word for TextLine that have TextEquiv/Unicode but no Word """ if not (page_filename or page_etree or pcgts): raise ValueError( "Must pass either pcgts, page_etree or page_filename to constructor" ) if alto_version not in XSD_ALTO_URLS: raise ValueError("Converting to ALTO-XML v%s is not supported" % alto_version) self.alto_version = alto_version self.skip_empty_lines = skip_empty_lines self.trailing_dash_to_hyp = trailing_dash_to_hyp self.dummy_textline = dummy_textline self.region_order = region_order self.dummy_word = dummy_word self.logger = logger if logger else getLogger('page-to-alto') if pcgts: self.page_pcgts = pcgts elif page_etree: self.page_pcgts = parseString(ET.tostring(page_etree)) else: self.page_pcgts = parse(page_filename) self.page_page = self.page_pcgts.get_Page() if check_words: self.check_words() if check_border: tree = ET.fromstring(to_xml(self.page_pcgts).encode('utf-8')) self.check_border(tree) self.textequiv_index = textequiv_index self.textequiv_fallback_strategy = textequiv_fallback_strategy self.alto_alto, self.alto_description, self.alto_styles, self.alto_tags, self.alto_page = self.create_alto( ) self.alto_printspace = self.convert_border() self.textstyle_mgr = TextStylesManager(self.alto_version) self.parastyle_mgr = ParagraphStyleManager(self.alto_version) self.layouttag_mgr = LayoutTagManager(self.alto_version)
def page2tsv(page_xml_file, tsv_out_file, purpose, image_url, ner_rest_endpoint, ned_rest_endpoint, noproxy, scale_factor, ned_threshold, min_confidence, max_confidence, ned_priority): if purpose == "NERD": out_columns = ['No.', 'TOKEN', 'NE-TAG', 'NE-EMB', 'ID', 'url_id', 'left', 'right', 'top', 'bottom', 'conf'] elif purpose == "OCR": out_columns = ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id'] if min_confidence is not None and max_confidence is not None: out_columns += ['ocrconf'] else: raise RuntimeError("Unknown purpose.") if noproxy: os.environ['no_proxy'] = '*' urls = [] if os.path.exists(tsv_out_file): parts = extract_doc_links(tsv_out_file) urls = [part['url'] for part in parts] else: pd.DataFrame([], columns=out_columns).to_csv(tsv_out_file, sep="\t", quoting=3, index=False) pcgts = parse(page_xml_file) tsv = [] line_info = [] for region_idx, region in enumerate(pcgts.get_Page().get_AllRegions(classes=['Text'], order='reading-order')): for text_line in region.get_TextLine(): left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)] if min_confidence is not None and max_confidence is not None: conf = np.max([textequiv.conf for textequiv in text_line.get_TextEquiv()]) else: conf = np.nan line_info.append((len(urls), left, right, top, bottom, conf, text_line.id)) words = [word for word in text_line.get_Word()] if len(words) <= 0: for text_equiv in text_line.get_TextEquiv(): # transform OCR coordinates using `scale_factor` to derive # correct coordinates for the web presentation image left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(text_line.get_Coords().points)] tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0, text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id)) else: for word in words: for text_equiv in word.get_TextEquiv(): # transform OCR coordinates using `scale_factor` to derive # correct coordinates for the web presentation image left, top, right, bottom = [int(scale_factor * x) for x in bbox_from_points(word.get_Coords().points)] tsv.append((region_idx, len(line_info) - 1, left + (right - left) / 2.0, text_equiv.get_Unicode(), len(urls), left, right, top, bottom, text_line.id)) line_info = pd.DataFrame(line_info, columns=['url_id', 'left', 'right', 'top', 'bottom', 'conf', 'line_id']) if min_confidence is not None and max_confidence is not None: line_info['ocrconf'] = line_info.conf.map(lambda x: get_conf_color(x, min_confidence, max_confidence)) tsv = pd.DataFrame(tsv, columns=['rid', 'line', 'hcenter'] + ['TEXT', 'url_id', 'left', 'right', 'top', 'bottom', 'line_id']) if len(tsv) == 0: return with open(tsv_out_file, 'a') as f: f.write('# ' + image_url + '\n') vlinecenter = pd.DataFrame(tsv[['line', 'top']].groupby('line', sort=False).mean().top + (tsv[['line', 'bottom']].groupby('line', sort=False).mean().bottom - tsv[['line', 'top']].groupby('line', sort=False).mean().top) / 2, columns=['vlinecenter']) tsv = tsv.merge(vlinecenter, left_on='line', right_index=True) regions = [region.sort_values(['vlinecenter', 'hcenter']) for rid, region in tsv.groupby('rid', sort=False)] tsv = pd.concat(regions) if purpose == 'NERD': tsv['No.'] = 0 tsv['NE-TAG'] = 'O' tsv['NE-EMB'] = 'O' tsv['ID'] = '-' tsv['conf'] = '-' tsv = tsv.rename(columns={'TEXT': 'TOKEN'}) elif purpose == 'OCR': tsv = pd.DataFrame([(line, " ".join(part.TEXT.to_list())) for line, part in tsv.groupby('line')], columns=['line', 'TEXT']) tsv = tsv.merge(line_info, left_on='line', right_index=True) tsv = tsv[out_columns].reset_index(drop=True) try: if purpose == 'NERD' and ner_rest_endpoint is not None: tsv, ner_result = ner(tsv, ner_rest_endpoint) if ned_rest_endpoint is not None: tsv, _ = ned(tsv, ner_result, ned_rest_endpoint, threshold=ned_threshold, priority=ned_priority) tsv.to_csv(tsv_out_file, sep="\t", quoting=3, index=False, mode='a', header=False) except requests.HTTPError as e: print(e)