def test_overwrite(self): resolver = Resolver() with TemporaryDirectory() as tempdir: workspace = resolver.workspace_from_url( assets.path_to('kant_aufklaerung_1784/data/mets.xml'), dst_dir=tempdir) # should fail at step 3 workspace.mets.add_file('OCR-D-SEG-WORD', url='foo/bar', ID='foo', pageId='page1', mimetype='image/tif') with self.assertRaisesRegex( Exception, r"Invalid task sequence input/output file groups: \[\"Output fileGrp\[@USE='OCR-D-SEG-WORD'\] already in METS!\"\]" ): validate_tasks([ ProcessorTask.parse(x) for x in [ "sample-processor -I OCR-D-IMG -O OCR-D-SEG-BLOCK", "sample-processor -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE", "sample-processor -I OCR-D-SEG-LINE -O OCR-D-SEG-WORD", "sample-processor -I OCR-D-SEG-WORD -O OCR-D-OCR-TESS", ] ], workspace) # should succeed b/c overwrite validate_tasks([ ProcessorTask.parse(x) for x in [ "sample-processor -I OCR-D-IMG -O OCR-D-SEG-BLOCK", "sample-processor -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE", "sample-processor -I OCR-D-SEG-LINE -O OCR-D-SEG-WORD", "sample-processor -I OCR-D-SEG-WORD -O OCR-D-OCR-TESS", ] ], workspace, overwrite=True)
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml')) with pushd_popd(workspace.directory): ocrd_file = workspace.mets.find_all_files(ID="FAULTY_GLYPHS_FILE")[0] report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors')
def test_validate_sequence(self): resolver = Resolver() with TemporaryDirectory() as tempdir: workspace = resolver.workspace_from_url( assets.path_to('kant_aufklaerung_1784/data/mets.xml'), dst_dir=tempdir) params_path = Path(tempdir, 'params.json') params_path.write_text('{"param1": true}') with self.assertRaisesRegex( Exception, "Input file group not contained in METS or produced by previous steps: FOO'" ): validate_tasks([ ProcessorTask.parse(x) for x in [ '%s -I OCR-D-IMG -O OUT1 -p %s' % (SAMPLE_NAME_REQUIRED_PARAM, params_path), '%s -I FOO -O OUT2 -p %s' % (SAMPLE_NAME_REQUIRED_PARAM, params_path) ] ], workspace) with self.assertRaisesRegex( Exception, "Input fileGrp.@USE='IN'. not in METS!"): validate_tasks([ ProcessorTask.parse(x) for x in [ '%s -I IN -O OUT1 -p %s' % (SAMPLE_NAME_REQUIRED_PARAM, params_path), ] ], workspace)
def bashlib_input_files(**kwargs): """ List input files for processing Instantiate a processor and workspace from the given processing options. Then loop through the input files of the input fileGrp, and for each one, print its `url`, `ID`, `mimetype` and `pageId`, as well as its recommended `outputFileId` (from ``make_file_id``). (The printing format is one associative array initializer per line.) """ initLogging() mets = kwargs.pop('mets') working_dir = kwargs.pop('working_dir') if is_local_filename(mets) and not isfile(get_local_filename(mets)): msg = "File does not exist: %s" % mets raise Exception(msg) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) processor = Processor(workspace, ocrd_tool=None, page_id=kwargs['page_id'], input_file_grp=kwargs['input_file_grp'], output_file_grp=kwargs['output_file_grp']) for input_file in processor.input_files: for field in ['url', 'ID', 'mimetype', 'pageId']: # make this bash-friendly (show initialization for associative array) print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ') print("[outputFileId]='%s'" % make_file_id(input_file, kwargs['output_file_grp']))
class TestResolver(TestCase): def setUp(self): self.resolver = Resolver() self.folder = os.path.join(TMP_FOLDER, 'kant_aufklaerung_1784') if os.path.exists(TMP_FOLDER): rmtree(TMP_FOLDER) os.makedirs(TMP_FOLDER) copytree(FOLDER_KANT, self.folder) def test_workspace_from_url(self): workspace = self.resolver.workspace_from_url(METS_HEROLD) # print(METS_HEROLD) # print(workspace.mets) input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG') # print [str(f) for f in input_files] image_file = input_files[0] # print(image_file) f = workspace.download_file(image_file) self.assertEqual(f.ID, 'FILE_0001_IMAGE') # print(f) def test_unpack_workspace(self): workspace = self.resolver.unpack_workspace_from_filename(TEST_ZIP) files = workspace.mets.find_files(mimetype='image/tiff') self.assertEqual(len(files), 2, '2 TIF') for f in files: workspace.download_file(f) print( [OcrdExif.from_filename(f.local_filename).to_xml() for f in files])
def ocrd_cli_wrap_processor(processorClass, ocrd_tool=None, mets=None, working_dir=None, dump_json=False, version=False, **kwargs): if dump_json: processorClass(workspace=None, dump_json=True) elif version: p = processorClass(workspace=None) print("Version %s, ocrd/core %s" % (p.version, OCRD_VERSION)) elif mets is None: raise Exception('Error: Missing option "-m" / "--mets".') else: if mets.find('://') == -1: mets = 'file://' + os.path.abspath(mets) if mets.startswith('file://') and not os.path.exists( mets[len('file://'):]): raise Exception("File does not exist: %s" % mets) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) run_processor(processorClass, ocrd_tool, mets, workspace=workspace, **kwargs)
def test_workspace_from_url_kant(mock_request, tmp_path): # arrange url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml' mock_request.side_effect = request_behavior dst_dir = tmp_path / 'workspace_kant' dst_dir.mkdir() # act resolver = Resolver() resolver.workspace_from_url(url_src, mets_basename='foo.xml', dst_dir=dst_dir) # assert local_path = dst_dir / 'foo.xml' assert os.path.isfile(str(local_path)) # 1 time data was requested assert mock_request.call_count == 1
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('glyph-consistency/data/mets.xml')) ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0] if not ocrd_file.local_filename: workspace.download_file(ocrd_file) report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len(report.errors), 17, 'errors')
def _fixture_workspace_sample_features(tmp_path): copytree('tests/data/sample-features', str(tmp_path)) resolver = Resolver() ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml')) prev_path = abspath(curdir) chdir(tmp_path) yield ws chdir(prev_path)
def _fixture_workspace_gutachten_data(tmp_path): copytree(assets.path_to('gutachten/data'), str(tmp_path)) resolver = Resolver() ws = resolver.workspace_from_url(join(str(tmp_path), 'mets.xml')) prev_path = abspath(curdir) chdir(tmp_path) yield ws chdir(prev_path)
def runTest(self): resolver = Resolver() workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process() workspace.save_mets()
def test_validate_ocrd_file(self): resolver = Resolver() workspace = resolver.workspace_from_url( assets.url_of('glyph-consistency/data/mets.xml')) with pushd_popd(workspace.directory): ocrd_file = workspace.mets.find_files(ID="FAULTY_GLYPHS_FILE")[0] report = PageValidator.validate(ocrd_file=ocrd_file) self.assertEqual(len(report.errors), 17, 'errors')
def _fixture_workspace_kant_aufklaerung(tmp_path): copytree(assets.path_to('kant_aufklaerung_1784/data/'), str(tmp_path)) resolver = Resolver() ws = resolver.workspace_from_url(join(tmp_path, 'mets.xml'), src_baseurl=tmp_path) prev_dir = abspath(curdir) chdir(tmp_path) yield ws chdir(prev_dir)
def test_run1(self): resolver = Resolver() workspace = resolver.workspace_from_url(assets.url_of('kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=WORKSPACE_DIR) proc = KrakenSegment( workspace, input_file_grp="OCR-D-IMG-BIN", output_file_grp="OCR-D-SEG-LINE-KRAKEN", parameter={'level-of-operation': 'line'} ) proc.process() workspace.save_mets()
def test_param_json(self): resolver = Resolver() workspace = resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets_one_file.xml'), dst_dir=WORKSPACE_DIR) run_processor(KrakenOcr, resolver=resolver, workspace=workspace, input_file_grp="INPUT", output_file_grp="OCR-D-OCR-KRAKEN") workspace.save_mets()
def runTest(self): resolver = Resolver(cache_enabled=True) workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, directory=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="INPUT", output_file_grp="OCR-D-SEG-BLOCK").process() # workspace.save_mets() TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process() workspace.save_mets()
def run_tasks(mets, log_level, page_id, task_strs): resolver = Resolver() workspace = resolver.workspace_from_url(mets) log = getLogger('ocrd.task_sequence') tasks = [ProcessorTask.parse(task_str) for task_str in task_strs] for task in tasks: task.validate() # check input file groups are in mets for input_file_grp in task.input_file_grps: if not input_file_grp in workspace.mets.file_groups: raise Exception( "Unmet requirement: expected input file group not contained in mets: %s" % input_file_grp) for output_file_grp in task.output_file_grps: if output_file_grp in workspace.mets.file_groups: raise Exception( "Conflict: output file group already contained in mets: %s" % output_file_grp) log.info("Start processing task '%s'", task) # execute cli returncode = run_cli(task.executable, mets, resolver, workspace, log_level=log_level, page_id=page_id, input_file_grp=','.join(task.input_file_grps), output_file_grp=','.join(task.output_file_grps), parameter=task.parameter_path) # check return code if returncode != 0: raise Exception("%s exited with non-zero return value %s" % (task.executable, returncode)) log.info("Finished processing task '%s'", task) # reload mets workspace.reload_mets() # check output file groups are in mets for output_file_grp in task.output_file_grps: if not output_file_grp in workspace.mets.file_groups: raise Exception( "Invalid state: expected output file group not in mets: %s" % output_file_grp)
def test_workspace_from_url_kant_with_resources(mock_request, tmp_path): # arrange url_src = 'https://raw.githubusercontent.com/OCR-D/assets/master/data/kant_aufklaerung_1784/data/mets.xml' mock_request.side_effect = request_behavior dst_dir = tmp_path / 'workspace_kant' dst_dir.mkdir() # act resolver = Resolver() resolver.workspace_from_url(url_src, mets_basename='kant_aufklaerung_1784.xml', dst_dir=dst_dir, download=True) # assert files present under local tmp_path local_path_mets = dst_dir / 'kant_aufklaerung_1784.xml' assert os.path.isfile(str(local_path_mets)) local_path_img1 = dst_dir / 'OCR-D-IMG' / 'INPUT_0017.tif' assert os.path.isfile(str(local_path_img1)) local_path_page1 = dst_dir / 'OCR-D-GT-PAGE' / 'PAGE_0017_PAGE.xml' assert os.path.isfile(str(local_path_page1)) # 1 METS/MODS + 2 images + 4 OCR files = 7 requests assert mock_request.call_count == 7
def test_run1(self): resolver = Resolver() with TemporaryDirectory() as tempdir: workspace = resolver.workspace_from_url(assets.path_to( 'kant_aufklaerung_1784-binarized/data/mets.xml'), dst_dir=tempdir) proc = OcropySegment( workspace, input_file_grp="OCR-D-IMG-BIN", output_file_grp="OCR-D-SEG-OCROPY-TEST", page_id='P_0017', ) # print(proc.parameter) proc.process() workspace.save_mets()
def runTest(self): resolver = Resolver() # workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets_one_file.xml'), directory=WORKSPACE_DIR) workspace = resolver.workspace_from_url( assets.url_of('kant_aufklaerung_1784-binarized/mets.xml'), directory=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process() TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process() TesserocrSegmentWord(workspace, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-SEG-WORD").process() workspace.save_mets()
class TestResolver(TestCase): def setUp(self): self.resolver = Resolver() self.workspace = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets.xml')) def test_verify(self): proc = DummyProcessor(self.workspace) self.assertEquals(proc.verify(), True) def test_json(self): DummyProcessor(self.workspace, dump_json=True) def test_params(self): proc = Processor(workspace=self.workspace) self.assertEquals(proc.parameter, {})
def test_422(self): """ # OCR-D/core#422 """ resolver = Resolver() with TemporaryDirectory() as tempdir: workspace = resolver.workspace_from_url( assets.path_to('kant_aufklaerung_1784/data/mets.xml'), dst_dir=tempdir) validate_tasks([ ProcessorTask.parse(x) for x in [ "sample-processor -I OCR-D-IMG -O OCR-D-SEG-BLOCK", "sample-processor -I OCR-D-SEG-BLOCK -O OCR-D-SEG-LINE", "sample-processor -I OCR-D-SEG-LINE -O OCR-D-SEG-WORD", "sample-processor -I OCR-D-SEG-WORD -O OCR-D-OCR-TESS", ] ], workspace)
def run_tasks(mets, log_level, page_id, task_strs, overwrite=False): resolver = Resolver() workspace = resolver.workspace_from_url(mets) log = getLogger('ocrd.task_sequence.run_tasks') tasks = [ProcessorTask.parse(task_str) for task_str in task_strs] validate_tasks(tasks, workspace, page_id, overwrite) # Run the tasks for task in tasks: log.info("Start processing task '%s'", task) # execute cli returncode, out, err = run_cli( task.executable, mets, resolver, workspace, log_level=log_level, page_id=page_id, overwrite=overwrite, input_file_grp=','.join(task.input_file_grps), output_file_grp=','.join(task.output_file_grps), parameter=json.dumps(task.parameters)) # check return code if returncode != 0: raise Exception( "%s exited with non-zero return value %s. STDOUT:\n%s\nSTDERR:\n%s" % (task.executable, returncode, out, err)) log.info("Finished processing task '%s'", task) # reload mets workspace.reload_mets() # check output file groups are in mets for output_file_grp in task.output_file_grps: if not output_file_grp in workspace.mets.file_groups: raise Exception( "Invalid state: expected output file group not in mets: %s\nSTDOUT:\n%s\nSTDERR:\n%s" % (output_file_grp, out, err))
def ocrd_cli_wrap_processor(processorClass, ocrd_tool=None, mets=None, working_dir=None, cache_enabled=True, *args, **kwargs): if mets.find('://') == -1: mets = 'file://' + mets if mets.startswith('file://') and not os.path.exists( mets[len('file://'):]): raise Exception("File does not exist: %s" % mets) resolver = Resolver(cache_enabled=cache_enabled) workspace = resolver.workspace_from_url(mets, working_dir) run_processor(processorClass, ocrd_tool, mets, workspace=workspace, *args, **kwargs)
def workspace(): if os.path.exists(WORKSPACE_DIR): shutil.rmtree(WORKSPACE_DIR) os.makedirs(WORKSPACE_DIR) resolver = Resolver() workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR) # XXX Work around data bug(?): # PAGE-XML links to OCR-D-IMG/INPUT_0017.tif, but this is nothing core can download os.makedirs(os.path.join(WORKSPACE_DIR, 'OCR-D-IMG')) for f in ['INPUT_0017.tif', 'INPUT_0020.tif']: urllib.request.urlretrieve( "https://github.com/OCR-D/assets/raw/master/data/kant_aufklaerung_1784/data/OCR-D-IMG/" + f, os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f)) # The binarization options I have are: # # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf) # b. ocrd_olena which 1. I cannot fully install via pip and 2. whose dependency olena doesn't compile on my # machine # c. just fumble with the original files # # So I'm going for option c. for f in ['INPUT_0017.tif', 'INPUT_0020.tif']: ff = os.path.join(WORKSPACE_DIR, 'OCR-D-IMG', f) subprocess.call(['convert', ff, '-threshold', '50%', ff]) # Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text # XXX Review data again # XXX Make this more robust against namespace version changes for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-LINE"): workspace.download_file(of) for to_remove in ["//pc:Word", "//pc:TextEquiv"]: for ff in glob(os.path.join(WORKSPACE_DIR, "OCR-D-GT-SEG-LINE", "*")): tree = etree.parse(ff) for e in tree.xpath(to_remove, namespaces=NSMAP_GT): e.getparent().remove(e) tree.write(ff, xml_declaration=True, encoding="utf-8") return workspace
def workspace(): if os.path.exists(WORKSPACE_DIR): shutil.rmtree(WORKSPACE_DIR) os.makedirs(WORKSPACE_DIR) resolver = Resolver() # due to core#809 this does not always work: #workspace = resolver.workspace_from_url(METS_KANT, dst_dir=WORKSPACE_DIR) # workaround: shutil.rmtree(WORKSPACE_DIR) shutil.copytree(os.path.dirname(METS_KANT), WORKSPACE_DIR) workspace = resolver.workspace_from_url( os.path.join(WORKSPACE_DIR, 'mets.xml')) # The binarization options I have are: # # a. ocrd_kraken which tries to install cltsm, whose installation is borken on my machine (protobuf) # b. ocrd_olena which 1. I cannot fully install via pip and 2. whose dependency olena doesn't compile on my # machine # c. just fumble with the original files # # So I'm going for option c. for imgf in workspace.mets.find_files(fileGrp="OCR-D-IMG"): imgf = workspace.download_file(imgf) path = os.path.join(workspace.directory, imgf.local_filename) subprocess.call(['mogrify', '-threshold', '50%', path]) # Remove GT Words and TextEquivs, to not accidently check GT text instead of the OCR text # XXX Review data again for of in workspace.mets.find_files(fileGrp="OCR-D-GT-SEG-WORD-GLYPH"): workspace.download_file(of) path = os.path.join(workspace.directory, of.local_filename) tree = etree.parse(path) nsmap_gt = {"pc": page_namespace(tree)} for to_remove in ["//pc:Word", "//pc:TextEquiv"]: for e in tree.xpath(to_remove, namespaces=nsmap_gt): e.getparent().remove(e) tree.write(path, xml_declaration=True, encoding="utf-8") assertFileDoesNotContain(path, "TextEquiv") return workspace
def runTest(self): resolver = Resolver(cache_enabled=True) # workspace = resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/mets_one_file.xml'), directory=WORKSPACE_DIR) workspace = resolver.workspace_from_url(assets.url_of( 'kant_aufklaerung_1784-page-block-line-word/mets.xml'), directory=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process() workspace.save_mets() TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process() workspace.save_mets() TesserocrRecognize(workspace, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-OCR-TESS", parameter={ 'textequiv_level': 'word' }).process() workspace.save_mets()
def test_task_run(self): resolver = Resolver() with copy_of_directory( assets.path_to('kant_aufklaerung_1784/data')) as wsdir: with pushd_popd(wsdir): ws = resolver.workspace_from_url('mets.xml') ws.add_file('GRP0', content='', local_filename='GRP0/foo', ID='file0', mimetype=MIMETYPE_PAGE, pageId=None) ws.save_mets() files_before = len(ws.mets.find_files()) run_tasks('mets.xml', 'DEBUG', None, [ "dummy -I OCR-D-IMG -O GRP1", "dummy -I GRP1 -O GRP2", ]) ws.reload_mets() # step 1: 2 images in OCR-D-IMG -> 2 images 2 PAGEXML in GRP1 # step 2: 2 images and 2 PAGEXML in GRP1 -> process just the PAGEXML self.assertEqual(len(ws.mets.find_files()), files_before + 6)
def runTest(self): resolver = Resolver() workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process() workspace.save_mets() TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process() workspace.save_mets() TesserocrRecognize( workspace, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-OCR-TESS", parameter={ 'textequiv_level': 'line' } # add dep tesseract-ocr-script-frak: , 'model': 'Fraktur' ).process() workspace.save_mets() TesserocrSegmentWord(workspace, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-SEG-WORD").process() workspace.save_mets() TesserocrRecognize( workspace, input_file_grp="OCR-D-SEG-WORD", output_file_grp="OCR-D-OCR-TESS-W2C", parameter={ 'textequiv_level': 'glyph' } # add dep tesseract-ocr-script-frak: , 'model': 'Fraktur'} ).process() workspace.save_mets()
def prepare_workspace(task: dict, resolver: Resolver, dst_dir: str) -> Workspace: """Prepare a workspace and return it.""" mets_basename = "mets.xml" workspace = resolver.workspace_from_url(task["src"], dst_dir=dst_dir, mets_basename=mets_basename, clobber_mets=True) if task["default_file_grp"] == "MAX" and "MAX" not in workspace.mets.file_groups: for file_name in workspace.mets.find_files(fileGrp="DEFAULT"): workspace.download_file( add_max_file_to_workspace(workspace, file_name)) else: for file_name in workspace.mets.find_files( fileGrp=task["default_file_grp"]): if not file_name.local_filename: workspace.download_file(file_name) workspace.save_mets() return workspace