def ocrd_cli_wrap_processor(processorClass, ocrd_tool=None, mets=None, working_dir=None, dump_json=False, version=False, **kwargs): if dump_json: processorClass(workspace=None, dump_json=True) elif version: p = processorClass(workspace=None) print("Version %s, ocrd/core %s" % (p.version, OCRD_VERSION)) elif mets is None: raise Exception('Error: Missing option "-m" / "--mets".') else: if mets.find('://') == -1: mets = 'file://' + os.path.abspath(mets) if mets.startswith('file://') and not os.path.exists( mets[len('file://'):]): raise Exception("File does not exist: %s" % mets) resolver = Resolver() workspace = resolver.workspace_from_url(mets, working_dir) run_processor(processorClass, ocrd_tool, mets, workspace=workspace, **kwargs)
def test_run_agent(self): no_agents_before = len(self.workspace.mets.agents) run_processor(DummyProcessor, ocrd_tool=DUMMY_TOOL, workspace=self.workspace) self.assertEqual(len(self.workspace.mets.agents), no_agents_before + 1, 'one more agent')
def test_copies_ok(self): with copy_of_directory(assets.url_of('SBB0000F29300010000/data')) as wsdir: workspace = Workspace(Resolver(), wsdir) input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG') self.assertEqual(len(input_files), 3) output_files = workspace.mets.find_files(fileGrp='OUTPUT') self.assertEqual(len(output_files), 0) run_processor( DummyProcessor, input_file_grp='OCR-D-IMG', output_file_grp='OUTPUT', workspace=workspace ) output_files = workspace.mets.find_files(fileGrp='OUTPUT') output_files.sort(key=lambda x: x.url) print([str(s) for s in output_files]) self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_0001.tif') self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_0001.xml') self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID) self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url) self.assertEqual(len(output_files), 6) self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*')), 6) self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*_PAGE')), 3) self.assertEqual(len(workspace.mets.find_files(fileGrp='OUTPUT', mimetype=MIMETYPE_PAGE)), 3) run_processor( DummyProcessor, input_file_grp='OUTPUT', output_file_grp='OUTPUT2', workspace=workspace ) output2_files = workspace.mets.find_files(fileGrp='OUTPUT2') output2_files.sort(key=lambda x: x.url) self.assertEqual(len(output2_files), 3)
def test_crop(self): with copy_of_directory(assets.path_to('dfki-testdata/data')) as wsdir: ws = Workspace(self.resolver, wsdir) pagexml_before = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) run_processor( OcrdAnybaseocrCropper, resolver=self.resolver, mets_url=str(Path(wsdir, 'mets.xml')), input_file_grp='BIN', output_file_grp='CROP-TEST', parameter={}, ) ws.reload_mets() pagexml_after = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) self.assertEqual(pagexml_after, pagexml_before + 1)
def test_crop(self): if not torch.cuda.is_available(): pytest.skip('CUDA is not available, cannot test dewarping') with copy_of_directory(assets.path_to('dfki-testdata/data')) as wsdir: ws = Workspace(self.resolver, wsdir) pagexml_before = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) run_processor(OcrdAnybaseocrDewarper, resolver=self.resolver, mets_url=str(Path(wsdir, 'mets.xml')), input_file_grp='BIN', output_file_grp='DEWARP-TEST', parameter={'model_path': str(self.model_path)}) ws.reload_mets() pagexml_after = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) self.assertEqual(pagexml_after, pagexml_before + 1)
def test_with_mets_url_input_files(self): processor = run_processor( DummyProcessor, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) self.assertEqual(len(processor.input_files), 20) self.assertTrue( all([f.mimetype == MIMETYPE_PAGE for f in processor.input_files]))
def test_no_input_file_grp(self): processor = run_processor( DummyProcessor, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) with self.assertRaisesRegex(Exception, 'Processor is missing input fileGrp'): _ = processor.input_files
def ocrd_cli_wrap_processor(processorClass, ocrd_tool=None, mets=None, working_dir=None, cache_enabled=True, *args, **kwargs): if mets.find('://') == -1: mets = 'file://' + mets if mets.startswith('file://') and not os.path.exists( mets[len('file://'):]): raise Exception("File does not exist: %s" % mets) resolver = Resolver(cache_enabled=cache_enabled) workspace = resolver.workspace_from_url(mets, working_dir) run_processor(processorClass, ocrd_tool, mets, workspace=workspace, *args, **kwargs)
def test_run_cli(self): with TemporaryDirectory() as tempdir: run_processor(DummyProcessor, ocrd_tool=DUMMY_TOOL, workspace=self.workspace) run_cli( 'echo', mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), resolver=Resolver(), workspace=None, page_id='page1', log_level='DEBUG', input_file_grp='INPUT', output_file_grp='OUTPUT', parameter='/path/to/param.json', working_dir=tempdir) run_cli( 'echo', mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml'), resolver=Resolver(), )
def test_parameter_url(self): with TemporaryDirectory() as tempdir: jsonpath = join(tempdir, 'params.json') with open(jsonpath, 'w') as f: f.write('{}') processor = run_processor( DummyProcessor, parameter='file://%s' % jsonpath, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml') ) self.assertEqual(len(processor.input_files), 35)
def test_parameter(self): with TemporaryDirectory() as tempdir: jsonpath = join(tempdir, 'params.json') with open(jsonpath, 'w') as f: f.write('{"baz": "quux"}') with open(jsonpath, 'r') as f: processor = run_processor( DummyProcessor, parameter=json.load(f), resolver=self.resolver, mets_url=assets.url_of( 'SBB0000F29300010000/data/mets.xml')) self.assertEqual(len(processor.input_files), 20)
def test_no_mets_url(self): with self.assertRaisesRegex(Exception, 'pass mets_url to create a workspace'): run_processor(DummyProcessor, resolver=self.resolver)
def test_no_resolver(self): with self.assertRaisesRegex(Exception, 'pass a resolver to create a workspace'): run_processor(DummyProcessor)
def test_with_mets_url_input_files(self): processor = run_processor( DummyProcessor, resolver=self.resolver, mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) self.assertEqual(len(processor.input_files), 35)