def process_cli(mets_url, **kwargs): """ Execute OCR-D processors for a METS file directly. """ resolver = Resolver() workspace = resolver.workspace_from_url(mets_url) cmds = [] for ocrd_tool_file in kwargs['ocrd_tool']: with codecs.open(ocrd_tool_file, encoding='utf-8') as f: obj = json.loads(f.read()) for tool in obj['tools']: cmds.append(tool['binary']) for cmd in kwargs['steps']: if cmd not in cmds: raise Exception("Tool not registered: '%s'" % cmd) for cmd in kwargs['steps']: run_cli(cmd, mets_url, resolver, workspace) workspace.reload_mets() # print('\n'.join(k + '=' + str(kwargs[k]) for k in kwargs)) print(workspace)
def _sample_ws_for_overwrite(self): resolver = Resolver() with TemporaryDirectory() as tempdir: ws = resolver.workspace_from_nothing(directory=tempdir) ws.add_file('IN-GRP', pageId='pID1', ID='fID1', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID1.tif')) ws.add_file('OUT-GRP', pageId='pID2', ID='fID2', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID2.tif')) ws.add_file('OUT-GRP', pageId='pID3', ID='fID3', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID3.tif')) ws.add_file('OUT-GRP', pageId='pID4', ID='fID4', mimetype='image/tiff', content='CONTENT', local_filename=join(tempdir, 'ID4.tif')) ws.save_mets() yield ws
def test_bulk_add_stdin(self): resolver = Resolver() with pushd_popd(tempdir=True) as wsdir: ws = resolver.workspace_from_nothing(directory=wsdir) Path(wsdir, 'BIN').mkdir() Path(wsdir, 'BIN/FILE_0001_BIN.IMG-wolf.png').write_text('') Path(wsdir, 'BIN/FILE_0002_BIN.IMG-wolf.png').write_text('') Path(wsdir, 'BIN/FILE_0001_BIN.xml').write_text('') Path(wsdir, 'BIN/FILE_0002_BIN.xml').write_text('') with mock_stdin( 'PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png BIN/FILE_0001_BIN.IMG-wolf.png image/png\n' 'PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png BIN/FILE_0002_BIN.IMG-wolf.png image/png\n' 'PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml BIN/FILE_0001_BIN.xml application/vnd.prima.page+xml\n' 'PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml BIN/FILE_0002_BIN.xml application/vnd.prima.page+xml\n' ): assert len(ws.mets.file_groups) == 0 exit_code, out, err = self.invoke_cli(workspace_cli, [ 'bulk-add', '-r', r'(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<src>.*) (?P<dest>.*) (?P<mimetype>.*)', '-G', '{{ filegrp }}', '-g', '{{ pageid }}', '-i', '{{ fileid }}', '-m', '{{ mimetype }}', '-u', "{{ dest }}", '-' ]) ws.reload_mets() assert len(ws.mets.file_groups) == 1 assert len(list(ws.mets.find_files())) == 4 f = next(ws.mets.find_files()) assert f.mimetype == 'image/png' assert f.ID == 'FILE_0001_BIN.IMG-wolf' assert f.url == 'BIN/FILE_0001_BIN.IMG-wolf.png'
def editable(self, editable: bool) -> None: if editable: if self._original_url: self.workspace = self._clone_workspace(self._original_url) else: self.workspace = Resolver().workspace_from_nothing( directory=None, mets_basename='mets.xml') else: self.workspace = Resolver().workspace_from_url(self.baseurl_mets) self._editable = editable
def kant_ocrdzip(ocrd_identifier): resolver = Resolver() bagger = WorkspaceBagger(resolver, strict=True) dest = join(gettempdir(), 'olahd-test-bag-%d.ocrd.zip' % int(round((time() * 1000)))) ws = resolver.workspace_from_url( assets.path_to('kant_aufklaerung_1784/data/mets.xml')) bagger.bag(ws, ocrd_identifier, dest=dest) yield dest unlink(dest)
def test_binarize_lines(self): resolver = Resolver() workspace = resolver.workspace_from_url( assets.url_of('kant_aufklaerung_1784/data/mets.xml'), dst_dir=WORKSPACE_DIR) proc = KrakenBinarize(workspace, input_file_grp="OCR-D-GT-PAGE", output_file_grp="OCR-D-IMG-BIN-KRAKEN", parameter={'level-of-operation': 'line'}) proc.process() workspace.save_mets()
def __init__(self, directory, mets_url, mets_basename, automatic_backup): self.log = getLogger('ocrd.cli.workspace') self.resolver = Resolver() if mets_basename: self.log.warning( DeprecationWarning( '--mets-basename is deprecated. Use --mets/--directory instead.' )) self.directory, self.mets_url, self.mets_basename = self.resolver.resolve_mets_arguments( directory, mets_url, mets_basename) self.automatic_backup = automatic_backup
def runTest(self): resolver = Resolver() workspace = resolver.workspace_from_url(METS_HEROLD_SMALL, dst_dir=WORKSPACE_DIR) TesserocrSegmentRegion(workspace, input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK").process() TesserocrSegmentLine(workspace, input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE").process() TesserocrSegmentWord(workspace, input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-SEG-WORD").process() workspace.save_mets()
def test_copies_ok(self): with copy_of_directory(assets.url_of('SBB0000F29300010000/data')) as wsdir: workspace = Workspace(Resolver(), wsdir) input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG') self.assertEqual(len(input_files), 3) output_files = workspace.mets.find_files(fileGrp='OUTPUT') self.assertEqual(len(output_files), 0) run_processor( DummyProcessor, input_file_grp='OCR-D-IMG', output_file_grp='OUTPUT', workspace=workspace ) output_files = workspace.mets.find_files(fileGrp='OUTPUT') output_files.sort(key=lambda x: x.url) print([str(s) for s in output_files]) self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_0001.tif') self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_0001.xml') self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID) self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url) self.assertEqual(len(output_files), 6) self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*')), 6) self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*_PAGE')), 3) self.assertEqual(len(workspace.mets.find_files(fileGrp='OUTPUT', mimetype=MIMETYPE_PAGE)), 3) run_processor( DummyProcessor, input_file_grp='OUTPUT', output_file_grp='OUTPUT2', workspace=workspace ) output2_files = workspace.mets.find_files(fileGrp='OUTPUT2') output2_files.sort(key=lambda x: x.url) self.assertEqual(len(output2_files), 3)
def __init__(self, directory, mets_url, mets_basename, automatic_backup): self.log = getLogger('ocrd.cli.workspace') if mets_basename and mets_url: raise ValueError("Use either --mets or --mets-basename, not both") if mets_basename and not mets_url: self.log.warning(DeprecationWarning("--mets-basename is deprecated. Use --mets/--directory instead")) mets_basename = mets_basename if mets_basename else 'mets.xml' if directory and mets_url: directory = abspath(directory) if not abspath(mets_url).startswith(directory): raise ValueError("--mets has a directory part inconsistent with --directory") elif not directory and mets_url: if mets_url.startswith('http') or mets_url.startswith('https:'): raise ValueError("--mets is an http(s) URL but no --directory was given") directory = dirname(abspath(mets_url)) or getcwd() elif directory and not mets_url: directory = abspath(directory) mets_url = join(directory, mets_basename) else: directory = getcwd() mets_url = join(directory, mets_basename) self.directory = directory self.resolver = Resolver() self.mets_url = mets_url self.automatic_backup = automatic_backup
def validate_process(tasks, workspace): ''' Validate a sequence of tasks passable to 'ocrd process' ''' if workspace: _inform_of_result( validate_tasks([ProcessorTask.parse(t) for t in tasks], Workspace(Resolver(), directory=workspace))) else: for t in [ProcessorTask.parse(t) for t in tasks]: _inform_of_result(t.validate())
def process(self): client = OlaHdClient(self.parameter['endpoint'], self.parameter['username'], self.parameter['password']) bagger = WorkspaceBagger(Resolver(), strict=True) # TODO dest = join(gettempdir(), 'bag-%d.ocrd.zip' % int(round((time() * 1000)))) # TODO ocrd_identifier = self.workspace.mets.unique_identifier bagger.bag(self.workspace, ocrd_identifier, dest=dest) client.login() client.post(dest, prev_pid=ocrd_identifier)
def _clone_workspace(cls, mets_url: Union[Path, str]) -> Workspace: """ Clones a workspace (mets.xml and all used files) to a temporary directory for editing """ log = getLogger( 'ocrd_browser.model.document.Document._clone_workspace') mets_url = cls._strip_local(mets_url, disallow_remote=False) temporary_workspace = mkdtemp(prefix='browse-ocrd-clone-') cls.temporary_workspaces.append(temporary_workspace) # TODO download = False and lazy loading would be nice for responsiveness log.info("Cloning '%s' to '%s'", mets_url, temporary_workspace) workspace = Resolver().workspace_from_url(mets_url=mets_url, dst_dir=temporary_workspace, download=True) return workspace
def save_as(self, mets_url: Union[Path, str], backup_directory: Union[bool, Path, str] = True) -> None: log = getLogger('ocrd_browser.model.document.Document.save_as') mets_path = Path(self._strip_local(mets_url, disallow_remote=True)) workspace_directory = mets_path.parent if workspace_directory.exists(): if backup_directory: if isinstance(backup_directory, bool): backup_directory = self._derive_backup_directory( workspace_directory) shutil.move(str(workspace_directory), str(backup_directory)) else: shutil.rmtree(str(workspace_directory)) mets_basename = mets_path.name workspace_directory.mkdir(parents=True, exist_ok=True) self._emit('document_saving', 0, None) saved_space = Resolver().workspace_from_url( mets_url=self.workspace.mets_target, mets_basename=mets_basename, download=False, clobber_mets=True, dst_dir=workspace_directory) saved_files = list(saved_space.mets.find_files()) for n, f in enumerate(saved_files): f = saved_space.download_file(f) self._emit('document_saving', n / len(saved_files), f) self._emit('document_saving', 1, None) self._emit('document_saved', Document(saved_space, self.emitter)) self._original_url = str(mets_path) self._modified = False log.info('Saved to %s', self._original_url)
def load(cls, mets_url: Union[Path, str] = None, emitter: EventCallBack = None) -> 'Document': """ Load a project from an url as a readonly view If you want to modify the Workspace, use Document.clone instead """ if not mets_url: return cls.create(emitter=emitter) mets_url = cls._strip_local(mets_url) workspace = Resolver().workspace_from_url(mets_url, download=False) doc = cls(workspace, emitter=emitter, original_url=mets_url) doc._empty = False return doc
def testProcessorProfiling(self): initLogging() log_capture_string = FIFOIO(256) ch = logging.StreamHandler(log_capture_string) ch.setFormatter(logging.Formatter(LOG_FORMAT)) getLogger('ocrd.process.profile').setLevel('DEBUG') getLogger('ocrd.process.profile').addHandler(ch) run_processor(DummyProcessor, resolver=Resolver(), mets_url=assets.url_of('SBB0000F29300010000/data/mets.xml')) log_contents = log_capture_string.getvalue() log_capture_string.close() # with open('/tmp/debug.log', 'w') as f: # f.write(log_contents) # Check whether profile information has been logged. Dummy should finish in under 0.1s self.assertTrue(match(r'.*Executing processor \'ocrd-test\' took 0.\d+s.*', log_contents))
class TestXsdValidator(TestCase): def setUp(self): self.resolver = Resolver() self.ws = self.resolver.workspace_from_url( assets.url_of('SBB0000F29300010000/data/mets.xml')) def test_constructor(self): with self.assertRaisesRegex(Exception, 'schema not bundled'): XsdValidator('foo') XsdValidator(XSD_METS_URL) def test_mets_empty(self): with TemporaryDirectory() as tempdir: mets_path = Path(tempdir, 'mets.xml') mets_path.write_bytes(METS_XML_EMPTY) report = XsdMetsValidator.validate(mets_path) self.assertEqual(len(report.errors), 2) self.assertEqual( report.errors[0], "Line 3: Element '{http://www.loc.gov/METS/}metsHdr', attribute 'CREATEDATE': '{{ NOW }}' is not a valid value of the atomic type 'xs:dateTime'." ) self.assertEqual( report.errors[1], "Line 18: Element '{http://www.loc.gov/METS/}fileSec': Missing child element(s). Expected is ( {http://www.loc.gov/METS/}fileGrp )." ) self.assertFalse(report.is_valid) def test_validate_simple_protected_str(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets.to_xml()) self.assertTrue(report.is_valid) def test_validate_simple_protected_doc(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets._tree) self.assertTrue(report.is_valid) def test_validate_simple_static_doc(self): report = XsdValidator.validate(XSD_METS_URL, self.ws.mets._tree) self.assertTrue(report.is_valid)
def setUp(self): self.maxDiff = None self.resolver = Resolver() initLogging() self.runner = CliRunner()
def setUp(self): self.model_path = Path(Path.cwd(), 'models/latest_net_G.pth') self.resolver = Resolver()
class TestCli(TestCase): def setUp(self): super().setUp() disableLogging() self.maxDiff = None self.resolver = Resolver() self.runner = CliRunner(mix_stderr=False) def test_add(self): """ Ensure that `ocrd workspace add` does the right thing """ ID = 'foo123file' page_id = 'foo123page' file_grp = 'TEST_GROUP' content = 'x' mimetype = 'image/tiff' local_filename = join(file_grp, 'foo.xml') # mets_api = None # mets_cli = None with TemporaryDirectory() as tempdir: ws_api = self.resolver.workspace_from_nothing(directory=tempdir) ws_api.add_file(file_grp, ID=ID, content=content, pageId=page_id, mimetype=mimetype, local_filename=local_filename) ws_api.save_mets() # mets_api = ws_api.mets.to_xml().decode('utf8') with TemporaryDirectory() as tempdir: ws_api = self.resolver.workspace_from_nothing(directory=tempdir) content_file = join(tempdir, 'testfile') with open(content_file, 'w') as f: f.write(content) result = self.runner.invoke(workspace_cli, [ '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype, content_file ]) self.assertEqual(result.exit_code, 0) # TODO too complex to compare :( # with open(join(tempdir, 'mets.xml')) as f: # mets_cli = f.read() # print(mets_api) # print(mets_cli) # self.assertEqual(mets_api, mets_cli) # print(result.output) # with open(join(tempdir, 'mets.xml')) as f: # print(f.read()) self.assertEqual(result.exit_code, 0) def test_add_remove(self): ID = 'foo123file' page_id = 'foo123page' file_grp = 'TEST_GROUP' content = 'x' mimetype = 'image/tiff' with TemporaryDirectory() as tempdir: content_file = join(tempdir, 'testfile') with open(content_file, 'w') as f: f.write(content) result = self.runner.invoke(workspace_cli, ['init', tempdir]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke(workspace_cli, [ '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype, content_file ]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke( workspace_cli, ['-d', tempdir, 'remove', '--keep-file', ID]) self.assertEqual(result.exit_code, 0) # File should still exist self.assertTrue(exists(content_file)) def test_add_remove_force(self): ID = 'foo123file' page_id = 'foo123page' file_grp = 'TEST_GROUP' content = 'x' mimetype = 'image/tiff' with TemporaryDirectory() as tempdir: content_file = join(tempdir, 'testfile') with open(content_file, 'w') as f: f.write(content) result = self.runner.invoke(workspace_cli, ['init', tempdir]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke(workspace_cli, [ '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype, content_file ]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke( workspace_cli, ['-d', tempdir, 'remove', '--force', ID]) self.assertEqual(result.exit_code, 0) # File should have been deleted self.assertFalse(exists(content_file)) def test_add_url(self): ID = 'foo123file' page_id = 'foo123page' file_grp = 'TEST_GROUP' mimetype = 'image/tiff' url = 'http://remote/file.tif' with TemporaryDirectory() as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.save_mets() result = self.runner.invoke(workspace_cli, [ '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype, url ]) self.assertEqual(result.exit_code, 0) ws.reload_mets() f = ws.mets.find_all_files()[0] self.assertEqual(f.url, url) def test_add_nonexisting_checked(self): ID = 'foo123file' page_id = 'foo123page' file_grp = 'TEST_GROUP' mimetype = 'image/tiff' with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.save_mets() exit_code, out, err = self.invoke_cli(workspace_cli, [ '-d', tempdir, 'add', '-C', '--file-grp', file_grp, '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype, 'does-not-exist.xml' ]) self.assertEqual(exit_code, 1) self.assertIn( "File 'does-not-exist.xml' does not exist, halt execution!", err) def test_add_519(self): """ https://github.com/OCR-D/core/issues/519 """ with TemporaryDirectory() as tempdir: wsdir = Path(tempdir, "workspace") wsdir.mkdir() srcdir = Path(tempdir, "source") srcdir.mkdir() srcfile = Path(srcdir, "srcfile.jpg") srcfile_content = 'foo' srcfile.write_text(srcfile_content) with pushd_popd(str(wsdir)): exit_code, out, err = self.invoke_cli(workspace_cli, ['init']) exit_code, out, err = self.invoke_cli(workspace_cli, [ 'add', '-m', 'image/jpg', '-G', 'MAX', '-i', 'IMG_MAX_1818975', '-C', str(srcfile) ]) # print(out, err) self.assertEqual(exit_code, 0) self.assertTrue(Path(wsdir, 'MAX', 'srcfile.jpg').exists()) self.assertEqual( Path(wsdir, 'MAX', 'srcfile.jpg').read_text(), srcfile_content) def test_add_existing_checked(self): ID = 'foo123file' page_id = 'foo123page' file_grp = 'TEST_GROUP' mimetype = 'image/tiff' with TemporaryDirectory() as tempdir: content_file = join(tempdir, 'test.tif') ws = self.resolver.workspace_from_nothing(directory=tempdir) ws.save_mets() with open(content_file, 'w') as f: f.write('x') result = self.runner.invoke(workspace_cli, [ '-d', tempdir, 'add', '-C', '--file-grp', file_grp, '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype, content_file ]) self.assertEqual(result.exit_code, 0) ws.reload_mets() f = ws.mets.find_all_files()[0] self.assertEqual(f.url, 'test.tif') def test_find_all_files(self): with TemporaryDirectory() as tempdir: wsdir = join(tempdir, 'ws') copytree(assets.path_to('SBB0000F29300010000/data'), wsdir) with pushd_popd(wsdir): result = self.runner.invoke( workspace_cli, ['find', '-G', 'OCR-D-IMG-BIN', '-k', 'fileGrp']) self.assertEqual(result.output, 'OCR-D-IMG-BIN\nOCR-D-IMG-BIN\n') self.assertEqual(result.exit_code, 0) def test_prune_files(self): with TemporaryDirectory() as tempdir: copytree(assets.path_to('SBB0000F29300010000/data'), join(tempdir, 'ws')) ws1 = self.resolver.workspace_from_url( join(tempdir, 'ws', 'mets.xml')) self.assertEqual(len(ws1.mets.find_all_files()), 35) result = self.runner.invoke( workspace_cli, ['-d', join(tempdir, 'ws'), 'prune-files']) self.assertEqual(result.exit_code, 0) ws2 = self.resolver.workspace_from_url( join(tempdir, 'ws', 'mets.xml')) self.assertEqual(len(ws2.mets.find_all_files()), 7) def test_clone_into_nonexisting_dir(self): """ https://github.com/OCR-D/core/issues/330 """ with TemporaryDirectory() as tempdir: clone_to = join(tempdir, 'non-existing-dir') result = self.runner.invoke(workspace_cli, [ 'clone', '--download', assets.path_to('scribo-test/data/mets.xml'), clone_to ]) self.assertEqual(result.exit_code, 0) def test_remove_file_group(self): """ Test removal of filegrp """ with TemporaryDirectory() as tempdir: wsdir = join(tempdir, 'ws') copytree(assets.path_to('SBB0000F29300010000/data'), wsdir) file_group = 'OCR-D-GT-PAGE' file_path = Path(tempdir, 'ws', file_group, 'FILE_0002_FULLTEXT.xml') self.assertTrue(file_path.exists()) workspace = self.resolver.workspace_from_url( join(wsdir, 'mets.xml')) self.assertEqual(workspace.directory, wsdir) with self.assertRaisesRegex(Exception, "not empty"): workspace.remove_file_group(file_group) self.assertTrue(file_path.exists()) self.assertEqual(len(workspace.mets.file_groups), 17) self.assertEqual(len(workspace.mets.find_all_files()), 35) workspace.remove_file_group(file_group, recursive=True, force=True) self.assertEqual(len(workspace.mets.file_groups), 16) self.assertEqual(len(workspace.mets.find_all_files()), 33) self.assertFalse(file_path.exists()) # TODO ensure empty dirs are removed # self.assertFalse(file_path.parent.exists()) def test_clone_relative(self): # Create a relative path to trigger make sure #319 is gone src_path = str( Path(assets.path_to( 'kant_aufklaerung_1784/data/mets.xml')).relative_to( Path.cwd())) with TemporaryDirectory() as tempdir: result = self.runner.invoke(workspace_cli, ['clone', '-a', src_path, tempdir]) self.assertEqual(result.exit_code, 0) self.assertTrue( exists(join(tempdir, 'OCR-D-GT-PAGE/PAGE_0017_PAGE.xml'))) def test_copy_vs_clone(self): src_dir = assets.path_to('kant_aufklaerung_1784/data') with TemporaryDirectory() as tempdir: # cloned without download shallowcloneddir = join(tempdir, 'cloned-shallow') # cloned with download fullcloneddir = join(tempdir, 'cloned-all') # copied copieddir = join(tempdir, 'copied') Path(fullcloneddir).mkdir() Path(shallowcloneddir).mkdir() result = self.runner.invoke( workspace_cli, ['clone', join(src_dir, 'mets.xml'), shallowcloneddir]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke( workspace_cli, ['clone', '-a', join(src_dir, 'mets.xml'), fullcloneddir]) self.assertEqual(result.exit_code, 0) with copy_of_directory(src_dir, copieddir): shallow_vs_copied = dircmp(shallowcloneddir, copieddir) self.assertEqual( set(shallow_vs_copied.right_only), set(['OCR-D-GT-ALTO', 'OCR-D-GT-PAGE', 'OCR-D-IMG'])) full_vs_copied = dircmp(fullcloneddir, copieddir) # print(full_vs_copied) # from ocrd_utils import pushd_popd # with pushd_popd(tempdir): # import os # os.system("diff %s/mets.xml %s/mets.xml" % (fullcloneddir, copieddir)) # XXX mets.xml will not have the exact same content because # URLs that are actually files will be marked up as such with # @LOCTYPE/@OTHERLOCTYPE # self.assertEqual(full_vs_copied.diff_files, []) self.assertEqual(full_vs_copied.left_only, []) self.assertEqual(full_vs_copied.right_only, []) def test_find_all_files_multiple_physical_pages_for_fileids(self): with copy_of_directory( assets.path_to('SBB0000F29300010000/data')) as tempdir: result = self.runner.invoke(workspace_cli, [ '-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0005', '-k', 'url' ]) self.assertEqual(result.stdout, 'OCR-D-IMG/FILE_0005_IMAGE.tif\n') self.assertEqual(result.exit_code, 0) result = self.runner.invoke(workspace_cli, [ '-d', tempdir, 'find', '--page-id', 'PHYS_0005,PHYS_0001', '-k', 'url' ]) self.assertEqual(len(result.stdout.split('\n')), 19) def test_mets_basename(self): with TemporaryDirectory() as tempdir: with pushd_popd(tempdir): result = self.runner.invoke(workspace_cli, ['-m', 'foo.xml', 'init']) self.assertEqual(result.exit_code, 0) self.assertTrue(exists('foo.xml')) self.assertFalse(exists('mets.xml')) def test_mets_basename_and_mets(self): with pushd_popd(tempdir=True) as tempdir: with self.assertRaisesRegex( ValueError, "Use either --mets or --mets-basename, not both"): self.invoke_cli(workspace_cli, ['-m', 'foo.xml', '-M', 'not-foo.xml', 'init']) def test_mets_basename_and_not_mets(self): with pushd_popd(tempdir=True) as tempdir: _, out, err = self.invoke_cli( workspace_cli, ['-d', 'foo', '-M', 'not-foo.xml', 'init']) self.assertEqual(out, join(tempdir, 'foo') + '\n') self.assertIn( '--mets-basename is deprecated. Use --mets/--directory instead', err) def test_mets_get_id_set_id(self): with pushd_popd(tempdir=True): self.invoke_cli(workspace_cli, ['init']) disableLogging() mets_id = 'foo123' self.invoke_cli(workspace_cli, ['set-id', mets_id]) disableLogging() _, out, _ = self.invoke_cli(workspace_cli, ['get-id']) self.assertEqual(out, mets_id + '\n') def test_mets_directory_incompatible(self): with pushd_popd(tempdir=True) as tempdir: with self.assertRaisesRegex( ValueError, "--mets has a directory part inconsistent with --directory" ): self.invoke_cli(workspace_cli, ['-d', 'foo', '-m', '/somewhere/else', 'init']) def test_mets_directory_html(self): with pushd_popd(tempdir=True) as tempdir: with self.assertRaisesRegex( ValueError, r"--mets is an http\(s\) URL but no --directory was given" ): self.invoke_cli(workspace_cli, ['-m', 'https://foo.bar/bla', 'init']) def test_bulk_add(self): NO_FILES = 100 with TemporaryDirectory() as srcdir: Path(srcdir, "OCR-D-IMG").mkdir() Path(srcdir, "OCR-D-PAGE").mkdir() for i in range(NO_FILES): Path(srcdir, "OCR-D-IMG", "page_%04d.tif" % i).write_text('') for i in range(NO_FILES): Path(srcdir, "OCR-D-PAGE", "page_%04d.xml" % i).write_text('') with TemporaryDirectory() as wsdir: with pushd_popd(wsdir): ws = self.resolver.workspace_from_nothing(directory=wsdir) exit_code, out, err = self.invoke_cli( workspace_cli, [ 'bulk-add', '--ignore', '--regex', r'^.*/(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)$', '--url', '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}', '--file-id', 'FILE_{{ fileGrp }}_{{ pageid }}', '--page-id', 'PHYS_{{ pageid }}', '--file-grp', '{{ fileGrp }}', '%s/*/*' % srcdir ]) # print('exit_code', exit_code) # print('out', out) # print('err', err) ws.reload_mets() self.assertEqual(len(ws.mets.file_groups), 2) self.assertEqual(len(ws.mets.find_all_files()), 2 * NO_FILES) self.assertEqual( len(ws.mets.find_all_files(mimetype='image/tiff')), NO_FILES) self.assertEqual( len(ws.mets.find_all_files( ID='//FILE_OCR-D-IMG_000.*')), 10) self.assertEqual( len(ws.mets.find_all_files(ID='//FILE_.*_000.*')), 20) self.assertEqual( len(ws.mets.find_all_files(pageId='PHYS_0001')), 2) self.assertEqual( ws.mets.find_all_files( ID='FILE_OCR-D-PAGE_0001')[0].url, 'OCR-D-PAGE/FILE_0001.xml')
def setUp(self): self.resolver = Resolver()
def __init__(self, directory, mets_basename, automatic_backup): self.directory = directory self.resolver = Resolver() self.mets_basename = mets_basename self.automatic_backup = automatic_backup
def resolver(): return Resolver()
#TODO PAGE-XMl MIME_TO_EXT = { MIMETYPE_PAGE: ".xml", "application/pdf": ".pdf", "image/tiff": ".tif", "image/tif": ".tif", "image/jp2": ".jp2", "image/png": ".png", "image/jpg": ".jpg", "image/jpeg": ".jpg", "application/alto+xml": ".xml", } resolver = Resolver() DOCS_REPO = Path(__file__).resolve(True).parent UPDATE_BAGIT_SCRIPT = Path(DOCS_REPO, 'update-bagit') def update_checksums(bagdir): with pushd_popd(bagdir): os.system('zsh "%s"' % UPDATE_BAGIT_SCRIPT) resolver = Resolver() def do_the_update(bagdir, non_local_urls=False): directory = Path(bagdir, 'data')
class TestCli(TestCase): def setUp(self): self.maxDiff = None self.resolver = Resolver() initLogging() self.runner = CliRunner() def test_add(self): """ Ensure that `ocrd workspace add` does the right thing """ ID = 'foo123file' page_id = 'foo123page' file_grp = 'TEST_GROUP' content = 'x' mimetype = 'image/tiff' local_filename = join(file_grp, 'foo.xml') # mets_api = None # mets_cli = None with TemporaryDirectory() as tempdir: ws_api = self.resolver.workspace_from_nothing(directory=tempdir) ws_api.add_file(file_grp, ID=ID, content=content, pageId=page_id, mimetype=mimetype, local_filename=local_filename) ws_api.save_mets() # mets_api = ws_api.mets.to_xml().decode('utf8') with TemporaryDirectory() as tempdir: ws_api = self.resolver.workspace_from_nothing(directory=tempdir) content_file = join(tempdir, 'testfile') with open(content_file, 'w') as f: f.write(content) result = self.runner.invoke(workspace_cli, [ '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype, content_file ]) self.assertEqual(result.exit_code, 0) # TODO too complex to compare :( # with open(join(tempdir, 'mets.xml')) as f: # mets_cli = f.read() # print(mets_api) # print(mets_cli) # self.assertEqual(mets_api, mets_cli) # print(result.output) # with open(join(tempdir, 'mets.xml')) as f: # print(f.read()) self.assertEqual(result.exit_code, 0) def test_add_remove(self): ID = 'foo123file' page_id = 'foo123page' file_grp = 'TEST_GROUP' content = 'x' mimetype = 'image/tiff' with TemporaryDirectory() as tempdir: content_file = join(tempdir, 'testfile') with open(content_file, 'w') as f: f.write(content) result = self.runner.invoke(workspace_cli, ['init', tempdir]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke(workspace_cli, [ '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype, content_file ]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke( workspace_cli, ['-d', tempdir, 'remove', '--keep-file', ID]) self.assertEqual(result.exit_code, 0) # File should still exist self.assertTrue(exists(content_file)) def test_add_remove_force(self): ID = 'foo123file' page_id = 'foo123page' file_grp = 'TEST_GROUP' content = 'x' mimetype = 'image/tiff' with TemporaryDirectory() as tempdir: content_file = join(tempdir, 'testfile') with open(content_file, 'w') as f: f.write(content) result = self.runner.invoke(workspace_cli, ['init', tempdir]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke(workspace_cli, [ '-d', tempdir, 'add', '--file-grp', file_grp, '--page-id', page_id, '--file-id', ID, '--mimetype', mimetype, content_file ]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke( workspace_cli, ['-d', tempdir, 'remove', '--force', ID]) print(result) print(result.output) self.assertEqual(result.exit_code, 0) # File should have been deleted self.assertFalse(exists(content_file)) def test_find_files(self): with TemporaryDirectory() as tempdir: wsdir = join(tempdir, 'ws') copytree(assets.path_to('SBB0000F29300010000/data'), wsdir) with pushd_popd(wsdir): result = self.runner.invoke( workspace_cli, ['find', '-G', 'OCR-D-IMG-BIN', '-k', 'fileGrp']) self.assertEqual(result.output, 'OCR-D-IMG-BIN\nOCR-D-IMG-BIN\n') self.assertEqual(result.exit_code, 0) def test_prune_files(self): with TemporaryDirectory() as tempdir: copytree(assets.path_to('SBB0000F29300010000/data'), join(tempdir, 'ws')) ws1 = self.resolver.workspace_from_url( join(tempdir, 'ws', 'mets.xml')) self.assertEqual(len(ws1.mets.find_files()), 35) result = self.runner.invoke( workspace_cli, ['-d', join(tempdir, 'ws'), 'prune-files']) self.assertEqual(result.exit_code, 0) ws2 = self.resolver.workspace_from_url( join(tempdir, 'ws', 'mets.xml')) self.assertEqual(len(ws2.mets.find_files()), 7) def test_clone_into_nonexisting_dir(self): """ https://github.com/OCR-D/core/issues/330 """ with TemporaryDirectory() as tempdir: clone_to = join(tempdir, 'non-existing-dir') result = self.runner.invoke(workspace_cli, [ 'clone', '--download', assets.path_to('scribo-test/data/mets.xml'), clone_to ]) self.assertEqual(result.exit_code, 0) def test_remove_file_group(self): """ Test removal of filegrp """ with TemporaryDirectory() as tempdir: wsdir = join(tempdir, 'ws') copytree(assets.path_to('SBB0000F29300010000/data'), wsdir) file_group = 'OCR-D-GT-PAGE' file_path = Path(tempdir, 'ws', file_group, 'FILE_0002_FULLTEXT.xml') self.assertTrue(file_path.exists()) workspace = self.resolver.workspace_from_url( join(wsdir, 'mets.xml')) self.assertEqual(workspace.directory, wsdir) with self.assertRaisesRegex(Exception, "not empty"): workspace.remove_file_group(file_group) self.assertTrue(file_path.exists()) self.assertEqual(len(workspace.mets.file_groups), 17) self.assertEqual(len(workspace.mets.find_files()), 35) workspace.remove_file_group(file_group, recursive=True, force=True) self.assertEqual(len(workspace.mets.file_groups), 16) self.assertEqual(len(workspace.mets.find_files()), 33) self.assertFalse(file_path.exists()) # TODO ensure empty dirs are removed # self.assertFalse(file_path.parent.exists()) def test_clone_relative(self): # Create a relative path to trigger make sure #319 is gone src_path = str( Path(assets.path_to( 'kant_aufklaerung_1784/data/mets.xml')).relative_to( Path.cwd())) with TemporaryDirectory() as tempdir: result = self.runner.invoke(workspace_cli, ['clone', '-a', src_path, tempdir]) self.assertEqual(result.exit_code, 0) self.assertTrue( exists(join(tempdir, 'OCR-D-GT-PAGE/PAGE_0017_PAGE.xml'))) def test_copy_vs_clone(self): src_dir = assets.path_to('kant_aufklaerung_1784/data') with TemporaryDirectory() as tempdir: # cloned without download shallowcloneddir = join(tempdir, 'cloned-shallow') # cloned with download fullcloneddir = join(tempdir, 'cloned-all') # copied copieddir = join(tempdir, 'copied') Path(fullcloneddir).mkdir() Path(shallowcloneddir).mkdir() result = self.runner.invoke( workspace_cli, ['clone', join(src_dir, 'mets.xml'), shallowcloneddir]) self.assertEqual(result.exit_code, 0) result = self.runner.invoke( workspace_cli, ['clone', '-a', join(src_dir, 'mets.xml'), fullcloneddir]) self.assertEqual(result.exit_code, 0) with copy_of_directory(src_dir, copieddir): shallow_vs_copied = dircmp(shallowcloneddir, copieddir) self.assertEqual( set(shallow_vs_copied.right_only), set(['OCR-D-GT-ALTO', 'OCR-D-GT-PAGE', 'OCR-D-IMG'])) full_vs_copied = dircmp(fullcloneddir, copieddir) # print(full_vs_copied) # from ocrd_utils import pushd_popd # with pushd_popd(tempdir): # import os # os.system("diff %s/mets.xml %s/mets.xml" % (fullcloneddir, copieddir)) # XXX mets.xml will not have the exact same content because # URLs that are actually files will be marked up as such with # @LOCTYPE/@OTHERLOCTYPE # self.assertEqual(full_vs_copied.diff_files, []) self.assertEqual(full_vs_copied.left_only, []) self.assertEqual(full_vs_copied.right_only, []) def test_mets_basename(self): with TemporaryDirectory() as tempdir: with pushd_popd(tempdir): result = self.runner.invoke(workspace_cli, ['-M', 'foo.xml', 'init', '.']) self.assertEqual(result.exit_code, 0) self.assertTrue(exists('foo.xml')) self.assertFalse(exists('mets.xml'))
def setUp(self): super().setUp() self.resolver = Resolver() self.ws = self.resolver.workspace_from_url(assets.url_of('SBB0000F29300010000/data/mets.xml'))
def __init__(self, directory): self.directory = directory self.resolver = Resolver(cache_enabled=True) self.config = {} self.verbose = False
def __init__(self, directory, mets_basename): self.directory = directory self.resolver = Resolver() self.mets_basename = mets_basename self.config = {} self.verbose = False
def setUp(self): super().setUp() disableLogging() self.maxDiff = None self.resolver = Resolver() self.runner = CliRunner(mix_stderr=False)
def setUp(self): self.resolver = Resolver() initLogging()