def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, force, local_filename): """ Add a file LOCAL_FILENAME to METS in a workspace. """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) if not local_filename.startswith(ctx.directory): log.debug("File '%s' is not in workspace, copying", local_filename) local_filename = ctx.resolver.download_to_directory(ctx.directory, "file://" + local_filename, subdir=file_grp) url = "file://" + local_filename workspace.mets.add_file(fileGrp=file_grp, ID=file_id, mimetype=mimetype, url=url, pageId=page_id, force=force, local_filename=local_filename) workspace.save_mets()
def set_id(ctx, id): workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) workspace.mets.unique_identifier = id workspace.save_mets()
def do_the_update(bagdir, non_local_urls=False): directory = Path(bagdir, 'data') if not Path(directory, 'mets.xml').exists(): LOG.error("Something's wrong with OCRD-ZIP at %s, no data/mets.xml!", bagdir) return workspace = Workspace(resolver, directory=str(directory)) with pushd_popd(directory): for f in workspace.mets.find_files(): fp = Path(f.url) if not fp.exists() and not non_local_urls: LOG.debug("Skipping non-local file: %s", fp) continue ext = MIME_TO_EXT.get(f.mimetype) if not ext: LOG.error( "No rule to translate '%s' to an extension. Skipping %s", f.mimetype, fp) continue if fp.suffix == ext: LOG.debug("Already has the right extension, %s", fp.name) continue if fp.suffix and fp.suffix in EXT_TO_MIME and fp.suffix != ext: LOG.warning("Has the WRONG extension, is '%s' should be '%s'", fp.suffix, ext) f.url = f.url[:-len(fp.suffix)] LOG.info('Renaming %s{,%s}', fp, ext) f.url = "%s%s" % (f.url, ext) if fp.exists(): fp.rename('%s%s' % (fp, ext)) workspace.save_mets() LOG.debug('Running bagit update script') update_checksums(bagdir) LOG.info("FINISHED: %s", bagdir)
def set_id(ctx, id): # pylint: disable=redefined-builtin workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) workspace.mets.unique_identifier = id workspace.save_mets()
def prune_files(ctx, file_grp, mimetype, page_id, file_id): """ Removes mets:files that point to non-existing local files (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup) with pushd_popd(workspace.directory): for f in workspace.mets.find_files( ID=file_id, fileGrp=file_grp, mimetype=mimetype, pageId=page_id, ): try: if not f.local_filename or not exists(f.local_filename): workspace.mets.remove_file(f.ID) except Exception as e: ctx.log.exception("Error removing %f: %s", f, e) raise (e) workspace.save_mets()
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname): """ Add a file or http(s) URL FNAME to METS in a workspace. If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace. """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) log = getLogger('ocrd.cli.workspace.add') if not mimetype: try: mimetype = EXT_TO_MIME[Path(fname).suffix] log.info("Guessed mimetype to be %s" % mimetype) except KeyError: log.error( "Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname)) kwargs = { 'fileGrp': file_grp, 'ID': file_id, 'mimetype': mimetype, 'pageId': page_id, 'force': force, 'ignore': ignore } log.debug("Adding '%s' (%s)", fname, kwargs) if not (fname.startswith('http://') or fname.startswith('https://')): if not fname.startswith(ctx.directory): if not isabs(fname) and exists(join(ctx.directory, fname)): fname = join(ctx.directory, fname) else: log.debug("File '%s' is not in workspace, copying", fname) try: fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp) except FileNotFoundError: if check_file_exists: log.error("File '%s' does not exist, halt execution!" % fname) sys.exit(1) if check_file_exists and not exists(fname): log.error("File '%s' does not exist, halt execution!" % fname) sys.exit(1) if fname.startswith(ctx.directory): fname = relpath(fname, ctx.directory) kwargs['local_filename'] = fname kwargs['url'] = fname if not page_id: log.warning( "You did not provide '--page-id/-g', so the file you added is not linked to a specific page." ) workspace.mets.add_file(**kwargs) workspace.save_mets()
def workspace_add_file(ctx, file_grp, file_id, mimetype, group_id, local_filename): workspace = Workspace(ctx.resolver, directory=ctx.directory) workspace.mets.add_file(file_grp=file_grp, file_id=file_id, mimetype=mimetype, group_id=group_id, local_filename=local_filename) workspace.save_mets()
def set_id(ctx, id): # pylint: disable=redefined-builtin """ Set METS ID. If one of the supported identifier mechanisms is used, will set this identifier. Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>. """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup) workspace.mets.unique_identifier = id workspace.save_mets()
def prune_files(ctx): workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) with pushd_popd(workspace.directory): for f in workspace.mets.find_files(): try: if not f.local_filename or not exists(f.local_filename): workspace.mets.remove_file(f.ID) except Exception as e: log.exception("Error removing %f: %s", f, e) raise (e) workspace.save_mets()
def test_crop(self): with copy_of_directory(assets.path_to('dfki-testdata/data')) as wsdir: ws = Workspace(self.resolver, wsdir) pagexml_before = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) run_processor( OcrdAnybaseocrCropper, resolver=self.resolver, mets_url=str(Path(wsdir, 'mets.xml')), input_file_grp='BIN', output_file_grp='CROP-TEST', parameter={}, ) ws.reload_mets() pagexml_after = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) self.assertEqual(pagexml_after, pagexml_before + 1)
def test_crop(self): if not torch.cuda.is_available(): pytest.skip('CUDA is not available, cannot test dewarping') with copy_of_directory(assets.path_to('dfki-testdata/data')) as wsdir: ws = Workspace(self.resolver, wsdir) pagexml_before = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) run_processor(OcrdAnybaseocrDewarper, resolver=self.resolver, mets_url=str(Path(wsdir, 'mets.xml')), input_file_grp='BIN', output_file_grp='DEWARP-TEST', parameter={'model_path': str(self.model_path)}) ws.reload_mets() pagexml_after = len(ws.mets.find_files(mimetype=MIMETYPE_PAGE)) self.assertEqual(pagexml_after, pagexml_before + 1)
def test_copies_ok(self): with copy_of_directory(assets.url_of('SBB0000F29300010000/data')) as wsdir: workspace = Workspace(Resolver(), wsdir) input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG') self.assertEqual(len(input_files), 3) output_files = workspace.mets.find_files(fileGrp='OUTPUT') self.assertEqual(len(output_files), 0) run_processor( DummyProcessor, input_file_grp='OCR-D-IMG', output_file_grp='OUTPUT', workspace=workspace ) output_files = workspace.mets.find_files(fileGrp='OUTPUT') output_files.sort(key=lambda x: x.url) print([str(s) for s in output_files]) self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_0001.tif') self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_0001.xml') self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID) self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url) self.assertEqual(len(output_files), 6) self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*')), 6) self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*_PAGE')), 3) self.assertEqual(len(workspace.mets.find_files(fileGrp='OUTPUT', mimetype=MIMETYPE_PAGE)), 3) run_processor( DummyProcessor, input_file_grp='OUTPUT', output_file_grp='OUTPUT2', workspace=workspace ) output2_files = workspace.mets.find_files(fileGrp='OUTPUT2') output2_files.sort(key=lambda x: x.url) self.assertEqual(len(output2_files), 3)
def workspace_backup_list(ctx): """ List backups """ backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)) for b in backup_manager.list(): print(b)
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname): """ Add a file or http(s) URL FNAME to METS in a workspace. If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace. """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup) kwargs = { 'fileGrp': file_grp, 'ID': file_id, 'mimetype': mimetype, 'pageId': page_id, 'force': force, 'ignore': ignore } log = getLogger('ocrd.cli.workspace.add') log.debug("Adding '%s' (%s)", fname, kwargs) if not (fname.startswith('http://') or fname.startswith('https://')): if not fname.startswith(ctx.directory): if not isabs(fname) and exists(join(ctx.directory, fname)): fname = join(ctx.directory, fname) else: log.debug("File '%s' is not in workspace, copying", fname) try: fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp) except FileNotFoundError: if check_file_exists: log.error("File '%s' does not exist, halt execution!" % fname) sys.exit(1) if check_file_exists and not exists(fname): log.error("File '%s' does not exist, halt execution!" % fname) sys.exit(1) if fname.startswith(ctx.directory): fname = relpath(fname, ctx.directory) kwargs['local_filename'] = fname kwargs['url'] = fname workspace.mets.add_file(**kwargs) workspace.save_mets()
def list_pages(ctx): """ List physical page IDs """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url)) print("\n".join(workspace.mets.physical_pages))
def get_id(ctx): """ Get METS id if any """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url)) ID = workspace.mets.unique_identifier if ID: print(ID)
def list_groups(ctx): """ List fileGrp USE attributes """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url)) print("\n".join(workspace.mets.file_groups))
def merge(ctx, copy_files, filegrp_mapping, file_grp, file_id, page_id, mimetype, mets_path): # pylint: disable=redefined-builtin """ Merges this workspace with the workspace that contains ``METS_PATH`` The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help`` for an explanation. """ mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( other_workspace, copy_files=copy_files, fileGrp_mapping=filegrp_mapping, fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, ) workspace.save_mets()
def setUp(self): self.resolver = Resolver() self.tempdir = mkdtemp() self.workspace_dir = join(self.tempdir, 'kant_aufklaerung_1784') copytree(assets.path_to('kant_aufklaerung_1784/data'), self.workspace_dir) self.workspace = Workspace(self.resolver, directory=join(self.workspace_dir)) self.mgr = WorkspaceBackupManager(self.workspace)
def workspace_backup_undo(ctx): """ Restore the last backup """ backup_manager = WorkspaceBackupManager( Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)) backup_manager.undo()
def workspace_backup_restore(ctx, choose_first, bak): """ Restore backup BAK """ backup_manager = WorkspaceBackupManager( Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)) backup_manager.restore(bak, choose_first)
def workspace_backup_add(ctx): """ Create a new backup """ backup_manager = WorkspaceBackupManager( Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)) backup_manager.add()
def validate_process(tasks, workspace): ''' Validate a sequence of tasks passable to 'ocrd process' ''' if workspace: _inform_of_result( validate_tasks([ProcessorTask.parse(t) for t in tasks], Workspace(Resolver(), directory=workspace))) else: for t in [ProcessorTask.parse(t) for t in tasks]: _inform_of_result(t.validate())
def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url)) workspace.rename_file_group(old, new) workspace.save_mets()
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download): """ Find files. (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ modified_mets = False ret = list() workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url)) for f in workspace.mets.find_files( ID=file_id, fileGrp=file_grp, mimetype=mimetype, pageId=page_id, ): if download and not f.local_filename: workspace.download_file(f) modified_mets = True ret.append([f.ID if field == 'pageId' else getattr(f, field) or '' for field in output_field]) if modified_mets: workspace.save_mets() if 'pageId' in output_field: idx = output_field.index('pageId') fileIds = list(map(lambda fields: fields[idx], ret)) pages = workspace.mets.get_physical_pages(for_fileIds=fileIds) for fields, page in zip(ret, pages): fields[idx] = page or '' for fields in ret: print('\t'.join(fields))
def remove_group(ctx, group, recursive, force): workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) for g in group: workspace.remove_file_group(g, recursive, force) workspace.save_mets()
def workspace_remove_file(ctx, id, force): # pylint: disable=redefined-builtin """ Delete file by ID from mets.xml """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) for i in id: workspace.remove_file(i, force=force) workspace.save_mets()
def remove_group(ctx, group, recursive, force, keep_files): """ Delete fileGrps (given by their USE attribute ``GROUP``). (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url)) for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) workspace.save_mets()
def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefined-builtin """ Delete files (given by their ID attribute ``ID``). (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup) for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) workspace.save_mets()
def workspace_find(ctx, file_grp, local_only, mimetype, group_id, file_id, output_field, download): """ Find files. """ workspace = Workspace(ctx.resolver, directory=ctx.directory) for f in workspace.mets.find_files( ID=file_id, fileGrp=file_grp, local_only=local_only, mimetype=mimetype, groupId=group_id, ): if download: workspace.download_file(f, subdir=f.fileGrp) workspace.save_mets() ret = '\t'.join([getattr(f, field) or '' for field in output_field]) print(ret)