def merge(ctx, copy_files, filegrp_mapping, file_grp, file_id, page_id, mimetype, mets_path): # pylint: disable=redefined-builtin """ Merges this workspace with the workspace that contains ``METS_PATH`` The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help`` for an explanation. """ mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( other_workspace, copy_files=copy_files, fileGrp_mapping=filegrp_mapping, fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, ) workspace.save_mets()
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, force, local_filename): """ Add a file LOCAL_FILENAME to METS in a workspace. """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) if not local_filename.startswith(ctx.directory): log.debug("File '%s' is not in workspace, copying", local_filename) local_filename = ctx.resolver.download_to_directory(ctx.directory, "file://" + local_filename, subdir=file_grp) url = "file://" + local_filename workspace.mets.add_file(fileGrp=file_grp, ID=file_id, mimetype=mimetype, url=url, pageId=page_id, force=force, local_filename=local_filename) workspace.save_mets()
def prune_files(ctx, file_grp, mimetype, page_id, file_id): """ Removes mets:files that point to non-existing local files (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup) with pushd_popd(workspace.directory): for f in workspace.mets.find_files( ID=file_id, fileGrp=file_grp, mimetype=mimetype, pageId=page_id, ): try: if not f.local_filename or not exists(f.local_filename): workspace.mets.remove_file(f.ID) except Exception as e: ctx.log.exception("Error removing %f: %s", f, e) raise (e) workspace.save_mets()
def set_id(ctx, id): workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) workspace.mets.unique_identifier = id workspace.save_mets()
def do_the_update(bagdir, non_local_urls=False): directory = Path(bagdir, 'data') if not Path(directory, 'mets.xml').exists(): LOG.error("Something's wrong with OCRD-ZIP at %s, no data/mets.xml!", bagdir) return workspace = Workspace(resolver, directory=str(directory)) with pushd_popd(directory): for f in workspace.mets.find_files(): fp = Path(f.url) if not fp.exists() and not non_local_urls: LOG.debug("Skipping non-local file: %s", fp) continue ext = MIME_TO_EXT.get(f.mimetype) if not ext: LOG.error( "No rule to translate '%s' to an extension. Skipping %s", f.mimetype, fp) continue if fp.suffix == ext: LOG.debug("Already has the right extension, %s", fp.name) continue if fp.suffix and fp.suffix in EXT_TO_MIME and fp.suffix != ext: LOG.warning("Has the WRONG extension, is '%s' should be '%s'", fp.suffix, ext) f.url = f.url[:-len(fp.suffix)] LOG.info('Renaming %s{,%s}', fp, ext) f.url = "%s%s" % (f.url, ext) if fp.exists(): fp.rename('%s%s' % (fp, ext)) workspace.save_mets() LOG.debug('Running bagit update script') update_checksums(bagdir) LOG.info("FINISHED: %s", bagdir)
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download): """ Find files. (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ modified_mets = False ret = list() workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url)) for f in workspace.mets.find_files( ID=file_id, fileGrp=file_grp, mimetype=mimetype, pageId=page_id, ): if download and not f.local_filename: workspace.download_file(f) modified_mets = True ret.append([f.ID if field == 'pageId' else getattr(f, field) or '' for field in output_field]) if modified_mets: workspace.save_mets() if 'pageId' in output_field: idx = output_field.index('pageId') fileIds = list(map(lambda fields: fields[idx], ret)) pages = workspace.mets.get_physical_pages(for_fileIds=fileIds) for fields, page in zip(ret, pages): fields[idx] = page or '' for fields in ret: print('\t'.join(fields))
def set_id(ctx, id): # pylint: disable=redefined-builtin workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) workspace.mets.unique_identifier = id workspace.save_mets()
def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url)) workspace.rename_file_group(old, new) workspace.save_mets()
def remove_group(ctx, group, recursive, force): workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) for g in group: workspace.remove_file_group(g, recursive, force) workspace.save_mets()
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname): """ Add a file or http(s) URL FNAME to METS in a workspace. If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace. """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) log = getLogger('ocrd.cli.workspace.add') if not mimetype: try: mimetype = EXT_TO_MIME[Path(fname).suffix] log.info("Guessed mimetype to be %s" % mimetype) except KeyError: log.error( "Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (Path(fname).suffix, fname)) kwargs = { 'fileGrp': file_grp, 'ID': file_id, 'mimetype': mimetype, 'pageId': page_id, 'force': force, 'ignore': ignore } log.debug("Adding '%s' (%s)", fname, kwargs) if not (fname.startswith('http://') or fname.startswith('https://')): if not fname.startswith(ctx.directory): if not isabs(fname) and exists(join(ctx.directory, fname)): fname = join(ctx.directory, fname) else: log.debug("File '%s' is not in workspace, copying", fname) try: fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp) except FileNotFoundError: if check_file_exists: log.error("File '%s' does not exist, halt execution!" % fname) sys.exit(1) if check_file_exists and not exists(fname): log.error("File '%s' does not exist, halt execution!" % fname) sys.exit(1) if fname.startswith(ctx.directory): fname = relpath(fname, ctx.directory) kwargs['local_filename'] = fname kwargs['url'] = fname if not page_id: log.warning( "You did not provide '--page-id/-g', so the file you added is not linked to a specific page." ) workspace.mets.add_file(**kwargs) workspace.save_mets()
def workspace_add_file(ctx, file_grp, file_id, mimetype, group_id, local_filename): workspace = Workspace(ctx.resolver, directory=ctx.directory) workspace.mets.add_file(file_grp=file_grp, file_id=file_id, mimetype=mimetype, group_id=group_id, local_filename=local_filename) workspace.save_mets()
def workspace_remove_file(ctx, id, force): # pylint: disable=redefined-builtin """ Delete file by ID from mets.xml """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) for i in id: workspace.remove_file(i, force=force) workspace.save_mets()
def remove_group(ctx, group, recursive, force, keep_files): """ Delete fileGrps (given by their USE attribute ``GROUP``). (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url)) for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) workspace.save_mets()
def set_id(ctx, id): # pylint: disable=redefined-builtin """ Set METS ID. If one of the supported identifier mechanisms is used, will set this identifier. Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>. """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup) workspace.mets.unique_identifier = id workspace.save_mets()
def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefined-builtin """ Delete files (given by their ID attribute ``ID``). (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup) for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) workspace.save_mets()
def prune_files(ctx): workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) with pushd_popd(workspace.directory): for f in workspace.mets.find_files(): try: if not f.local_filename or not exists(f.local_filename): workspace.mets.remove_file(f.ID) except Exception as e: log.exception("Error removing %f: %s", f, e) raise (e) workspace.save_mets()
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_file_exists, force, fname): """ Add a file or http(s) URL FNAME to METS in a workspace. If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace. """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup) kwargs = { 'fileGrp': file_grp, 'ID': file_id, 'mimetype': mimetype, 'pageId': page_id, 'force': force, 'ignore': ignore } log = getLogger('ocrd.cli.workspace.add') log.debug("Adding '%s' (%s)", fname, kwargs) if not (fname.startswith('http://') or fname.startswith('https://')): if not fname.startswith(ctx.directory): if not isabs(fname) and exists(join(ctx.directory, fname)): fname = join(ctx.directory, fname) else: log.debug("File '%s' is not in workspace, copying", fname) try: fname = ctx.resolver.download_to_directory(ctx.directory, fname, subdir=file_grp) except FileNotFoundError: if check_file_exists: log.error("File '%s' does not exist, halt execution!" % fname) sys.exit(1) if check_file_exists and not exists(fname): log.error("File '%s' does not exist, halt execution!" % fname) sys.exit(1) if fname.startswith(ctx.directory): fname = relpath(fname, ctx.directory) kwargs['local_filename'] = fname kwargs['url'] = fname workspace.mets.add_file(**kwargs) workspace.save_mets()
def workspace_find(ctx, file_grp, local_only, mimetype, group_id, file_id, output_field, download): """ Find files. """ workspace = Workspace(ctx.resolver, directory=ctx.directory) for f in workspace.mets.find_files( ID=file_id, fileGrp=file_grp, local_only=local_only, mimetype=mimetype, groupId=group_id, ): if download: workspace.download_file(f, subdir=f.fileGrp) workspace.save_mets() ret = '\t'.join([getattr(f, field) or '' for field in output_field]) print(ret)
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download): """ Find files. """ workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) for f in workspace.mets.find_files( ID=file_id, fileGrp=file_grp, mimetype=mimetype, pageId=page_id, ): if download: workspace.download_file(f) workspace.save_mets() ret = '\t'.join([getattr(f, field) or '' for field in output_field]) print(ret)
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp, dry_run, file_glob, ignore, force, skip): r""" Add files in bulk to an OCR-D workspace. FILE_GLOB can either be a shell glob expression or a list of files. --regex is applied to the absolute path of every file in FILE_GLOB and can define named groups that can be used in --page-id, --file-id, --mimetype, --url and --file-grp by referencing the named group 'grp' in the regex as '{{ grp }}'. \b Example: ocrd workspace bulk-add \\ --regex '^.*/(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)$' \\ --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\ --page-id 'PHYS_{{ pageid }}' \\ --file-grp "{{ fileGrp }}" \\ --url '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\ path/to/files/*/*.* """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup) try: pat = re.compile(regex) except Exception as e: log.error("Invalid regex: %s" % e) sys.exit(1) file_paths = [] for fglob in file_glob: file_paths += [Path(x).resolve() for x in glob(fglob)] for i, file_path in enumerate(file_paths): log.info("[%4d/%d] %s" % (i, len(file_paths), file_path)) # match regex m = pat.match(str(file_path)) if not m: if skip: continue log.error("File not matched by regex: '%s'" % file_path) sys.exit(1) group_dict = m.groupdict() # set up file info file_dict = {'url': url, 'mimetype': mimetype, 'ID': file_id, 'pageId': page_id, 'fileGrp': file_grp} # guess mime type if not file_dict['mimetype']: try: file_dict['mimetype'] = EXT_TO_MIME[file_path.suffix] except KeyError: log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (file_path.suffix, file_path)) # expand templates for param_name in file_dict: for group_name in group_dict: file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name]) # copy files if file_dict['url']: urlpath = Path(workspace.directory, file_dict['url']) if not urlpath.exists(): log.info("cp '%s' '%s'", file_path, urlpath) if not dry_run: if not urlpath.parent.is_dir(): urlpath.parent.mkdir() urlpath.write_bytes(file_path.read_bytes()) # Add to workspace (or not) fileGrp = file_dict.pop('fileGrp') if dry_run: log.info('workspace.add_file(%s)' % file_dict) else: workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # save changes to disk workspace.save_mets()
def set_id(ctx, ID): workspace = Workspace(ctx.resolver, directory=ctx.directory) workspace.mets.unique_identifier = ID workspace.save_mets()
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip): """ Add files in bulk to an OCR-D workspace. FILE_GLOB can either be a shell glob expression to match file names, or a list of expressions or '-', in which case expressions are read from STDIN. After globbing, --regex is matched against each expression resulting from FILE_GLOB, and can define named groups reusable in the --page-id, --file-id, --mimetype, --url, --source-path and --file-grp options, e.g. by referencing the group name 'grp' from the regex as '{{ grp }}'. If the FILE_GLOB expressions do not denote the file names themselves (but arbitrary strings for --regex matching), then use --source-path to set the actual file paths to use. (This could involve fixed strings or group references.) \b Examples: ocrd workspace bulk-add \\ --regex '(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.[^.]+' \\ --page-id 'PHYS_{{ pageid }}' \\ --file-grp "{{ fileGrp }}" \\ path/to/files/*/*.* \b echo "path/to/src/file.xml SEG/page_p0001.xml" \\ | ocrd workspace bulk-add \\ --regex '(?P<src>.*?) (?P<fileGrp>.+?)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)' \\ --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\ --page-id 'PHYS_{{ pageid }}' \\ --file-grp "{{ fileGrp }}" \\ --url '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\ - \b { echo PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png; \\ echo PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml; \\ echo PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png; \\ echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\ } | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<url>.*)' \\ -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ url }}' - """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) try: pat = re.compile(regex) except Exception as e: log.error("Invalid regex: %s" % e) sys.exit(1) file_paths = [] from_stdin = file_glob == ('-', ) if from_stdin: file_paths += [Path(x.strip('\n')) for x in sys.stdin.readlines()] else: for fglob in file_glob: expanded = glob(fglob) if not expanded: file_paths += [Path(fglob)] else: file_paths += [Path(x) for x in expanded] for i, file_path in enumerate(file_paths): log.info("[%4d/%d] %s" % (i, len(file_paths), file_path)) # match regex m = pat.match(str(file_path)) if not m: if skip: continue log.error("File '%s' not matched by regex: '%s'" % (file_path, regex)) sys.exit(1) group_dict = m.groupdict() # derive --file-id from filename if not --file-id not explicitly set file_id_ = file_id or safe_filename(str(file_path)) # set up file info file_dict = { 'url': url, 'mimetype': mimetype, 'ID': file_id_, 'pageId': page_id, 'fileGrp': file_grp } # guess mime type if not file_dict['mimetype']: try: file_dict['mimetype'] = EXT_TO_MIME[file_path.suffix] except KeyError: log.error( "Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (file_path.suffix, file_path)) # Flag to track whether 'url' should be 'src' url_is_src = False # expand templates for param_name in file_dict: if not file_dict[param_name]: if param_name == 'url': url_is_src = True continue raise ValueError( f"OcrdFile attribute '{param_name}' unset ({file_dict})") for group_name in group_dict: file_dict[param_name] = file_dict[param_name].replace( '{{ %s }}' % group_name, group_dict[group_name]) # Where to copy from if src_path_option: src_path = src_path_option for group_name in group_dict: src_path = src_path.replace('{{ %s }}' % group_name, group_dict[group_name]) srcpath = Path(src_path) else: srcpath = file_path # copy files if src != url if url_is_src: file_dict['url'] = str(srcpath) else: destpath = Path(workspace.directory, file_dict['url']) if srcpath != destpath and not destpath.exists(): log.info("cp '%s' '%s'", srcpath, destpath) if not dry_run: if not destpath.parent.is_dir(): destpath.parent.mkdir() destpath.write_bytes(srcpath.read_bytes()) # Add to workspace (or not) fileGrp = file_dict.pop('fileGrp') if dry_run: log.info('workspace.add_file(%s)' % file_dict) else: workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # save changes to disk workspace.save_mets()