def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp, dry_run, file_glob, ignore, force, skip): r""" Add files in bulk to an OCR-D workspace. FILE_GLOB can either be a shell glob expression or a list of files. --regex is applied to the absolute path of every file in FILE_GLOB and can define named groups that can be used in --page-id, --file-id, --mimetype, --url and --file-grp by referencing the named group 'grp' in the regex as '{{ grp }}'. \b Example: ocrd workspace bulk-add \\ --regex '^.*/(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)$' \\ --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\ --page-id 'PHYS_{{ pageid }}' \\ --file-grp "{{ fileGrp }}" \\ --url '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\ path/to/files/*/*.* """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup) try: pat = re.compile(regex) except Exception as e: log.error("Invalid regex: %s" % e) sys.exit(1) file_paths = [] for fglob in file_glob: file_paths += [Path(x).resolve() for x in glob(fglob)] for i, file_path in enumerate(file_paths): log.info("[%4d/%d] %s" % (i, len(file_paths), file_path)) # match regex m = pat.match(str(file_path)) if not m: if skip: continue log.error("File not matched by regex: '%s'" % file_path) sys.exit(1) group_dict = m.groupdict() # set up file info file_dict = {'url': url, 'mimetype': mimetype, 'ID': file_id, 'pageId': page_id, 'fileGrp': file_grp} # guess mime type if not file_dict['mimetype']: try: file_dict['mimetype'] = EXT_TO_MIME[file_path.suffix] except KeyError: log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (file_path.suffix, file_path)) # expand templates for param_name in file_dict: for group_name in group_dict: file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name]) # copy files if file_dict['url']: urlpath = Path(workspace.directory, file_dict['url']) if not urlpath.exists(): log.info("cp '%s' '%s'", file_path, urlpath) if not dry_run: if not urlpath.parent.is_dir(): urlpath.parent.mkdir() urlpath.write_bytes(file_path.read_bytes()) # Add to workspace (or not) fileGrp = file_dict.pop('fileGrp') if dry_run: log.info('workspace.add_file(%s)' % file_dict) else: workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # save changes to disk workspace.save_mets()
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp, dry_run, file_glob, src_path_option, ignore, force, skip): """ Add files in bulk to an OCR-D workspace. FILE_GLOB can either be a shell glob expression to match file names, or a list of expressions or '-', in which case expressions are read from STDIN. After globbing, --regex is matched against each expression resulting from FILE_GLOB, and can define named groups reusable in the --page-id, --file-id, --mimetype, --url, --source-path and --file-grp options, e.g. by referencing the group name 'grp' from the regex as '{{ grp }}'. If the FILE_GLOB expressions do not denote the file names themselves (but arbitrary strings for --regex matching), then use --source-path to set the actual file paths to use. (This could involve fixed strings or group references.) \b Examples: ocrd workspace bulk-add \\ --regex '(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.[^.]+' \\ --page-id 'PHYS_{{ pageid }}' \\ --file-grp "{{ fileGrp }}" \\ path/to/files/*/*.* \b echo "path/to/src/file.xml SEG/page_p0001.xml" \\ | ocrd workspace bulk-add \\ --regex '(?P<src>.*?) (?P<fileGrp>.+?)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)' \\ --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\ --page-id 'PHYS_{{ pageid }}' \\ --file-grp "{{ fileGrp }}" \\ --url '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\ - \b { echo PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png; \\ echo PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml; \\ echo PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png; \\ echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\ } | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<url>.*)' \\ -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ url }}' - """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) try: pat = re.compile(regex) except Exception as e: log.error("Invalid regex: %s" % e) sys.exit(1) file_paths = [] from_stdin = file_glob == ('-', ) if from_stdin: file_paths += [Path(x.strip('\n')) for x in sys.stdin.readlines()] else: for fglob in file_glob: expanded = glob(fglob) if not expanded: file_paths += [Path(fglob)] else: file_paths += [Path(x) for x in expanded] for i, file_path in enumerate(file_paths): log.info("[%4d/%d] %s" % (i, len(file_paths), file_path)) # match regex m = pat.match(str(file_path)) if not m: if skip: continue log.error("File '%s' not matched by regex: '%s'" % (file_path, regex)) sys.exit(1) group_dict = m.groupdict() # derive --file-id from filename if not --file-id not explicitly set file_id_ = file_id or safe_filename(str(file_path)) # set up file info file_dict = { 'url': url, 'mimetype': mimetype, 'ID': file_id_, 'pageId': page_id, 'fileGrp': file_grp } # guess mime type if not file_dict['mimetype']: try: file_dict['mimetype'] = EXT_TO_MIME[file_path.suffix] except KeyError: log.error( "Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (file_path.suffix, file_path)) # Flag to track whether 'url' should be 'src' url_is_src = False # expand templates for param_name in file_dict: if not file_dict[param_name]: if param_name == 'url': url_is_src = True continue raise ValueError( f"OcrdFile attribute '{param_name}' unset ({file_dict})") for group_name in group_dict: file_dict[param_name] = file_dict[param_name].replace( '{{ %s }}' % group_name, group_dict[group_name]) # Where to copy from if src_path_option: src_path = src_path_option for group_name in group_dict: src_path = src_path.replace('{{ %s }}' % group_name, group_dict[group_name]) srcpath = Path(src_path) else: srcpath = file_path # copy files if src != url if url_is_src: file_dict['url'] = str(srcpath) else: destpath = Path(workspace.directory, file_dict['url']) if srcpath != destpath and not destpath.exists(): log.info("cp '%s' '%s'", srcpath, destpath) if not dry_run: if not destpath.parent.is_dir(): destpath.parent.mkdir() destpath.write_bytes(srcpath.read_bytes()) # Add to workspace (or not) fileGrp = file_dict.pop('fileGrp') if dry_run: log.info('workspace.add_file(%s)' % file_dict) else: workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # save changes to disk workspace.save_mets()