예제 #1
0
def merge(ctx, copy_files, filegrp_mapping, file_grp, file_id, page_id,
          mimetype, mets_path):  # pylint: disable=redefined-builtin
    """
    Merges this workspace with the workspace that contains ``METS_PATH``

    The ``--file-id``, ``--page-id``, ``--mimetype`` and ``--file-grp`` options have
    the same semantics as in ``ocrd workspace find``, see ``ocrd workspace find --help``
    for an explanation.
    """
    mets_path = Path(mets_path)
    if filegrp_mapping:
        filegrp_mapping = loads(filegrp_mapping)
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename,
                          automatic_backup=ctx.automatic_backup)
    other_workspace = Workspace(ctx.resolver,
                                directory=str(mets_path.parent),
                                mets_basename=str(mets_path.name))
    workspace.merge(
        other_workspace,
        copy_files=copy_files,
        fileGrp_mapping=filegrp_mapping,
        fileGrp=file_grp,
        ID=file_id,
        pageId=page_id,
        mimetype=mimetype,
    )
    workspace.save_mets()
예제 #2
0
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, force,
                       local_filename):
    """
    Add a file LOCAL_FILENAME to METS in a workspace.
    """
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename,
                          automatic_backup=ctx.automatic_backup)

    if not local_filename.startswith(ctx.directory):
        log.debug("File '%s' is not in workspace, copying", local_filename)
        local_filename = ctx.resolver.download_to_directory(ctx.directory,
                                                            "file://" +
                                                            local_filename,
                                                            subdir=file_grp)

    url = "file://" + local_filename

    workspace.mets.add_file(fileGrp=file_grp,
                            ID=file_id,
                            mimetype=mimetype,
                            url=url,
                            pageId=page_id,
                            force=force,
                            local_filename=local_filename)
    workspace.save_mets()
예제 #3
0
파일: workspace.py 프로젝트: cclauss/core
def prune_files(ctx, file_grp, mimetype, page_id, file_id):
    """
    Removes mets:files that point to non-existing local files

    (If any ``FILTER`` starts with ``//``, then its remainder
     will be interpreted as a regular expression.)
    """
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=basename(ctx.mets_url),
                          automatic_backup=ctx.automatic_backup)
    with pushd_popd(workspace.directory):
        for f in workspace.mets.find_files(
                ID=file_id,
                fileGrp=file_grp,
                mimetype=mimetype,
                pageId=page_id,
        ):
            try:
                if not f.local_filename or not exists(f.local_filename):
                    workspace.mets.remove_file(f.ID)
            except Exception as e:
                ctx.log.exception("Error removing %f: %s", f, e)
                raise (e)
        workspace.save_mets()
예제 #4
0
def set_id(ctx, id):
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename,
                          automatic_backup=ctx.automatic_backup)
    workspace.mets.unique_identifier = id
    workspace.save_mets()
예제 #5
0
def do_the_update(bagdir, non_local_urls=False):
    directory = Path(bagdir, 'data')
    if not Path(directory, 'mets.xml').exists():
        LOG.error("Something's wrong with OCRD-ZIP at %s, no data/mets.xml!",
                  bagdir)
        return
    workspace = Workspace(resolver, directory=str(directory))
    with pushd_popd(directory):
        for f in workspace.mets.find_files():
            fp = Path(f.url)
            if not fp.exists() and not non_local_urls:
                LOG.debug("Skipping non-local file: %s", fp)
                continue
            ext = MIME_TO_EXT.get(f.mimetype)
            if not ext:
                LOG.error(
                    "No rule to translate '%s' to an extension. Skipping %s",
                    f.mimetype, fp)
                continue
            if fp.suffix == ext:
                LOG.debug("Already has the right extension, %s", fp.name)
                continue
            if fp.suffix and fp.suffix in EXT_TO_MIME and fp.suffix != ext:
                LOG.warning("Has the WRONG extension, is '%s' should be '%s'",
                            fp.suffix, ext)
                f.url = f.url[:-len(fp.suffix)]
            LOG.info('Renaming %s{,%s}', fp, ext)
            f.url = "%s%s" % (f.url, ext)
            if fp.exists():
                fp.rename('%s%s' % (fp, ext))
        workspace.save_mets()
        LOG.debug('Running bagit update script')
        update_checksums(bagdir)
    LOG.info("FINISHED: %s", bagdir)
예제 #6
0
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, download):
    """
    Find files.

    (If any ``FILTER`` starts with ``//``, then its remainder
     will be interpreted as a regular expression.)
    """
    modified_mets = False
    ret = list()
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
    for f in workspace.mets.find_files(
            ID=file_id,
            fileGrp=file_grp,
            mimetype=mimetype,
            pageId=page_id,
        ):
        if download and not f.local_filename:
            workspace.download_file(f)
            modified_mets = True
        ret.append([f.ID if field == 'pageId' else getattr(f, field) or ''
                    for field in output_field])
    if modified_mets:
        workspace.save_mets()
    if 'pageId' in output_field:
        idx = output_field.index('pageId')
        fileIds = list(map(lambda fields: fields[idx], ret))
        pages = workspace.mets.get_physical_pages(for_fileIds=fileIds)
        for fields, page in zip(ret, pages):
            fields[idx] = page or ''
    for fields in ret:
        print('\t'.join(fields))
예제 #7
0
def set_id(ctx, id):  # pylint: disable=redefined-builtin
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename,
                          automatic_backup=ctx.automatic_backup)
    workspace.mets.unique_identifier = id
    workspace.save_mets()
예제 #8
0
def rename_group(ctx, old, new):
    """
    Rename fileGrp (USE attribute ``NEW`` to ``OLD``).
    """
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
    workspace.rename_file_group(old, new)
    workspace.save_mets()
예제 #9
0
def remove_group(ctx, group, recursive, force):
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename)
    for g in group:
        workspace.remove_file_group(g, recursive, force)
    workspace.save_mets()
예제 #10
0
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore,
                       check_file_exists, force, fname):
    """
    Add a file or http(s) URL FNAME to METS in a workspace.
    If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
    """
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename,
                          automatic_backup=ctx.automatic_backup)

    log = getLogger('ocrd.cli.workspace.add')
    if not mimetype:
        try:
            mimetype = EXT_TO_MIME[Path(fname).suffix]
            log.info("Guessed mimetype to be %s" % mimetype)
        except KeyError:
            log.error(
                "Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly"
                % (Path(fname).suffix, fname))

    kwargs = {
        'fileGrp': file_grp,
        'ID': file_id,
        'mimetype': mimetype,
        'pageId': page_id,
        'force': force,
        'ignore': ignore
    }
    log.debug("Adding '%s' (%s)", fname, kwargs)
    if not (fname.startswith('http://') or fname.startswith('https://')):
        if not fname.startswith(ctx.directory):
            if not isabs(fname) and exists(join(ctx.directory, fname)):
                fname = join(ctx.directory, fname)
            else:
                log.debug("File '%s' is not in workspace, copying", fname)
                try:
                    fname = ctx.resolver.download_to_directory(ctx.directory,
                                                               fname,
                                                               subdir=file_grp)
                except FileNotFoundError:
                    if check_file_exists:
                        log.error("File '%s' does not exist, halt execution!" %
                                  fname)
                        sys.exit(1)
        if check_file_exists and not exists(fname):
            log.error("File '%s' does not exist, halt execution!" % fname)
            sys.exit(1)
        if fname.startswith(ctx.directory):
            fname = relpath(fname, ctx.directory)
        kwargs['local_filename'] = fname

    kwargs['url'] = fname
    if not page_id:
        log.warning(
            "You did not provide '--page-id/-g', so the file you added is not linked to a specific page."
        )
    workspace.mets.add_file(**kwargs)
    workspace.save_mets()
예제 #11
0
def workspace_add_file(ctx, file_grp, file_id, mimetype, group_id,
                       local_filename):
    workspace = Workspace(ctx.resolver, directory=ctx.directory)
    workspace.mets.add_file(file_grp=file_grp,
                            file_id=file_id,
                            mimetype=mimetype,
                            group_id=group_id,
                            local_filename=local_filename)
    workspace.save_mets()
예제 #12
0
def workspace_remove_file(ctx, id, force):  # pylint: disable=redefined-builtin
    """
    Delete file by ID from mets.xml
    """
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename,
                          automatic_backup=ctx.automatic_backup)
    for i in id:
        workspace.remove_file(i, force=force)
    workspace.save_mets()
예제 #13
0
def remove_group(ctx, group, recursive, force, keep_files):
    """
    Delete fileGrps (given by their USE attribute ``GROUP``).
    
    (If any ``GROUP`` starts with ``//``, then its remainder
     will be interpreted as a regular expression.)
    """
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url))
    for g in group:
        workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files)
    workspace.save_mets()
예제 #14
0
def set_id(ctx, id):   # pylint: disable=redefined-builtin
    """
    Set METS ID.

    If one of the supported identifier mechanisms is used, will set this identifier.

    Otherwise will create a new <mods:identifier type="purl">{{ ID }}</mods:identifier>.
    """
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)
    workspace.mets.unique_identifier = id
    workspace.save_mets()
예제 #15
0
def workspace_remove_file(ctx, id, force, keep_file):  # pylint: disable=redefined-builtin
    """
    Delete files (given by their ID attribute ``ID``).
    
    (If any ``ID`` starts with ``//``, then its remainder
     will be interpreted as a regular expression.)
    """
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)
    for i in id:
        workspace.remove_file(i, force=force, keep_file=keep_file)
    workspace.save_mets()
예제 #16
0
def prune_files(ctx):
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename)
    with pushd_popd(workspace.directory):
        for f in workspace.mets.find_files():
            try:
                if not f.local_filename or not exists(f.local_filename):
                    workspace.mets.remove_file(f.ID)
            except Exception as e:
                log.exception("Error removing %f: %s", f, e)
                raise (e)
        workspace.save_mets()
예제 #17
0
파일: workspace.py 프로젝트: cclauss/core
def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore,
                       check_file_exists, force, fname):
    """
    Add a file or http(s) URL FNAME to METS in a workspace.
    If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace.
    """
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=basename(ctx.mets_url),
                          automatic_backup=ctx.automatic_backup)

    kwargs = {
        'fileGrp': file_grp,
        'ID': file_id,
        'mimetype': mimetype,
        'pageId': page_id,
        'force': force,
        'ignore': ignore
    }
    log = getLogger('ocrd.cli.workspace.add')
    log.debug("Adding '%s' (%s)", fname, kwargs)
    if not (fname.startswith('http://') or fname.startswith('https://')):
        if not fname.startswith(ctx.directory):
            if not isabs(fname) and exists(join(ctx.directory, fname)):
                fname = join(ctx.directory, fname)
            else:
                log.debug("File '%s' is not in workspace, copying", fname)
                try:
                    fname = ctx.resolver.download_to_directory(ctx.directory,
                                                               fname,
                                                               subdir=file_grp)
                except FileNotFoundError:
                    if check_file_exists:
                        log.error("File '%s' does not exist, halt execution!" %
                                  fname)
                        sys.exit(1)
        if check_file_exists and not exists(fname):
            log.error("File '%s' does not exist, halt execution!" % fname)
            sys.exit(1)
        if fname.startswith(ctx.directory):
            fname = relpath(fname, ctx.directory)
        kwargs['local_filename'] = fname

    kwargs['url'] = fname
    workspace.mets.add_file(**kwargs)
    workspace.save_mets()
예제 #18
0
def workspace_find(ctx, file_grp, local_only, mimetype, group_id, file_id,
                   output_field, download):
    """
    Find files.
    """
    workspace = Workspace(ctx.resolver, directory=ctx.directory)
    for f in workspace.mets.find_files(
            ID=file_id,
            fileGrp=file_grp,
            local_only=local_only,
            mimetype=mimetype,
            groupId=group_id,
    ):
        if download:
            workspace.download_file(f, subdir=f.fileGrp)
            workspace.save_mets()
        ret = '\t'.join([getattr(f, field) or '' for field in output_field])
        print(ret)
예제 #19
0
def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field,
                   download):
    """
    Find files.
    """
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename)
    for f in workspace.mets.find_files(
            ID=file_id,
            fileGrp=file_grp,
            mimetype=mimetype,
            pageId=page_id,
    ):
        if download:
            workspace.download_file(f)
            workspace.save_mets()
        ret = '\t'.join([getattr(f, field) or '' for field in output_field])
        print(ret)
예제 #20
0
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, file_grp, dry_run, file_glob, ignore, force, skip):
    r"""
    Add files in bulk to an OCR-D workspace.

    FILE_GLOB can either be a shell glob expression or a list of files.

    --regex is applied to the absolute path of every file in FILE_GLOB and can
    define named groups that can be used in --page-id, --file-id, --mimetype, --url and
    --file-grp by referencing the named group 'grp' in the regex as '{{ grp }}'.

    \b
    Example:
        ocrd workspace bulk-add \\
                --regex '^.*/(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)$' \\
                --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\
                --page-id 'PHYS_{{ pageid }}' \\
                --file-grp "{{ fileGrp }}" \\
                --url '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
                path/to/files/*/*.*

    """
    log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name
    workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=basename(ctx.mets_url), automatic_backup=ctx.automatic_backup)

    try:
        pat = re.compile(regex)
    except Exception as e:
        log.error("Invalid regex: %s" % e)
        sys.exit(1)

    file_paths = []
    for fglob in file_glob:
        file_paths += [Path(x).resolve() for x in glob(fglob)]

    for i, file_path in enumerate(file_paths):
        log.info("[%4d/%d] %s" % (i, len(file_paths), file_path))

        # match regex
        m = pat.match(str(file_path))
        if not m:
            if skip:
                continue
            log.error("File not matched by regex: '%s'" % file_path)
            sys.exit(1)
        group_dict = m.groupdict()

        # set up file info
        file_dict = {'url': url, 'mimetype': mimetype, 'ID': file_id, 'pageId': page_id, 'fileGrp': file_grp}

        # guess mime type
        if not file_dict['mimetype']:
            try:
                file_dict['mimetype'] = EXT_TO_MIME[file_path.suffix]
            except KeyError:
                log.error("Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly" % (file_path.suffix, file_path))

        # expand templates
        for param_name in file_dict:
            for group_name in group_dict:
                file_dict[param_name] = file_dict[param_name].replace('{{ %s }}' % group_name, group_dict[group_name])

        # copy files
        if file_dict['url']:
            urlpath = Path(workspace.directory, file_dict['url'])
            if not urlpath.exists():
                log.info("cp '%s' '%s'", file_path, urlpath)
                if not dry_run:
                    if not urlpath.parent.is_dir():
                        urlpath.parent.mkdir()
                    urlpath.write_bytes(file_path.read_bytes())

        # Add to workspace (or not)
        fileGrp = file_dict.pop('fileGrp')
        if dry_run:
            log.info('workspace.add_file(%s)' % file_dict)
        else:
            workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict)

    # save changes to disk
    workspace.save_mets()
예제 #21
0
def set_id(ctx, ID):
    workspace = Workspace(ctx.resolver, directory=ctx.directory)
    workspace.mets.unique_identifier = ID
    workspace.save_mets()
예제 #22
0
def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url,
                           file_grp, dry_run, file_glob, src_path_option,
                           ignore, force, skip):
    """
    Add files in bulk to an OCR-D workspace.

    FILE_GLOB can either be a shell glob expression to match file names,
    or a list of expressions or '-', in which case expressions are read from STDIN.

    After globbing, --regex is matched against each expression resulting from FILE_GLOB, and can
    define named groups reusable in the --page-id, --file-id, --mimetype, --url, --source-path and
    --file-grp options, e.g. by referencing the group name 'grp' from the regex as '{{ grp }}'.

    If the FILE_GLOB expressions do not denote the file names themselves
    (but arbitrary strings for --regex matching), then use --source-path to set
    the actual file paths to use. (This could involve fixed strings or group references.)

    \b
    Examples:
        ocrd workspace bulk-add \\
                --regex '(?P<fileGrp>[^/]+)/page_(?P<pageid>.*)\.[^.]+' \\
                --page-id 'PHYS_{{ pageid }}' \\
                --file-grp "{{ fileGrp }}" \\
                path/to/files/*/*.*
        \b
        echo "path/to/src/file.xml SEG/page_p0001.xml" \\
        | ocrd workspace bulk-add \\
                --regex '(?P<src>.*?) (?P<fileGrp>.+?)/page_(?P<pageid>.*)\.(?P<ext>[^\.]*)' \\
                --file-id 'FILE_{{ fileGrp }}_{{ pageid }}' \\
                --page-id 'PHYS_{{ pageid }}' \\
                --file-grp "{{ fileGrp }}" \\
                --url '{{ fileGrp }}/FILE_{{ pageid }}.{{ ext }}' \\
                -

        \b
        { echo PHYS_0001 BIN FILE_0001_BIN.IMG-wolf BIN/FILE_0001_BIN.IMG-wolf.png; \\
          echo PHYS_0001 BIN FILE_0001_BIN BIN/FILE_0001_BIN.xml; \\
          echo PHYS_0002 BIN FILE_0002_BIN.IMG-wolf BIN/FILE_0002_BIN.IMG-wolf.png; \\
          echo PHYS_0002 BIN FILE_0002_BIN BIN/FILE_0002_BIN.xml; \\
        } | ocrd workspace bulk-add -r '(?P<pageid>.*) (?P<filegrp>.*) (?P<fileid>.*) (?P<url>.*)' \\
          -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ url }}' -
    """
    log = getLogger('ocrd.cli.workspace.bulk-add')  # pylint: disable=redefined-outer-name
    workspace = Workspace(ctx.resolver,
                          directory=ctx.directory,
                          mets_basename=ctx.mets_basename,
                          automatic_backup=ctx.automatic_backup)

    try:
        pat = re.compile(regex)
    except Exception as e:
        log.error("Invalid regex: %s" % e)
        sys.exit(1)

    file_paths = []
    from_stdin = file_glob == ('-', )
    if from_stdin:
        file_paths += [Path(x.strip('\n')) for x in sys.stdin.readlines()]
    else:
        for fglob in file_glob:
            expanded = glob(fglob)
            if not expanded:
                file_paths += [Path(fglob)]
            else:
                file_paths += [Path(x) for x in expanded]

    for i, file_path in enumerate(file_paths):
        log.info("[%4d/%d] %s" % (i, len(file_paths), file_path))

        # match regex
        m = pat.match(str(file_path))
        if not m:
            if skip:
                continue
            log.error("File '%s' not matched by regex: '%s'" %
                      (file_path, regex))
            sys.exit(1)
        group_dict = m.groupdict()

        # derive --file-id from filename if not --file-id not explicitly set
        file_id_ = file_id or safe_filename(str(file_path))

        # set up file info
        file_dict = {
            'url': url,
            'mimetype': mimetype,
            'ID': file_id_,
            'pageId': page_id,
            'fileGrp': file_grp
        }

        # guess mime type
        if not file_dict['mimetype']:
            try:
                file_dict['mimetype'] = EXT_TO_MIME[file_path.suffix]
            except KeyError:
                log.error(
                    "Cannot guess mimetype from extension '%s' for '%s'. Set --mimetype explicitly"
                    % (file_path.suffix, file_path))

        # Flag to track whether 'url' should be 'src'
        url_is_src = False

        # expand templates
        for param_name in file_dict:
            if not file_dict[param_name]:
                if param_name == 'url':
                    url_is_src = True
                    continue
                raise ValueError(
                    f"OcrdFile attribute '{param_name}' unset ({file_dict})")
            for group_name in group_dict:
                file_dict[param_name] = file_dict[param_name].replace(
                    '{{ %s }}' % group_name, group_dict[group_name])

        # Where to copy from
        if src_path_option:
            src_path = src_path_option
            for group_name in group_dict:
                src_path = src_path.replace('{{ %s }}' % group_name,
                                            group_dict[group_name])
            srcpath = Path(src_path)
        else:
            srcpath = file_path

        # copy files if src != url
        if url_is_src:
            file_dict['url'] = str(srcpath)
        else:
            destpath = Path(workspace.directory, file_dict['url'])
            if srcpath != destpath and not destpath.exists():
                log.info("cp '%s' '%s'", srcpath, destpath)
                if not dry_run:
                    if not destpath.parent.is_dir():
                        destpath.parent.mkdir()
                    destpath.write_bytes(srcpath.read_bytes())

        # Add to workspace (or not)
        fileGrp = file_dict.pop('fileGrp')
        if dry_run:
            log.info('workspace.add_file(%s)' % file_dict)
        else:
            workspace.add_file(fileGrp,
                               ignore=ignore,
                               force=force,
                               **file_dict)

    # save changes to disk
    workspace.save_mets()