Exemplo n.º 1
0
class Clone(Interface):
    """Obtain a dataset (copy) from a URL or local directory

    The purpose of this command is to obtain a new clone (copy) of a dataset
    and place it into a not-yet-existing or empty directory. As such `clone`
    provides a strict subset of the functionality offered by `install`. Only a
    single dataset can be obtained, and immediate recursive installation of
    subdatasets is not supported. However, once a (super)dataset is installed
    via `clone`, any content, including subdatasets can be obtained by a
    subsequent `get` command.

    Primary differences over a direct `git clone` call are 1) the automatic
    initialization of a dataset annex (pure Git repositories are equally
    supported); 2) automatic registration of the newly obtained dataset as a
    subdataset (submodule), if a parent dataset is specified; 3) support
    for additional resource identifiers (DataLad resource identifiers as used
    on datasets.datalad.org, and RIA store URLs as used for store.datalad.org
    - optionally in specific versions as identified by a branch or a tag; see
    examples); and 4) automatic configurable generation of alternative access
    URL for common cases (such as appending '.git' to the URL in case the
    accessing the base URL failed).

    || PYTHON >>By default, the command returns a single Dataset instance for
    an installed dataset, regardless of whether it was newly installed ('ok'
    result), or found already installed from the specified source ('notneeded'
    result).<< PYTHON ||

    .. seealso::

      :ref:`handbook:3-001`
        More information on Remote Indexed Archive (RIA) stores
    """
    # by default ignore everything but install results
    # i.e. no "add to super dataset"
    result_filter = EnsureKeyChoice('action', ('install',))
    # very frequently this command will yield exactly one installed dataset
    # spare people the pain of going through a list by default
    return_type = 'item-or-list'
    # as discussed in #1409 and #1470, we want to return dataset instances
    # matching what is actually available after command completion (and
    # None for any failed dataset installation)
    result_xfm = 'successdatasets-or-none'

    _examples_ = [
        dict(text="Install a dataset from Github into the current directory",
             code_py="clone("
             "source='https://github.com/datalad-datasets/longnow"
             "-podcasts.git')",
             code_cmd="datalad clone "
             "https://github.com/datalad-datasets/longnow-podcasts.git"),
        dict(text="Install a dataset into a specific directory",
             code_py="""\
             clone(source='https://github.com/datalad-datasets/longnow-podcasts.git',
                   path='myfavpodcasts')""",
             code_cmd="""\
             datalad clone https://github.com/datalad-datasets/longnow-podcasts.git \\
             myfavpodcasts"""),
        dict(text="Install a dataset as a subdataset into the current dataset",
             code_py="""\
             clone(dataset='.',
                   source='https://github.com/datalad-datasets/longnow-podcasts.git')""",
             code_cmd="datalad clone -d . "
             "https://github.com/datalad-datasets/longnow-podcasts.git"),
        dict(text="Install the main superdataset from datasets.datalad.org",
             code_py="clone(source='///')",
             code_cmd="datalad clone ///"),
        dict(text="Install a dataset identified by a literal alias from store.datalad.org",
             code_py="clone(source='ria+http://store.datalad.org#~hcp-openaccess')",
             code_cmd="datalad clone ria+http://store.datalad.org#~hcp-openaccess"),
        dict(
            text="Install a dataset in a specific version as identified by a"
                 "branch or tag name from store.datalad.org",
            code_py="clone(source='ria+http://store.datalad.org#76b6ca66-36b1-11ea-a2e6-f0d5bf7b5561@myidentifier')",
            code_cmd="datalad clone ria+http://store.datalad.org#76b6ca66-36b1-11ea-a2e6-f0d5bf7b5561@myidentifier"),
        dict(
            text="Install a dataset with group-write access permissions",
            code_py=\
            "clone(source='http://example.com/dataset', reckless='shared-group')",
            code_cmd=\
            "datalad clone http://example.com/dataset --reckless shared-group"),
    ]

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""(parent) dataset to clone into. If given, the newly cloned
            dataset is registered as a subdataset of the parent. Also, if given,
            relative paths are interpreted as being relative to the parent
            dataset, and not relative to the working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        source=Parameter(
            args=("source",),
            metavar='SOURCE',
            doc="""URL, DataLad resource identifier, local path or instance of
            dataset to be cloned""",
            constraints=EnsureStr() | EnsureNone()),
        path=Parameter(
            args=("path",),
            metavar='PATH',
            nargs="?",
            doc="""path to clone into.  If no `path` is provided a
            destination path will be derived from a source URL
            similar to :command:`git clone`"""),
        description=location_description,
        reckless=reckless_opt,
    )

    @staticmethod
    @datasetmethod(name='clone')
    @eval_results
    def __call__(
            source,
            path=None,
            dataset=None,
            description=None,
            reckless=None):
        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = require_dataset(
            dataset, check_installed=True, purpose='cloning') \
            if dataset is not None else dataset
        refds_path = ds.path if ds else None

        # legacy compatibility
        if reckless is True:
            # so that we can forget about how things used to be
            reckless = 'auto'

        if isinstance(source, Dataset):
            source = source.path

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "clone `source` and destination `path` are identical [{}]. "
                "If you are trying to add a subdataset simply use `save`".format(
                    path))

        if path is not None:
            path = resolve_path(path, dataset)

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            # since this is a relative `path`, resolve it:
            # we are not going to reuse the decoded URL, as this is done for
            # all source candidates in clone_dataset(), we just use to determine
            # a destination path here in order to perform a bunch of additional
            # checks that shall not pollute the helper function
            source_ = decode_source_spec(
                source, cfg=None if ds is None else ds.config)
            path = resolve_path(source_['default_destpath'], dataset)
            lgr.debug("Determined clone target path from source")
        lgr.debug("Resolved clone target path to: '%s'", path)

        # there is no other way -- my intoxicated brain tells me
        assert(path is not None)

        result_props = dict(
            action='install',
            logger=lgr,
            refds=refds_path,
            source_url=source)

        try:
            # this will implicitly cause pathlib to run a bunch of checks
            # whether the present path makes any sense on the platform
            # we are running on -- we don't care if the path actually
            # exists at this point, but we want to abort early if the path
            # spec is determined to be useless
            path.exists()
        except OSError as e:
            yield get_status_dict(
                status='error',
                path=path,
                message=('cannot handle target path: %s', exc_str(e)),
                **result_props)
            return

        destination_dataset = Dataset(path)
        result_props['ds'] = destination_dataset

        if ds is not None and ds.pathobj not in path.parents:
            yield get_status_dict(
                status='error',
                message=("clone target path '%s' not in specified target dataset '%s'",
                         path, ds),
                **result_props)
            return

        # perform the actual cloning operation
        yield from clone_dataset(
            [source],
            destination_dataset,
            reckless,
            description,
            result_props,
            cfg=None if ds is None else ds.config,
        )

        # TODO handle any 'version' property handling and verification using a dedicated
        # public helper

        if ds is not None:
            # we created a dataset in another dataset
            # -> make submodule
            for r in ds.save(
                    path,
                    return_type='generator',
                    result_filter=None,
                    result_xfm=None,
                    on_failure='ignore'):
                yield r
Exemplo n.º 2
0
class NoAnnex(Interface):
    """Configure a dataset to never put some content into the dataset's annex

    This can be useful in mixed datasets that also contain textual data, such
    as source code, which can be efficiently and more conveniently managed
    directly in Git.

    Patterns generally look like this::

      code/*

    which would match all file in the code directory. In order to match all
    files under ``code/``, including all its subdirectories use such a
    pattern::

      code/**

    Note that the plugin works incrementally, hence any existing configuration
    (e.g. from a previous plugin run) is amended, not replaced.
    """
    from datalad.support.param import Parameter
    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import EnsureNone

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc=""""specify the dataset to configure. If no dataset is given,
            an attempt is made to identify the dataset based on the current
            working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        pattern=Parameter(
            args=("--pattern", ),
            nargs='+',
            doc="""list of path patterns. Any content whose path is matching
            any pattern will not be annexed when added to a dataset, but
            instead will be tracked directly in Git. Path pattern have to be
            relative to the directory given by the `ref_dir` option. By
            default, patterns should be relative to the root of the dataset."""
        ),
        ref_dir=Parameter(
            args=("--ref-dir", ),
            doc="""Relative path (within the dataset) to the directory that is
            to be configured. All patterns are interpreted relative to this
            path, and configuration is written to a ``.gitattributes`` file in
            this directory."""),
        makedirs=Parameter(
            args=("--makedirs", ),
            action='store_true',
            doc="""If set, any missing directories will be created in order to
            be able to place a file into ``--ref-dir``."""),
    )

    @staticmethod
    @datasetmethod(name='no_annex')
    @eval_results
    def __call__(dataset, pattern, ref_dir='.', makedirs=False):
        # could be extended to accept actual largefile expressions
        from os.path import join as opj
        from os.path import isabs
        from os.path import exists
        from os import makedirs as makedirsfx
        from datalad.distribution.dataset import require_dataset
        from datalad.support.annexrepo import AnnexRepo
        from datalad.utils import assure_list

        pattern = assure_list(pattern)
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='no_annex configuration')

        res_kwargs = dict(
            path=ds.path,
            type='dataset',
            action='no_annex',
        )

        # all the ways we refused to cooperate
        if not isinstance(ds.repo, AnnexRepo):
            yield dict(res_kwargs,
                       status='notneeded',
                       message='dataset has no annex')
            return
        if any(isabs(p) for p in pattern):
            yield dict(
                res_kwargs,
                status='error',
                message=
                ('path pattern for `no_annex` configuration must be relative paths: %s',
                 pattern))
            return
        if isabs(ref_dir):
            yield dict(
                res_kwargs,
                status='error',
                message=
                ('`ref_dir` for `no_annex` configuration must be a relative path: %s',
                 ref_dir))
            return

        gitattr_dir = opj(ds.path, ref_dir)
        if not exists(gitattr_dir):
            if makedirs:
                makedirsfx(gitattr_dir)
            else:
                yield dict(
                    res_kwargs,
                    status='error',
                    message=
                    'target directory for `no_annex` does not exist (consider makedirs=True)'
                )
                return

        gitattr_file = opj(gitattr_dir, '.gitattributes')
        ds.repo.set_gitattributes([(p, {
            'annex.largefiles': 'nothing'
        }) for p in pattern],
                                  attrfile=gitattr_file)
        yield dict(res_kwargs, status='ok')

        for r in ds.save(gitattr_file,
                         to_git=True,
                         message="[DATALAD] exclude paths from annex'ing",
                         result_filter=None,
                         result_xfm=None):
            yield r
Exemplo n.º 3
0
class ImportDicoms(Interface):
    """Import a DICOM archive into a study raw dataset.

    This creates a subdataset, containing the extracted DICOM files, under ACQUISITION ID/dicoms.
    Metadata is extracted from the DICOM headers and a study specification will automatically be prefilled, based on
    the metadata in DICOM headers. The specification is written to AQUISTION ID/studyspec.json by default.
    To this end after the creation of the subdataset and the extraction of DICOM metadata, hirni-dicom2spec is called internally.
    Therefore whatever you configure regarding dicom2spec applies here as well. Please refer to hirni-dicom2spec's documentation
    on how to configure the deduction from DICOM metadata to a study specification."""

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='PATH',
            doc="""specify the dataset to import the DICOM archive into.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory and/or the given PATH""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            doc="""path or URL of the dicom archive to be imported.""",
            constraints=EnsureStr()),
        acqid=Parameter(
            args=("acqid", ),
            metavar="ACQUISITION ID",
            doc=
            """acquisition identifier for the imported DICOM files. This is used as the name the of directory, that is supposed to contain all data related to that acquisition.
            If not specified, an attempt will be made to derive ACQUISITION_ID from DICOM metadata. You can specify how to deduce that identifier from the DICOM header fields by
            configuring `datalad.hirni.import.acquisition-format` with a python format string referencing DICOM header field names as variables. For example, the current default
            value for that configuration is "{PatientID}".""",
            nargs="?",
            constraints=EnsureStr() | EnsureNone()),
        subject=Parameter(
            args=("--subject", ),
            metavar="SUBJECT",
            doc="""subject identifier. If not specified, an attempt will be made 
            to derive SUBJECT from DICOM headers. See hirni-dicom2spec for details.""",
            constraints=EnsureStr() | EnsureNone()),
        anon_subject=Parameter(
            args=("--anon-subject", ),
            metavar="ANON_SUBJECT",
            doc="""an anonymized subject identifier. This is needed for 
            anonymized conversion via spec2bids --anonymize and will be stored 
            in the specification snippet for the imported DICOMs. Hence it can 
            be added later and isn't mandatory for the import.""",
            constraints=EnsureStr() | EnsureNone()),
        properties=Parameter(
            args=("--properties", ),
            metavar="PATH or JSON string",
            doc="""a JSON string or a path to a JSON file, to provide 
            overrides/additions to the to be created specification snippets for this acquisition.
            """,
            constraints=EnsureStr() | EnsureNone()),
    )

    @staticmethod
    @datasetmethod(name='hirni_import_dcm')
    @eval_results
    def __call__(path,
                 acqid=None,
                 dataset=None,
                 subject=None,
                 anon_subject=None,
                 properties=None):
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose="import DICOM session")
        if acqid:
            # acquisition was specified => we know where to create subds
            acq_dir = op.join(ds.path, acqid)
            if not op.exists(acq_dir):
                makedirs(acq_dir)
            # TODO: if exists: needs to be empty?

            dicom_ds = _create_subds_from_tarball(path, acq_dir)

        else:
            # we don't know the acquisition id yet => create in tmp

            acq_dir = op.join(ds.path, '.git', 'datalad', 'hirni_import')
            assert not op.exists(acq_dir)
            # TODO: don't assert; check and adapt instead

            try:
                dicom_ds = _create_subds_from_tarball(path, acq_dir)
                dicom_ds = _guess_acquisition_and_move(dicom_ds, ds)
            except OSError as e:
                # TODO: Was FileExistsError. Find more accurate PY2/3 solution
                # than just OSError
                yield dict(status='impossible',
                           path=e.filename,
                           type='file',
                           action='import DICOM tarball',
                           logger=lgr,
                           message=exc_str(e))
                rmtree(acq_dir)
                return  # we can't do anything
            finally:
                if op.exists(acq_dir):
                    lgr.debug("Killing temp dataset at %s ...", acq_dir)
                    rmtree(acq_dir)

        acqid = op.basename(op.dirname(dicom_ds.path))
        ds.save(dicom_ds.path,
                message="[HIRNI] Add aquisition {}".format(acqid))

        # Note: use path with trailing slash to indicate we want metadata about the content of this subds,
        # not the subds itself.
        ds.meta_aggregate(with_pathsep(dicom_ds.path), into='top')

        ds.hirni_dicom2spec(path=dicom_ds.path,
                            spec=op.normpath(
                                op.join(dicom_ds.path, op.pardir,
                                        "studyspec.json")),
                            subject=subject,
                            anon_subject=anon_subject,
                            acquisition=acqid,
                            properties=properties)

        # TODO: This should probably be optional
        # We have the tarball and can drop extracted stuff:
        dicom_ds.drop([
            f for f in listdir(dicom_ds.path)
            if f != ".datalad" and f != ".git"
        ])

        # finally clean up git objects:
        dicom_ds.repo.cmd_call_wrapper.run(['git', 'gc'])

        # TODO: yield error results etc.
        yield dict(status='ok',
                   path=dicom_ds.path,
                   type='dataset',
                   action='import DICOM tarball',
                   logger=lgr)
Exemplo n.º 4
0
class WebApp(Interface):
    """
    """
    _params_ = dict(
        app=Parameter(args=('app', ),
                      nargs='?',
                      metavar='APPNAME',
                      doc="""Name of a registered webapp to start"""),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to serve as the anchor of the webapp.
            An attempt is made to identify the dataset based on the current
            working directory. If a dataset is given, the command will be
            executed in the root directory of this dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
        read_only=Parameter(
            args=("--read-only", ),
            constraints=EnsureBool(),
            doc="""do not perform operations other then read-only access
            to dataset. It is up to the individual resources to interpret
            this flag and act accordingly."""),
        mode=Parameter(
            args=("--mode", ),
            constraints=EnsureChoice('normal', 'daemon', 'dry-run', 'debug'),
            doc="""Execution mode: regular foreground process (normal);
            background process (daemon); no server is started, but all
            configuration is perform (dry-run); like normal, but in debug
            mode (debug)"""),
        static_root=Parameter(
            args=("--static-root", ),
            doc="""path to static (HTML) files that should be served in
            root of the webapp. Defaults to the current directory."""),
        get_apps=Parameter(args=('--get-apps', ),
                           action='store_true',
                           doc="""if set, yields all registered webapp."""),
    )

    @staticmethod
    @datasetmethod(name='webapp')
    @eval_results
    def __call__(app=None,
                 dataset=None,
                 read_only=False,
                 mode='normal',
                 static_root=None,
                 get_apps=False):
        if get_apps:
            for ep in iter_entry_points('datalad.webapp.apps'):
                yield dict(action='webapp',
                           status='ok' if resource_isdir(
                               ep.module_name, ep.load()) else 'error',
                           path=ep.name,
                           logger=lgr,
                           message=("provided by '%s'", ep.module_name))
            return

        from datalad.distribution.dataset import require_dataset
        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='serving')

        if static_root is None and app:
            for ep in iter_entry_points('datalad.webapp.apps'):
                if ep.name == app:
                    app_path = resource_filename(ep.module_name, ep.load())
                    if not resource_isdir(ep.module_name, ep.load()):
                        yield dict(
                            action='webapp',
                            status='error',
                            path=dataset.path,
                            message=
                            ("app entrypoint '%s' does not point to directory",
                             app, app_path))
                        return
                    static_root = app_path
                    break
            if static_root is None:
                yield dict(action='webapp',
                           status='error',
                           path=dataset.path,
                           message=("no registered webapp with name '%s'",
                                    app))
                return
        elif static_root is None:
            static_root = op.curdir

        from flask import Flask
        app = Flask(
            __name__,
            root_path=dataset.path,
            static_url_path='',
            static_folder=op.abspath(static_root),
        )
        app.secret_key = os.urandom(64)
        # expose via arg
        app.config['api_key'] = 'dummy'

        webapp_props['config'] = app.config

        from flask_restful import Api
        api = Api(app, prefix="/api/v1")

        # TODO add default route to static index.html, if one exists
        # TODO use opt-in model for endpoints to limit exposure of
        # functionality to what is really needed
        for ep in iter_entry_points('datalad.webapp.resources'):
            lgr.warn("Available webapp resource'%s'", ep.name)
            cls = ep.load()
            urls = ['/{}'.format(ep.name)]
            if hasattr(cls, '_urlarg_spec'):
                urls.append('/{}/{}'.format(ep.name, cls._urlarg_spec))

            api.add_resource(cls,
                             *urls,
                             resource_class_kwargs=dict(dataset=dataset, ))

        if op.exists(op.join(static_root, 'index.html')):
            from flask import send_from_directory

            @app.route('/')
            def serve_index():
                return send_from_directory(static_root, 'index.html')

        if mode == 'dry-run':
            yield dict(
                action='webapp',
                status='ok',
                app=app,
                path=dataset.path,
            )
            return

        print("""
*************************************************
*************************************************

      THIS IS NOT A PRODUCTION-READY TOOL

      - only use in a trusted environment
      - do not expose service on public
        network interfaces

*************************************************
*************************************************
""")
        # TODO expose flags, or use FLASK config vars
        app.run(debug=mode == 'debug')
Exemplo n.º 5
0
class Get(Interface):
    """Get any dataset content (files/directories/subdatasets).

    This command only operates on dataset content. To obtain a new independent
    dataset from some source use the `clone` command.

    By default this command operates recursively within a dataset, but not
    across potential subdatasets, i.e. if a directory is provided, all files in
    the directory are obtained. Recursion into subdatasets is supported too. If
    enabled, relevant subdatasets are detected and installed in order to
    fulfill a request.

    Known data locations for each requested file are evaluated and data are
    obtained from some available location (according to git-annex configuration
    and possibly assigned remote priorities), unless a specific source is
    specified.

    *Getting subdatasets*

    Just as DataLad supports getting file content from more than one location,
    the same is supported for subdatasets, including a ranking of individual
    sources for prioritization.

    The following location candidates are considered. For each candidate a
    cost is given in parenthesis, higher values indicate higher cost, and thus
    lower priority:

    - URL of any configured superdataset remote that is known to have the
      desired submodule commit, with the submodule path appended to it.
      There can be more than one candidate (cost 500).

    - In case `.gitmodules` contains a relative path instead of a URL,
      the URL of any configured superdataset remote that is known to have the
      desired submodule commit, with this relative path appended to it.
      There can be more than one candidate (cost 500).

    - A URL or absolute path recorded in `.gitmodules` (cost 600).

    - In case `.gitmodules` contains a relative path as a URL, the absolute
      path of the superdataset, appended with this relative path (cost 900).

    Additional candidate URLs can be generated based on templates specified as
    configuration variables with the pattern

      `datalad.get.subdataset-source-candidate-<name>`

    where `name` is an arbitrary identifier. If `name` starts with three digits
    (e.g. '400myserver') these will be interpreted as a cost, and the
    respective candidate will be sorted into the generated candidate list
    according to this cost. If no cost is given, a default of 700 is used.

    A template string assigned to such a variable can utilize the Python format
    mini language and may reference a number of properties that are inferred
    from the parent dataset's knowledge about the target subdataset. Properties
    include any submodule property specified in the respective `.gitmodules`
    record. For convenience, an existing `datalad-id` record is made available
    under the shortened name `id`.

    Additionally, the URL of any configured remote that contains the respective
    submodule commit is available as `remote-<name>` properties, where `name`
    is the configured remote name.

    Lastly, all candidates are sorted according to their cost (lower values
    first), and duplicate URLs are stripped, while preserving the first item in the
    candidate list.

    .. note::
      Power-user info: This command uses :command:`git annex get` to fulfill
      file handles.
    """
    _examples_ = [
        dict(text="Get a single file",
             code_py="get('path/to/file')",
             code_cmd="datalad get <path/to/file>"),
        dict(text="Get contents of a directory",
             code_py="get('path/to/dir/')",
             code_cmd="datalad get <path/to/dir/>"),
        dict(
            text="Get all contents of the current dataset and its subdatasets",
            code_py="get(dataset='.', recursive=True)",
            code_cmd="datalad get . -r"),
        dict(
            text="Get (clone) a registered subdataset, but don't retrieve data",
            code_py="get('path/to/subds', get_data=False)",
            code_cmd="datalad get -n <path/to/subds>"),
    ]

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar="PATH",
            doc="""specify the dataset to perform the add operation on, in
            which case `path` arguments are interpreted as being relative
            to this dataset.  If no dataset is given, an attempt is made to
            identify a dataset for each input `path`""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar="PATH",
            doc="""path/name of the requested dataset component. The component
            must already be known to a dataset. To add new components to a
            dataset use the `add` command""",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        source=Parameter(
            args=(
                "-s",
                "--source",
            ),
            metavar="LABEL",
            doc="""label of the data source to be used to fulfill requests.
            This can be the name of a dataset :term:`sibling` or another known
            source""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=Parameter(
            args=(
                "-R",
                "--recursion-limit",
            ),
            metavar="LEVELS",
            constraints=EnsureInt() | EnsureChoice('existing') | EnsureNone(),
            doc="""limit recursion into subdataset to the given number of levels.
            Alternatively, 'existing' will limit recursion to subdatasets that already
            existed on the filesystem at the start of processing, and prevent new
            subdatasets from being obtained recursively."""),
        get_data=Parameter(
            args=(
                "-n",
                "--no-data",
            ),
            dest='get_data',
            action='store_false',
            doc=
            """whether to obtain data for all file handles. If disabled, `get`
            operations are limited to dataset handles.[CMD:  This option prevents data
            for file handles from being obtained CMD]"""),
        description=location_description,
        reckless=reckless_opt,
        jobs=jobs_opt)

    @staticmethod
    @datasetmethod(name='get')
    @eval_results
    def __call__(
        path=None,
        source=None,
        dataset=None,
        recursive=False,
        recursion_limit=None,
        get_data=True,
        description=None,
        reckless=None,
        jobs='auto',
    ):
        refds_path = Interface.get_refds_path(dataset)
        if not (dataset or path):
            raise InsufficientArgumentsError(
                "Neither dataset nor target path(s) provided")
        if dataset and not path:
            # act on the whole dataset if nothing else was specified
            path = refds_path

        # we have to have a single dataset to operate on
        refds = require_dataset(dataset,
                                check_installed=True,
                                purpose='get content')

        content_by_ds = {}
        # use subdatasets() to discover any relevant content that is not
        # already present in the root dataset (refds)
        for sdsres in Subdatasets.__call__(
                contains=path,
                # maintain path argument semantics and pass in dataset arg
                # as is
                dataset=dataset,
                # always come from the top to get sensible generator behavior
                bottomup=False,
                # when paths are given, they will constrain the recursion
                # automatically, and we need to enable recursion so we can
                # location path in subdatasets several levels down
                recursive=True if path else recursive,
                recursion_limit=None if path else recursion_limit,
                return_type='generator',
                on_failure='ignore'):
            if sdsres.get('type', None) != 'dataset':
                # if it is not about a 'dataset' it is likely content in
                # the root dataset
                if sdsres.get('status', None) == 'impossible' and \
                        sdsres.get('message', None) == \
                        'path not contained in any matching subdataset':
                    target_path = Path(sdsres['path'])
                    if refds.pathobj != target_path and \
                            refds.pathobj not in target_path.parents:
                        yield dict(
                            action='get',
                            path=str(target_path),
                            status='error',
                            message=('path not associated with dataset %s',
                                     refds),
                        )
                        continue
                    # check if we need to obtain anything underneath this path
                    # the subdataset() call above will only look _until_ it
                    # hits the targetpath
                    for res in _install_targetpath(
                            refds,
                            Path(sdsres['path']),
                            recursive,
                            recursion_limit,
                            reckless,
                            refds_path,
                            description,
                            jobs=jobs,
                    ):
                        # fish out the datasets that 'contains' a targetpath
                        # and store them for later
                        if res.get('status', None) in ('ok', 'notneeded') and \
                                'contains' in res:
                            dsrec = content_by_ds.get(res['path'], set())
                            dsrec.update(res['contains'])
                            content_by_ds[res['path']] = dsrec
                        if res.get('status', None) != 'notneeded':
                            # all those messages on not having installed anything
                            # are a bit pointless
                            # "notneeded" for annex get comes below
                            yield res
                else:
                    # dunno what this is, send upstairs
                    yield sdsres
                # must continue for both conditional branches above
                # the rest is about stuff in real subdatasets
                continue
            # instance of the closest existing dataset for this result
            ds = Dataset(sdsres['parentds'] if sdsres.get('state', None) ==
                         'absent' else sdsres['path'])
            assert 'contains' in sdsres
            # explore the unknown
            for target_path in sdsres.get('contains', []):
                # essentially the same as done above for paths in the root
                # dataset, but here we are starting from the closest
                # discovered subdataset
                for res in _install_targetpath(
                        ds,
                        Path(target_path),
                        recursive,
                        recursion_limit,
                        reckless,
                        refds_path,
                        description,
                        jobs=jobs,
                ):
                    known_ds = res['path'] in content_by_ds
                    if res.get('status', None) in ('ok', 'notneeded') and \
                            'contains' in res:
                        dsrec = content_by_ds.get(res['path'], set())
                        dsrec.update(res['contains'])
                        content_by_ds[res['path']] = dsrec
                    # prevent double-reporting of datasets that have been
                    # installed by explorative installation to get to target
                    # paths, prior in this loop
                    if res.get('status', None) != 'notneeded' or not known_ds:
                        yield res

        if not get_data:
            # done already
            return

        # and now annex-get, this could all be done in parallel now
        for ds, content in content_by_ds.items():
            for res in _get_targetpaths(Dataset(ds), content, refds.path,
                                        source, jobs):
                if res['path'] not in content_by_ds:
                    # we had reports on datasets and subdatasets already
                    # before the annex stage
                    yield res
Exemplo n.º 6
0
class Diff(Interface):
    """Report changes of dataset components.

    Reports can be generated for changes between recorded revisions, or
    between a revision and the state of a dataset's work tree.

    Unlike 'git diff', this command also reports untracked content when
    comparing a revision to the state of the work tree. Such content is
    marked with the property `state='untracked'` in the command results.

    The following types of changes are distinguished and reported via the
    `state` result property:

    - added
    - copied
    - deleted
    - modified
    - renamed
    - typechange
    - unmerged
    - untracked

    Whenever applicable, source and/or destination revisions are reported
    to indicate when exactly within the requested revision range a particular
    component changed its status.

    Optionally, the reported changes can be limited to a subset of paths
    within a dataset.
    """

    # make the custom renderer the default one, as the global default renderer
    # does not yield meaningful output for this command
    result_renderer = 'tailored'

    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to query.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the input and/or the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="""path to be evaluated""",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        revision=Parameter(
            args=('--revision', ),
            metavar='REVISION EXPRESSION',
            nargs='?',
            doc="""comparison reference specification. Three modes are
            supported: 1) <revision> changes you have in your working tree
            relative to the named revision (this can also be a branch name,
            tag, commit or any label Git can understand). 2) <revision>..<revision>
            changes between two arbitrary revisions. 3) <revision>...<revision>
            changes on the branch containing and up to the second <revision>,
            starting at a common ancestor of both revisions."""),
        staged=Parameter(
            args=("--staged", ),
            action="store_true",
            doc="""get the changes already staged for a commit relative
            to an optionally given revision (by default the most recent one)"""
        ),
        ignore_subdatasets=Parameter(
            args=('--ignore-subdatasets', ),
            constraints=EnsureChoice('none', 'untracked', 'dirty', 'all'),
            doc="""speed up execution by (partially) not evaluating the state of
            subdatasets in a parent dataset. With "none" a subdataset is
            considered modified when it either contains untracked or modified
            content or its last saved state differs from that recorded in the
            parent dataset. When "untracked" is used subdatasets are not
            considered modified when they only contain untracked content (but
            they are still scanned for modified content). Using "dirty" ignores
            all changes to the work tree of subdatasets, only changes to the
            revisions stored in the parent dataset are shown. Using "all" hides
            all changes to subdatasets. Note, even with "all" recursive
            execution will still report other changes in any existing
            subdataset, only the subdataset record in a parent dataset
            is not  evaluated."""),
        report_untracked=Parameter(
            args=('--report-untracked', ),
            constraints=EnsureChoice('no', 'normal', 'all'),
            doc="""If and how untracked content is reported when comparing
            a revision to the state of the work tree. 'no': no untracked files
            are reported; 'normal': untracked files and entire untracked
            directories are reported as such; 'all': report individual files
            even in fully untracked directories."""),
        recursive=recursion_flag,
        recursion_limit=recursion_limit)

    @staticmethod
    @datasetmethod(name='diff')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 revision=None,
                 staged=False,
                 ignore_subdatasets='none',
                 report_untracked='normal',
                 recursive=False,
                 recursion_limit=None):
        if not dataset and not path:
            # act on the whole dataset if nothing else was specified
            dataset = curdir
        refds_path = Interface.get_refds_path(dataset)

        to_process = []
        # tracked what commit ranges we want to diff per dataset
        ds_diffies = {}
        for ap in AnnotatePaths.__call__(
                path=path,
                dataset=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='diff',
                # unavailable is OK, because we might query for a deleted file
                unavailable_path_status='',
                nondataset_path_status='impossible',
                # must not use `modified`, infinite loop otherwise
                modified=None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # we know what to report already
                yield ap
                continue
            if ap.get('type', None) == 'dataset':
                ap['process_content'] = True
            if ap.get('raw_input', False) or ap['path'] == refds_path:
                # prepopulate the revision specs for all input paths
                ds_diffies[ap['path'] if ap.get('type', None) ==
                           'dataset' else ap['parentds']] = revision
            to_process.append(ap)

        # sort into datasets
        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert (not completed)

        for ds_path in sorted(content_by_ds.keys()):
            if ds_path not in ds_diffies:
                # we don't know how to diff
                # this was not neither an input path, not did we see it
                # when diffing its parent
                continue
            content_paths = content_by_ds[ds_path]
            revision = ds_diffies[ds_path]
            for r in _parse_git_diff(ds_path,
                                     diff_thingie=ds_diffies[ds_path],
                                     paths=content_paths,
                                     ignore_submodules=ignore_subdatasets,
                                     staged=staged):
                r.update(dict(action='diff', logger=lgr))
                if refds_path:
                    r['refds'] = refds_path
                if 'status' not in r:
                    r['status'] = 'ok'
                if r.get('type', None) == 'dataset':
                    # this is a subdataset report
                    # we need to use the reported commit range to properly adjust the
                    # query once we hit that subdataset
                    from_rev = r.get('revision_src', '')
                    to_rev = r.get('revision', '')
                    subrev = '{}..{}'.format(
                        from_rev if from_rev else PRE_INIT_COMMIT_SHA,
                        to_rev if to_rev else '',
                    )
                    if from_rev and from_rev == to_rev:
                        # this is a special case, where subdataset reported changes without
                        # a change in state/commit -- this is code for uncommited changes
                        # in the subdataset (including staged ones). In such a case, we
                        # must not provide a diff range, but only the source commit we want
                        # to diff against
                        # XXX if this is changed, likely the same logic in annotate_paths needs
                        # changing too!
                        subrev = from_rev
                    ds_diffies[r['path']] = subrev
                yield r
            if (revision and '..' in revision) or report_untracked == 'no':
                # don't look for untracked content, we got a revision range
                continue
            for r in _get_untracked_content(ds_path,
                                            report_untracked,
                                            paths=content_paths):
                r.update(dict(action='diff', logger=lgr))
                if refds_path:
                    r['refds'] = refds_path
                if 'status' not in r:
                    r['status'] = 'ok'
                yield r

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        from datalad.ui import ui
        if not res['status'] == 'ok':
            # logging reported already
            return
        path = relpath(res['path'], start=res['refds']) \
            if res.get('refds', None) else res['path']
        type_ = res.get('type', res.get('type_src', ''))
        max_len = len('untracked(directory)')
        state_msg = '{}{}'.format(res['state'],
                                  '({})'.format(type_ if type_ else ''))
        ui.message('{fill}{state_msg}: {path}'.format(
            fill=' ' * max(0, max_len - len(state_msg)),
            state_msg=state_msg,
            path=path))
Exemplo n.º 7
0
class ExportToFigshare(Interface):
    """Export the content of a dataset as a ZIP archive to figshare

    Very quick and dirty approach.  Ideally figshare should be supported as
    a proper git annex special remote.  Unfortunately, figshare does not support
    having directories, and can store only a flat list of files.  That makes
    it impossible for any sensible publishing of complete datasets.

    The only workaround is to publish dataset as a zip-ball, where the entire
    content is wrapped into a .zip archive for which figshare would provide a
    navigator.
    """

    from datalad.support.param import Parameter
    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import EnsureNone, EnsureInt, EnsureStr

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc=""""specify the dataset to export. If no dataset is given, an
            attempt is made to identify the dataset based on the current
            working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        filename=Parameter(
            args=("filename", ),
            metavar="PATH",
            nargs='?',
            doc="""File name of the generated ZIP archive. If no file name is
            given the archive will be generated in the current directory and
            will be named: datalad_<dataset_uuid>.zip.""",
            constraints=EnsureStr() | EnsureNone()),
        no_annex=Parameter(
            args=("--no-annex", ),
            action="store_true",
            doc="""By default the generated .zip file would be added to annex,
            and all files would get registered in git-annex to be available
            from such a tarball. Also upon upload we will register for that
            archive to be a possible source for it in annex. Setting this flag
            disables this behavior."""),
        missing_content=Parameter(
            args=("--missing-content", ),
            metavar="error|continue|ignore",
            doc="""By default, any discovered file with missing content will
            result in an error and the plugin is aborted. Setting this to
            'continue' will issue warnings instead of failing on error. The
            value 'ignore' will only inform about problem at the 'debug' log
            level. The latter two can be helpful when generating a TAR archive
            from a dataset where some file content is not available
            locally.""",
            constraints=EnsureStr()),
        # article_id=Parameter(
        #     args=("--project-id",),
        #     metavar="ID",
        #     doc="""If given, article (if article_id is not provided) will be
        #     created in that project.""",
        #     constraints=EnsureInt() | EnsureNone()),
        article_id=Parameter(args=("--article-id", ),
                             metavar="ID",
                             doc="""Which article to publish to.""",
                             constraints=EnsureInt() | EnsureNone()),
    )

    @staticmethod
    @datasetmethod(name='export_to_figshare')
    @eval_results
    def __call__(
        dataset,
        filename=None,
        missing_content='error',
        no_annex=False,
        # TODO: support working with projects and articles within them
        # project_id=None,
        article_id=None):
        import os
        import logging
        lgr = logging.getLogger('datalad.plugin.export_to_figshare')

        from datalad.ui import ui
        from datalad.api import add_archive_content
        from datalad.api import export_archive
        from datalad.distribution.dataset import require_dataset
        from datalad.support.annexrepo import AnnexRepo

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='export to figshare')

        if not isinstance(dataset.repo, AnnexRepo):
            raise ValueError(
                "%s is not an annex repo, so annexification could be done" %
                dataset)

        if dataset.repo.is_dirty():
            raise RuntimeError(
                "Paranoid authors of DataLad refuse to proceed in a dirty repository"
            )
        lgr.info(
            "Exporting current tree as an archive since figshare does not support directories"
        )
        archive_out = next(
            export_archive(dataset,
                           filename=filename,
                           archivetype='zip',
                           missing_content=missing_content,
                           return_type="generator"))
        assert archive_out['status'] == 'ok'
        fname = archive_out['path']

        lgr.info("Uploading %s to figshare", fname)
        figshare = FigshareRESTLaison()

        if not article_id:
            # TODO: ask if it should be an article within a project
            if ui.is_interactive:
                # or should we just upload to a new article?
                if ui.yesno(
                        "Would you like to create a new article to upload to?  "
                        "If not - we will list existing articles",
                        title="Article"):
                    article = figshare.create_article(
                        title=os.path.basename(dataset.path))
                    lgr.info(
                        "Created a new (private) article %(id)s at %(url_private_html)s. "
                        "Please visit it, enter additional meta-data and make public",
                        article)
                    article_id = article['id']
                else:
                    article_id = int(
                        ui.question(
                            "Which of the articles should we upload to.",
                            choices=map(str, figshare.get_article_ids())))
            if not article_id:
                raise ValueError("We need an article to upload to.")

        file_info = figshare.upload_file(
            fname, files_url='account/articles/%s/files' % article_id)

        if no_annex:
            lgr.info("Removing generated tarball")
            os.unlink(fname)
        else:
            # I will leave all the complaining etc to the dataset add if path
            # is outside etc
            lgr.info("'Registering' %s within annex", fname)
            repo = dataset.repo
            repo.add(fname, git=False)
            key = repo.get_file_key(fname)
            lgr.info("Adding URL %(download_url)s for it", file_info)
            repo._annex_custom_command([], [
                "git", "annex", "registerurl", '-c',
                'annex.alwayscommit=false', key, file_info['download_url']
            ])

            lgr.info("Registering links back for the content of the archive")
            add_archive_content(
                fname,
                annex=dataset.repo,
                delete_after=True,  # just remove extracted into a temp dir
                allow_dirty=True,  # since we have a tarball
                commit=
                False  # we do not want to commit anything we have done here
            )

            lgr.info("Removing generated and now registered in annex archive")
            repo.drop(key, key=True, options=['--force'])
            repo.remove(fname, force=True)  # remove the tarball

            # if annex in {'delete'}:
            #     dataset.repo.remove(fname)
            # else:
            #     # kinda makes little sense I guess.
            #     # Made more sense if export_archive could export an arbitrary treeish
            #     # so we could create a branch where to dump and export to figshare
            #     # (kinda closer to my idea)
            #     dataset.save(fname, message="Added the entire dataset into a zip file")

        # TODO: add to downloader knowledge about figshare token so it could download-url
        # those zipballs before they go public
        yield dict(
            status='ok',
            # TODO: add article url (which needs to be queried if only ID is known
            message="Published archive {}".format(file_info['download_url']),
            file_info=file_info,
            path=dataset,
            action='export_to_figshare',
            logger=lgr)
Exemplo n.º 8
0
class CreateSiblingRia(Interface):
    """Creates a sibling to a dataset in a RIA store

    Communication with a dataset in a RIA store is implemented via two
    siblings. A regular Git remote (repository sibling) and a git-annex
    special remote for data transfer (storage sibling) -- with the former
    having a publication dependency on the latter. By default, the name of the
    storage sibling is derived from the repository sibling's name by appending
    "-storage".

    The store's base path is expected to not exist, be an empty directory,
    or a valid RIA store.

    RIA store layout
    ~~~~~~~~~~~~~~~~

    A RIA store is a directory tree with a dedicated subdirectory for each
    dataset in the store. The subdirectory name is constructed from the
    DataLad dataset ID, e.g. '124/68afe-59ec-11ea-93d7-f0d5bf7b5561', where
    the first three characters of the ID are used for an intermediate
    subdirectory in order to mitigate files system limitations for stores
    containing a large number of datasets.

    Each dataset subdirectory contains a standard bare Git repository for
    the dataset.

    In addition, a subdirectory 'annex' hold a standard Git-annex object
    store. However, instead of using the 'dirhashlower' naming scheme for
    the object directories, like Git-annex would do, a 'dirhashmixed'
    layout is used -- the same as for non-bare Git repositories or regular
    DataLad datasets.

    Optionally, there can be a further subdirectory 'archives' with
    (compressed) 7z archives of annex objects. The storage remote is able to
    pull annex objects from these archives, if it cannot find in the regular
    annex object store. This feature can be useful for storing large
    collections of rarely changing data on systems that limit the number of
    files that can be stored.

    Each dataset directory also contains a 'ria-layout-version' file that
    identifies the data organization (as, for example, described above).

    Lastly, there is a global 'ria-layout-version' file at the store's
    base path that identifies where dataset subdirectories themselves are
    located. At present, this file must contain a single line stating the
    version (currently "1"). This line MUST end with a newline character.

    It is possible to define an alias for an individual dataset in a store by
    placing a symlink to the dataset location into an 'alias/' directory
    in the root of the store. This enables dataset access via URLs of format:
    'ria+<protocol>://<storelocation>#~<aliasname>'.

    Error logging
    ~~~~~~~~~~~~~

    To enable error logging at the remote end, append a pipe symbol and an "l"
    to the version number in ria-layout-version (like so '1|l\\n').

    Error logging will create files in an "error_log" directory whenever the
    git-annex special remote (storage sibling) raises an exception, storing the
    Python traceback of it. The logfiles are named according to the scheme
    '<dataset id>.<annex uuid of the remote>.log' showing "who" ran into this
    issue with which dataset. Because logging can potentially leak personal
    data (like local file paths for example), it can be disabled client-side
    by setting the configuration variable
    "annex.ora-remote.<storage-sibling-name>.ignore-remote-config".
    """

    # TODO: description?
    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to process.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        url=Parameter(
            args=("url", ),
            metavar="ria+<ssh|file>://<host>[/path]",
            doc="""URL identifying the target RIA store and access protocol.
            """,
            constraints=EnsureStr() | EnsureNone()),
        name=Parameter(args=(
            '-s',
            '--name',
        ),
                       metavar='NAME',
                       doc="""Name of the sibling.
            With `recursive`, the same name will be used to label all
            the subdatasets' siblings.""",
                       constraints=EnsureStr() | EnsureNone(),
                       required=True),
        storage_name=Parameter(
            args=("--storage-name", ),
            metavar="NAME",
            doc="""Name of the storage sibling (git-annex special remote).
            Must not be identical to the sibling name. If not specified,
            defaults to the sibling name plus '-storage' suffix.""",
            constraints=EnsureStr() | EnsureNone()),
        post_update_hook=Parameter(
            args=("--post-update-hook", ),
            doc="""Enable git's default post-update-hook for the created
            sibling.""",
            action="store_true"),
        shared=Parameter(
            args=("--shared", ),
            metavar='{false|true|umask|group|all|world|everybody|0xxx}',
            doc="""If given, configures the permissions in the
            RIA store for multi-users access.
            Possible values for this option are identical to those of
            `git init --shared` and are described in its documentation.""",
            constraints=EnsureStr() | EnsureBool() | EnsureNone()),
        group=Parameter(
            args=("--group", ),
            metavar="GROUP",
            doc="""Filesystem group for the repository. Specifying the group is
            crucial when [CMD: --shared=group CMD][PY: shared="group" PY]""",
            constraints=EnsureStr() | EnsureNone()),
        storage_sibling=Parameter(
            args=("--no-storage-sibling", ),
            dest='storage_sibling',
            doc="""Flag to disable establishing a storage sibling.""",
            action="store_false"),
        existing=Parameter(
            args=("--existing", ),
            constraints=EnsureChoice('skip', 'error', 'reconfigure')
            | EnsureNone(),
            metavar='MODE',
            doc="""Action to perform, if a (storage) sibling is already
            configured under the given name and/or a target already exists.
            In this case, a dataset can be skipped ('skip'), an existing target
            repository be forcefully re-initialized, and the sibling
            (re-)configured ('reconfigure'), or the command be instructed to
            fail ('error').""",
        ),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        trust_level=Parameter(
            args=("--trust-level", ),
            metavar="TRUST-LEVEL",
            constraints=EnsureChoice('trust', 'semitrust', 'untrust')
            | EnsureNone(),
            doc="""specify a trust level for the storage sibling. If not
            specified, the default git-annex trust level is used.""",
        ),
    )

    @staticmethod
    @datasetmethod(name='create_sibling_ria')
    @eval_results
    def __call__(url,
                 name,
                 dataset=None,
                 storage_name=None,
                 post_update_hook=False,
                 shared=None,
                 group=None,
                 storage_sibling=True,
                 existing='error',
                 trust_level=None,
                 recursive=False,
                 recursion_limit=None):

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='create sibling RIA')
        res_kwargs = dict(
            ds=ds,
            action="create-sibling-ria",
            logger=lgr,
        )

        if ds.repo.get_hexsha() is None or ds.id is None:
            raise RuntimeError("Repository at {} is not a DataLad dataset, "
                               "run 'datalad create [--force]' first.".format(
                                   ds.path))

        if not storage_sibling and storage_name:
            lgr.warning(
                "Storage sibling setup disabled, but a storage sibling name "
                "was provided")

        if storage_sibling and not storage_name:
            storage_name = "{}-storage".format(name)

        if storage_sibling and name == storage_name:
            # leads to unresolvable, circular dependency with publish-depends
            raise ValueError("sibling names must not be equal")

        if not isinstance(url, str):
            raise TypeError("url is not a string, but %s" % type(url))

        # Query existing siblings upfront in order to fail early on
        # existing=='error', since misconfiguration (particularly of special
        # remotes) only to fail in a subdataset later on with that config, can
        # be quite painful.
        # TODO: messages - this is "create-sibling". Don't confuse existence of
        #       local remotes with existence of the actual remote sibling
        #       in wording
        if existing == 'error':
            # in recursive mode this check could take a substantial amount of
            # time: employ a progress bar (or rather a counter, because we don't
            # know the total in advance
            pbar_id = 'check-siblings-{}'.format(id(ds))
            log_progress(
                lgr.info,
                pbar_id,
                'Start checking pre-existing sibling configuration %s',
                ds,
                label='Query siblings',
                unit=' Siblings',
            )
            # even if we have to fail, let's report all conflicting siblings
            # in subdatasets
            failed = False
            for r in ds.siblings(result_renderer=None,
                                 recursive=recursive,
                                 recursion_limit=recursion_limit):
                log_progress(lgr.info,
                             pbar_id,
                             'Discovered sibling %s in dataset at %s',
                             r['name'],
                             r['path'],
                             update=1,
                             increment=True)
                if not r['type'] == 'sibling' or r['status'] != 'ok':
                    # this is an internal status query that has not consequence
                    # for the outside world. Be silent unless something useful
                    # can be said
                    #yield r
                    continue
                if r['name'] == name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
                if storage_name and r['name'] == storage_name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(storage_name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
            log_progress(
                lgr.info,
                pbar_id,
                'Finished checking pre-existing sibling configuration %s',
                ds,
            )
            if failed:
                return

        # TODO: - URL parsing + store creation needs to be RF'ed based on
        #         command abstractions
        #       - more generally consider store creation a dedicated command or
        #         option
        # Note: URL parsing is done twice ATM (for top-level ds). This can't be
        # reduced to single instance, since rewriting url based on config could
        # be different for subdatasets.

        # parse target URL
        try:
            ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config)
        except ValueError as e:
            yield get_status_dict(status='error', message=str(e), **res_kwargs)
            return
        create_store(
            SSHRemoteIO(ssh_host) if ssh_host else LocalIO(), Path(base_path),
            '1')

        yield from _create_sibling_ria(ds, url, name, storage_sibling,
                                       storage_name, existing, shared, group,
                                       post_update_hook, trust_level,
                                       res_kwargs)

        if recursive:
            # Note: subdatasets can be treated independently, so go full
            # recursion when querying for them and _no_recursion with the
            # actual call. Theoretically this can be parallelized.

            for subds in ds.subdatasets(fulfilled=True,
                                        recursive=True,
                                        recursion_limit=recursion_limit,
                                        result_xfm='datasets'):
                yield from _create_sibling_ria(subds, url, name,
                                               storage_sibling, storage_name,
                                               existing, shared, group,
                                               post_update_hook, trust_level,
                                               res_kwargs)
Exemplo n.º 9
0
class Metadata(Interface):
    """Metadata reporting for files and entire datasets

    Two types of metadata are supported:

    1. metadata describing a dataset as a whole (dataset-global metadata), and

    2. metadata for files in a dataset (content metadata).

    Both types can be accessed with this command.

    Examples:

      Report the metadata of a single file, as aggregated into the closest
      locally available dataset, containing the query path::

        % datalad metadata somedir/subdir/thisfile.dat

      Sometimes it is helpful to get metadata records formatted in a more accessible
      form, here as pretty-printed JSON::

        % datalad -f json_pp metadata somedir/subdir/thisfile.dat

      Same query as above, but specify which dataset to query (must be
      containing the query path)::

        % datalad metadata -d . somedir/subdir/thisfile.dat

      Report any metadata record of any dataset known to the queried dataset::

        % datalad metadata --recursive --reporton datasets 

      Get a JSON-formatted report of aggregated metadata in a dataset, incl.
      information on enabled metadata extractors, dataset versions, dataset IDs,
      and dataset paths::

        % datalad -f json metadata --get-aggregates
    """
    # make the custom renderer the default, path reporting isn't the top
    # priority here
    result_renderer = 'tailored'

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""dataset to query. If given, metadata will be reported
            as stored in this dataset. Otherwise, the closest available
            dataset containing a query path will be consulted.""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="path(s) to query metadata for",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        get_aggregates=Parameter(
            args=('--get-aggregates', ),
            action='store_true',
            doc="""if set, yields all (sub)datasets for which aggregate
            metadata are available in the dataset. No other action is
            performed, even if other arguments are given. The reported
            results contain a datasets's ID, the commit hash at which
            metadata aggregation was performed, and the location of the
            object file(s) containing the aggregated metadata."""),
        reporton=reporton_opt,
        recursive=recursion_flag)
    # MIH: not sure of a recursion limit makes sense here
    # ("outdated from 5 levels down?")
    #recursion_limit=recursion_limit)

    @staticmethod
    @datasetmethod(name='metadata')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 get_aggregates=False,
                 reporton='all',
                 recursive=False):
        # prep results
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='metadata', logger=lgr)
        if refds_path:
            res_kwargs['refds'] = refds_path

        if get_aggregates:
            # yield all datasets for which we have aggregated metadata as results
            # the get actual dataset results, so we can turn them into dataset
            # instances using generic top-level code if desired
            ds = require_dataset(refds_path,
                                 check_installed=True,
                                 purpose='aggregate metadata query')
            agginfos = load_ds_aggregate_db(
                ds, version=str(aggregate_layout_version), abspath=True)
            if not agginfos:
                # if there has ever been an aggregation run, this file would
                # exist, hence there has not been and we need to tell this
                # to people
                yield get_status_dict(
                    ds=ds,
                    status='impossible',
                    action='metadata',
                    logger=lgr,
                    message=
                    'metadata aggregation has never been performed in this dataset'
                )
                return
            parentds = []
            for dspath in sorted(agginfos):
                info = agginfos[dspath]
                if parentds and not path_is_subpath(dspath, parentds[-1]):
                    parentds.pop()
                info.update(
                    path=dspath,
                    type='dataset',
                    status='ok',
                )
                if dspath == ds.path:
                    info['layout_version'] = aggregate_layout_version
                if parentds:
                    info['parentds'] = parentds[-1]
                yield dict(info, **res_kwargs)
                parentds.append(dspath)
            return

        if not dataset and not path:
            # makes no sense to have no dataset, go with "here"
            # error generation happens during annotation
            path = op.curdir

        content_by_ds = OrderedDict()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                # MIH: we are querying the aggregated metadata anyways, and that
                # mechanism has its own, faster way to go down the hierarchy
                #recursive=recursive,
                #recursion_limit=recursion_limit,
                action='metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                # we need to know when to look into aggregated data
                force_subds_discovery=True,
                force_parentds_discovery=True,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(
                    ap['path']):
                ap['process_content'] = True
            to_query = None
            if ap.get('state', None) == 'absent' or \
                    ap.get('type', 'dataset') != 'dataset':
                # this is a lonely absent dataset/file or content in a present dataset
                # -> query through parent
                # there must be a parent, otherwise this would be a non-dataset path
                # and would have errored during annotation
                to_query = ap['parentds']
            else:
                to_query = ap['path']
            if to_query:
                pcontent = content_by_ds.get(to_query, [])
                pcontent.append(ap)
                content_by_ds[to_query] = pcontent

        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            query_agg = [
                ap for ap in content_by_ds[ds_path]
                # this is an available subdataset, will be processed in another
                # iteration
                if ap.get('state', None) == 'absent' or not (ap.get(
                    'type', None) == 'dataset' and ap['path'] != ds_path)
            ]
            if not query_agg:
                continue
            # report from aggregated metadata
            for r in query_aggregated_metadata(
                    reporton,
                    # by default query the reference dataset, only if there is none
                    # try our luck in the dataset that contains the queried path
                    # this is consistent with e.g. `get_aggregates` reporting the
                    # situation in the reference dataset only
                    Dataset(refds_path) if refds_path else ds,
                    query_agg,
                    # recursion above could only recurse into datasets
                    # on the filesystem, but there might be any number of
                    # uninstalled datasets underneath the last installed one
                    # for which we might have metadata
                    recursive=recursive,
                    **res_kwargs):
                yield r
        return

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        if res['status'] != 'ok' or not res.get('action', None) == 'metadata':
            # logging complained about this already
            return
        # list the path, available metadata keys, and tags
        path = op.relpath(res['path'], res['refds']) if res.get(
            'refds', None) else res['path']
        meta = res.get('metadata', {})
        ui.message('{path}{type}:{spacer}{meta}{tags}'.format(
            path=ac.color_word(path, ac.BOLD),
            type=' ({})'.format(ac.color_word(res['type'], ac.MAGENTA))
            if 'type' in res else '',
            spacer=' ' if len([m for m in meta if m != 'tag']) else '',
            meta=','.join(k for k in sorted(meta.keys())
                          if k not in ('tag', '@context', '@id'))
            if meta else ' -' if 'metadata' in res else ' aggregated',
            tags='' if 'tag' not in meta else ' [{}]'.format(','.join(
                ensure_list(meta['tag'])))))
Exemplo n.º 10
0
class Extract(Interface):
    """Run a metadata extractor on a dataset or file.

    This command distinguishes between dataset-level extraction and
    file-level extraction.

    If no "path" argument is given, the command
    assumes that a given extractor is a dataset-level extractor and
    executes it on the dataset that is given by the current working
    directory or by the "-d" argument.

    If a path is given, the command assumes that the path identifies a
    file and that the given extractor is a file-level extractor, which
    will then be executed on the specified file. If the file level
    extractor requests the content of a file that is not present, the
    command might "get" the file content to make it locally available.
    Path must not refer to a sub-dataset. Path must not be a directory.

    .. note::

        If you want to insert sub-dataset-metadata into the super-dataset's
        metadata, you currently have to do the following:
        first, extract dataset metadata of the sub-dataset using a dataset-
        level extractor, second add the extracted metadata with sub-dataset
        information (i.e. dataset_path, root_dataset_id, root-dataset-
        version) to the metadata of the super-dataset.

    The extractor configuration can be parameterized with key-value pairs
    given as additional arguments. Each key-value pair consists of two
    arguments, first the key, followed by the value. If no path is given,
    and you want to provide key-value pairs, you have to give the path
    "++", to prevent that the first key is interpreted as path.

    The results are written into the repository of the source dataset
    or into the repository of the dataset given by the "-i" parameter.
    If the same extractor is executed on the same element (dataset or
    file) with the same configuration, any existing results will be
    overwritten.

    The command can also take legacy datalad-metalad extractors and
    will execute them in either "content" or "dataset" mode, depending
    on the presence of the "path"-parameter.
    """

    result_renderer = "tailored"

    _examples_ = [
        dict(text='Use the metalad_core_file-extractor to extract metadata'
             'from the file "subdir/data_file_1.txt". The dataset is '
             'given by the current working directory',
             code_cmd="datalad meta-extract metalad_core_file "
             "subdir/data_file_1.txt"),
        dict(text='Use the metalad_core_file-extractor to extract metadata '
             'from the file "subdir/data_file_1.txt" in the dataset '
             '/home/datasets/ds0001',
             code_cmd="datalad meta-extract -d /home/datasets/ds0001 "
             "metalad_core_file subdir/data_file_1.txt"),
        dict(text='Use the metalad_core_dataset-extractor to extract '
             'dataset-level metadata from the dataset given by the '
             'current working directory',
             code_cmd="datalad meta-extract metalad_core_dataset"),
        dict(text='Use the metalad_core_dataset-extractor to extract '
             'dataset-level metadata from the dataset in '
             '/home/datasets/ds0001',
             code_cmd="datalad meta-extract -d /home/datasets/ds0001 "
             "metalad_core_dataset")
    ]

    _params_ = dict(
        extractorname=Parameter(
            args=("extractorname", ),
            metavar="EXTRACTOR_NAME",
            doc="Name of a metadata extractor to be executed."),
        path=Parameter(args=("path", ),
                       metavar="FILE",
                       nargs="?",
                       doc="""Path of a file or dataset to extract metadata
            from. If this argument is provided, we assume a file
            extractor is requested, if the path is not given, or
            if it identifies the root of a dataset, i.e. "", we
            assume a dataset level metadata extractor is
            specified.""",
                       constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""Dataset to extract metadata from. If no dataset
            is given, the dataset is determined by the current work
            directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        extractorargs=Parameter(args=("extractorargs", ),
                                metavar="EXTRACTOR_ARGUMENTS",
                                doc="""Extractor arguments""",
                                nargs="*",
                                constraints=EnsureStr() | EnsureNone()))

    @staticmethod
    @datasetmethod(name="meta_extract")
    @eval_results
    def __call__(extractorname: str,
                 path: Optional[str] = None,
                 dataset: Optional[Union[Dataset, str]] = None,
                 extractorargs: Optional[List[str]] = None):

        # Get basic arguments
        extractor_name = extractorname
        extractor_args = extractorargs
        path = None if path == "++" else path

        source_dataset = require_dataset(dataset or curdir,
                                         purpose="extract metadata",
                                         check_installed=path is not None)

        if not source_dataset.repo:
            raise ValueError(f"No dataset found in {dataset or curdir}.")

        source_dataset_version = source_dataset.repo.get_hexsha()

        extractor_class = get_extractor_class(extractor_name)
        dataset_tree_path, file_tree_path = get_path_info(
            source_dataset,
            Path(path) if path else None, None)

        extraction_parameters = ExtractionParameter(
            source_dataset=source_dataset,
            source_dataset_id=UUID(source_dataset.id),
            source_dataset_version=source_dataset_version,
            extractor_class=extractor_class,
            extractor_name=extractor_name,
            extractor_arguments=args_to_dict(extractor_args),
            file_tree_path=file_tree_path,
            agent_name=source_dataset.config.get("user.name"),
            agent_email=source_dataset.config.get("user.email"))

        # If a path is given, we assume file-level metadata extraction is
        # requested, and the extractor class should be a subclass of
        # FileMetadataExtractor (or a legacy extractor).
        # If path is not given, we assume that a dataset-level extraction is
        # requested and the extractor class is a subclass of
        # DatasetMetadataExtractor (or a legacy extractor class).
        path = None if path == "++" else path

        if path:
            # Check whether the path points to a sub_dataset.
            ensure_path_validity(source_dataset, file_tree_path)
            yield from do_file_extraction(extraction_parameters)
        else:
            yield from do_dataset_extraction(extraction_parameters)
        return

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        if res["status"] != "ok" or res.get("action", "") != 'meta_extract':
            # logging complained about this already
            return

        metadata_record = res["metadata_record"]
        path = ({
            "path": str(metadata_record["path"])
        } if "path" in metadata_record else {})

        dataset_path = ({
            "dataset_path": str(metadata_record["dataset_path"])
        } if "dataset_path" in metadata_record else {})

        ui.message(
            json.dumps({
                **metadata_record,
                **path,
                **dataset_path, "dataset_id":
                str(metadata_record["dataset_id"])
            }))
Exemplo n.º 11
0
class AnnotatePaths(Interface):
    """Analyze and act upon input paths

    Given paths (or more generally location requests) are inspected and
    annotated with a number of properties. A list of recognized properties
    is provided below.

    || PYTHON >>Input `paths` for this command can either be un-annotated
    (raw) path strings, or already (partially) annotated paths. In the latter
    case, further annotation is limited to yet-unknown properties, and is
    potentially faster than initial annotation.<< PYTHON ||


    *Recognized path properties*

    {proplist}

    In the case of enabled modification detection the results may contain
    additional properties regarding the nature of the modification. See the
    documentation of the `diff` command for details.

    """
    _docs_ = dict(proplist='\n\n    '.join('"{}"\n{}'.format(
        k,
        textwrap.fill(known_props[k],
                      initial_indent='        ',
                      subsequent_indent='        '))
                                           for k in sorted(known_props)))

    _params_ = dict(
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="""path to be annotated""",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""an optional reference/base dataset for the paths""",
            constraints=EnsureDataset() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        action=Parameter(args=("--action", ),
                         metavar="LABEL",
                         doc="""an "action" property value to include in the
            path annotation""",
                         constraints=EnsureStr() | EnsureNone()),
        unavailable_path_status=Parameter(
            args=("--unavailable-path-status", ),
            metavar="LABEL",
            doc="""a "status" property value to include in the
            annotation for paths that are underneath a dataset, but
            do not exist on the filesystem""",
            constraints=EnsureStr() | EnsureNone()),
        unavailable_path_msg=Parameter(
            args=("--unavailable-path-msg", ),
            metavar="message",
            doc="""a "message" property value to include in the
            annotation for paths that are underneath a dataset, but
            do not exist on the filesystem""",
            constraints=EnsureStr() | EnsureNone()),
        nondataset_path_status=Parameter(
            args=("--nondataset-path-status", ),
            metavar="LABEL",
            doc="""a "status" property value to include in the
            annotation for paths that are not underneath any dataset""",
            constraints=EnsureStr() | EnsureNone()),
        force_parentds_discovery=Parameter(
            args=("--no-parentds-discovery", ),
            dest='force_parentds_discovery',
            action='store_false',
            doc="""Flag to disable reports of parent dataset information for any
            path, in particular dataset root paths. Disabling saves on command
            run time, if this information is not needed."""),
        force_subds_discovery=Parameter(
            args=("--no-subds-discovery", ),
            action='store_false',
            dest='force_subds_discovery',
            doc="""Flag to disable reporting type='dataset' for subdatasets, even
            when they are not installed, or their mount point directory doesn't
            exist. Disabling saves on command run time, if this information is
            not needed."""),
        force_untracked_discovery=Parameter(
            args=("--no-untracked-discovery", ),
            action='store_false',
            dest='force_untracked_discovery',
            doc="""Flag to disable discovery of untracked changes.
                Disabling saves on command run time, if this information is
                not needed."""),
        force_no_revision_change_discovery=Parameter(
            args=("--revision-change-discovery", ),
            action='store_false',
            dest='force_no_revision_change_discovery',
            doc=
            """Flag to disable discovery of changes which were not yet committed.
            Disabling saves on command run time, if this information is
            not needed."""),
        modified=Parameter(
            args=("--modified", ),
            nargs='?',
            const=True,
            constraints=EnsureStr() | EnsureBool() | EnsureNone(),
            doc="""comparison reference specification for modification detection.
            This can be (mostly) anything that `git diff` understands (commit,
            treeish, tag, etc). See the documentation of `datalad diff --revision`
            for details. Unmodified paths will not be annotated. If a requested
            path was not modified but some content underneath it was, then the
            request is replaced by the modified paths and those are annotated
            instead. This option can be used [PY: with `True` as PY][CMD: without CMD]
            an argument to test against changes that have been made, but have not
            yet been staged for a commit."""))

    @staticmethod
    @datasetmethod(name='annotate_paths')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 action=None,
                 unavailable_path_status='',
                 unavailable_path_msg=None,
                 nondataset_path_status='error',
                 force_parentds_discovery=True,
                 force_subds_discovery=True,
                 force_no_revision_change_discovery=True,
                 force_untracked_discovery=True,
                 modified=None):
        # upfront check for the fastest possible response
        if not path and dataset is None:
            # nothing given, try "here", but do not use `require_dataset`, as
            # it will determine the root dataset of `curdir` and further down
            # lead to path annotation of upstairs directories
            dataset = curdir

        if force_subds_discovery and not force_parentds_discovery:
            raise ValueError(
                'subdataset discovery requires parent dataset discovery')

        # CONCEPT: yield with no status to indicate further processing

        # everything in one big loop to be able too yield as fast a possible
        # without any precomputing for all paths
        refds_path = Interface.get_refds_path(dataset)
        if modified is not None and (refds_path is None
                                     or not GitRepo.is_valid_repo(refds_path)):
            raise ValueError(
                "modification detection only works with a base dataset (non-given or found)"
            )

        # prep common result props
        res_kwargs = dict(action=action if action else 'annotate_path',
                          refds=refds_path,
                          logger=lgr)

        # handle the case of recursion into a single dataset without any
        # extra fancy processing first -- full recursion can be done
        # faster than manual recursion, hence we gain quite some speed
        # from these few lines of extra code
        if not modified and not path and refds_path:
            if not GitRepo.is_valid_repo(refds_path):
                yield get_status_dict(
                    # doesn't matter if the path is in another dataset
                    # it was given as reference dataset
                    status=nondataset_path_status,
                    message='given reference dataset is not a dataset',
                    path=refds_path,
                    **res_kwargs)
                return

            refds = Dataset(refds_path)
            path = []
            # yield the dataset itself
            r = get_status_dict(ds=refds, status='', **res_kwargs)
            yield r

            if recursive:
                # if we have nothing given, but need recursion, we need to feed
                # the dataset path itself
                for r in yield_recursive(refds, refds_path, action,
                                         recursion_limit):
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    yield r
            return

        # goal: structure in a way that makes most information on any path
        # available in a single pass, at the cheapest possible cost
        reported_paths = {}
        requested_paths = ensure_list(path)

        if modified is not None:
            # modification detection would silently kill all nondataset paths
            # but we have to complain about them, hence doing it here
            if requested_paths and refds_path:
                for r in requested_paths:
                    p = r['path'] if isinstance(r, dict) else r
                    p = _resolve_path(p, ds=refds_path)
                    if path_startswith(p, refds_path):
                        # all good
                        continue
                    # not the refds
                    path_props = r if isinstance(r, dict) else {}
                    res = get_status_dict(**dict(res_kwargs, **path_props))
                    res['status'] = nondataset_path_status
                    res['message'] = 'path not associated with reference dataset'
                    reported_paths[r] = res
                    yield res

            # preserve non-existing paths to be silently killed by modification
            # detection and append them to requested_paths again after detection.
            # TODO: This might be melted in with treatment of non dataset paths
            # above. Re-appending those paths seems to be better than yielding
            # directly to avoid code duplication, since both cases later on are
            # dealt with again.
            preserved_paths = []
            if requested_paths:
                [
                    preserved_paths.append(r) for r in requested_paths
                    if not lexists(r['path'] if isinstance(r, dict) else r)
                ]

            # replace the requested paths by those paths that were actually
            # modified underneath or at a requested location
            requested_paths = get_modified_subpaths(
                # either the request, or the base dataset, if there was no request
                requested_paths if requested_paths else [refds_path],
                refds=Dataset(refds_path),
                revision=modified,
                report_no_revision_change=force_no_revision_change_discovery,
                report_untracked='all' if force_untracked_discovery else 'no',
                recursion_limit=recursion_limit)

            from itertools import chain
            # re-append the preserved paths:
            requested_paths = chain(requested_paths, iter(preserved_paths))

        # Possibly to be used "cache" of known subdatasets per each parent
        # to avoid re-querying subdatasets per each path.  The assumption here
        # is that the list of sub-datasets for a given parent should not change
        # through the execution of this loop, which (hypothetically) could be
        # incorrect while annotating paths for some commands.
        # TODO: verify this assumption and possibly add an argument to turn
        #  caching off if/when needed, or provide some other way to invalidate
        #  it
        subdss_cache = {}

        # do not loop over unique(), this could be a list of dicts
        # we avoid duplicates manually below via `reported_paths`
        for path in requested_paths:
            if not isinstance(path, dict):
                path = rawpath2ap(path, refds_path)
            # this is now an annotated path!
            path_props = path
            path = path['path']
            # we need to mark our territory, who knows where this has been
            path_props.update(res_kwargs)

            if path in reported_paths:
                # we already recorded this path in the output
                # this can happen, whenever `path` is a subdataset, that was
                # discovered via recursive processing of another path before
                continue
            # the path exists in some shape or form
            # TODO if we have path_props already we could skip this test
            if isdir(path):
                # keep any existing type info, previously a more expensive run
                # could have discovered an uninstalled 'dataset', and we don't
                # want it to be relabeled to a directory
                path_props['type'] = \
                    path_props.get(
                        'type',
                        'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory')
                # this could contain all types of additional content
                containing_dir = path if not islink(path) else normpath(
                    opj(path, pardir))
            else:
                if lexists(path):
                    path_props['type'] = 'file'
                else:
                    path_props['state'] = 'absent'
                # for everything else we are interested in the container
                containing_dir = dirname(path)
                if not containing_dir:
                    containing_dir = curdir

            dspath = parent = get_dataset_root(containing_dir)
            if dspath:
                if path_props.get('type', None) == 'dataset':
                    # for a dataset the root is not the parent, for anything else
                    # it is
                    parent = path_props.get('parentds', None)
                    oneupdir = normpath(opj(containing_dir, pardir))
                    if parent is None and (force_parentds_discovery or
                                           (refds_path
                                            and _with_sep(oneupdir).startswith(
                                                _with_sep(refds_path)))):
                        # either forced, or only if we have a reference dataset, and
                        # only if we stay within this refds when searching for the
                        # parent
                        parent = get_dataset_root(
                            normpath(opj(containing_dir, pardir)))
                        # NOTE the `and refds_path` is critical, as it will determine
                        # whether a top-level dataset that was discovered gets the
                        # parent property or not, it won't get it without a common
                        # base dataset, and that is how we always rolled
                    if parent and refds_path:
                        path_props['parentds'] = parent
                        # don't check whether this is actually a true subdataset of the
                        # parent, done further down
                else:
                    # set parent, but prefer existing property
                    path_props['parentds'] = path_props.get('parentds', dspath)

            # test for `dspath` not `parent`, we only need to know whether there is
            # ANY dataset, not which one is the true parent, logic below relies on
            # the fact that we end here, if there is no dataset at all
            if not dspath:
                # not in any dataset
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = 'path not associated with any dataset'
                reported_paths[path] = res
                yield res
                continue

            # check that we only got SUBdatasets
            if refds_path and not path_startswith(dspath, refds_path):
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = \
                    ('path not part of the reference dataset at %s', refds_path)
                reported_paths[path] = res
                yield res
                continue

            if path_props.get('type', None) == 'file':
                # nothing else we can learn about this
                res = get_status_dict(**dict(res_kwargs, **path_props))
                if 'status' not in res:
                    res['status'] = ''
                reported_paths[path] = res
                yield res
                continue

            containing_ds = None
            path_type = path_props.get('type', None)
            if parent and force_subds_discovery and (
                (path_type == 'dataset'
                 and 'registered_subds' not in path_props)
                    or path_type == 'directory' or not lexists(path)):
                # if the path doesn't exist, or is labeled a directory, or a dataset even
                # a dataset (without this info) -> record whether this is a known subdataset
                # to its parent
                containing_ds = Dataset(parent)
                # Possibly "cache" the list of known subdss for parents we
                # have encountered so far
                if parent in subdss_cache:
                    subdss = subdss_cache[parent]
                else:
                    subdss = containing_ds.subdatasets(fulfilled=None,
                                                       recursive=False,
                                                       result_xfm=None,
                                                       result_filter=None,
                                                       return_type='list')
                    subdss_cache[parent] = subdss
                if path in [s['path'] for s in subdss]:
                    if path_type == 'directory' or not lexists(path):
                        # first record that it isn't here, if just a dir or not here at all
                        path_props['state'] = 'absent'
                    # this must be a directory, and it is not installed
                    path_props['type'] = 'dataset'
                    path_props['registered_subds'] = True

            if not lexists(path) or \
                    (path_props.get('type', None) == 'dataset' and
                     path_props.get('state', None) == 'absent'):
                # not there (yet)
                message = unavailable_path_msg if unavailable_path_msg else None
                if message and '%s' in message:
                    message = (message, path)
                path_props['message'] = message
                res = get_status_dict(**dict(res_kwargs, **path_props))
                # assign given status, but only if the props don't indicate a status
                # already
                res['status'] = path_props.get('status',
                                               unavailable_path_status)
                reported_paths[path] = res
                yield res
                continue

            # we know everything we can, report
            res = get_status_dict(**dict(res_kwargs, **path_props))
            if 'status' not in res:
                res['status'] = ''
            reported_paths[path] = res
            yield res

            rec_paths = []
            if recursive:
                # here we need to consider the special case that `path` is
                # a dataset itself, if a recursion_limit is given (e.g.
                # `remove` will do that by default), we need to recurse
                # from the dataset itself, and not its parent to get things
                # right -- this will also avoid needless discovery of
                # unrelated subdatasets
                if path_props.get('type', None) == 'dataset':
                    containing_ds = Dataset(path)
                else:
                    # regular parent, we might have a dataset already
                    containing_ds = Dataset(
                        parent) if containing_ds is None else containing_ds
                for r in yield_recursive(containing_ds, path, action,
                                         recursion_limit):
                    # capture reported paths
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    reported_paths[r['path']] = r
                    if modified is not None:
                        # we cannot yield right away, maybe it wasn't modified
                        rec_paths.append(r)
                    else:
                        yield r
            if modified is not None and rec_paths:
                # replace the recursively discovered paths by those paths that
                # were actually modified underneath or at a requested location
                for r in get_modified_subpaths(
                        rec_paths,
                        refds=Dataset(refds_path),
                        revision=modified,
                        report_no_revision_change=
                        force_no_revision_change_discovery,
                        report_untracked='all'
                        if force_untracked_discovery else 'no',
                        recursion_limit=recursion_limit):
                    res = get_status_dict(**dict(r, **res_kwargs))
                    reported_paths[res['path']] = res
                    yield res
        return
Exemplo n.º 12
0
class ContainersList(Interface):
    # first docstring line is used a short description in the cmdline help
    # the rest is put in the verbose help and manpage
    """List containers known to a dataset
    """

    result_renderer = 'tailored'
    # parameters of the command, must be exhaustive
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to query. If no dataset is given, an
            attempt is made to identify the dataset based on the current
            working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        contains=Parameter(
            args=('--contains',),
            metavar='PATH',
            action='append',
            doc="""when operating recursively, restrict the reported containers
            to those from subdatasets that contain the given path (i.e. the
            subdatasets that are reported by :command:`datalad subdatasets
            --contains=PATH`). Top-level containers are always reported."""),
        recursive=recursion_flag,
    )

    @staticmethod
    @datasetmethod(name='containers_list')
    @eval_results
    def __call__(dataset=None, recursive=False, contains=None):
        ds = require_dataset(dataset, check_installed=True,
                             purpose='list containers')
        refds = ds.path

        if recursive:
            for sub in ds.subdatasets(
                    contains=contains,
                    on_failure='ignore',
                    return_type='generator',
                    result_renderer='disabled'):
                subds = Dataset(sub['path'])
                if subds.is_installed():
                    for c in subds.containers_list(recursive=recursive,
                                                   return_type='generator',
                                                   on_failure='ignore',
                                                   result_filter=None,
                                                   result_renderer=None,
                                                   result_xfm=None):
                        c['name'] = sub['gitmodule_name'] + '/' + c['name']
                        c['refds'] = refds
                        yield c

        # all info is in the dataset config!
        var_prefix = 'datalad.containers.'
        containers = {}
        for var, value in ds.config.items():
            if not var.startswith(var_prefix):
                # not an interesting variable
                continue
            var_comps = var[len(var_prefix):].split('.')
            cname = var_comps[0]
            ccfgname = '.'.join(var_comps[1:])
            if not ccfgname:
                continue

            cinfo = containers.get(cname, {})
            cinfo[ccfgname] = value

            containers[cname] = cinfo

        for k, v in containers.items():
            if 'image' not in v:
                # there is no container location configured
                continue
            res = get_status_dict(
                status='ok',
                action='containers',
                name=k,
                type='file',
                path=op.join(ds.path, v.pop('image')),
                refds=refds,
                parentds=ds.path,
                # TODO
                #state='absent' if ... else 'present'
                **v)
            yield res

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        if res["action"] != "containers":
            default_result_renderer(res)
        else:
            ui.message(
                "{name} -> {path}"
                .format(name=ac.color_word(res["name"], ac.MAGENTA),
                        path=op.relpath(res["path"], res["refds"])))
Exemplo n.º 13
0
class Clean(Interface):
    """Clean up after DataLad (possible temporary files etc.)

    Removes extracted temporary archives, etc.

    Examples:

      $ datalad clean
    """

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to perform the clean operation on.  If
                no dataset is given, an attempt is made to identify the dataset
                in current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        # TODO:  --info  -- which performs dry run just summarizing what is to be cleaned up
        # TODO: Python only???
        what=Parameter(
            args=("--what", ),
            dest='what',
            choices=('cached-archives', 'annex-tmp', 'search-index'),
            nargs="*",
            doc="""What to clean.  If none specified -- all known targets are
            cleaned"""),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
    )

    @staticmethod
    @datasetmethod(name='clean')
    @eval_results
    def __call__(dataset=None,
                 what=None,
                 recursive=False,
                 recursion_limit=None):
        ds = require_dataset(dataset, purpose='clean-up')
        res_kwargs = dict(action='clean', logger=lgr, refds=ds.path)
        for ap in AnnotatePaths.__call__(dataset=ds.path,
                                         recursive=recursive,
                                         recursion_limit=recursion_limit,
                                         action='clean',
                                         unavailable_path_status='impossible',
                                         nondataset_path_status='impossible',
                                         return_type='generator',
                                         on_failure='ignore'):
            if ap.get('status', None):
                yield ap
                continue
            if ap.get('type', None) != 'dataset':
                ap.update(status='impossible',
                          message='only datasets can be cleaned')
                yield ap
                continue
            d = ap['path']
            gitdir = get_git_dir(d)
            for dirpath, flag, msg, sing_pl in [
                (ARCHIVES_TEMP_DIR, "cached-archives", "temporary archive",
                 ("directory", "directories")),
                (ANNEX_TEMP_DIR, "annex-tmp", "temporary annex", ("file",
                                                                  "files")),
                (opj(gitdir, SEARCH_INDEX_DOTGITDIR), 'search-index',
                 "metadata search index", ("file", "files")),
            ]:
                topdir = opj(d, dirpath)
                lgr.debug("Considering to clean %s:%s", d, dirpath)
                if not ((what is None) or (flag in what)):
                    yield get_status_dict(path=topdir,
                                          status='notneeded',
                                          type='directory',
                                          **res_kwargs)
                    continue
                paths = glob(opj(topdir, '*'))
                if not paths:
                    yield get_status_dict(path=topdir,
                                          status='notneeded',
                                          type='directory',
                                          **res_kwargs)
                    continue
                pl = len(paths) > 1
                message = ("Removed %d %s %s: %s", len(paths), msg,
                           sing_pl[int(pl)], ", ".join(
                               sorted([x[len(topdir) + 1:] for x in paths])))
                rmtree(topdir)
                yield get_status_dict(path=topdir,
                                      status='ok',
                                      type='dir',
                                      message=message,
                                      **res_kwargs)
Exemplo n.º 14
0
class Search(Interface):
    """Search dataset metadata

    DataLad can search metadata extracted from a dataset and/or aggregated into
    a superdataset (see the `aggregate-metadata` command). This makes it
    possible to discover datasets, or individual files in a dataset even when
    they are not available locally.

    Ultimately DataLad metadata are a graph of linked data structures. However,
    this command does not (yet) support queries that can exploit all
    information stored in the metadata. At the moment the following search
    modes are implemented that represent different trade-offs between the
    expressiveness of a query and the computational and storage resources
    required to execute a query.

    - egrep (default)

    - egrepcs [case-sensitive egrep]

    - textblob

    - autofield

    An alternative default mode can be configured by tuning the
    configuration variable 'datalad.search.default-mode'::

      [datalad "search"]
        default-mode = egrepcs

    Each search mode has its own default configuration for what kind of
    documents to query. The respective default can be changed via configuration
    variables::

      [datalad "search"]
        index-<mode_name>-documenttype = (all|datasets|files)


    *Mode: egrep/egrepcs*

    These search modes are largely ignorant of the metadata structure, and
    simply perform matching of a search pattern against a flat
    string-representation of metadata. This is advantageous when the query is
    simple and the metadata structure is irrelevant, or precisely known.
    Moreover, it does not require a search index, hence results can be reported
    without an initial latency for building a search index when the underlying
    metadata has changed (e.g. due to a dataset update). By default, these
    search modes only consider datasets and do not investigate records for
    individual files for speed reasons. Search results are reported in the
    order in which they were discovered.

    Queries can make use of Python regular expression syntax
    (https://docs.python.org/3/library/re.html). In `egrep` mode, matching is
    case-insensitive when the query does not contain upper case characters, but
    is case-sensitive when it does. In `egrepcs` mode, matching is always
    case-sensitive. Expressions will match anywhere in a metadata string, not
    only at the start.

    When multiple queries are given, all queries have to match for a search hit
    (AND behavior).

    It is possible to search individual metadata key/value items by prefixing
    the query with a metadata key name, separated by a colon (':'). The key
    name can also be a regular expression to match multiple keys. A query match
    happens when any value of an item with a matching key name matches the query
    (OR behavior). See examples for more information.

    Examples:

      Query for (what happens to be) an author::

        % datalad search haxby

      Queries are case-INsensitive when the query contains no upper case characters,
      and can be regular expressions. Use `egrepcs` mode when it is desired
      to perform a case-sensitive lowercase match::

        % datalad search --mode egrepcs halchenko.*haxby

      This search mode performs NO analysis of the metadata content.  Therefore
      queries can easily fail to match. For example, the above query implicitly
      assumes that authors are listed in alphabetical order.  If that is the
      case (which may or may not be true), the following query would yield NO
      hits::

        % datalad search Haxby.*Halchenko

      The ``textblob`` search mode represents an alternative that is more
      robust in such cases.

      For more complex queries multiple query expressions can be provided that
      all have to match to be considered a hit (AND behavior). This query
      discovers all files (non-default behavior) that match 'bids.type=T1w'
      AND 'nifti1.qform_code=scanner'::

        % datalad -c datalad.search.index-egrep-documenttype=all search bids.type:T1w nifti1.qform_code:scanner

      Key name selectors can also be expressions, which can be used to select
      multiple keys or construct "fuzzy" queries. In such cases a query matches
      when any item with a matching key matches the query (OR behavior).
      However, multiple queries are always evaluated using an AND conjunction.
      The following query extends the example above to match any files that
      have either 'nifti1.qform_code=scanner' or 'nifti1.sform_code=scanner'::

        % datalad -c datalad.search.index-egrep-documenttype=all search bids.type:T1w nifti1.(q|s)form_code:scanner

    *Mode: textblob*

    This search mode is very similar to the ``egrep`` mode, but with a few key
    differences. A search index is built from the string-representation of
    metadata records. By default, only datasets are included in this index, hence
    the indexing is usually completed within a few seconds, even for hundreds
    of datasets. This mode uses its own query language (not regular expressions)
    that is similar to other search engines. It supports logical conjunctions
    and fuzzy search terms. More information on this is available from the Whoosh
    project (search engine implementation):

      - Description of the Whoosh query language:
        http://whoosh.readthedocs.io/en/latest/querylang.html)

      - Description of a number of query language customizations that are
        enabled in DataLad, such as, fuzzy term matching:
        http://whoosh.readthedocs.io/en/latest/parsing.html#common-customizations

    Importantly, search hits are scored and reported in order of descending
    relevance, hence limiting the number of search results is more meaningful
    than in the 'egrep' mode and can also reduce the query duration.

    Examples:

      Search for (what happens to be) two authors, regardless of the order in
      which those names appear in the metadata::

        % datalad search --mode textblob halchenko haxby

      Fuzzy search when you only have an approximate idea what you are looking
      for or how it is spelled::

        % datalad search --mode textblob haxbi~

      Very fuzzy search, when you are basically only confident about the first
      two characters and how it sounds approximately (or more precisely: allow
      for three edits and require matching of the first two characters)::

        % datalad search --mode textblob haksbi~3/2

      Combine fuzzy search with logical constructs::

        % datalad search --mode textblob 'haxbi~ AND (hanke OR halchenko)'


    *Mode: autofield*

    This mode is similar to the 'textblob' mode, but builds a vastly more
    detailed search index that represents individual metadata variables as
    individual fields. By default, this search index includes records for
    datasets and individual fields, hence it can grow very quickly into
    a huge structure that can easily take an hour or more to build and require
    more than a GB of storage. However, limiting it to documents on datasets
    (see above) retains the enhanced expressiveness of queries while
    dramatically reducing the resource demands.

    Examples:

      List names of search index fields (auto-discovered from the set of
      indexed datasets)::

        % datalad search --mode autofield --show-keys name

      Fuzzy search for datasets with an author that is specified in a particular
      metadata field::

        % datalad search --mode autofield bids.author:haxbi~ type:dataset

      Search for individual files that carry a particular description
      prefix in their 'nifti1' metadata::

        % datalad search --mode autofield nifti1.description:FSL* type:file


    *Reporting*

    Search hits are returned as standard DataLad results. On the command line
    the '--output-format' (or '-f') option can be used to tweak results for
    further processing.

    Examples:

      Format search hits as a JSON stream (one hit per line)::

        % datalad -f json search haxby

      Custom formatting: which terms matched the query of particular
      results. Useful for investigating fuzzy search results::

        $ datalad -f '{path}: {query_matched}' search --mode autofield bids.author:haxbi~
    """
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to perform the query operation on. If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory and/or the `path` given""",
            constraints=EnsureDataset() | EnsureNone()),
        query=Parameter(
            args=("query", ),
            metavar='QUERY',
            nargs="*",
            doc="""query string, supported syntax and features depends on the
            selected search mode (see documentation)"""),
        force_reindex=Parameter(
            args=("--reindex", ),
            dest='force_reindex',
            action='store_true',
            doc="""force rebuilding the search index, even if no change in the
            dataset's state has been detected, for example, when the index
            documenttype configuration has changed."""),
        max_nresults=Parameter(
            args=("--max-nresults", ),
            doc="""maxmimum number of search results to report. Setting this
            to 0 will report all search matches. Depending on the mode this
            can search substantially slower. If not specified, a
            mode-specific default setting will be used.""",
            constraints=EnsureInt() | EnsureNone()),
        mode=Parameter(
            args=("--mode", ),
            choices=('egrep', 'textblob', 'autofield'),
            doc="""Mode of search index structure and content. See section
            SEARCH MODES for details.
            """),
        full_record=Parameter(
            args=("--full-record", '-f'),
            action='store_true',
            doc="""If set, return the full metadata record for each search hit.
            Depending on the search mode this might require additional queries.
            By default, only data that is available to the respective search modes
            is returned. This always includes essential information, such as the
            path and the type."""),
        show_keys=Parameter(
            args=('--show-keys', ),
            choices=('name', 'short', 'full'),
            default=None,
            doc="""if given, a list of known search keys is shown. If 'name' -
            only the name is printed one per line. If 'short' or 'full',
            statistics (in how many datasets, and how many unique values) are
            printed. 'short' truncates the listing of unique values.
            No other action is performed (except for reindexing), even if other
            arguments are given. Each key is accompanied by a term definition in
            parenthesis (TODO). In most cases a definition is given in the form
            of a URL. If an ontology definition for a term is known, this URL
            can resolve to a webpage that provides a comprehensive definition
            of the term. However, for speed reasons term resolution is solely done
            on information contained in a local dataset's metadata, and definition
            URLs might be outdated or point to no longer existing resources."""
        ),
        show_query=Parameter(
            args=('--show-query', ),
            action='store_true',
            doc="""if given, the formal query that was generated from the given
            query string is shown, but not actually executed. This is mostly useful
            for debugging purposes."""),
    )

    @staticmethod
    @datasetmethod(name='search')
    @eval_results
    def __call__(query=None,
                 dataset=None,
                 force_reindex=False,
                 max_nresults=None,
                 mode=None,
                 full_record=False,
                 show_keys=None,
                 show_query=False):
        try:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='dataset search')
            if ds.id is None:
                raise NoDatasetArgumentFound(
                    "This does not seem to be a dataset (no DataLad dataset ID "
                    "found). 'datalad create --force %s' can initialize "
                    "this repository as a DataLad dataset" % ds.path)
        except NoDatasetArgumentFound:
            for r in _search_from_virgin_install(dataset, query):
                yield r
            return

        if mode is None:
            # let's get inspired by what the dataset/user think is
            # default
            mode = ds.config.obtain('datalad.search.default-mode')

        if mode == 'egrep':
            searcher = _EGrepSearch
        elif mode == 'egrepcs':
            searcher = _EGrepCSSearch
        elif mode == 'textblob':
            searcher = _BlobSearch
        elif mode == 'autofield':
            searcher = _AutofieldSearch
        else:
            raise ValueError('unknown search mode "{}"'.format(mode))

        searcher = searcher(ds, force_reindex=force_reindex)

        if show_keys:
            searcher.show_keys(show_keys)
            return

        if not query:
            return

        if show_query:
            print(repr(searcher.get_query(query)))
            return

        for r in searcher(query,
                          max_nresults=max_nresults,
                          full_record=full_record):
            yield r
Exemplo n.º 15
0
class Save(Interface):
    """Save the current state of a dataset

    Saving the state of a dataset records changes that have been made to it.
    This change record is annotated with a user-provided description.
    Optionally, an additional tag, such as a version, can be assigned to the
    saved state. Such tag enables straightforward retrieval of past versions at
    a later point in time.

    .. note::
      Before Git v2.22, any Git repository without an initial commit located
      inside a Dataset is ignored, and content underneath it will be saved to
      the respective superdataset. DataLad datasets always have an initial
      commit, hence are not affected by this behavior.
    """
    # note above documents that out behavior is like that of `git add`, but
    # does not explicitly mention the connection to keep it simple.

    _examples_ = [
        dict(text="""Save any content underneath the current directory, without
             altering any potential subdataset""",
             code_py="save(path='.')",
             code_cmd="datalad save ."),
        dict(text="""Save specific content in the dataset""",
             code_py="save(path='myfile.txt')",
             code_cmd="datalad save myfile.txt"),
        dict(text="""Attach a commit message to save""",
             code_py="save(path='myfile.txt', message='add file')",
             code_cmd="datalad save -m 'add file' myfile.txt"),
        dict(text="""Save any content underneath the current directory, and
             recurse into any potential subdatasets""",
             code_py="save(path='.', recursive=True)",
             code_cmd="datalad save . -r"),
        dict(text="Save any modification of known dataset content in the "
                  "current directory, but leave untracked files (e.g. temporary files) "
                  "untouched",
             code_py="""save(path='.', updated=True)""",
             code_cmd="""datalad save -u ."""),
        dict(text="Tag the most recent saved state of a dataset",
             code_py="save(version_tag='bestyet')",
             code_cmd="datalad save --version-tag 'bestyet'"),
        dict(text="Save a specific change but integrate into last commit keeping "
                  "the already recorded commit message",
             code_py="save(path='myfile.txt', amend=True)",
             code_cmd="datalad save myfile.txt --amend")
    ]

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc=""""specify the dataset to save""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path",),
            metavar='PATH',
            doc="""path/name of the dataset component to save. If given, only
            changes made to those components are recorded in the new state.""",
            nargs='*',
            constraints=EnsureStr() | EnsureNone()),
        message=save_message_opt,
        message_file=Parameter(
            args=("-F", "--message-file"),
            doc="""take the commit message from this file. This flag is
            mutually exclusive with -m.""",
            constraints=EnsureStr() | EnsureNone()),
        version_tag=Parameter(
            args=("-t", "--version-tag",),
            metavar='ID',
            doc="""an additional marker for that state. Every dataset that
            is touched will receive the tag.""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        updated=Parameter(
            args=('-u', '--updated',),
            action='store_true',
            doc="""if given, only saves previously tracked paths."""),
        to_git=Parameter(
            args=("--to-git",),
            action='store_true',
            doc="""flag whether to add data directly to Git, instead of
            tracking data identity only.  Use with caution, there is no
            guarantee that a file put directly into Git like this will
            not be annexed in a subsequent save operation.
            If not specified, it will be up to git-annex to decide how
            a file is tracked, based on a dataset's configuration
            to track particular paths,
            file types, or file sizes with either Git or git-annex.
            (see https://git-annex.branchable.com/tips/largefiles).
            """),
        jobs=jobs_opt,
        amend=Parameter(
            args=('--amend',),
            action='store_true',
            doc="""if set, changes are not recorded in a new, separate
            commit, but are integrated with the changeset of the previous
            commit, and both together are recorded by replacing that
            previous commit. This is mutually exclusive with recursive
            operation.
            """),
    )

    @staticmethod
    @datasetmethod(name='save')
    @eval_results
    def __call__(path=None,
                 *,
                 message=None, dataset=None,
                 version_tag=None,
                 recursive=False, recursion_limit=None,
                 updated=False,
                 message_file=None,
                 to_git=None,
                 jobs=None,
                 amend=False,
                 ):
        if message and message_file:
            raise ValueError(
                "Both a message and message file were specified for save()")

        if amend and recursive:
            raise ValueError("Cannot amend a commit recursively.")

        path = ensure_list(path)

        if message_file:
            with open(message_file) as mfh:
                message = mfh.read()

        # we want 'normal' to achieve the most compact argument list
        # for git calls
        # untracked_mode = 'no' if updated else 'normal'
        # TODO however, Repo.add() would refuse to add any dotfiles
        # in a directory that is itself untracked, hence the only
        # choice is to go with potentially crazy long lists
        # until https://github.com/datalad/datalad/issues/1454
        # has a resolution
        untracked_mode = 'no' if updated else 'all'

        # there are three basic scenarios:
        # 1. save modifications to any already tracked content
        # 2. save any content (including removal of deleted content)
        #    to bring things to a clean state
        # 3. like (2), but only operate on a given subset of content
        #    identified by paths
        # - all three have to work in conjunction with --recursive
        # - the difference between (1) and (2) should be no more
        #   that a switch from --untracked=no to --untracked=all
        #   in Repo.save()

        # we do not support
        # - simultaneous operations on multiple datasets from disjoint
        #   dataset hierarchies, hence a single reference dataset must be
        #   identifiable from the either
        #   - curdir or
        #   - the `dataset` argument.
        #   This avoids complex annotation loops and hierarchy tracking.
        # - any modification upwards from the root dataset

        ds = require_dataset(dataset, check_installed=True, purpose='save')

        # use status() to do all discovery and annotation of paths
        paths_by_ds = {}
        for s in Status()(
                # ATTN: it is vital to pass the `dataset` argument as it,
                # and not a dataset instance in order to maintain the path
                # semantics between here and the status() call
                dataset=dataset,
                path=path,
                untracked=untracked_mode,
                recursive=recursive,
                recursion_limit=recursion_limit,
                on_failure='ignore',
                # for save without recursion only commit matters
                eval_subdataset_state='full' if recursive else 'commit',
                return_type='generator',
                # this could be, but for now only 'error' results are handled
                # below
                #on_failure='ignore',
                result_renderer='disabled'):
            if s['status'] == 'error':
                # Downstream code can't do anything with these. Let the caller
                # decide their fate.
                yield s
                continue

            # fish out status dict for this parent dataset
            ds_status = paths_by_ds.get(s['parentds'], {})
            # reassemble path status info as repo.status() would have made it
            ds_status[ut.Path(s['path'])] = \
                {k: v for k, v in s.items()
                 if k not in (
                     'path', 'parentds', 'refds', 'status', 'action',
                     'logger')}
            paths_by_ds[s['parentds']] = ds_status

        lgr.debug('Determined %i datasets for saving from input arguments',
                  len(paths_by_ds))
        # figure out what datasets to process, start with the ones containing
        # the paths that were given as arguments
        discovered_datasets = list(paths_by_ds.keys())
        if dataset:
            # if a reference dataset was given we want to save all the way up
            # to it, so let's throw it into the mix
            discovered_datasets.append(ds.path)
        # sort the datasets into (potentially) disjoint hierarchies,
        # or a single one, if a reference dataset was given
        dataset_hierarchies = get_tree_roots(discovered_datasets)
        for rootds, children in dataset_hierarchies.items():
            edges = {}
            discover_dataset_trace_to_targets(
                rootds, children, [], edges, includeds=children)
            for superds, subdss in edges.items():
                superds_status = paths_by_ds.get(superds, {})
                for subds in subdss:
                    subds_path = ut.Path(subds)
                    sub_status = superds_status.get(subds_path, {})
                    if not (sub_status.get("state") == "clean" and
                            sub_status.get("type") == "dataset"):
                        # TODO actually start from an entry that may already
                        # exist in the status record
                        superds_status[subds_path] = dict(
                            # shot from the hip, some status config
                            # to trigger this specific super/sub
                            # relation to be saved
                            state='untracked',
                            type='dataset')
                paths_by_ds[superds] = superds_status

        def save_ds(args, version_tag=None):
            pdspath, paths = args

            pds = Dataset(pdspath)
            pds_repo = pds.repo
            # pop status for this dataset, we are not coming back to it
            pds_status = {
                # for handing over to the low-level code, we recode any
                # path relative to the real repo location, this avoid
                # cumbersome symlink handling without context in the
                # lower levels
                pds_repo.pathobj / p.relative_to(pdspath): props
                for p, props in paths.items()}
            start_commit = pds_repo.get_hexsha()
            if not all(p['state'] == 'clean' for p in pds_status.values()) or \
                    (amend and message):
                for res in pds_repo.save_(
                        message=message,
                        # make sure to have the `path` arg be None, as we want
                        # to prevent and bypass any additional repo.status()
                        # calls
                        paths=None,
                        # prevent whining of GitRepo
                        git=True if not hasattr(ds.repo, 'uuid')
                        else to_git,
                        # we are supplying the full status already, do not
                        # detect anything else
                        untracked='no',
                        _status=pds_status,
                        amend=amend):
                    # TODO remove stringification when datalad-core can handle
                    # path objects, or when PY3.6 is the lowest supported
                    # version
                    for k in ('path', 'refds'):
                        if k in res:
                            res[k] = str(
                                # recode path back to dataset path anchor
                                pds.pathobj / res[k].relative_to(
                                    pds_repo.pathobj)
                            )
                    yield res
            # report on the dataset itself
            dsres = dict(
                action='save',
                type='dataset',
                path=pds.path,
                refds=ds.path,
                status='ok'
                if start_commit != pds_repo.get_hexsha()
                else 'notneeded',
                logger=lgr,
            )
            if not version_tag:
                yield dsres
                return
            try:
                # method requires str
                version_tag = str(version_tag)
                pds_repo.tag(version_tag)
                dsres.update(
                    status='ok',
                    version_tag=version_tag)
                yield dsres
            except CommandError as e:
                if dsres['status'] == 'ok':
                    # first we yield the result for the actual save
                    # TODO: we will get duplicate dataset/save record obscuring
                    # progress reporting.  yoh thought to decouple "tag" from "save"
                    # messages but was worrying that original authors would disagree
                    yield dsres.copy()
                # and now complain that tagging didn't work
                dsres.update(
                    status='error',
                    message=('cannot tag this version: %s', e.stderr.strip()))
                yield dsres

        if not paths_by_ds:
            # Special case: empty repo. There's either an empty commit only or
            # none at all. An empty one we can amend otherwise there's nothing
            # to do.
            if amend and ds.repo.get_hexsha():
                yield from save_ds((ds.pathobj, dict()), version_tag=version_tag)

            else:
                yield dict(action='save',
                           type='dataset',
                           path=ds.path,
                           refds=ds.path,
                           status='notneeded',
                           logger=lgr)
            return

        # TODO: in principle logging could be improved to go not by a dataset
        # but by path(s) within subdatasets. That should provide a bit better ETA
        # and more "dynamic" feedback than jumpy datasets count.
        # See addurls where it is implemented that way by providing agg and another
        # log_filter
        yield from ProducerConsumerProgressLog(
            sorted(paths_by_ds.items(), key=lambda v: v[0], reverse=True),
            partial(save_ds, version_tag=version_tag),
            safe_to_consume=no_subds_in_futures,
            producer_future_key=lambda ds_items: ds_items[0],
            jobs=jobs,
            log_filter=_log_filter_save_dataset,
            unit="datasets",
            lgr=lgr,
        )
Exemplo n.º 16
0
class Push(Interface):
    """Push a dataset to a known :term:`sibling`.

    This makes a saved state of a dataset available to a sibling or special
    remote data store of a dataset. Any target sibling must already exist and
    be known to the dataset.

    || REFLOW >>
    By default, all files tracked in the last saved state (of the current
    branch) will be copied to the target location. Optionally, it is possible
    to limit a push to changes relative to a particular point in the version
    history of a dataset (e.g. a release tag) using the
    [CMD: --since CMD][PY: since PY] option in conjunction with the
    specification of a reference dataset. In recursive mode subdatasets will also be
    evaluated, and only those subdatasets are pushed where a change was
    recorded that is reflected in the current state of the top-level reference
    dataset.
    << REFLOW ||

    .. note::
      Power-user info: This command uses :command:`git push`, and :command:`git
      annex copy` to push a dataset. Publication targets are either configured
      remote Git repositories, or git-annex special remotes (if they support
      data upload).
    """

    # TODO add examples

    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to push""",
                          constraints=EnsureDataset() | EnsureNone()),
        to=Parameter(
            args=("--to", ),
            metavar='SIBLING',
            doc="""name of the target sibling. If no name is given an attempt is
            made to identify the target based on the dataset's configuration
            (i.e. a configured tracking branch, or a single sibling that is
            configured for push)""",
            constraints=EnsureStr() | EnsureNone()),
        since=Parameter(
            args=("--since", ),
            constraints=EnsureStr() | EnsureNone(),
            doc=
            """specifies commit-ish (tag, shasum, etc.) from which to look for
            changes to decide whether pushing is necessary.
            If '^' is given, the last state of the current branch at the sibling
            is taken as a starting point."""),
        path=Parameter(args=("path", ),
                       metavar='PATH',
                       doc="""path to contrain a push to. If given, only
            data or changes for those paths are considered for a push.""",
                       nargs='*',
                       constraints=EnsureStr() | EnsureNone()),
        data=Parameter(
            args=("--data", ),
            doc="""what to do with (annex'ed) data. 'anything' would cause
            transfer of all annexed content, 'nothing' would avoid call to
            `git annex copy` altogether. 'auto' would use 'git annex copy' with
            '--auto' thus transferring only data which would satisfy "wanted"
            or "numcopies" settings for the remote (thus "nothing" otherwise).
            'auto-if-wanted' would enable '--auto' mode only if there is a 
            "wanted" setting for the remote, and transfer 'anything' otherwise.
            """,
            constraints=EnsureChoice('anything', 'nothing', 'auto',
                                     'auto-if-wanted')),
        force=Parameter(
            # multi-mode option https://github.com/datalad/datalad/issues/3414
            args=(
                "-f",
                "--force",
            ),
            doc="""force particular operations, possibly overruling safety
            protections or optimizations: use --force with git-push ('gitpush');
            do not use --fast with git-annex copy ('checkdatapresent');
            combine all force modes ('all').""",
            constraints=EnsureChoice('all', 'gitpush', 'checkdatapresent',
                                     None)),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        jobs=jobs_opt,
    )

    # Desired features:
    # - let Git do it's thing (push multiple configured refs without the need
    #                          to specific anything on the command line
    #   - compilication: we need publication dependencies (i.e. publish what
    #     would be published by Git to a different remote first, hence we
    #     cannot simply watch Git do it, and later act on it.)
    #   - https://github.com/datalad/datalad/issues/1284
    #   - https://github.com/datalad/datalad/issues/4006
    # - make differences between remotes and various types of special remotes
    #   opaque
    #   - https://github.com/datalad/datalad/issues/3127
    # - informative and comprehensive (error) reporting
    #   - https://github.com/datalad/datalad/issues/2000
    #   - https://github.com/datalad/datalad/issues/1682
    #   - https://github.com/datalad/datalad/issues/2029
    #   - https://github.com/datalad/datalad/issues/2855
    #   - https://github.com/datalad/datalad/issues/3412
    #   - https://github.com/datalad/datalad/issues/3424
    # - ensure robust behavior in multi-lateral push scenarios (updating
    #   a dataset that was updated by a 3rd-party after the last known
    #   fetched change
    #   - https://github.com/datalad/datalad/issues/2636
    # - should NOT mimic `publish` and that it mixes `create-sibling` and
    #   `push` into a single operation. This would fold the complexity
    #   of all possible ways a local dataset hierarchy could possibly
    #   connected to remote ends into this command. It would be lost battle
    #   from the start.
    #   - not tackle: https://github.com/datalad/datalad/issues/2186
    # - maintain standard setup, and not reflect procedural aspects
    #   onto the resulting outcomes
    #   - https://github.com/datalad/datalad/issues/2001
    # - do a straight push, nothing like 'sync'. If a remote has something that
    #   needs merging first, fail and let users update. Any diff we are missing
    #   locally can impact decision making via --since and friends.

    @staticmethod
    @datasetmethod(name='push')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 to=None,
                 since=None,
                 data='auto-if-wanted',
                 force=None,
                 recursive=False,
                 recursion_limit=None,
                 jobs=None):
        # push uses '^' to annotate the previous pushed committish, and None for default
        # behavior. '' was/is (to be deprecated) used in `publish`. Alert user about the mistake
        if since == '':
            raise ValueError("'since' should point to commitish or use '^'.")
        # we resolve here, because we need to perform inspection on what was given
        # as an input argument further down
        paths = [resolve_path(p, dataset) for p in assure_list(path)]

        ds = require_dataset(dataset, check_installed=True, purpose='pushing')
        ds_repo = ds.repo

        res_kwargs = dict(
            action='publish',
            refds=ds.path,
            logger=lgr,
        )

        get_remote_kwargs = {'exclude_special_remotes': False} \
            if isinstance(ds_repo, AnnexRepo) else {}
        if to and to not in ds_repo.get_remotes(**get_remote_kwargs):
            # get again for proper error:
            sr = ds_repo.get_remotes(**get_remote_kwargs)
            # yield an error result instead of raising a ValueError,
            # to enable the use case of pushing to a target that
            # a superdataset doesn't know, but some subdatasets to
            # (in combination with '--on-failure ignore')
            yield dict(res_kwargs,
                       status='error',
                       path=ds.path,
                       message="Unknown push target '{}'. {}".format(
                           to, 'Known targets: {}.'.format(', '.join(
                               repr(s) for s in sr))
                           if sr else 'No targets configured in dataset.'))
            return

        if since == '^':
            # figure out state of remote branch and set `since`
            since = _get_corresponding_remote_state(ds_repo, to)
            if not since:
                lgr.info("No tracked remote for active branch, "
                         "detection of last pushed state not in effect.")
        elif since:
            # will blow with ValueError if unusable
            ds_repo.get_hexsha(since)

        # obtain a generator for information on the datasets to process
        # idea is to turn the `paths` argument into per-dataset
        # content listings that can be acted upon
        ds_spec = _datasets_since_(
            # important to pass unchanged dataset arg
            dataset,
            since,
            paths,
            recursive,
            recursion_limit)

        # instead of a loop, this could all be done in parallel
        matched_anything = False
        for dspath, dsrecords in ds_spec:
            matched_anything = True
            lgr.debug('Attempt push of Dataset at %s', dspath)
            pbars = {}
            yield from _push(dspath,
                             dsrecords,
                             to,
                             data,
                             force,
                             jobs,
                             res_kwargs.copy(),
                             pbars,
                             got_path_arg=True if path else False)
            # take down progress bars for this dataset
            for i, ds in pbars.items():
                log_progress(lgr.info, i, 'Finished push of %s', ds)
        if not matched_anything:
            yield dict(
                res_kwargs,
                status='notneeded',
                message=
                'Given constraints did not match any changes to publish',
                type='dataset',
                path=ds.path,
            )

    @staticmethod
    def custom_result_summary_renderer(results):  # pragma: more cover
        # report on any hints at the end
        # get all unique hints
        hints = set([r.get('hints', None) for r in results])
        hints = [hint for hint in hints if hint is not None]
        if hints:
            from datalad.ui import ui
            from datalad.support import ansi_colors
            intro = ansi_colors.color_word(
                "Potential hints to solve encountered errors: ",
                ansi_colors.YELLOW)
            ui.message(intro)
            [
                ui.message("{}: {}".format(
                    ansi_colors.color_word(id + 1, ansi_colors.YELLOW), hint))
                for id, hint in enumerate(hints)
            ]
Exemplo n.º 17
0
class Run(Interface):
    """Run an arbitrary shell command and record its impact on a dataset.

    It is recommended to craft the command such that it can run in the root
    directory of the dataset that the command will be recorded in. However,
    as long as the command is executed somewhere underneath the dataset root,
    the exact location will be recorded relative to the dataset root.

    If the executed command did not alter the dataset in any way, no record of
    the command execution is made.

    If the given command errors, a `CommandError` exception with the same exit
    code will be raised, and no modifications will be saved.

    *Command format*

    || REFLOW >>
    A few placeholders are supported in the command via Python format
    specification. "{pwd}" will be replaced with the full path of the current
    working directory. "{dspath}" will be replaced with the full path of the
    dataset that run is invoked on. "{tmpdir}" will be replaced with the full
    path of a temporary directory. "{inputs}" and "{outputs}" represent the
    values specified by [CMD: --input and --output CMD][PY: `inputs` and
    `outputs` PY]. If multiple values are specified, the values will be joined
    by a space. The order of the values will match that order from the command
    line, with any globs expanded in alphabetical order (like bash). Individual
    values can be accessed with an integer index (e.g., "{inputs[0]}").
    << REFLOW ||

    || REFLOW >>
    Note that the representation of the inputs or outputs in the formatted
    command string depends on whether the command is given as a list of
    arguments or as a string[CMD:  (quotes surrounding the command) CMD]. The
    concatenated list of inputs or outputs will be surrounded by quotes when
    the command is given as a list but not when it is given as a string. This
    means that the string form is required if you need to pass each input as a
    separate argument to a preceding script (i.e., write the command as
    "./script {inputs}", quotes included). The string form should also be used
    if the input or output paths contain spaces or other characters that need
    to be escaped.
    << REFLOW ||

    To escape a brace character, double it (i.e., "{{" or "}}").

    Custom placeholders can be added as configuration variables under
    "datalad.run.substitutions".  As an example:

      Add a placeholder "name" with the value "joe"::

        % git config --file=.datalad/config datalad.run.substitutions.name joe
        % datalad add -m "Configure name placeholder" .datalad/config

      Access the new placeholder in a command::

        % datalad run "echo my name is {name} >me"
    """
    _examples_ = [
        dict(
            text="Run an executable script and record the impact on a dataset",
            code_py="run(message='run my script', cmd='code/script.sh')",
            code_cmd="datalad run -m 'run my script' 'code/script.sh'"),
        dict(text="Run a command and specify a directory as a dependency "
             "for the run. The contents of the dependency will be retrieved "
             "prior to running the script",
             code_cmd="datalad run -m 'run my script' -i 'data/*' "
             "'code/script.sh'",
             code_py="""\
             run(cmd='code/script.sh', message='run my script',
                 inputs=['data/*'])"""),
        dict(text="Run an executable script and specify output files of the "
             "script to be unlocked prior to running the script",
             code_py="""\
             run(cmd='code/script.sh', message='run my script',
                 inputs=['data/*'], outputs=['output_dir'])""",
             code_cmd="""\
             datalad run -m 'run my script' -i 'data/*' \\
             -o 'output_dir/*' 'code/script.sh'"""),
        dict(text="Specify multiple inputs and outputs",
             code_py="""\
             run(cmd='code/script.sh',
                 message='run my script',
                 inputs=['data/*', 'datafile.txt'],
                 outputs=['output_dir', 'outfile.txt'])""",
             code_cmd="""\
             datalad run -m 'run my script' -i 'data/*' \\
             -i 'datafile.txt' -o 'output_dir/*' -o \\
             'outfile.txt' 'code/script.sh'""")
    ]

    _params_ = dict(
        cmd=Parameter(
            args=("cmd", ),
            nargs=REMAINDER,
            metavar='COMMAND',
            doc="""command for execution. A leading '--' can be used to
            disambiguate this command from the preceding options to
            DataLad."""),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to record the command results in.
            An attempt is made to identify the dataset based on the current
            working directory. If a dataset is given, the command will be
            executed in the root directory of this dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
        inputs=Parameter(
            args=("-i", "--input"),
            dest="inputs",
            metavar=("PATH"),
            action='append',
            doc="""A dependency for the run. Before running the command, the
            content of this file will be retrieved. A value of "." means "run
            :command:`datalad get .`". The value can also be a glob. [CMD: This
            option can be given more than once. CMD]"""),
        outputs=Parameter(
            args=("-o", "--output"),
            dest="outputs",
            metavar=("PATH"),
            action='append',
            doc="""Prepare this file to be an output file of the command. A
            value of "." means "run :command:`datalad unlock .`" (and will fail
            if some content isn't present). For any other value, if the content
            of this file is present, unlock the file. Otherwise, remove it. The
            value can also be a glob. [CMD: This option can be given more than
            once. CMD]"""),
        expand=Parameter(
            args=("--expand", ),
            doc="""Expand globs when storing inputs and/or outputs in the
            commit message.""",
            constraints=EnsureChoice(None, "inputs", "outputs", "both")),
        explicit=Parameter(
            args=("--explicit", ),
            action="store_true",
            doc="""Consider the specification of inputs and outputs to be
            explicit. Don't warn if the repository is dirty, and only save
            modifications to the listed outputs."""),
        message=save_message_opt,
        sidecar=Parameter(args=('--sidecar', ),
                          metavar="{yes|no}",
                          doc="""By default, the configuration variable
            'datalad.run.record-sidecar' determines whether a record with
            information on a command's execution is placed into a separate
            record file instead of the commit message (default: off). This
            option can be used to override the configured behavior on a
            case-by-case basis. Sidecar files are placed into the dataset's
            '.datalad/runinfo' directory (customizable via the
            'datalad.run.record-directory' configuration variable).""",
                          constraints=EnsureNone() | EnsureBool()),
    )

    @staticmethod
    @datasetmethod(name='run')
    @eval_results
    def __call__(cmd=None,
                 dataset=None,
                 inputs=None,
                 outputs=None,
                 expand=None,
                 explicit=False,
                 message=None,
                 sidecar=None):
        for r in run_command(cmd,
                             dataset=dataset,
                             inputs=inputs,
                             outputs=outputs,
                             expand=expand,
                             explicit=explicit,
                             message=message,
                             sidecar=sidecar):
            yield r
Exemplo n.º 18
0
class CreateSibling(Interface):
    """Create a dataset sibling on a UNIX-like SSH-accessible machine

    Given a local dataset, and SSH login information this command creates
    a remote dataset repository and configures it as a dataset sibling to
    be used as a publication target (see `publish` command).

    Various properties of the remote sibling can be configured (e.g. name
    location on the server, read and write access URLs, and access
    permissions.

    Optionally, a basic web-viewer for DataLad datasets can be installed
    at the remote location.

    This command supports recursive processing of dataset hierarchies, creating
    a remote sibling for each dataset in the hierarchy. By default, remote
    siblings are created in hierarchical structure that reflects the
    organization on the local file system. However, a simple templating
    mechanism is provided to produce a flat list of datasets (see
    --target-dir).
    """
    # XXX prevent common args from being added to the docstring
    _no_eval_results = True

    _params_ = dict(
        # TODO: Figure out, whether (and when) to use `sshurl` as push url
        dataset=Parameter(
            args=(
                "--dataset",
                "-d",
            ),
            doc="""specify the dataset to create the publication target for. If
                no dataset is given, an attempt is made to identify the dataset
                based on the current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        sshurl=Parameter(
            args=("sshurl", ),
            metavar='SSHURL',
            nargs='?',
            doc="""Login information for the target server. This can be given
                as a URL (ssh://host/path) or SSH-style (user@host:path).
                Unless overridden, this also serves the future dataset's access
                URL and path on the server.""",
            constraints=EnsureStr()),
        name=Parameter(
            args=(
                '-s',
                '--name',
            ),
            metavar='NAME',
            doc="""sibling name to create for this publication target.
                If `recursive` is set, the same name will be used to label all
                the subdatasets' siblings. When creating a target dataset fails,
                no sibling is added""",
            constraints=EnsureStr() | EnsureNone(),
            nargs="?"),
        target_dir=Parameter(
            args=('--target-dir', ),
            metavar='PATH',
            doc="""path to the directory *on the server* where the dataset
                shall be created. By default the SSH access URL is used to
                identify this directory. If a relative path is provided here,
                it is interpreted as being relative to the user's home
                directory on the server.\n
                Additional features are relevant for recursive processing of
                datasets with subdatasets. By default, the local
                dataset structure is replicated on the server. However, it is
                possible to provide a template for generating different target
                directory names for all (sub)datasets. Templates can contain
                certain placeholder that are substituted for each (sub)dataset.
                For example: "/mydirectory/dataset%%RELNAME".\nSupported
                placeholders:\n
                %%RELNAME - the name of the datasets, with any slashes replaced by
                dashes\n""",
            constraints=EnsureStr() | EnsureNone()),
        target_url=Parameter(
            args=('--target-url', ),
            metavar='URL',
            doc=""""public" access URL of the to-be-created target dataset(s)
                (default: `sshurl`). Accessibility of this URL determines the
                access permissions of potential consumers of the dataset.
                As with `target_dir`, templates (same set of placeholders)
                are supported.  Also, if specified, it is provided as the annex
                description\n""",
            constraints=EnsureStr() | EnsureNone()),
        target_pushurl=Parameter(
            args=('--target-pushurl', ),
            metavar='URL',
            doc="""In case the `target_url` cannot be used to publish to the
                dataset, this option specifies an alternative URL for this
                purpose. As with `target_url`, templates (same set of
                placeholders) are supported.\n""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        existing=Parameter(
            args=("--existing", ),
            constraints=EnsureChoice('skip', 'replace', 'error',
                                     'reconfigure'),
            metavar='MODE',
            doc=
            """action to perform, if a sibling is already configured under the
            given name and/or a target directory already exists.
            In this case, a dataset can be skipped ('skip'), an existing target
            directory be forcefully re-initialized, and the sibling (re-)configured
            ('replace', implies 'reconfigure'), the sibling configuration be updated
            only ('reconfigure'), or to error ('error').""",
        ),
        inherit=inherit_opt,
        shared=Parameter(
            args=("--shared", ),
            metavar='false|true|umask|group|all|world|everybody|0xxx',
            doc="""if given, configures the access permissions on the server
            for multi-users (this could include access by a webserver!).
            Possible values for this option are identical to those of
            `git init --shared` and are described in its documentation.""",
            constraints=EnsureStr() | EnsureBool() | EnsureNone()),
        ui=Parameter(args=("--ui", ),
                     metavar='false|true|html_filename',
                     doc="""publish a web interface for the dataset with an
            optional user-specified name for the html at publication
            target. defaults to `index.html` at dataset root""",
                     constraints=EnsureBool() | EnsureStr()),
        as_common_datasrc=as_common_datasrc,
        publish_depends=publish_depends,
        publish_by_default=publish_by_default,
        annex_wanted=annex_wanted_opt,
        annex_group=annex_group_opt,
        annex_groupwanted=annex_groupwanted_opt,
        since=Parameter(
            args=("--since", ),
            constraints=EnsureStr() | EnsureNone(),
            doc=
            """limit processing to datasets that have been changed since a given
            state (by tag, branch, commit, etc). This can be used to create siblings
            for recently added subdatasets."""),
    )

    @staticmethod
    @datasetmethod(name='create_sibling')
    @eval_results
    def __call__(sshurl,
                 name=None,
                 target_dir=None,
                 target_url=None,
                 target_pushurl=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 existing='error',
                 shared=None,
                 ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None,
                 annex_wanted=None,
                 annex_group=None,
                 annex_groupwanted=None,
                 inherit=False,
                 since=None):

        # there is no point in doing anything further
        not_supported_on_windows(
            "Support for SSH connections is not yet implemented in Windows")

        #
        # nothing without a base dataset
        #
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='creating a sibling')
        refds_path = ds.path

        #
        # all checks that are possible before we start parsing the dataset
        #

        # possibly use sshurl to get the name in case if not specified
        if not sshurl:
            if not inherit:
                raise InsufficientArgumentsError(
                    "needs at least an SSH URL, if no inherit option")
            if name is None:
                raise ValueError(
                    "Neither SSH URL, nor the name of sibling to inherit from "
                    "was specified")
            # It might well be that we already have this remote setup
            try:
                sshurl = CreateSibling._get_remote_url(ds, name)
            except Exception as exc:
                lgr.debug('%s does not know about url for %s: %s', ds, name,
                          exc_str(exc))
        elif inherit:
            raise ValueError(
                "For now, for clarity not allowing specifying a custom sshurl "
                "while inheriting settings")
            # may be could be safely dropped -- still WiP

        if not sshurl:
            # TODO: may be more back up before _prep?
            super_ds = ds.get_superdataset()
            if not super_ds:
                raise ValueError(
                    "Could not determine super dataset for %s to inherit URL" %
                    ds)
            super_url = CreateSibling._get_remote_url(super_ds, name)
            # for now assuming hierarchical setup
            # (TODO: to be able to destinguish between the two, probably
            # needs storing datalad.*.target_dir to have %RELNAME in there)
            sshurl = slash_join(super_url, relpath(ds.path, super_ds.path))

        # check the login URL
        sshri = RI(sshurl)
        if not is_ssh(sshri):
            raise ValueError(
                "Unsupported SSH URL: '{0}', "
                "use ssh://host/path or host:path syntax".format(sshurl))

        if not name:
            # use the hostname as default remote name
            name = sshri.hostname
            lgr.debug(
                "No sibling name given, use URL hostname '%s' as sibling name",
                name)

        if since == '':
            # default behavior - only updated since last update
            # so we figure out what was the last update
            # XXX here we assume one to one mapping of names from local branches
            # to the remote
            active_branch = ds.repo.get_active_branch()
            since = '%s/%s' % (name, active_branch)

        #
        # parse the base dataset to find all subdatasets that need processing
        #
        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                # only a single path!
                path=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='create_sibling',
                # both next should not happen anyways
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                modified=since,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) != 'dataset' or ap.get('state',
                                                           None) == 'absent':
                # this can happen when there is `since`, but we have no
                # use for anything but datasets here
                continue
            checkds_remotes = Dataset(ap['path']).repo.get_remotes() \
                if ap.get('state', None) != 'absent' \
                else []
            if publish_depends:
                # make sure dependencies are valid
                # TODO: inherit -- we might want to automagically create
                # those dependents as well???
                unknown_deps = set(
                    assure_list(publish_depends)).difference(checkds_remotes)
                if unknown_deps:
                    ap['status'] = 'error'
                    ap['message'] = (
                        'unknown sibling(s) specified as publication dependency: %s',
                        unknown_deps)
                    yield ap
                    continue
            if name in checkds_remotes and existing in ('error', 'skip'):
                ap['status'] = 'error' if existing == 'error' else 'notneeded'
                ap['message'] = (
                    "sibling '%s' already configured (specify alternative name, or force "
                    "reconfiguration via --existing", name)
                yield ap
                continue
            to_process.append(ap)

        if not to_process:
            # we ruled out all possibilities
            # TODO wait for gh-1218 and make better return values
            lgr.info("No datasets qualify for sibling creation. "
                     "Consider different settings for --existing "
                     "or --since if this is unexpected")
            return

        if target_dir is None:
            if sshri.path:
                target_dir = sshri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = "%RELNAME" not in target_dir

        # request ssh connection:
        lgr.info("Connecting ...")
        assert (sshurl is not None)  # delayed anal verification
        ssh = ssh_manager.get_connection(sshurl)
        if not ssh.get_annex_version():
            raise MissingExternalDependency('git-annex',
                                            msg='on the remote system')

        #
        # all checks done and we have a connection, now do something
        #

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        yielded = set()
        remote_repos_to_run_hook_for = []
        for currentds_ap in \
                sorted(to_process, key=lambda x: x['path'].count('/')):
            current_ds = Dataset(currentds_ap['path'])

            path = _create_dataset_sibling(
                name, current_ds, ds.path, ssh, replicate_local_structure,
                sshri, target_dir, target_url, target_pushurl, existing,
                shared, publish_depends, publish_by_default, as_common_datasrc,
                annex_wanted, annex_group, annex_groupwanted, inherit)
            if not path:
                # nothing new was created
                # TODO is 'notneeded' appropriate in this case?
                currentds_ap['status'] = 'notneeded'
                # TODO explain status in 'message'
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            remote_repos_to_run_hook_for.append((path, currentds_ap))

            # publish web-interface to root dataset on publication server
            if current_ds.path == ds.path and ui:
                lgr.info("Uploading web interface to %s" % path)
                try:
                    CreateSibling.upload_web_interface(path, ssh, shared, ui)
                except CommandError as e:
                    currentds_ap['status'] = 'error'
                    currentds_ap['message'] = (
                        "failed to push web interface to the remote datalad repository (%s)",
                        exc_str(e))
                    yield currentds_ap
                    yielded.add(currentds_ap['path'])
                    continue

        # in reverse order would be depth first
        lgr.info("Running post-update hooks in all created siblings")
        # TODO: add progressbar
        for path, currentds_ap in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            lgr.debug("Running hook for %s", path)
            try:
                ssh("cd {} && hooks/post-update".format(
                    sh_quote(_path_(path, ".git"))))
            except CommandError as e:
                currentds_ap['status'] = 'error'
                currentds_ap['message'] = (
                    "failed to run post-update hook under remote path %s (%s)",
                    path, exc_str(e))
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            if not currentds_ap['path'] in yielded:
                # if we were silent until now everything is just splendid
                currentds_ap['status'] = 'ok'
                yield currentds_ap

    @staticmethod
    def _get_ds_remote_shared_setting(ds, name, ssh):
        """Figure out setting of sharedrepository for dataset's `name` remote"""
        shared = None
        try:
            current_super_url = CreateSibling._get_remote_url(ds, name)
            current_super_ri = RI(current_super_url)
            out, err = ssh('git -C {} config --get core.sharedrepository'.format(
                # TODO -- we might need to expanduser taking .user into account
                # but then it must be done also on remote side
                sh_quote(current_super_ri.path)))
            shared = out.strip()
            if err:
                lgr.warning("Got stderr while calling ssh: %s", err)
        except CommandError as e:
            lgr.debug(
                "Could not figure out remote shared setting of %s for %s due "
                "to %s", ds, name, exc_str(e))
            # could well be ok if e.g. not shared
            # TODO: more detailed analysis may be?
        return shared

    @staticmethod
    def _get_remote_url(ds, name):
        """A little helper to get url from pushurl or from url if not defined"""
        # take pushurl if present, if not -- just a url
        url = ds.config.get('remote.%s.pushurl' % name) or \
            ds.config.get('remote.%s.url' % name)
        if not url:
            raise ValueError("%s had neither pushurl or url defined for %s" %
                             (ds, name))
        return url

    @staticmethod
    def init_remote_repo(path, ssh, shared, dataset, description=None):
        cmd = "git -C {} init{}".format(
            sh_quote(path),
            " --shared='{}'".format(sh_quote(shared)) if shared else '')
        try:
            ssh(cmd)
        except CommandError as e:
            lgr.error("Initialization of remote git repository failed at %s."
                      "\nError: %s\nSkipping ..." % (path, exc_str(e)))
            return False

        if isinstance(dataset.repo, AnnexRepo):
            # init remote git annex repo (part fix of #463)
            try:
                ssh("git -C {} annex init {}".format(
                    sh_quote(path),
                    sh_quote(description) if description else ''))
            except CommandError as e:
                lgr.error(
                    "Initialization of remote git annex repository failed at %s."
                    "\nError: %s\nSkipping ..." % (path, exc_str(e)))
                return False
        return True

    @staticmethod
    def create_postupdate_hook(path, ssh, dataset):
        # location of post-update hook file, logs folder on remote target
        hooks_remote_dir = opj(path, '.git', 'hooks')
        # make sure hooks directory exists (see #1251)
        ssh('mkdir -p {}'.format(sh_quote(hooks_remote_dir)))
        hook_remote_target = opj(hooks_remote_dir, 'post-update')

        # create json command for current dataset
        log_filename = 'datalad-publish-hook-$(date +%s).log' % TIMESTAMP_FMT
        hook_content = r'''#!/bin/bash

git update-server-info

#
# DataLad
#
# (Re)generate meta-data for DataLad Web UI and possibly init new submodules
dsdir="$(dirname $0)/../.."
logfile="$dsdir/{WEB_META_LOG}/{log_filename}"

if [ ! -e "$dsdir/.git" ]; then
  echo Assumption of being under .git has failed >&2
  exit 1
fi

mkdir -p "$dsdir/{WEB_META_LOG}"  # assure logs directory exists

( which datalad > /dev/null \
  && ( cd "$dsdir"; GIT_DIR="$PWD/.git" datalad ls -a --json file .; ) \
  || echo "E: no datalad found - skipping generation of indexes for web frontend"; \
) &> "$logfile"

# Some submodules might have been added and thus we better init them
( cd "$dsdir"; git submodule update --init || : ; ) >> "$logfile" 2>&1
'''.format(WEB_META_LOG=WEB_META_LOG, **locals())

        with make_tempfile(content=hook_content) as tempf:
            # create post_update hook script
            # upload hook to dataset
            ssh.copy(tempf, hook_remote_target)
        # and make it executable
        ssh('chmod +x {}'.format(sh_quote(hook_remote_target)))

    @staticmethod
    def upload_web_interface(path, ssh, shared, ui):
        # path to web interface resources on local
        webui_local = opj(dirname(datalad.__file__), 'resources', 'website')
        # local html to dataset
        html_local = opj(webui_local, "index.html")

        # name and location of web-interface html on target
        html_targetname = {True: ui, False: "index.html"}[isinstance(ui, str)]
        html_target = opj(path, html_targetname)

        # upload ui html to target
        ssh.copy(html_local, html_target)

        # upload assets to the dataset
        webresources_local = opj(webui_local, 'assets')
        webresources_remote = opj(path, WEB_HTML_DIR)
        ssh('mkdir -p {}'.format(sh_quote(webresources_remote)))
        ssh.copy(webresources_local, webresources_remote, recursive=True)

        # minimize and upload js assets
        for js_file in glob(opj(webresources_local, 'js', '*.js')):
            with open(js_file) as asset:
                try:
                    from jsmin import jsmin
                    # jsmin = lambda x: x   # no minimization
                    minified = jsmin(asset.read())  # minify asset
                except ImportError:
                    lgr.warning(
                        "Will not minify web interface javascript, no jsmin available"
                    )
                    minified = asset.read()  # no minify available
                with make_tempfile(content=minified
                                   ) as tempf:  # write minified to tempfile
                    js_name = js_file.split('/')[-1]
                    ssh.copy(tempf,
                             opj(webresources_remote, 'assets', 'js',
                                 js_name))  # and upload js

        # explicitly make web+metadata dir of dataset world-readable, if shared set to 'all'
        mode = None
        if shared in (True, 'true', 'all', 'world', 'everybody'):
            mode = 'a+rX'
        elif shared == 'group':
            mode = 'g+rX'
        elif str(shared).startswith('0'):
            mode = shared

        if mode:
            ssh('chmod {} -R {} {}'.format(
                mode, sh_quote(dirname(webresources_remote)),
                sh_quote(opj(path, 'index.html'))))
Exemplo n.º 19
0
class Rerun(Interface):
    """Re-execute previous `datalad run` commands.

    This will unlock any dataset content that is on record to have
    been modified by the command in the specified revision.  It will
    then re-execute the command in the recorded path (if it was inside
    the dataset). Afterwards, all modifications will be saved.

    *Report mode*

    || REFLOW >>
    When called with [CMD: --report CMD][PY: report=True PY], this command
    reports information about what would be re-executed as a series of records.
    There will be a record for each revision in the specified revision range.
    Each of these will have one of the following "rerun_action" values:
    << REFLOW ||

      - run: the revision has a recorded command that would be re-executed
      - skip: the revision does not have a recorded command and would be
        skipped
      - pick: the revision does not have a recorded command and would be cherry
        picked

    The decision to skip rather than cherry pick a revision is based on whether
    the revision would be reachable from HEAD at the time of execution.

    In addition, when a starting point other than HEAD is specified, there is a
    rerun_action value "checkout", in which case the record includes
    information about the revision the would be checked out before rerunning
    any commands.

    Examples:

      Re-execute the command from the previous commit::

        % datalad rerun

      Re-execute any commands in the last five commits::

        % datalad rerun --since=HEAD~5

      Do the same as above, but re-execute the commands on top of
      HEAD~5 in a detached state::

        % datalad rerun --onto= --since=HEAD~5

      Re-execute all previous commands and compare the old and new
      results::

        % # on master branch
        % datalad rerun --branch=verify --since=
        % # now on verify branch
        % datalad diff --revision=master..
        % git log --oneline --left-right --cherry-pick master...

    .. note::
      Currently the "onto" feature only sets the working tree of the current
      dataset to a previous state. The working trees of any subdatasets remain
      unchanged.
    """
    _params_ = dict(
        revision=Parameter(
            args=("revision",),
            metavar="REVISION",
            nargs="?",
            doc="""rerun command(s) in `revision`. By default, the command from
            this commit will be executed, but [CMD: --since CMD][PY: `since`
            PY] can be used to construct a revision range.""",
            default="HEAD",
            constraints=EnsureStr()),
        since=Parameter(
            args=("--since",),
            doc="""If `since` is a commit-ish, the commands from all commits
            that are reachable from `revision` but not `since` will be
            re-executed (in other words, the commands in :command:`git log
            SINCE..REVISION`). If SINCE is an empty string, it is set to the
            parent of the first commit that contains a recorded command (i.e.,
            all commands in :command:`git log REVISION` will be
            re-executed).""",
            constraints=EnsureStr() | EnsureNone()),
        branch=Parameter(
            metavar="NAME",
            args=("-b", "--branch",),
            doc="create and checkout this branch before rerunning the commands.",
            constraints=EnsureStr() | EnsureNone()),
        onto=Parameter(
            metavar="base",
            args=("--onto",),
            doc="""start point for rerunning the commands. If not specified,
            commands are executed at HEAD. This option can be used to specify
            an alternative start point, which will be checked out with the
            branch name specified by [CMD: --branch CMD][PY: `branch` PY] or in
            a detached state otherwise. As a special case, an empty value for
            this option means to use the commit specified by [CMD: --since
            CMD][PY: `since` PY].""",
            constraints=EnsureStr() | EnsureNone()),
        message=Parameter(
            args=("-m", "--message",),
            metavar="MESSAGE",
            doc="""use MESSAGE for the reran commit rather than the
            recorded commit message.  In the case of a multi-commit
            rerun, all the reran commits will have this message.""",
            constraints=EnsureStr() | EnsureNone()),
        script=Parameter(
            args=("--script",),
            metavar="FILE",
            doc="""extract the commands into [CMD: FILE CMD][PY: this file PY]
            rather than rerunning. Use - to write to stdout instead. [CMD: This
            option implies --report. CMD]""",
            constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset from which to rerun a recorded
            command. If no dataset is given, an attempt is made to
            identify the dataset based on the current working
            directory. If a dataset is given, the command will be
            executed in the root directory of this dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
        report=Parameter(
            args=("--report",),
            action="store_true",
            doc="""Don't actually re-execute anything, just display what would
            be done. [CMD: Note: If you give this option, you most likely want
            to set --output-format to 'json' or 'json_pp'. CMD]"""),
    )

    @staticmethod
    @datasetmethod(name='rerun')
    @eval_results
    def __call__(
            revision="HEAD",
            since=None,
            dataset=None,
            branch=None,
            message=None,
            onto=None,
            script=None,
            report=False):

        ds = require_dataset(
            dataset, check_installed=True,
            purpose='rerunning a command')

        lgr.debug('rerunning command output underneath %s', ds)

        if script is None and not report and ds.repo.dirty:
            yield get_status_dict(
                'run',
                ds=ds,
                status='impossible',
                message=(
                    'clean dataset required to detect changes from command; '
                    'use `datalad status` to inspect unsaved changes'))
            return

        if not ds.repo.get_hexsha():
            yield get_status_dict(
                'run', ds=ds,
                status='impossible',
                message='cannot rerun command, nothing recorded')
            return

        if branch and branch in ds.repo.get_branches():
            yield get_status_dict(
                "run", ds=ds, status="error",
                message="branch '{}' already exists".format(branch))
            return

        if not ds.repo.commit_exists(revision + "^"):
            # Only a single commit is reachable from `revision`.  In
            # this case, --since has no effect on the range construction.
            revrange = revision
        elif since is None:
            revrange = "{rev}^..{rev}".format(rev=revision)
        elif since.strip() == "":
            revrange = revision
        else:
            revrange = "{}..{}".format(since, revision)

        if ds.repo.repo.git.rev_list("--merges", revrange, "--"):
            yield get_status_dict(
                "run", ds=ds, status="error",
                message="cannot rerun history with merge commits")
            return

        results = _rerun_as_results(ds, revrange, since, branch, onto, message)
        if script:
            handler = _get_script_handler(script, since, revision)
        elif report:
            handler = _report
        else:
            handler = _rerun

        for res in handler(ds, results):
            yield res
Exemplo n.º 20
0
class Init(Interface):
    """Initialize an existing dataset to track an XNAT project
    """

    _examples_ = [
        dict(text='Initialize a dataset in the current directory',
             code_cmd='datalad xnat-init http://central.xnat.org:8080',
             code_py='xnat_init("http://central.xnat.org:8080")'),
    ]

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='DATASET',
            doc="""specify the dataset to perform the initialization on""",
            constraints=EnsureDataset() | EnsureNone()),
        url=Parameter(
            args=("url", ),
            doc="""XNAT instance URL""",
        ),
        project=Parameter(
            args=(
                "-p",
                "--project",
            ),
            doc="""name of an XNAT project to track""",
        ),
        path=Parameter(
            args=(
                "-O",
                "--path",
            ),
            doc="""Specify the directory structure for the downloaded files, and
            if/where a subdataset should be created.
            To include the subject, session, or scan values, use the following
            format: {subject}/{session}/{scan}/
            To insert a subdataset at a specific directory level use '//':
            {subject}/{session}//{scan}/""",
        ),
        force=Parameter(args=(
            "-f",
            "--force",
        ),
                        doc="""force (re-)initialization""",
                        action='store_true'),
    )

    @staticmethod
    @datasetmethod(name='xnat_init')
    @eval_results
    def __call__(url,
                 path="{subject}/{session}/{scan}/",
                 project=None,
                 force=False,
                 dataset=None):
        from pyxnat import Interface as XNATInterface

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='initialization')

        config = ds.config
        path = with_pathsep(path)

        # prep for yield
        res = dict(
            action='xnat_init',
            path=ds.path,
            type='dataset',
            logger=lgr,
            refds=ds.path,
        )

        # obtain user credentials, use simplified/stripped URL as identifier
        # given we don't have more knowledge than the user, do not
        # give a `url` to provide hints on how to obtain credentials
        parsed_url = urlparse(url)
        no_proto_url = '{}{}'.format(parsed_url.netloc,
                                     parsed_url.path).replace(' ', '')
        cred = UserPassword(name=no_proto_url, url=None)()

        xn = XNATInterface(server=url, **cred)

        # now we make a simple request to obtain the server version
        # we don't care much, but if the URL or the credentials are wrong
        # we will not get to see one
        try:
            xnat_version = xn.version()
            lgr.debug("XNAT server version is %s", xnat_version)
        except Exception as e:
            yield dict(
                res,
                status='error',
                message=('Failed to access the XNAT server. Full error:\n%s',
                         e),
            )
            return

        if project is None:
            from datalad.ui import ui
            projects = xn.select.projects().get()
            ui.message('No project name specified. The following projects are '
                       'available on {} for user {}:'.format(
                           url, cred['user']))
            for p in sorted(projects):
                # list and prep for C&P
                # TODO multi-column formatting?
                ui.message("  {}".format(quote_cmdlinearg(p)))
            return

        # query the specified project to make sure it exists and is accessible
        proj = xn.select.project(project)

        try:
            nsubj = len(proj.subjects().get())
        except Exception as e:
            yield dict(
                res,
                status='error',
                message=(
                    'Failed to obtain information on project %s from XNAT. '
                    'Full error:\n%s', project, e),
            )
            return

        lgr.info('XNAT reports %i subjects currently on-record for project %s',
                 nsubj, project)

        # check if dataset already initialized
        auth_dir = ds.pathobj / '.datalad' / 'providers'
        if auth_dir.exists() and not force:
            yield dict(
                res,
                status='error',
                message='Dataset found already initialized, '
                'use `force` to reinitialize',
            )
            return

        # put essential configuration into the dataset
        config.set('datalad.xnat.default.url',
                   url,
                   where='dataset',
                   reload=False)
        config.set('datalad.xnat.default.project', project, where='dataset')
        config.set('datalad.xnat.default.path', path, where='dataset')

        ds.save(
            path=ds.pathobj / '.datalad' / 'config',
            to_git=True,
            message="Configure default XNAT url and project",
        )

        # Configure XNAT access authentication
        ds.run_procedure(spec='cfg_xnat_dataset')

        yield dict(
            res,
            status='ok',
        )
        return
Exemplo n.º 21
0
class CreateSiblingOSF(Interface):
    """Create a dataset representation at OSF.

    This will create a node on OSF and initialize
    an osf special remote to point to it. There are two modes
    this can operate in: 'annex' and 'export'.
    The former uses the OSF node as a key-value store, that
    can be used by git-annex to copy data to and retrieve
    data from (potentially by any clone of the original dataset).
    The latter allows to use 'git annex export' to publish a
    snapshot of a particular version of the dataset. Such an OSF
    node will - in opposition to the 'annex' - be
    human-readable.

    For authentication with OSF, you can define environment variables: Either
    'OSF_TOKEN', or both 'OSF_USERNAME' and 'OSF_PASSWORD'. If neither of these
    is defined, the tool will fall back to the datalad credential manager and
    inquire for credentials interactively.

    """

    result_renderer = 'tailored'

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc=""""Dataset to create a sibling for. If no further
        constraining path is given, metadata is extracted from all files
        of the dataset.""",
            constraints=EnsureDataset() | EnsureNone()
        ),
        title=Parameter(
            args=("--title",),
            doc="""title of the to-be created OSF node that is displayed
            on the OSF website. Defaults to the basename of the root directory
            of the local dataset.""",
            constraints=EnsureStr() | EnsureNone(),
        ),
        name=Parameter(
            args=("-s", "--name",),
            doc="""Name of the to-be initialized osf-special-remote""",
            constraints=EnsureStr()
        ),
        storage_name=Parameter(
            args=("--storage-name",),
            metavar="NAME",
            doc="""Name of the storage sibling (git-annex special remote).
            Must not be identical to the sibling name. If not specified,
            defaults to the sibling name plus '-storage' suffix.""",
            constraints=EnsureStr() | EnsureNone()),
        existing=Parameter(
            args=("--existing",),
            constraints=EnsureChoice(
                'skip', 'error') | EnsureNone(),
            metavar='MODE',
            doc="""Action to perform, if a (storage) sibling is already
            configured under the given name and/or a target already exists.
            In this case, a dataset can be skipped ('skip'), or the command
            be instructed to fail ('error').""", ),
        trust_level=Parameter(
            args=("--trust-level",),
            metavar="TRUST-LEVEL",
            constraints=EnsureChoice(
                'trust', 'semitrust', 'untrust') | EnsureNone(),
            doc="""specify a trust level for the storage sibling. If not
            specified, the default git-annex trust level is used.""",),
        mode=Parameter(
            args=("--mode",),
            doc=""" """,
            constraints=EnsureChoice(
                "annex", "export", "exportonly", "gitonly")
        ),
        tags=Parameter(
            args=('--tag',),
            dest='tags',
            metavar='TAG',
            doc="""specific one or more tags for the to-be-create OSF node.
            A tag 'DataLad dataset' and the dataset ID (if there is any)
            will be automatically added as additional tags.
            [CMD: This option can be given more than once CMD].""",
            action='append',
        ),
        public=Parameter(
            args=("--public",),
            doc="""make OSF node public""",
            action='store_true',
        ),
        category=Parameter(
            args=("--category",),
            doc="""specific the OSF node category to be used for the
            node. The categorization determines what icon is displayed
            with the node on the OSF, and helps with search organization""",
            # all presently supported categories
            constraints=EnsureChoice(
                "analysis", "communication", "data", "hypothesis",
                "instrumentation", "methods and measures", "procedure",
                "project", "software", "other")
        ),
        description=Parameter(
            args=("--description",),
            metavar="TEXT",
            doc="""Description of the OSF node that will be displayed on
            the associated project page. By default a description will be
            generated based on the mode the sibling is put into.""",
            constraints=EnsureStr() | EnsureNone()),
    )

    @staticmethod
    @datasetmethod(name='create_sibling_osf')
    @eval_results
    def __call__(title=None,
                 name="osf",
                 storage_name=None,
                 dataset=None,
                 mode="annex",
                 existing='error',
                 trust_level=None,
                 tags=None,
                 public=False,
                 category='data',
                 description=None,
                 ):
        ds = require_dataset(dataset,
                             purpose="create OSF remote",
                             check_installed=True)
        res_kwargs = dict(
            ds=ds,
            action="create-sibling-osf",
            logger=lgr,
        )
        # we need an annex
        if not isinstance(ds.repo, AnnexRepo):
            yield get_status_dict(
                type="dataset",
                status="impossible",
                message="dataset has no annex",
                **res_kwargs)
            return

        # NOTES:
        # - we prob. should check osf-special-remote availability upfront to
        #   fail early
        # - add --recursive option
        #       - recursive won't work easily. Need to think that through.
        #       - would need a naming scheme for subdatasets
        #       - flat on OSF or a tree?
        #       - how do we detect something is there already, so we can skip
        #         rather than duplicate (with a new name)?
        #         osf-type-special-remote sufficient to decide it's not needed?
        # - adapt to conclusions in issue #30
        #   -> create those subcomponents
        # - results need to report URL for created projects suitable for datalad
        #   output formatting!
        #   -> result_renderer
        #   -> needs to ne returned by create_node

        if not storage_name:
            storage_name = "{}-storage".format(name)

        sibling_conflicts = sibling_exists(
            ds, [name, storage_name],
            # TODO pass through
            recursive=False, recursion_limit=None,
            # fail fast, if error is desired
            exhaustive=existing == 'error',
        )
        if existing == 'error' and sibling_conflicts:
            # we only asked for one
            conflict = sibling_conflicts[0]
            yield get_status_dict(
                status='error',
                message=(
                    "a sibling '%s' is already configured in dataset %s",
                    conflict[1], conflict[0]),
                **res_kwargs,
            )
            return

        if title is None:
            # use dataset root basename
            title = ds.pathobj.name

        tags = ensure_list(tags)
        if 'DataLad dataset' not in tags:
            tags.append('DataLad dataset')
        if ds.id and ds.id not in tags:
            tags.append(ds.id)

        if not description:
            description = \
                "This component was built from a DataLad dataset using the " \
                "datalad-osf extension " \
                "(https://github.com/datalad/datalad-osf)."
            if mode != 'exportonly':
                description += \
                    " With this extension installed, this component can be " \
                    "git or datalad cloned from a 'osf://ID' URL, where " \
                    "'ID' is the OSF node ID that shown in the OSF HTTP " \
                    "URL, e.g. https://osf.io/q8xnk/ can be cloned from " \
                    "osf://q8xnk"
        cred = get_credentials(allow_interactive=True)
        osf = OSF(**cred)
        node_id, node_url = create_node(
            osf_session=osf.session,
            title=title,
            category=category,
            tags=tags if tags else None,
            public=EnsureBool()(public),
            description=description,
        )
        if mode != 'gitonly':
            init_opts = ["encryption=none",
                         "type=external",
                         "externaltype=osf",
                         "autoenable=true",
                         "node={}".format(node_id)]

            if mode in ("export", "exportonly"):
                init_opts += ["exporttree=yes"]

            ds.repo.init_remote(storage_name, options=init_opts)
            if trust_level:
                ds.repo.call_git(['annex', trust_level, storage_name])

            yield get_status_dict(
                type="dataset",
                url=node_url,
                id=node_id,
                name=storage_name,
                status="ok",
                **res_kwargs
            )

        if mode == 'exportonly':
            return

        ds.config.set(
            'remote.{}.annex-ignore'.format(name), 'true',
            where='local')
        yield from ds.siblings(
            # use configure, not add, to not trip over the config that
            # we just made
            action='configure',
            name=name,
            url='osf://{}'.format(node_id),
            fetch=False,
            publish_depends=storage_name if mode != 'gitonly' else None,
            recursive=False,
            result_renderer=None,
        )

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        from datalad.ui import ui
        if res['action'] == "create-sibling-osf":
            msg = res.get('message', None)
            ui.message("{action}({status}): {url}{msg}".format(
                action=ac.color_word(res['action'], ac.BOLD),
                status=ac.color_status(res['status']),
                url=res.get('url', ''),
                msg=' [{}]'.format(msg[0] % msg[1:]
                                   if isinstance(msg, tuple)
                                   else res['message'])
                if msg else '')
            )
        elif res['action'] == "add-sibling-osf":
            ui.message("{action}({status})".format(
                action=ac.color_word(res['action'], ac.BOLD),
                status=ac.color_status(res['status']))
            )
        else:
            from datalad.interface.utils import default_result_renderer
            default_result_renderer(res)
Exemplo n.º 22
0
class Configuration(Interface):
    """Get and set dataset, dataset-clone-local, or global configuration

    .. note::
      This command was introduced with DataLad 0.14 and might still see
      API and behavior changes in future releases.

    This command works similar to git-config, but some features are not
    supported (e.g., modifying system configuration), while other features
    are not available in git-config (e.g., multi-configuration queries).

    Query and modification of three distinct configuration scopes is
    supported:

    - 'dataset': the persistent configuration in .datalad/config of a dataset
    - 'local': a dataset clone's Git repository configuration in .git/config
    - 'global': non-dataset-specific configuration (usually in $USER/.gitconfig)

    Modifications of the persistent 'dataset' configuration will not be saved
    by this command, but have to be committed with a subsequent `save`
    call.

    Rules of precedence regarding different configuration scopes are the same
    as in Git, with two exceptions: 1) environment variables can be used to
    override any datalad configuration, and have precedence over any other
    configuration scope (see below). 2) the 'dataset' scope is considered in
    addition to the standard git configuration scopes. Its content has lower
    precedence than Git configuration scopes, but it is committed to a dataset,
    hence can be used to ship (default and branch-specific) configuration with
    a dataset.

    Any DATALAD_* environment variable is also mapped to a configuration item.
    Their values take precedence over any other specification. In variable
    names '_' encodes a '.' in the configuration name, and '__' encodes a '-',
    such that 'DATALAD_SOME__VAR' is mapped to 'datalad.some-var'.

    Recursive operation is supported for querying and modifying configuration
    across a hierarchy of datasets.
    """
    _examples_ = [
        dict(
            text=
            "Dump the effective configuration, including an annotation for common items",
            code_py="configuration()",
            code_cmd="datalad configuration"),
        dict(text="Query two configuration items",
             code_py="configuration('get', ['user.name', 'user.email'])",
             code_cmd="datalad configuration get user.name user.email"),
        dict(
            text=
            "Recursively set configuration in all (sub)dataset repositories",
            code_py=
            "configuration('set', [('my.config.name', 'value')], recursive=True)",
            code_cmd="datalad configuration -r set my.config=value"),
        dict(
            text=
            "Modify the persistent dataset configuration (changes are not committed)",
            code_py=
            "configuration('set', [('my.config.name', 'value')], scope='dataset')",
            code_cmd="datalad configuration --scope dataset set my.config=value"
        ),
    ]

    result_renderer = 'tailored'

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to query or to configure""",
            constraints=EnsureDataset() | EnsureNone()),
        action=Parameter(args=("action", ),
                         nargs='?',
                         doc="""which action to perform""",
                         constraints=EnsureChoice(*config_actions)),
        scope=Parameter(args=("--scope", ),
                        doc="""scope for getting or setting
            configuration. If no scope is declared for a query, all
            configuration sources (including overrides via environment
            variables) are considered according to the normal
            rules of precedence. For action 'get' only 'dataset' and 'local'
            (with include 'global' here) are supported. For action 'dump',
            a scope selection is ignored and all scopes are considered.""",
                        constraints=EnsureChoice('global', 'local', 'dataset',
                                                 None)),
        spec=Parameter(
            args=("spec", ),
            doc="""configuration name (for actions 'get' and 'unset'),
            or name/value pair (for action 'set')""",
            nargs='*',
            metavar='name[=value]'),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
    )

    @staticmethod
    @datasetmethod(name='configuration')
    @eval_results
    def __call__(action='dump',
                 spec=None,
                 scope=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None):

        # check conditions
        # - global and recursion makes no sense

        if action == 'dump':
            if spec:
                raise ValueError(
                    'Configuration selection is not supported for dumping')
            if scope:
                raise ValueError(
                    'Scope selection is not supported for dumping')

        # normalize variable specificatons
        specs = []
        for s in ensure_list(spec):
            if isinstance(s, tuple):
                specs.append((str(s[0]), str(s[1])))
            elif '=' not in s:
                specs.append((str(s), ))
            else:
                specs.append(tuple(s.split('=', 1)))

        if action == 'set':
            missing_values = [s[0] for s in specs if len(s) < 2]
            if missing_values:
                raise ValueError(
                    'Values must be provided for all configuration '
                    'settings. Missing: {}'.format(missing_values))
            invalid_names = [s[0] for s in specs if '.' not in s[0]]
            if invalid_names:
                raise ValueError(
                    'Name must contain a section (i.e. "section.name"). '
                    'Invalid: {}'.format(invalid_names))

        ds = None
        if scope != 'global' or recursive:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='configuration')

        res_kwargs = dict(
            action='configuration',
            logger=lgr,
        )
        if ds:
            res_kwargs['refds'] = ds.path
        yield from configuration(action, scope, specs, res_kwargs, ds)

        if not recursive:
            return

        for subds in ds.subdatasets(fulfilled=True,
                                    recursive=True,
                                    recursion_limit=recursion_limit,
                                    on_failure='ignore',
                                    result_renderer='disabled'):
            yield from configuration(action, scope, specs, res_kwargs,
                                     Dataset(subds['path']))

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        if (res['status'] != 'ok' or res['action']
                not in ('get_configuration', 'dump_configuration')):
            if 'message' not in res and 'name' in res:
                suffix = '={}'.format(res['value']) if 'value' in res else ''
                res['message'] = '{}{}'.format(res['name'], suffix)
            default_result_renderer(res)
            return
        # TODO source
        from datalad.ui import ui
        name = res['name']
        if res['action'] == 'dump_configuration':
            for key in ('purpose', 'description'):
                s = res.get(key)
                if s:
                    ui.message('\n'.join(
                        wrap(
                            s,
                            initial_indent='# ',
                            subsequent_indent='# ',
                        )))

        if kwargs.get('recursive', False):
            have_subds = res['path'] != res['refds']
            # we need to mark up from which dataset results are reported
            prefix = '<ds>{}{}:'.format(
                '/' if have_subds else '',
                Path(res['path']).relative_to(res['refds']).as_posix()
                if have_subds else '',
            )
        else:
            prefix = ''

        if kwargs.get('action', None) == 'dump':
            ui.message('{}{}={}'.format(
                prefix,
                ac.color_word(name, ac.BOLD),
                res['value'] if res['value'] is not None else '',
            ))
        else:
            ui.message('{}{}'.format(
                prefix,
                res['value'] if res['value'] is not None else '',
            ))
Exemplo n.º 23
0
Arquivo: wtf.py Projeto: vsoch/datalad
class WTF(Interface):
    """Generate a report about the DataLad installation and configuration

    IMPORTANT: Sharing this report with untrusted parties (e.g. on the web)
    should be done with care, as it may include identifying information, and/or
    credentials or access tokens.
    """
    result_renderer = 'tailored'

    from datalad.support.param import Parameter
    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import EnsureNone, EnsureChoice

    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc=""""specify the dataset to report on.
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory.""",
                          constraints=EnsureDataset() | EnsureNone()),
        sensitive=Parameter(
            args=(
                "-s",
                "--sensitive",
            ),
            constraints=EnsureChoice(None, 'some', 'all'),
            doc="""if set to 'some' or 'all', it will display sections such as 
            config and metadata which could potentially contain sensitive 
            information (credentials, names, etc.).  If 'some', the fields
            which are known to be sensitive will still be masked out"""),
        sections=Parameter(
            args=("-S", "--section"),
            action='append',
            dest='sections',
            metavar="SECTION",
            constraints=EnsureChoice(*sorted(SECTION_CALLABLES))
            | EnsureNone(),
            doc="""section to include.  If not set, all sections.
            [CMD: This option can be given multiple times. CMD]"""),
        decor=Parameter(
            args=("-D", "--decor"),
            constraints=EnsureChoice('html_details') | EnsureNone(),
            doc="""decoration around the rendering to facilitate embedding into
            issues etc, e.g. use 'html_details' for posting collapsable entry
            to GitHub issues."""),
        clipboard=Parameter(
            args=(
                "-c",
                "--clipboard",
            ),
            action="store_true",
            doc="""if set, do not print but copy to clipboard (requires pyperclip
            module)"""),
    )

    @staticmethod
    @datasetmethod(name='wtf')
    @eval_results
    def __call__(dataset=None,
                 sensitive=None,
                 sections=None,
                 decor=None,
                 clipboard=None):
        from datalad.distribution.dataset import require_dataset
        from datalad.support.exceptions import NoDatasetFound
        from datalad.interface.results import get_status_dict

        ds = None
        try:
            ds = require_dataset(dataset,
                                 check_installed=False,
                                 purpose='reporting')
        except NoDatasetFound:
            # failure is already logged
            pass
        if ds and not ds.is_installed():
            # warn that the dataset is bogus
            yield dict(
                action='wtf',
                path=ds.path,
                status='impossible',
                message=('No dataset found at %s. Reporting on the dataset is '
                         'not attempted.', ds.path),
                logger=lgr)
            # we don't deal with absent datasets
            ds = None
        if sensitive:
            if ds is None:
                from datalad import cfg
            else:
                cfg = ds.config
        else:
            cfg = None

        from datalad.ui import ui
        from datalad.support.external_versions import external_versions

        infos = OrderedDict()
        res = get_status_dict(
            action='wtf',
            path=ds.path if ds else assure_unicode(op.abspath(op.curdir)),
            type='dataset' if ds else 'directory',
            status='ok',
            logger=lgr,
            decor=decor,
            infos=infos,
        )

        # Define section callables which require variables.
        # so there is no side-effect on module level original
        section_callables = SECTION_CALLABLES.copy()
        section_callables['location'] = partial(_describe_location, res)
        section_callables['configuration'] = \
            partial(_describe_configuration, cfg, sensitive)
        if ds:
            section_callables['dataset'] = \
                partial(_describe_dataset, ds, sensitive)
        else:
            section_callables.pop('dataset')
        assert all(section_callables.values())  # check if none was missed

        if sections is None:
            sections = sorted(list(section_callables))

        for s in sections:
            infos[s] = section_callables[s]()

        if clipboard:
            external_versions.check(
                'pyperclip', msg="It is needed to be able to use clipboard")
            import pyperclip
            report = _render_report(res)
            pyperclip.copy(report)
            ui.message("WTF information of length %s copied to clipboard" %
                       len(report))
        yield res
        return

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        from datalad.ui import ui
        out = _render_report(res)
        ui.message(out)
Exemplo n.º 24
0
class AddArchiveContent(Interface):
    """Add content of an archive under git annex control.

    Given an already annex'ed archive, extract and add its files to the
    dataset, and reference the original archive as a custom special remote.

    """
    _examples_ = [
        dict(text="""Add files from the archive 'big_tarball.tar.gz', but
                     keep big_tarball.tar.gz in the index""",
             code_py="add_archive_content(path='big_tarball.tar.gz')",
             code_cmd="datalad add-archive-content big_tarball.tar.gz"),
        dict(text="""Add files from the archive 'tarball.tar.gz', and
                     remove big_tarball.tar.gz from the index""",
             code_py="add_archive_content(path='big_tarball.tar.gz', delete=True)",
             code_cmd="datalad add-archive-content big_tarball.tar.gz --delete"),
        dict(text="""Add files from the archive 's3.zip' but remove the leading
                     directory""",
             code_py="add_archive_content(path='s3.zip', strip_leading_dirs=True)",
             code_cmd="datalad add-archive-content s3.zip --strip-leading-dirs"),
        ]

    # XXX prevent common args from being added to the docstring
    _no_eval_results = True
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc=""""specify the dataset to save""",
            constraints=EnsureDataset() | EnsureNone()),
        delete=Parameter(
            args=("-D", "--delete"),
            action="store_true",
            doc="""delete original archive from the filesystem/Git in current
            tree. %s""" % _KEY_OPT_NOTE),
        add_archive_leading_dir=Parameter(
            args=("--add-archive-leading-dir",),
            action="store_true",
            doc="""place extracted content under a directory which would
            correspond to the archive name with all suffixes stripped. E.g. the
            content of `archive.tar.gz` will be extracted under `archive/`"""),
        strip_leading_dirs=Parameter(
            args=("--strip-leading-dirs",),
            action="store_true",
            doc="""remove one or more leading directories from the archive
            layout on extraction"""),
        leading_dirs_depth=Parameter(
            args=("--leading-dirs-depth",),
            action="store",
            type=int,
            doc="""maximum depth of leading directories to strip.
            If not specified (None), no limit"""),
        leading_dirs_consider=Parameter(
            args=("--leading-dirs-consider",),
            action="append",
            doc="""regular expression(s) for directories to consider to strip
            away""",
            constraints=EnsureStr() | EnsureNone(),
        ),
        use_current_dir=Parameter(
            args=("--use-current-dir",),
            action="store_true",
            doc="""extract the archive under the current directory, not the
             directory where the archive is located. This parameter is applied
             automatically if [PY: `key=True` PY][CMD: --key CMD] was used."""),
        # TODO: add option to extract under archive's original directory. Currently would extract in curdir
        existing=Parameter(
            args=("--existing",),
            choices=('fail', 'overwrite', 'archive-suffix', 'numeric-suffix'),
            default="fail",
            doc="""what operation to perform if a file from an archive tries to
            overwrite an existing file with the same name.  'fail' (default)
            leads to an error result, 'overwrite' silently replaces
            existing file, 'archive-suffix' instructs to add a suffix (prefixed
            with a '-') matching archive name from which file gets extracted,
            and if that one is present as well, 'numeric-suffix' is in effect in
            addition, when incremental numeric suffix (prefixed with a '.') is
            added until no name collision is longer detected"""
        ),
        exclude=Parameter(
            args=("-e", "--exclude"),
            action='append',
            doc="""regular expressions for filenames which to exclude from being
            added to annex. Applied after --rename if that one is specified.
            For exact matching, use anchoring""",
            constraints=EnsureStr() | EnsureNone()
        ),
        rename=Parameter(
            args=("-r", "--rename"),
            action='append',
            doc="""regular expressions to rename files before added them under
            to Git. The first defines how to split provided string into
            two parts: Python regular expression (with groups), and replacement
            string""",
            constraints=EnsureStr(min_len=2) | EnsureNone()
        ),
        annex_options=Parameter(
            args=("-o", "--annex-options"),
            doc="""additional options to pass to git-annex """,
            constraints=EnsureStr() | EnsureNone()
        ),
        annex=Parameter(
            doc="""DEPRECATED. Use the 'dataset' parameter instead."""
        ),
        # TODO: Python only!
        stats=Parameter(
            doc="""ActivityStats instance for global tracking""",
        ),
        key=Parameter(
            args=("--key",),
            action="store_true",
            doc="""signal if provided archive is not actually a filename on its
            own but an annex key. The archive will be extracted in the current
            directory."""),
        copy=Parameter(
            args=("--copy",),
            action="store_true",
            doc="""copy the content of the archive instead of moving"""),
        allow_dirty=allow_dirty,
        commit=Parameter(
            args=("--no-commit",),
            action="store_false",
            dest="commit",
            doc="""don't commit upon completion"""),
        drop_after=Parameter(
            args=("--drop-after",),
            action="store_true",
            doc="""drop extracted files after adding to annex""",
        ),
        delete_after=Parameter(
            args=("--delete-after",),
            action="store_true",
            doc="""extract under a temporary directory, git-annex add, and
            delete afterwards. To be used to "index" files within annex without
            actually creating corresponding files under git. Note that
            `annex dropunused` would later remove that load"""),

        # TODO: interaction with archives cache whenever we make it persistent across runs
        archive=Parameter(
            args=("archive",),
            doc="archive file or a key (if %s specified)" % _KEY_OPT,
            constraints=EnsureStr()),
    )

    @staticmethod
    @datasetmethod(name='add_archive_content')
    @eval_results
    def __call__(
            archive,
            *,
            dataset=None,
            annex=None,
            add_archive_leading_dir=False,
            strip_leading_dirs=False,
            leading_dirs_depth=None,
            leading_dirs_consider=None,
            use_current_dir=False,
            delete=False,
            key=False,
            exclude=None,
            rename=None,
            existing='fail',
            annex_options=None,
            copy=False,
            commit=True,
            allow_dirty=False,
            stats=None,
            drop_after=False,
            delete_after=False):

        if exclude:
            exclude = ensure_tuple_or_list(exclude)
        if rename:
            rename = ensure_tuple_or_list(rename)
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='add-archive-content')

        # set up common params for result records
        res_kwargs = {
            'action': 'add-archive-content',
            'logger': lgr,
        }

        if not isinstance(ds.repo, AnnexRepo):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message="Can't operate in a pure Git repository",
                **res_kwargs
            )
            return
        if annex:
            warnings.warn(
                "datalad add_archive_content's `annex` parameter is "
                "deprecated and will be removed in a future release. "
                "Use the 'dataset' parameter instead.",
                DeprecationWarning)
        annex = ds.repo
        # get the archive path relative from the ds root
        archive_path = resolve_path(archive, ds=dataset)
        # let Status decide whether we can act on the given file
        for s in ds.status(
                path=archive_path,
                on_failure='ignore',
                result_renderer='disabled'):
            if s['status'] == 'error':
                if 'path not underneath the reference dataset %s' in s['message']:
                    yield get_status_dict(
                        ds=ds,
                        status='impossible',
                        message='Can not add archive outside of the dataset',
                        **res_kwargs)
                    return
                # status errored & we haven't anticipated the cause. Bubble up
                yield s
                return
            elif s['state'] == 'untracked':
                # we can't act on an untracked file
                message = (
                    "Can not add an untracked archive. "
                    "Run 'datalad save {}'".format(archive)
                )
                yield get_status_dict(
                           ds=ds,
                           status='impossible',
                           message=message,
                           **res_kwargs)
                return

        if not allow_dirty and annex.dirty:
            # error out here if the dataset contains untracked changes
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'clean dataset required. '
                    'Use `datalad status` to inspect unsaved changes'),
                **res_kwargs
            )
            return

        # ensure the archive exists, status doesn't error on a non-existing file
        if not key and not lexists(archive_path):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'No such file: {}'.format(archive_path),
                ),
                **res_kwargs
            )
            return

        if not key:
            check_path = archive_path.relative_to(ds.pathobj)
            # TODO: support adding archives content from outside the annex/repo
            origin = 'archive'
            # can become get_file_annexinfo once #6104 is merged
            key = annex.get_file_annexinfo(check_path)['key']
            if not key:
                raise RuntimeError(
                    f"Archive must be an annexed file in {ds}")
            archive_dir = Path(archive_path).parent
        else:
            origin = 'key'
            key = archive
            # We must not have anything to do with the location under .git/annex
            archive_dir = None
            # instead, we will go from the current directory
            use_current_dir = True

        archive_basename = file_basename(archive)

        if not key:
            # if we didn't manage to get a key, the file must be in Git
            raise NotImplementedError(
                "Provided file %s does not seem to be under annex control. "
                "We don't support adding everything straight to Git" % archive
            )

        # figure out our location
        pwd = getpwd()
        # are we in a subdirectory of the repository?
        pwd_in_root = annex.path == archive_dir
        # then we should add content under that subdirectory,
        # get the path relative to the repo top
        if use_current_dir:
            # extract the archive under the current directory, not the directory
            # where the archive is located
            extract_rpath = Path(pwd).relative_to(ds.path) \
                if not pwd_in_root \
                else None
        else:
            extract_rpath = archive_dir.relative_to(ds.path)

        # relpath might return '.' as the relative path to curdir, which then normalize_paths
        # would take as instructions to really go from cwd, so we need to sanitize
        if extract_rpath == curdir:
            extract_rpath = None

        try:
            key_rpath = annex.get_contentlocation(key)
        except:
            # the only probable reason for this to fail is that there is no
            # content present
            raise RuntimeError(
                "Content of %s seems to be N/A.  Fetch it first" % key
            )

        # now we simply need to go through every file in that archive and
        lgr.info(
            "Adding content of the archive %s into annex %s", archive, annex
        )

        from datalad.customremotes.archives import ArchiveAnnexCustomRemote

        # TODO: shouldn't we be able just to pass existing AnnexRepo instance?
        # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive
        # OK, let's ignore that the following class is actually a special
        # remote implementation, and use it only to work with its cache
        annexarchive = ArchiveAnnexCustomRemote(annex=None,
                                                path=annex.path,
                                                persistent_cache=True)
        # We will move extracted content so it must not exist prior running
        annexarchive.cache.allow_existing = True
        earchive = annexarchive.cache[key_rpath]
        # make sure there is an enabled datalad-archives special remote
        ensure_datalad_remote(ds.repo, remote=ARCHIVES_SPECIAL_REMOTE,
                              autoenable=True)

        precommitted = False
        old_always_commit = annex.always_commit
        # batch mode is disabled when faking dates, we want to always commit
        annex.always_commit = annex.fake_dates_enabled
        if annex_options:
            if isinstance(annex_options, str):
                annex_options = split_cmdline(annex_options)
        delete_after_rpath = None

        prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad",
                                               dir=annex.path)) \
            if delete_after \
            else None

        # dedicated stats which would be added to passed in (if any)
        outside_stats = stats
        stats = ActivityStats()

        try:
            # keep track of extracted files for progress bar logging
            file_counter = 0
            # iterative over all files in the archive
            extracted_files = list(earchive.get_extracted_files())
            # start a progress bar for extraction
            pbar_id = f'add-archive-{archive_path}'
            log_progress(
                lgr.info, pbar_id, 'Extracting archive',
                label="Extracting archive",
                unit=' Files',
                total = len(extracted_files),
                noninteractive_level = logging.INFO)
            for extracted_file in extracted_files:
                file_counter += 1
                files_left = len(extracted_files) - file_counter
                log_progress(
                    lgr.info, pbar_id,
                    "Files to extract %i ", files_left,
                    update=1,
                    increment=True,
                    noninteractive_level=logging.DEBUG)
                stats.files += 1
                extracted_path = Path(earchive.path) / Path(extracted_file)

                if extracted_path.is_symlink():
                    link_path = str(extracted_path.resolve())
                    if not exists(link_path):
                        # TODO: config  addarchive.symlink-broken='skip'
                        lgr.warning(
                            "Path %s points to non-existing file %s" %
                            (extracted_path, link_path)
                        )
                        stats.skipped += 1
                        continue
                        # TODO: check if points outside of archive - warn & skip

                url = annexarchive.get_file_url(
                    archive_key=key,
                    file=extracted_file,
                    size=os.stat(extracted_path).st_size)

                # preliminary target name which might get modified by renames
                target_file_orig = target_file = Path(extracted_file)

                # stream archives would not have had the original filename
                # information in them, so would be extracted under a name
                # derived from their annex key.
                # Provide ad-hoc handling for such cases
                if (len(extracted_files) == 1 and
                    Path(archive).suffix in ('.xz', '.gz', '.lzma') and
                        Path(key_rpath).name.startswith(Path(
                            extracted_file).name)):
                    # take archive's name without extension for filename & place
                    # where it was originally extracted
                    target_file = \
                        Path(extracted_file).parent / Path(archive).stem

                if strip_leading_dirs:
                    leading_dir = earchive.get_leading_directory(
                        depth=leading_dirs_depth, exclude=exclude,
                        consider=leading_dirs_consider)
                    leading_dir_len = \
                        len(leading_dir) + len(opsep) if leading_dir else 0
                    target_file = str(target_file)[leading_dir_len:]

                if add_archive_leading_dir:
                    # place extracted content under a directory corresponding to
                    # the archive name with suffix stripped.
                    target_file = Path(archive_basename) / target_file

                if rename:
                    target_file = apply_replacement_rules(rename,
                                                          str(target_file))

                # continue to next iteration if extracted_file in excluded
                if exclude:
                    try:  # since we need to skip outside loop from inside loop
                        for regexp in exclude:
                            if re.search(regexp, extracted_file):
                                lgr.debug(
                                    "Skipping {extracted_file} since contains "
                                    "{regexp} pattern".format(**locals()))
                                stats.skipped += 1
                                raise StopIteration
                    except StopIteration:
                        continue

                if delete_after:
                    # place target file in a temporary directory
                    target_file = Path(prefix_dir) / Path(target_file)
                    # but also allow for it in the orig
                    target_file_orig = Path(prefix_dir) / Path(target_file_orig)

                target_file_path_orig = annex.pathobj / target_file_orig

                # If we were invoked in a subdirectory, patch together the
                # correct path
                target_file_path = extract_rpath / target_file \
                    if extract_rpath else target_file
                target_file_path = annex.pathobj / target_file_path

                # when the file already exists...
                if lexists(target_file_path):
                    handle_existing = True
                    if md5sum(str(target_file_path)) == \
                            md5sum(str(extracted_path)):
                        if not annex.is_under_annex(str(extracted_path)):
                            # if under annex -- must be having the same content,
                            # we should just add possibly a new extra URL
                            # but if under git -- we cannot/should not do
                            # anything about it ATM
                            if existing != 'overwrite':
                                continue
                        else:
                            handle_existing = False
                    if not handle_existing:
                        pass  # nothing... just to avoid additional indentation
                    elif existing == 'fail':
                        message = \
                            "{} exists, but would be overwritten by new file " \
                            "{}. Consider adjusting --existing".format\
                            (target_file_path, extracted_file)
                        yield get_status_dict(
                            ds=ds,
                            status='error',
                            message=message,
                            **res_kwargs)
                        return
                    elif existing == 'overwrite':
                        stats.overwritten += 1
                        # to make sure it doesn't conflict -- might have been a
                        # tree
                        rmtree(target_file_path)
                    else:
                        # an elaborate dance to piece together new archive names
                        target_file_path_orig_ = target_file_path

                        # To keep extension intact -- operate on the base of the
                        # filename
                        p, fn = os.path.split(target_file_path)
                        ends_with_dot = fn.endswith('.')
                        fn_base, fn_ext = file_basename(fn, return_ext=True)

                        if existing == 'archive-suffix':
                            fn_base += '-%s' % archive_basename
                        elif existing == 'numeric-suffix':
                            pass  # archive-suffix will have the same logic
                        else:
                            # we shouldn't get here, argparse should catch a
                            # non-existing value for --existing right away
                            raise ValueError(existing)
                        # keep incrementing index in the suffix until file
                        # doesn't collide
                        suf, i = '', 0
                        while True:
                            connector = \
                                ('.' if (fn_ext or ends_with_dot) else '')
                            file = fn_base + suf + connector + fn_ext
                            target_file_path_new =  \
                                Path(p) / Path(file)
                            if not lexists(target_file_path_new):
                                # we found a file name that is not yet taken
                                break
                            lgr.debug("Iteration %i of file name finding. "
                                      "File %s already exists", i,
                                      target_file_path_new)
                            i += 1
                            suf = '.%d' % i
                        target_file_path = target_file_path_new
                        lgr.debug("Original file %s will be saved into %s"
                                  % (target_file_path_orig_, target_file_path))
                        # TODO: should we reserve smth like
                        # stats.clobbed += 1

                if target_file_path != target_file_path_orig:
                    stats.renamed += 1

                if copy:
                    raise NotImplementedError(
                        "Not yet copying from 'persistent' cache"
                    )

                lgr.debug("Adding %s to annex pointing to %s and with options "
                          "%r", target_file_path, url, annex_options)

                out_json = annex.add_url_to_file(
                    target_file_path,
                    url, options=annex_options,
                    batch=True)

                if 'key' in out_json and out_json['key'] is not None:
                    # annex.is_under_annex(target_file, batch=True):
                    # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated
                    # we need to maintain a list of those to be dropped files
                    if drop_after:
                        # drop extracted files after adding to annex
                        annex.drop_key(out_json['key'], batch=True)
                        stats.dropped += 1
                    stats.add_annex += 1
                else:
                    lgr.debug("File {} was added to git, not adding url".format(
                        target_file_path))
                    stats.add_git += 1

                if delete_after:
                    # we count the removal here, but don't yet perform it
                    # to not interfer with batched processes - any pure Git
                    # action invokes precommit which closes batched processes.
                    stats.removed += 1

                # Done with target_file -- just to have clear end of the loop
                del target_file

            if delete and archive and origin != 'key':
                lgr.debug("Removing the original archive {}".format(archive))
                # force=True since some times might still be staged and fail
                annex.remove(str(archive_path), force=True)

            lgr.info("Finished adding %s: %s", archive, stats.as_str(mode='line'))

            if outside_stats:
                outside_stats += stats
            if delete_after:
                # force since not committed. r=True for -r (passed into git call
                # to recurse)
                delete_after_rpath = opj(extract_rpath, prefix_dir) \
                    if extract_rpath else prefix_dir
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                lgr.debug(
                    "Removing extracted and annexed files under %s",
                    delete_after_rpath
                )
                annex.remove(str(delete_after_rpath), r=True, force=True)
            if commit:
                archive_rpath = archive_path.relative_to(ds.path)
                commit_stats = outside_stats if outside_stats else stats
                # so batched ones close and files become annex symlinks etc
                annex.precommit()
                precommitted = True
                if any(r.get('state', None) != 'clean'
                       for p, r in annex.status(untracked='no').items()):
                    annex.commit(
                        "Added content extracted from %s %s\n\n%s" %
                        (origin, archive_rpath,
                         commit_stats.as_str(mode='full')),
                        _datalad_msg=True
                    )
                    commit_stats.reset()
            else:
                # don't commit upon completion
                pass
        finally:
            # take down the progress bar
            log_progress(
                lgr.info, pbar_id,
                'Finished extraction',
                noninteractive_level=logging.INFO)
            # since we batched addurl, we should close those batched processes
            # if haven't done yet.  explicitly checked to avoid any possible
            # "double-action"
            if not precommitted:
                annex.precommit()

            if delete_after_rpath:
                delete_after_path = opj(annex.path, delete_after_rpath)
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                if exists(delete_after_path):  # should not be there
                    # but for paranoid yoh
                    lgr.warning(
                        "Removing temporary directory under which extracted "
                        "files were annexed and should have been removed: %s",
                        delete_after_path)
                    rmtree(delete_after_path)

            annex.always_commit = old_always_commit
            # remove what is left and/or everything upon failure
            earchive.clean(force=True)
            # remove tempfile directories (not cleaned up automatically):
            if prefix_dir is not None and lexists(prefix_dir):
                os.rmdir(prefix_dir)
        yield get_status_dict(
            ds=ds,
            status='ok',
            **res_kwargs)
        return annex
Exemplo n.º 25
0
class ContainersRemove(Interface):
    # first docstring line is used a short description in the cmdline help
    # the rest is put in the verbose help and manpage
    """Remove a known container from a dataset
    """

    # parameters of the command, must be exhaustive
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to query. If no dataset is given, an
            attempt is made to identify the dataset based on the current
            working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        name=Parameter(
            args=("name", ),
            doc="""name of the container to remove""",
            metavar="NAME",
            constraints=EnsureStr(),
        ),
        remove_image=Parameter(
            args=(
                "-i",
                "--remove-image",
            ),
            doc="""if set, remove container image as well""",
            action="store_true",
        ),
    )

    @staticmethod
    @datasetmethod(name='containers_remove')
    @eval_results
    def __call__(name, dataset=None, remove_image=False):
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='remove a container')

        res = get_status_dict(ds=ds, action='containers_remove', logger=lgr)

        section = 'datalad.containers.{}'.format(name)
        imagecfg = '{}.image'.format(section)

        to_save = []
        if remove_image and imagecfg in ds.config:
            imagepath = ds.config.get(imagecfg)
            if op.lexists(op.join(ds.path, imagepath)):
                for r in ds.remove(
                        path=imagepath,
                        # XXX shortcoming: this is the only way to say:
                        # don't drop
                        check=False,
                        # config setting might be outdated and image no longer
                        # there -> no reason to fail, just report
                        on_failure='ignore',
                        save=False):
                    yield r
                to_save.append(imagepath)

        if section in ds.config.sections():
            ds.config.remove_section(section, where='dataset', reload=True)
            res['status'] = 'ok'
            to_save.append(op.join('.datalad', 'config'))
        else:
            res['status'] = 'notneeded'
        if to_save:
            for r in ds.save(
                    path=to_save,
                    message='[DATALAD] Remove container {}'.format(name)):
                yield r
        yield res
Exemplo n.º 26
0
class Dicom2Spec(Interface):
    """Derives a specification snippet from DICOM metadata and stores it in a
    JSON file.

    The derivation is based on a rule system. You can implement your own rules as a python class.
    See the documentation page on customization for details. If you have such rules in dedicated files,
    their use and priority is configured via the datalad.hirni.dicom2spec.rules config variable. It takes
    a path to a python file containung such a rule definition. This configuration can be specified multiple
    times and at different levels (system-wide, user, dataset, local repository). If there are indeed
    several occurences of that configuration, the respective rules will be applied in order. Hence "later"
    appearances will overwrite "earlier" ones. Thereby you can have institution rules for example and still
    apply additional rules tailored to your needs or a particular study.
    """

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify a dataset containing the DICOM metadata to be 
                    used. If no dataset is given, an attempt is made to identify 
                    the dataset based on the current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       nargs="+",
                       doc="""path to DICOM files""",
                       constraints=EnsureStr() | EnsureNone()),
        spec=Parameter(args=(
            "-s",
            "--spec",
        ),
                       metavar="SPEC",
                       doc="""file to store the specification in""",
                       constraints=EnsureStr() | EnsureNone()),
        subject=Parameter(
            args=("--subject", ),
            metavar="SUBJECT",
            doc="""subject identifier. If not specified, an attempt will be made 
                        to derive SUBJECT from DICOM headers""",
            constraints=EnsureStr() | EnsureNone()),
        anon_subject=Parameter(args=("--anon-subject", ),
                               metavar="ANON_SUBJECT",
                               doc="""TODO""",
                               constraints=EnsureStr() | EnsureNone()),
        acquisition=Parameter(
            args=("--acquisition", ),
            metavar="ACQUISITION",
            doc="""acquisition identifier. If not specified, an attempt 
                    will be made to derive an identifier from DICOM headers""",
            constraints=EnsureStr() | EnsureNone()),
        properties=Parameter(args=("--properties", ),
                             metavar="PATH or JSON string",
                             doc="""""",
                             constraints=EnsureStr() | EnsureNone()),
    )

    @staticmethod
    @datasetmethod(name='hirni_dicom2spec')
    @eval_results
    def __call__(path=None,
                 spec=None,
                 dataset=None,
                 subject=None,
                 anon_subject=None,
                 acquisition=None,
                 properties=None):

        # TODO: acquisition can probably be removed (or made an alternative to
        # derive spec and/or dicom location from)

        # Change, so path needs to point directly to dicom ds?
        # Or just use acq and remove path?

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose="spec from dicoms")

        from datalad.utils import assure_list
        if path is not None:
            path = assure_list(path)
            path = [resolve_path(p, dataset) for p in path]
            path = [str(p) for p in path]
        else:
            raise InsufficientArgumentsError(
                "insufficient arguments for dicom2spec: a path is required")

        # TODO: We should be able to deal with several paths at once
        #       ATM we aren't (see also commit + message of actual spec)
        assert len(path) == 1

        if not spec:
            raise InsufficientArgumentsError(
                "insufficient arguments for dicom2spec: a spec file is required"
            )

            # TODO: That's prob. wrong. We can derive default spec from acquisition
        else:
            spec = str(resolve_path(spec, dataset))

        spec_series_list = \
            [r for r in json_py.load_stream(spec)] if op.exists(spec) else list()

        # get dataset level metadata:
        found_some = False
        for meta in dataset.meta_dump(
                path,
                recursive=False,  # always False?
                reporton='datasets',
                return_type='generator',
                result_renderer='disabled'):
            if meta.get('status', None) not in ['ok', 'notneeded']:
                yield meta
                continue

            if 'dicom' not in meta['metadata']:

                # TODO: Really "notneeded" or simply not a result at all?
                yield dict(status='notneeded',
                           message=("found no DICOM metadata for %s",
                                    meta['path']),
                           path=meta['path'],
                           type='dataset',
                           action='dicom2spec',
                           logger=lgr)
                continue

            if 'Series' not in meta['metadata']['dicom'] or \
                    not meta['metadata']['dicom']['Series']:
                yield dict(
                    status='impossible',
                    message=("no image series detected in DICOM metadata of"
                             " %s", meta['path']),
                    path=meta['path'],
                    type='dataset',
                    action='dicom2spec',
                    logger=lgr)
                continue

            found_some = True

            overrides = dict()
            if properties:
                # load from file or json string
                props = json_py.load(properties) \
                        if op.exists(properties) else json_py.loads(properties)
                # turn into editable, pre-approved records
                props = {
                    k: dict(value=v, approved=True)
                    for k, v in props.items()
                }
                overrides.update(props)

            spec_series_list = add_to_spec(
                meta,
                spec_series_list,
                op.dirname(spec),
                subject=subject,
                anon_subject=anon_subject,
                # session=session,
                # TODO: parameter "session" was what
                # we now call acquisition. This is
                # NOT a good default for bids_session!
                # Particularly wrt to anonymization
                overrides=overrides,
                dataset=dataset)

        if not found_some:
            yield dict(
                status='impossible',
                message="found no DICOM metadata",
                path=path,
                type=
                'file',  # TODO: arguable should be 'file' or 'dataset', depending on path
                action='dicom2spec',
                logger=lgr)
            return

        # TODO: RF needed. This rule should go elsewhere:
        # ignore duplicates (prob. reruns of aborted runs)
        # -> convert highest id only
        # Note: This sorting is a q&d hack!
        # TODO: Sorting needs to become more sophisticated + include notion of :all
        spec_series_list = sorted(spec_series_list,
                                  key=lambda x: get_specval(x, 'id')
                                  if 'id' in x.keys() else 0)
        for i in range(len(spec_series_list)):
            # Note: Removed the following line from condition below,
            # since it appears to be pointless. Value for 'converter'
            # used to be 'heudiconv' or 'ignore' for a 'dicomseries', so
            # it's not clear ATM what case this could possibly have catched:
            # heuristic.has_specval(spec_series_list[i], "converter") and \
            if spec_series_list[i]["type"] == "dicomseries" and \
                has_specval(spec_series_list[i], "bids-run") and \
                get_specval(spec_series_list[i], "bids-run") in \
                    [get_specval(s, "bids-run")
                     for s in spec_series_list[i + 1:]
                     if get_specval(
                            s,
                            "description") == get_specval(
                                spec_series_list[i], "description") and \
                     get_specval(s, "id") > get_specval(
                                             spec_series_list[i], "id")
                     ]:
                lgr.debug("Ignore SeriesNumber %s for conversion" % i)
                spec_series_list[i]["tags"].append(
                    'hirni-dicom-converter-ignore')

        lgr.debug("Storing specification (%s)", spec)
        # store as a stream (one record per file) to be able to
        # easily concat files without having to parse them, or
        # process them line by line without having to fully parse them
        from datalad_hirni.support.spec_helpers import sort_spec
        # Note: Sorting paradigm needs to change. See above.
        # spec_series_list = sorted(spec_series_list, key=lambda x: sort_spec(x))
        json_py.dump2stream(spec_series_list, spec)

        # make sure spec is tracked in git:
        spec_attrs = dataset.repo.get_gitattributes(spec)
        spec_relpath = op.relpath(spec, dataset.path)
        if spec_relpath not in spec_attrs.keys() or \
                'annex.largefiles' not in spec_attrs[spec_relpath].keys() or \
                spec_attrs[spec_relpath]['annex.largefiles'] != 'nothing':
            dataset.repo.set_gitattributes([(spec, {
                'annex.largefiles': 'nothing'
            })], '.gitattributes')

        for r in Save.__call__(dataset=dataset,
                               path=[spec, '.gitattributes'],
                               to_git=True,
                               message="[HIRNI] Added study specification "
                               "snippet for %s" %
                               op.relpath(path[0], dataset.path),
                               return_type='generator',
                               result_renderer='disabled'):
            if r.get('status', None) not in ['ok', 'notneeded']:
                yield r
            elif r['path'] in [spec, op.join(dataset.path, '.gitattributes')] \
                    and r['type'] == 'file':
                r['action'] = 'dicom2spec'
                r['logger'] = lgr
                yield r
            elif r['type'] == 'dataset':
                # 'ok' or 'notneeded' for a dataset is okay, since we commit
                # the spec. But it's not a result to yield
                continue
            else:
                # anything else shouldn't happen
                yield dict(
                    status='error',
                    message=("unexpected result from save: %s", r),
                    path=
                    spec,  # TODO: This actually isn't clear - get it from `r`
                    type='file',
                    action='dicom2spec',
                    logger=lgr)
Exemplo n.º 27
0
class Unlock(Interface):
    """Unlock file(s) of a dataset

    Unlock files of a dataset in order to be able to edit the actual content
    """

    _params_ = dict(
        path=Parameter(
            args=("path",),
            doc="""file(s) to unlock""",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc=""""specify the dataset to unlock files in. If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory. If the latter fails, an
            attempt is made to identify the dataset based on `path` """,
            constraints=EnsureDataset() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
    )

    @staticmethod
    @datasetmethod(name='unlock')
    @eval_results
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None):

        if path is None and dataset is None:
            raise InsufficientArgumentsError(
                "insufficient arguments for unlocking: needs at least "
                "a dataset or a path to unlock.")

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='unlock', logger=lgr, refds=refds_path)

        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='unlock',
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist",
                nondataset_path_status='impossible',
                modified=None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', 'dataset') == 'dataset':
                # this is a dataset
                ap['process_content'] = True
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path)
        assert(not completed)

        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            content = content_by_ds[ds_path]

            # no annex, no unlock:
            if not isinstance(ds.repo, AnnexRepo):
                for ap in content:
                    ap['status'] = 'notneeded'
                    ap['message'] = "not annex'ed, nothing to unlock"
                    ap.update(res_kwargs)
                    yield ap
                continue

            # direct mode, no unlock:
            elif ds.repo.is_direct_mode():
                for ap in content:
                    ap['status'] = 'notneeded'
                    ap['message'] = "direct mode, nothing to unlock"
                    ap.update(res_kwargs)
                    yield ap
                continue

            # only files in annex with their content present:
            files = [ap['path'] for ap in content]
            to_unlock = []
            for ap, under_annex, has_content in \
                zip(content,
                    ds.repo.is_under_annex(files),
                    ds.repo.file_has_content(files)):

                # TODO: what about directories? Make sure, there is no
                # situation like no file beneath with content or everything in
                # git, that leads to a CommandError
                # For now pass to annex:
                from os.path import isdir
                if isdir(ap['path']):
                    to_unlock.append(ap)
                    continue

                # Note, that `file_has_content` is (planned to report) True on
                # files in git. Therefore order matters: First check for annex!
                if under_annex:
                    if has_content:
                        to_unlock.append(ap)
                    # no content, no unlock:
                    else:
                        ap['status'] = 'impossible'
                        ap['message'] = "no content present, can't unlock"
                        ap.update(res_kwargs)
                        yield ap
                # file in git, no unlock:
                else:
                    ap['status'] = 'notneeded'
                    ap['message'] = "not controlled by annex, nothing to unlock"
                    ap.update(res_kwargs)
                    yield ap

            # don't call annex-unlock with no path, if this is this case because
            # nothing survived the filtering above
            if content and not to_unlock:
                continue

            for r in ds.repo.unlock([ap['path'] for ap in to_unlock]):
                yield get_status_dict(
                    path=r, status='ok', type='file', **res_kwargs)
Exemplo n.º 28
0
class Init(Interface):
    """Initialize an existing dataset to track a UKBiobank participant

    A batch file for the 'ukbfetch' tool will be generated and placed into the
    dataset. By selecting the relevant data records, raw and/or preprocessed
    data will be tracked.

    After initialization the dataset will contain at least three branches:

    - 'incoming': to track the pristine ZIP files downloaded from UKB
    - 'incoming-native': to track individual files (some extracted from ZIP
      files)
    - 'incoming-bids': to track individual files in a layout where file name
      conform to BIDS-conventions
    - main branch: based off of incoming-native or incoming-bids (if enabled)
      with potential manual modifications applied
    """

    _examples_ = [
        dict(
            text='Initialize a dataset in the current directory',
            code_cmd='datalad ukb-init 5874415 20227_2_0 20249_2_0',
            code_py=
            'ukb_init(participant="5874415", records=["20227_2_0", "20249_2_0"])'
        ),
        dict(
            text='Initialize a dataset in the current directory in BIDS layout',
            code_cmd='datalad ukb-init --bids 5874415 20227_2_0',
            code_py=
            'ukb_init(participant="5874415", records=["20227_2_0"], bids=True)'
        ),
    ]

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='DATASET',
            doc="""specify the dataset to perform the initialization on""",
            constraints=EnsureDataset() | EnsureNone()),
        participant=Parameter(
            args=('participant', ),
            metavar='PARTICPANT-ID',
            nargs=1,
            doc="""UKBiobank participant ID to use for this dataset
            (note: these encoded IDs are unique to each
            application/project)""",
            constraints=EnsureStr()),
        records=Parameter(args=('records', ),
                          metavar='DATARECORD-ID',
                          nargs='+',
                          doc='One or more data record identifiers',
                          constraints=EnsureStr()),
        force=Parameter(args=(
            "-f",
            "--force",
        ),
                        doc="""force (re-)initialization""",
                        action='store_true'),
        bids=Parameter(
            args=('--bids', ),
            action='store_true',
            doc="""additionally maintain an incoming-bids branch with a
            BIDS-like organization."""),
    )

    @staticmethod
    @datasetmethod(name='ukb_init')
    @eval_results
    def __call__(participant, records, force=False, bids=False, dataset=None):
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='initialization')

        participant = ensure_list(participant)[0]
        records = ensure_list(records)

        repo = ds.repo
        main_branch = repo.get_active_branch()
        # test for awareness of incoming branches
        incoming_branches = [
            b for b in ('incoming', 'incoming-native', 'incoming-bids')
            if b in repo.get_branches() or any(
                u.endswith('/{}'.format(b))
                for u in repo.get_remote_branches())
        ]

        # prep for yield
        res = dict(
            action='ukb_init',
            path=ds.path,
            type='dataset',
            logger=lgr,
            refds=ds.path,
        )

        if 'incoming' in incoming_branches and not force:
            yield dict(
                res,
                status='error',
                message='Dataset found already initialized, '
                'use `force` to reinitialize',
            )
            return
        if 'incoming' not in incoming_branches:
            # establish "incoming" branch that will hold pristine UKB downloads
            repo.call_git(['checkout', '--orphan', 'incoming'])
        else:
            repo.call_git(['checkout', 'incoming'])

        # place batch file with download config for ukbfetch in it
        batchfile = repo.pathobj / '.ukbbatch'
        batchfile.write_text('{}\n'.format('\n'.join(
            '{} {}'.format(participant, rec) for rec in records)))
        # inherit the standard attributes to ensure uniform behavior
        # across branches
        (repo.pathobj / '.gitattributes').write_text(
            repo.call_git(
                ['cat-file', '-p', '{}:.gitattributes'.format(main_branch)]))
        # save to incoming branch, provide path to avoid adding untracked
        # content
        ds.save(
            path=['.ukbbatch', '.gitattributes'],
            to_git=True,
            message="Configure UKB data fetch",
            result_renderer=None,
        )
        # establish rest of the branch structure: "incoming-native"
        # for extracted archive content
        _add_incoming_branch('incoming-native', incoming_branches, repo,
                             batchfile)
        if bids:
            _add_incoming_branch('incoming-bids', incoming_branches, repo,
                                 batchfile)
        # force merge unrelated histories into main branch
        # we are using an orphan branch such that we know that
        # `git ls-tree incoming`
        # will only report download-related content, nothing extracted or
        # manually modified
        repo.call_git(['checkout', main_branch])
        repo.call_git([
            'merge',
            '-m',
            'Merge incoming',
            '--allow-unrelated-histories',
            'incoming-bids' if bids else 'incoming-native',
        ])

        yield dict(
            res,
            status='ok',
            participant=participant,
            records=records,
        )
        return
Exemplo n.º 29
0
class Dump(Interface):
    """Query a dataset's aggregated metadata for dataset and file metadata

    Two types of metadata are supported:

    1. metadata describing a dataset as a whole (dataset-global metadata), and

    2. metadata for files in a dataset (content metadata).

    Both types can be queried with this command, and a specific type is
    requested via the `--reporton` argument.

    Examples:

      Dump the metadata of a single file, the queried dataset is determined
      based on the current working directory::

        % datalad meta-dump somedir/subdir/thisfile.dat

      Sometimes it is helpful to get metadata records formatted in a more
      accessible form, here as pretty-printed JSON::

        % datalad -f json_pp meta-dump somedir/subdir/thisfile.dat

      Same query as above, but specify which dataset to query (must be
      containing the query path)::

        % datalad meta-dump -d . somedir/subdir/thisfile.dat

      Dump any metadata record of any dataset known to the queried dataset::

        % datalad meta-dump --recursive --reporton datasets

      Get a JSON-formatted report of metadata aggregates in a dataset, incl.
      information on enabled metadata extractors, dataset versions, dataset
      IDs, and dataset paths::

        % datalad -f json meta-dump --reporton aggregates
    """
    # make the custom renderer the default, path reporting isn't the top
    # priority here
    result_renderer = 'tailored'

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""dataset to query. If not given, a dataset will be determined
            based on the current working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="path(s) to query metadata for",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        reporton=Parameter(
            args=('--reporton', ),
            constraints=EnsureChoice('all', 'jsonld', 'datasets', 'files',
                                     'aggregates'),
            doc="""what type of metadata to report on: dataset-global
            metadata only ('datasets'), metadata on dataset content/files only
            ('files'), both ('all', default). 'jsonld' is an alternative mode
            to report all available metadata with JSON-LD markup. A single
            metadata result with the entire metadata graph matching the query
            will be reported, all non-JSON-LD-type metadata will be ignored.
            There is an auxiliary category 'aggregates' that reports on which
            metadata aggregates are present in the queried dataset."""),
        recursive=Parameter(
            args=(
                "-r",
                "--recursive",
            ),
            action="store_true",
            doc="""if set, recursively report on any matching metadata based
            on given paths or reference dataset. Note, setting this option
            does not cause any recursion into potential subdatasets on the
            filesystem. It merely determines what metadata is being reported
            from the given/discovered reference dataset."""),
    )

    @staticmethod
    @datasetmethod(name='meta_dump')
    @eval_results
    def __call__(path=None, dataset=None, reporton='all', recursive=False):
        # prep results
        res_kwargs = dict(action='meta_dump', logger=lgr)
        ds = require_dataset(dataset=dataset,
                             check_installed=True,
                             purpose='aggregate metadata query')
        if dataset:
            res_kwargs['refds'] = ds.path

        agginfos = get_ds_aggregate_db(
            ds.pathobj,
            version=str(aggregate_layout_version),
            # we are handling errors below
            warn_absent=False,
        )
        if not agginfos:
            # if there has ever been an aggregation run, this file would
            # exist, hence there has not been and we need to tell this
            # to people
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message='metadata aggregation has never been performed in '
                'this dataset',
                **res_kwargs)
            return

        if not path:
            # implement https://github.com/datalad/datalad/issues/3282
            path = ds.pathobj if isinstance(dataset, Dataset) else os.getcwd()

        # check for paths that are not underneath this dataset
        resolved_paths = set()
        for p in assure_list(path):
            p = resolve_path(p, dataset)
            if p != ds.pathobj and ds.pathobj not in p.parents:
                raise ValueError(
                    'given path {} is not underneath dataset {}'.format(p, ds))
            resolved_paths.add(p)

        # sort paths into their containing dataset aggregate records
        paths_by_ds = {}
        while resolved_paths:
            resolved_path = resolved_paths.pop()
            # find the first dataset that matches
            for aggdspath in sorted(agginfos, reverse=True):
                if recursive and resolved_path in aggdspath.parents:
                    ps = paths_by_ds.get(aggdspath, set())
                    ps.add(aggdspath)
                    paths_by_ds[aggdspath] = ps
                elif aggdspath == resolved_path \
                        or aggdspath in resolved_path.parents:
                    ps = paths_by_ds.get(aggdspath, set())
                    ps.add(resolved_path)
                    paths_by_ds[aggdspath] = ps
                    # stop when the containing dataset is found
                    break

        # which files do we need to have locally to perform the query
        info_keys = \
            ('dataset_info', 'content_info') \
            if reporton in ('all', 'jsonld') else \
            ('dataset_info',) if reporton == 'datasets' else \
            ('content_info',) if reporton == 'files' else \
            []
        objfiles = [
            text_type(agginfos[d][t]) for d in paths_by_ds for t in info_keys
            if t in agginfos[d]
        ]
        lgr.debug(
            'Verifying/achieving local availability of %i metadata objects',
            len(objfiles))
        if objfiles:
            for r in ds.get(path=objfiles,
                            result_renderer='disabled',
                            return_type='generator'):
                # report only of not a success as this is an internal operation
                # that a user would not (need to) expect
                if success_status_map.get(
                        r['status'], False) != 'success':  # pragma: no cover
                    yield r

        contexts = {}
        nodes_by_context = {}
        parentds = []
        # loop over all records to get complete parentds relationships
        for aggdspath in sorted(agginfos):
            while parentds and parentds[-1] not in aggdspath.parents:
                parentds.pop()
            if aggdspath not in paths_by_ds:
                # nothing to say about this
                parentds.append(aggdspath)
                continue
            agg_record = agginfos[aggdspath]
            if reporton == 'aggregates':
                # we do not need to loop over the actual query paths, as
                # the aggregates of the containing dataset will contain
                # the desired info, if any exists

                # convert pathobj before emitting until we became more clever
                info = {
                    k: text_type(v) if isinstance(v, ut.PurePath) else v
                    for k, v in iteritems(agg_record)
                }
                info.update(
                    path=text_type(aggdspath),
                    type='dataset',
                )
                if aggdspath == ds.pathobj:
                    info['layout_version'] = aggregate_layout_version
                if parentds:
                    info['parentds'] = text_type(parentds[-1])
                yield dict(info, status='ok', **res_kwargs)
                parentds.append(aggdspath)
                continue

            # pull out actual metadata records
            for res in _yield_metadata_records(
                    aggdspath,
                    agg_record,
                    paths_by_ds[aggdspath],
                    reporton,
                    parentds=parentds[-1] if parentds else None):
                if reporton != 'jsonld':
                    yield dict(res, **res_kwargs)
                    continue
                collect_jsonld_metadata(aggdspath, res, nodes_by_context,
                                        contexts)

            parentds.append(aggdspath)
        if reporton == 'jsonld':
            yield dict(status='ok',
                       type='dataset',
                       path=ds.path,
                       metadata=format_jsonld_metadata(nodes_by_context),
                       refcommit=agginfos[ds.pathobj]['refcommit'],
                       **res_kwargs)

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        if res['status'] != 'ok' or not res.get('action', None) == 'meta_dump':
            # logging complained about this already
            return
        if kwargs.get('reporton', None) == 'jsonld':
            # special case of a JSON-LD report request
            # all reports are consolidated into a single
            # graph, dumps just that (no pretty printing, can
            # be done outside)
            ui.message(
                jsondumps(
                    res['metadata'],
                    # support utf-8 output
                    ensure_ascii=False,
                    # this cannot happen, spare the checks
                    check_circular=False,
                    # this will cause the output to not necessarily be
                    # JSON compliant, but at least contain all info that went
                    # in, and be usable for javascript consumers
                    allow_nan=True,
                ))
            return
        # list the path, available metadata keys, and tags
        path = op.relpath(res['path'], res['refds']) if res.get(
            'refds', None) else res['path']
        meta = res.get('metadata', {})
        ui.message('{path}{type}:{spacer}{meta}{tags}'.format(
            path=ac.color_word(path, ac.BOLD),
            type=' ({})'.format(ac.color_word(res['type'], ac.MAGENTA))
            if 'type' in res else '',
            spacer=' ' if len([m for m in meta if m != 'tag']) else '',
            meta=','.join(k for k in sorted(meta.keys())
                          if k not in ('tag', '@context', '@id'))
            if meta else ' -' if 'metadata' in res else ' {}'.format(','.join(
                e for e in res['extractors']
                if e not in ('datalad_core', 'metalad_core',
                             'metalad_annex'))) if 'extractors' in res else '',
            tags='' if 'tag' not in meta else ' [{}]'.format(','.join(
                assure_list(meta['tag'])))))
Exemplo n.º 30
0
class Subdatasets(Interface):
    r"""Report subdatasets and their properties.

    The following properties are reported (if possible) for each matching
    subdataset record.

    "name"
        Name of the subdataset in the parent (often identical with the
        relative path in the parent dataset)

    "path"
        Absolute path to the subdataset

    "parentds"
        Absolute path to the parent dataset

    "gitshasum"
        SHA1 of the subdataset commit recorded in the parent dataset

    "state"
        Condition of the subdataset: 'clean', 'modified', 'absent', 'conflict'
        as reported by `git submodule`

    "gitmodule_url"
        URL of the subdataset recorded in the parent

    "gitmodule_name"
        Name of the subdataset recorded in the parent

    "gitmodule_<label>"
        Any additional configuration property on record.

    Performance note: Property modification, requesting `bottomup` reporting
    order, or a particular numerical `recursion_limit` implies an internal
    switch to an alternative query implementation for recursive query that is
    more flexible, but also notably slower (performs one call to Git per
    dataset versus a single call for all combined).

    The following properties for subdatasets are recognized by DataLad
    (without the 'gitmodule\_' prefix that is used in the query results):

    "datalad-recursiveinstall"
        If set to 'skip', the respective subdataset is skipped when DataLad
        is recursively installing its superdataset. However, the subdataset
        remains installable when explicitly requested, and no other features
        are impaired.
    """
    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to query.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the input and/or the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            doc="""path/name to query for subdatasets. Defaults to the
            current directory[PY: , or the entire dataset if called as
            a dataset method PY].""",
            nargs='*',
            constraints=EnsureStr() | EnsureNone()),
        fulfilled=Parameter(
            args=("--fulfilled", ),
            doc="""if given, must be a boolean flag indicating whether
            to report either only locally present or absent datasets.
            By default subdatasets are reported regardless of their
            status""",
            constraints=EnsureBool() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        contains=Parameter(
            args=('--contains', ),
            metavar='PATH',
            action='append',
            doc="""limit report to the subdatasets containing the
            given path. If a root path of a subdataset is given the last
            reported dataset will be the subdataset itself.[CMD:  This
            option can be given multiple times CMD][PY:  Can be a list with
            multiple paths PY], in which case datasets will be reported that
            contain any of the given paths.""",
            constraints=EnsureStr() | EnsureNone()),
        bottomup=Parameter(
            args=("--bottomup", ),
            action="store_true",
            doc="""whether to report subdatasets in bottom-up order along
            each branch in the dataset tree, and not top-down."""),
        set_property=Parameter(
            args=('--set-property', ),
            metavar=('NAME', 'VALUE'),
            nargs=2,
            action='append',
            doc="""Name and value of one or more subdataset properties to
            be set in the parent dataset's .gitmodules file. The property name
            is case-insensitive, must start with a letter, and consist only
            of alphanumeric characters. The value can be
            a Python format() template string wrapped in '<>' (e.g.
            '<{gitmodule_name}>').
            Supported keywords are any item reported in the result properties
            of this command, plus 'refds_relpath' and 'refds_relname':
            the relative path of a subdataset with respect to the base dataset
            of the command call, and, in the latter case, the same string with
            all directory separators replaced by dashes.[CMD:  This
            option can be given multiple times. CMD]""",
            constraints=EnsureStr() | EnsureNone()),
        delete_property=Parameter(
            args=('--delete-property', ),
            metavar='NAME',
            action='append',
            doc="""Name of one or more subdataset properties to be removed
            from the parent dataset's .gitmodules file.[CMD:  This
            option can be given multiple times. CMD]""",
            constraints=EnsureStr() | EnsureNone()))

    @staticmethod
    @datasetmethod(name='subdatasets')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 fulfilled=None,
                 recursive=False,
                 recursion_limit=None,
                 contains=None,
                 bottomup=False,
                 set_property=None,
                 delete_property=None):
        # no constraints given -> query subdatasets under curdir
        if not path and dataset is None:
            path = os.curdir
        paths = [resolve_path(p, dataset) for p in assure_list(path)] \
            if path else None

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='subdataset reporting/modification')
        lgr.debug('Query subdatasets of %s', dataset)
        if paths is not None:
            lgr.debug('Query subdatasets underneath paths: %s', paths)
        refds_path = ds.path

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must "
                        "start with a letter)" % k)
        if contains:
            contains = [
                resolve_path(c, dataset) for c in assure_list(contains)
            ]
        contains_hits = set()
        for r in _get_submodules(ds, paths, fulfilled, recursive,
                                 recursion_limit, contains, bottomup,
                                 set_property, delete_property, refds_path):
            # a boat-load of ancient code consumes this and is ignorant of
            # Path objects
            r['path'] = str(r['path'])
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            if 'contains' in r:
                contains_hits.update(r['contains'])
                r['contains'] = [str(c) for c in r['contains']]
            yield r
        if contains:
            for c in set(contains).difference(contains_hits):
                yield get_status_dict(
                    'subdataset',
                    path=str(c),
                    status='impossible',
                    message='path not contained in any matching subdataset',
                    # we do not want to log such an event, because it is a
                    # legit query to check for matching subdatasets simply
                    # for the purpose of further decision making
                    # user communication in front-end scenarios will happen
                    # via result rendering
                    #logger=lgr
                )