Exemplo n.º 1
0
class TestUtils(Interface):
    """TestUtil's fake command"""

    _params_ = dict(
        number=Parameter(args=(
            "-n",
            "--number",
        ),
                         doc="""It's a number""",
                         constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(args=("-d", "--dataset"),
                          doc=""""specify the dataset to update.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the input and/or the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
    )

    @staticmethod
    @datasetmethod(name='fake_command')
    @eval_results
    def __call__(number, dataset=None):

        for i in range(number):
            # this dict will need to have the minimum info required by
            # eval_results
            yield {
                'path': 'some',
                'status': 'ok',
                'somekey': i,
                'action': 'off'
            }
Exemplo n.º 2
0
class ContainersList(Interface):
    # first docstring line is used a short description in the cmdline help
    # the rest is put in the verbose help and manpage
    """List containers known to a dataset
    """

    # parameters of the command, must be exhaustive
    _params_ = dict(dataset=Parameter(
        args=("-d", "--dataset"),
        doc="""specify the dataset to query. If no dataset is given, an 
            attempt is made to identify the dataset based on the current working
             directory""",
        constraints=EnsureDataset() | EnsureNone()), )

    @staticmethod
    @datasetmethod(name='containers_list')
    @eval_results
    def __call__(dataset=None):

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='list containers')

        loc_cfg_var = "datalad.containers.location"

        # TODO: We should provide an entry point (or sth similar) for extensions
        # to get config definitions into the ConfigManager. In other words an
        # easy way to extend definitions in datalad's common_cfgs.py.
        container_loc = \
            ds.config.obtain(loc_cfg_var,
                             where=definitions[loc_cfg_var]['destination'],
                             store=True,
                             default=definitions[loc_cfg_var]['default'],
                             dialog_type=definitions[loc_cfg_var]['ui'][0],
                             valtype=definitions[loc_cfg_var]['type'],
                             **definitions[loc_cfg_var]['ui'][1]
                             )

        from six import PY3

        try:
            location_content = listdir(op.join(ds.path, container_loc))
        except FileNotFoundError if PY3 else (OSError, IOError) as e:
            # TODO: Right now, just retunr nothing, since there is nothing
            # But may also be an "impossible" result, since the configured
            # common mountpoint isn't existing (needs "e.errno == errno.ENOENT"
            # in addition in PY2)
            return

        for r in [n for n in location_content if not n.startswith(".")]:
            yield {
                'status': 'ok',
                'action': 'containers_list',
                'path': op.join(ds.path, container_loc, r),
                # TODO: Might be an image file or a dataset.
                # Use AnnotatePath with container_loc?
                'type': 'file',
                'name': r,
            }
Exemplo n.º 3
0
class RunProcedure(Interface):
    """
    DO stuff
    datalad.locations.dataset-procedures
    datalad.locations.user-procedures
    datalad.locations.system-procedures
    """
    _params_ = dict(
        spec=Parameter(args=("spec", ),
                       metavar='NAME [ARGS]',
                       nargs=REMAINDER,
                       doc=""),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to record the command results in.
            An attempt is made to identify the dataset based on the current
            working directory. If a dataset is given, the command will be
            executed in the root directory of this dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
    )

    @staticmethod
    @datasetmethod(name='run_procedure')
    @eval_results
    def __call__(spec, dataset=None):
        if not isinstance(spec, (tuple, list)):
            # maybe coming from config
            import shlex
            spec = shlex.split(spec)
        name = spec[0]
        args = spec[1:]
        procedure_file = _get_procedure_implementation(name, ds=dataset)
        if not procedure_file:
            # TODO error result
            raise ValueError("Cannot find procedure with name '%s'", name)

        ds = require_dataset(dataset,
                             check_installed=False,
                             purpose='run a procedure') if dataset else None

        cmd_tmpl = _guess_exec(procedure_file)
        cmd = cmd_tmpl.format(script=procedure_file,
                              ds=ds.path if ds else '',
                              args=u' '.join(u'"{}"'.format(a)
                                             for a in args) if args else '')
        lgr.debug('Attempt to run procedure {} as: {}'.format(name, cmd))
        for r in Run.__call__(
                cmd=cmd,
                dataset=ds,
                # See gh-2593 for discussion on run feature extension
                #explicit=True,
                #inputs=None,
                #outputs=None,
                # pass through here
                on_failure='ignore',
        ):
            yield r
class ContainersList(Interface):
    # first docstring line is used a short description in the cmdline help
    # the rest is put in the verbose help and manpage
    """List containers known to a dataset
    """

    # parameters of the command, must be exhaustive
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to query. If no dataset is given, an
            attempt is made to identify the dataset based on the current
            working directory""",
            constraints=EnsureDataset() | EnsureNone()),
    )

    @staticmethod
    @datasetmethod(name='containers_list')
    @eval_results
    def __call__(dataset=None):
        ds = require_dataset(dataset, check_installed=True,
                             purpose='list containers')

        # all info is in the dataset config!
        var_prefix = 'datalad.containers.'
        containers = {}
        for var, value in ds.config.items():
            if not var.startswith(var_prefix):
                # not an interesting variable
                continue
            var_comps = var[len(var_prefix):].split('.')
            cname = var_comps[0]
            ccfgname = '.'.join(var_comps[1:])
            if not ccfgname:
                continue

            cinfo = containers.get(cname, {})
            cinfo[ccfgname] = value

            containers[cname] = cinfo

        for k, v in containers.items():
            if 'image' not in v:
                # there is no container location configured
                continue
            res = get_status_dict(
                status='ok',
                action='containers',
                name=k,
                type='file',
                path=op.join(ds.path, v.pop('image')),
                # TODO
                #state='absent' if ... else 'present'
                **v)
            yield res
Exemplo n.º 5
0
class Run(Interface):
    """Run an arbitrary command and record its impact on a dataset.

    It is recommended to craft the command such that it can run in the root
    directory of the dataset that the command will be recorded in. However,
    as long as the command is executed somewhere underneath the dataset root,
    the exact location will be recorded relative to the dataset root.

    If the executed command did not alter the dataset in any way, no record of
    the command execution is made.

    If the given command errors, a `CommandError` exception with the same exit
    code will be raised, and no modifications will be saved.
    """
    _params_ = dict(
        cmd=Parameter(args=("cmd", ),
                      nargs=REMAINDER,
                      metavar='SHELL COMMAND',
                      doc="command for execution"),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to record the command results in.
            An attempt is made to identify the dataset based on the current
            working directory. If a dataset is given, the command will be
            executed in the root directory of this dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
        message=save_message_opt,
        rerun=Parameter(
            args=('--rerun', ),
            action='store_true',
            doc="""re-run the command recorded in the last saved change (if any).
            Note: This option is deprecated since version 0.9.2 and
            will be removed in a later release. Use `datalad rerun`
            instead."""),
    )

    @staticmethod
    @datasetmethod(name='run')
    @eval_results
    def __call__(cmd=None, dataset=None, message=None, rerun=False):
        if rerun:
            if cmd:
                lgr.warning("Ignoring provided command in --rerun mode")
            lgr.warning(
                "The --rerun option is deprecated since version 0.9.2. "
                "Use `datalad rerun` instead.")
            from datalad.interface.rerun import Rerun
            for r in Rerun.__call__(dataset=dataset, message=message):
                yield r
        else:
            if cmd:
                for r in run_command(cmd, dataset, message):
                    yield r
            else:
                lgr.warning("No command given")
Exemplo n.º 6
0
class TestUtils(Interface):
    """TestUtil's fake command"""

    result_renderer = 'tailored'  # overrides None default
    return_type = 'item-or-list'  # overrides 'list'

    _params_ = dict(
        number=Parameter(args=(
            "-n",
            "--number",
        ),
                         doc="""It's a number""",
                         constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(args=("-d", "--dataset"),
                          doc=""""specify the dataset to update.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the input and/or the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        result_fn=Parameter(
            args=tuple(),  # Hide this from the cmdline parser.
            doc="""Generate the result records with this function
            rather than using the default logic. `number` will be
            passed as an argument."""),
    )

    @staticmethod
    @datasetmethod(name='fake_command')
    @eval_results
    def __call__(number, dataset=None, result_fn=None):
        if result_fn:
            yield from result_fn(number)
        else:
            for i in range(number):
                # this dict will need to have the minimum info
                # required by eval_results
                yield {
                    'path': 'some',
                    'status': 'ok',
                    'somekey': i,
                    'action': 'off'
                }
Exemplo n.º 7
0
class ContainersRemove(Interface):
    # first docstring line is used a short description in the cmdline help
    # the rest is put in the verbose help and manpage
    """Remove a known container from a dataset
    """

    # parameters of the command, must be exhaustive
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to query. If no dataset is given, an
            attempt is made to identify the dataset based on the current
            working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        name=Parameter(
            args=("name", ),
            doc="""name of the container to remove""",
            metavar="NAME",
            constraints=EnsureStr(),
        ),
        remove_image=Parameter(
            args=(
                "-i",
                "--remove-image",
            ),
            doc="""if set, remove container image as well""",
            action="store_true",
        ),
    )

    @staticmethod
    @datasetmethod(name='containers_remove')
    @eval_results
    def __call__(name, dataset=None, remove_image=False):
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='remove a container')

        res = get_status_dict(ds=ds, action='containers_remove', logger=lgr)

        section = 'datalad.containers.{}'.format(name)
        imagecfg = '{}.image'.format(section)

        to_save = []
        if remove_image and imagecfg in ds.config:
            imagepath = ds.config.get(imagecfg)
            if op.lexists(op.join(ds.path, imagepath)):
                for r in ds.remove(
                        path=imagepath,
                        # XXX shortcomming: this is the only way to say:
                        # don't drop
                        check=False,
                        # config setting might be outdated and image no longer
                        # there -> no reason to fail, just report
                        on_failure='ignore',
                        save=False):
                    yield r
                to_save.append(imagepath)

        if section in ds.config.sections():
            ds.config.remove_section(section, where='dataset', reload=True)
            res['status'] = 'ok'
            to_save.append(op.join('.datalad', 'config'))
        else:
            res['status'] = 'notneeded'
        if to_save:
            for r in ds.save(
                    path=to_save,
                    message='[DATALAD] Remove container {}'.format(name)):
                yield r
        yield res
Exemplo n.º 8
0
class Search(Interface):
    """Search dataset metadata

    DataLad can search metadata extracted from a dataset and/or aggregated into
    a superdataset (see the `aggregate-metadata` command). This makes it
    possible to discover datasets, or individual files in a dataset even when
    they are not available locally.

    Ultimately DataLad metadata are a graph of linked data structures. However,
    this command does not (yet) support queries that can exploit all
    information stored in the metadata. At the moment the following search
    modes are implemented that represent different trade-offs between the
    expressiveness of a query and the computational and storage resources
    required to execute a query.

    - egrep (default)

    - egrepcs [case-sensitive egrep]

    - textblob

    - autofield

    An alternative default mode can be configured by tuning the
    configuration variable 'datalad.search.default-mode'::

      [datalad "search"]
        default-mode = egrepcs

    Each search mode has its own default configuration for what kind of
    documents to query. The respective default can be changed via configuration
    variables::

      [datalad "search"]
        index-<mode_name>-documenttype = (all|datasets|files)


    *Mode: egrep/egrepcs*

    These search modes are largely ignorant of the metadata structure, and
    simply perform matching of a search pattern against a flat
    string-representation of metadata. This is advantageous when the query is
    simple and the metadata structure is irrelevant, or precisely known.
    Moreover, it does not require a search index, hence results can be reported
    without an initial latency for building a search index when the underlying
    metadata has changed (e.g. due to a dataset update). By default, these
    search modes only consider datasets and do not investigate records for
    individual files for speed reasons. Search results are reported in the
    order in which they were discovered.

    Queries can make use of Python regular expression syntax
    (https://docs.python.org/3/library/re.html). In `egrep` mode, matching is
    case-insensitive when the query does not contain upper case characters, but
    is case-sensitive when it does. In `egrepcs` mode, matching is always
    case-sensitive. Expressions will match anywhere in a metadata string, not
    only at the start.

    When multiple queries are given, all queries have to match for a search hit
    (AND behavior).

    It is possible to search individual metadata key/value items by prefixing
    the query with a metadata key name, separated by a colon (':'). The key
    name can also be a regular expression to match multiple keys. A query match
    happens when any value of an item with a matching key name matches the query
    (OR behavior). See examples for more information.

    Examples:

      Query for (what happens to be) an author::

        % datalad search haxby

      Queries are case-INsensitive when the query contains no upper case characters,
      and can be regular expressions. Use `egrepcs` mode when it is desired
      to perform a case-sensitive lowercase match::

        % datalad search --mode egrepcs halchenko.*haxby

      This search mode performs NO analysis of the metadata content.  Therefore
      queries can easily fail to match. For example, the above query implicitly
      assumes that authors are listed in alphabetical order.  If that is the
      case (which may or may not be true), the following query would yield NO
      hits::

        % datalad search Haxby.*Halchenko

      The ``textblob`` search mode represents an alternative that is more
      robust in such cases.

      For more complex queries multiple query expressions can be provided that
      all have to match to be considered a hit (AND behavior). This query
      discovers all files (non-default behavior) that match 'bids.type=T1w'
      AND 'nifti1.qform_code=scanner'::

        % datalad -c datalad.search.index-egrep-documenttype=all search bids.type:T1w nifti1.qform_code:scanner

      Key name selectors can also be expressions, which can be used to select
      multiple keys or construct "fuzzy" queries. In such cases a query matches
      when any item with a matching key matches the query (OR behavior).
      However, multiple queries are always evaluated using an AND conjunction.
      The following query extends the example above to match any files that
      have either 'nifti1.qform_code=scanner' or 'nifti1.sform_code=scanner'::

        % datalad -c datalad.search.index-egrep-documenttype=all search bids.type:T1w nifti1.(q|s)form_code:scanner

    *Mode: textblob*

    This search mode is very similar to the ``egrep`` mode, but with a few key
    differences. A search index is built from the string-representation of
    metadata records. By default, only datasets are included in this index, hence
    the indexing is usually completed within a few seconds, even for hundreds
    of datasets. This mode uses its own query language (not regular expressions)
    that is similar to other search engines. It supports logical conjunctions
    and fuzzy search terms. More information on this is available from the Whoosh
    project (search engine implementation):

      - Description of the Whoosh query language:
        http://whoosh.readthedocs.io/en/latest/querylang.html)

      - Description of a number of query language customizations that are
        enabled in DataLad, such as, fuzzy term matching:
        http://whoosh.readthedocs.io/en/latest/parsing.html#common-customizations

    Importantly, search hits are scored and reported in order of descending
    relevance, hence limiting the number of search results is more meaningful
    than in the 'egrep' mode and can also reduce the query duration.

    Examples:

      Search for (what happens to be) two authors, regardless of the order in
      which those names appear in the metadata::

        % datalad search --mode textblob halchenko haxby

      Fuzzy search when you only have an approximate idea what you are looking
      for or how it is spelled::

        % datalad search --mode textblob haxbi~

      Very fuzzy search, when you are basically only confident about the first
      two characters and how it sounds approximately (or more precisely: allow
      for three edits and require matching of the first two characters)::

        % datalad search --mode textblob haksbi~3/2

      Combine fuzzy search with logical constructs::

        % datalad search --mode textblob 'haxbi~ AND (hanke OR halchenko)'


    *Mode: autofield*

    This mode is similar to the 'textblob' mode, but builds a vastly more
    detailed search index that represents individual metadata variables as
    individual fields. By default, this search index includes records for
    datasets and individual fields, hence it can grow very quickly into
    a huge structure that can easily take an hour or more to build and require
    more than a GB of storage. However, limiting it to documents on datasets
    (see above) retains the enhanced expressiveness of queries while
    dramatically reducing the resource demands.

    Examples:

      List names of search index fields (auto-discovered from the set of
      indexed datasets)::

        % datalad search --mode autofield --show-keys name

      Fuzzy search for datasets with an author that is specified in a particular
      metadata field::

        % datalad search --mode autofield bids.author:haxbi~ type:dataset

      Search for individual files that carry a particular description
      prefix in their 'nifti1' metadata::

        % datalad search --mode autofield nifti1.description:FSL* type:file


    *Reporting*

    Search hits are returned as standard DataLad results. On the command line
    the '--output-format' (or '-f') option can be used to tweak results for
    further processing.

    Examples:

      Format search hits as a JSON stream (one hit per line)::

        % datalad -f json search haxby

      Custom formatting: which terms matched the query of particular
      results. Useful for investigating fuzzy search results::

        $ datalad -f '{path}: {query_matched}' search --mode autofield bids.author:haxbi~
    """
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to perform the query operation on. If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory and/or the `path` given""",
            constraints=EnsureDataset() | EnsureNone()),
        query=Parameter(
            args=("query", ),
            metavar='QUERY',
            nargs="*",
            doc="""query string, supported syntax and features depends on the
            selected search mode (see documentation)"""),
        force_reindex=Parameter(
            args=("--reindex", ),
            dest='force_reindex',
            action='store_true',
            doc="""force rebuilding the search index, even if no change in the
            dataset's state has been detected, for example, when the index
            documenttype configuration has changed."""),
        max_nresults=Parameter(
            args=("--max-nresults", ),
            doc="""maxmimum number of search results to report. Setting this
            to 0 will report all search matches. Depending on the mode this
            can search substantially slower. If not specified, a
            mode-specific default setting will be used.""",
            constraints=EnsureInt() | EnsureNone()),
        mode=Parameter(
            args=("--mode", ),
            choices=('egrep', 'textblob', 'autofield'),
            doc="""Mode of search index structure and content. See section
            SEARCH MODES for details.
            """),
        full_record=Parameter(
            args=("--full-record", '-f'),
            action='store_true',
            doc="""If set, return the full metadata record for each search hit.
            Depending on the search mode this might require additional queries.
            By default, only data that is available to the respective search modes
            is returned. This always includes essential information, such as the
            path and the type."""),
        show_keys=Parameter(
            args=('--show-keys', ),
            choices=('name', 'short', 'full'),
            default=None,
            doc="""if given, a list of known search keys is shown. If 'name' -
            only the name is printed one per line. If 'short' or 'full',
            statistics (in how many datasets, and how many unique values) are
            printed. 'short' truncates the listing of unique values.
            No other action is performed (except for reindexing), even if other
            arguments are given. Each key is accompanied by a term definition in
            parenthesis (TODO). In most cases a definition is given in the form
            of a URL. If an ontology definition for a term is known, this URL
            can resolve to a webpage that provides a comprehensive definition
            of the term. However, for speed reasons term resolution is solely done
            on information contained in a local dataset's metadata, and definition
            URLs might be outdated or point to no longer existing resources."""
        ),
        show_query=Parameter(
            args=('--show-query', ),
            action='store_true',
            doc="""if given, the formal query that was generated from the given
            query string is shown, but not actually executed. This is mostly useful
            for debugging purposes."""),
    )

    @staticmethod
    @datasetmethod(name='search')
    @eval_results
    def __call__(query=None,
                 dataset=None,
                 force_reindex=False,
                 max_nresults=None,
                 mode=None,
                 full_record=False,
                 show_keys=None,
                 show_query=False):
        try:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='dataset search')
            if ds.id is None:
                raise NoDatasetArgumentFound(
                    "This does not seem to be a dataset (no DataLad dataset ID "
                    "found). 'datalad create --force %s' can initialize "
                    "this repository as a DataLad dataset" % ds.path)
        except NoDatasetArgumentFound:
            for r in _search_from_virgin_install(dataset, query):
                yield r
            return

        if mode is None:
            # let's get inspired by what the dataset/user think is
            # default
            mode = ds.config.obtain('datalad.search.default-mode')

        if mode == 'egrep':
            searcher = _EGrepSearch
        elif mode == 'egrepcs':
            searcher = _EGrepCSSearch
        elif mode == 'textblob':
            searcher = _BlobSearch
        elif mode == 'autofield':
            searcher = _AutofieldSearch
        else:
            raise ValueError('unknown search mode "{}"'.format(mode))

        searcher = searcher(ds, force_reindex=force_reindex)

        if show_keys:
            searcher.show_keys(show_keys)
            return

        if not query:
            return

        if show_query:
            print(repr(searcher.get_query(query)))
            return

        for r in searcher(query,
                          max_nresults=max_nresults,
                          full_record=full_record):
            yield r
Exemplo n.º 9
0
class AddReadme(Interface):
    """Add basic information about DataLad datasets to a README file

    The README file is added to the dataset and the addition is saved
    in the dataset.
    """
    from datalad.support.param import Parameter
    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import EnsureNone, EnsureStr

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""Dataset to add information to. If no dataset is given, an
            attempt is made to identify the dataset based on the current
            working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        filename=Parameter(
            args=("filename", ),
            metavar="PATH",
            nargs='?',
            doc="""Path of the README file within the dataset.""",
            constraints=EnsureStr()),
        existing=Parameter(
            args=("--existing", ),
            metavar="skip|append|replace",
            doc="""How to react if a file with the target name already exists:
            'skip': do nothing; 'append': append information to the existing
            file; 'replace': replace the existing file with new content.""",
            constraints=EnsureStr()),
    )

    @staticmethod
    @datasetmethod(name='add_readme')
    @eval_results
    def __call__(dataset, filename='README.md', existing='skip'):
        from os.path import lexists
        from os.path import join as opj
        from io import open
        import logging
        lgr = logging.getLogger('datalad.plugin.add_readme')

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import assure_list

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='add README')

        filename = opj(dataset.path, filename)
        res_kwargs = dict(action='add_readme', path=filename)

        if lexists(filename) and existing == 'skip':
            yield dict(
                res_kwargs,
                status='notneeded',
                message='file already exists, and not appending content')
            return

        # unlock, file could be annexed
        if lexists(filename):
            dataset.unlock(filename)

        # get any metadata on the dataset itself
        dsinfo = dataset.metadata('.',
                                  reporton='datasets',
                                  return_type='item-or-list',
                                  on_failure='ignore')
        meta = {}
        if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok':
            lgr.warn("Could not obtain dataset metadata, proceeding without")
            dsinfo = {}
        else:
            # flatten possibly existing multiple metadata sources
            for src in dsinfo['metadata']:
                if src.startswith('@'):
                    # not a source
                    continue
                meta.update(dsinfo['metadata'][src])

        metainfo = ''
        for label, content in (
            ('', meta.get('description', meta.get('shortdescription', ''))),
            ('Author{}'.format(
                's' if isinstance(meta.get('author', None), list) else ''),
             u'\n'.join([
                 u'- {}'.format(a) for a in assure_list(meta.get('author', []))
             ])),
            ('Homepage', meta.get('homepage', '')),
            ('Reference', meta.get('citation', '')),
            ('License', meta.get('license', '')),
            ('Keywords', u', '.join([
                u'`{}`'.format(k) for k in assure_list(meta.get('tag', []))
            ])),
            ('Funding', meta.get('fundedby', '')),
        ):
            if label and content:
                metainfo += u'\n\n### {}\n\n{}'.format(label, content)
            elif content:
                metainfo += u'\n\n{}'.format(content)

        for key in 'title', 'name', 'shortdescription':
            if 'title' in meta:
                break
            if key in meta:
                meta['title'] = meta[key]

        default_content = u"""\
# {title}{metainfo}

## General information

This is a DataLad dataset{id}.

For more information on DataLad and on how to work with its datasets,
see the DataLad documentation at: http://docs.datalad.org
""".format(
            title='Dataset "{}"'.format(meta['title'])
            if 'title' in meta else 'About this dataset',
            metainfo=metainfo,
            id=u' (id: {})'.format(dataset.id) if dataset.id else '',
        )

        with open(filename,
                  'a' if existing == 'append' else 'w',
                  encoding='utf-8') as fp:
            fp.write(default_content)
            yield dict(status='ok',
                       path=filename,
                       type='file',
                       action='add_readme')

        for r in dataset.rev_save(filename,
                                  message='[DATALAD] added README',
                                  result_filter=None,
                                  result_xfm=None):
            yield r
Exemplo n.º 10
0
class Create(Interface):
    """Create a new dataset from scratch.

    This command initializes a new dataset at a given location, or the
    current directory. The new dataset can optionally be registered in an
    existing superdataset (the new dataset's path needs to be located
    within the superdataset for that, and the superdataset needs to be given
    explicitly via [PY: `dataset` PY][CMD: --dataset CMD]). It is recommended
    to provide a brief description to label the dataset's nature *and*
    location, e.g. "Michael's music on black laptop". This helps humans to
    identify data locations in distributed scenarios.  By default an identifier
    comprised of user and machine name, plus path will be generated.

    This command only creates a new dataset, it does not add existing content
    to it, even if the target directory already contains additional files or
    directories.

    Plain Git repositories can be created via the [PY: `no_annex` PY][CMD: --no-annex CMD] flag.
    However, the result will not be a full dataset, and, consequently,
    not all features are supported (e.g. a description).

    || REFLOW >>
    To create a local version of a remote dataset use the
    :func:`~datalad.api.install` command instead.
    << REFLOW ||

    .. note::
      Power-user info: This command uses :command:`git init` and
      :command:`git annex init` to prepare the new dataset. Registering to a
      superdataset is performed via a :command:`git submodule add` operation
      in the discovered superdataset.
    """

    # in general this command will yield exactly one result
    return_type = 'item-or-list'
    # in general users expect to get an instance of the created dataset
    result_xfm = 'datasets'
    # result filter
    result_filter = \
        EnsureKeyChoice('action', ('create',)) & \
        EnsureKeyChoice('status', ('ok', 'notneeded'))

    _params_ = dict(
        path=Parameter(
            args=("path", ),
            nargs='?',
            metavar='PATH',
            doc="""path where the dataset shall be created, directories
            will be created as necessary. If no location is provided, a dataset
            will be created in the current working directory. Either way the
            command will error if the target directory is not empty.
            Use `force` to create a dataset in a non-empty directory.""",
            # put dataset 2nd to avoid useless conversion
            constraints=EnsureStr() | EnsureDataset() | EnsureNone()),
        initopts=Parameter(
            args=("initopts", ),
            metavar='INIT OPTIONS',
            nargs=REMAINDER,
            doc="""options to pass to :command:`git init`. [PY: Options can be
            given as a list of command line arguments or as a GitPython-style
            option dictionary PY][CMD: Any argument specified after the
            destination path of the repository will be passed to git-init
            as-is CMD]. Note that not all options will lead to viable results.
            For example '--bare' will not yield a repository where DataLad
            can adjust files in its worktree."""),
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='DATASET',
            doc="""specify the dataset to perform the create operation on. If
            a dataset is given, a new subdataset will be created in it.""",
            constraints=EnsureDataset() | EnsureNone()),
        force=Parameter(
            args=(
                "-f",
                "--force",
            ),
            doc="""enforce creation of a dataset in a non-empty directory""",
            action='store_true'),
        description=location_description,
        no_annex=Parameter(
            args=("--no-annex", ),
            doc="""if set, a plain Git repository will be created without any
            annex""",
            action='store_true'),
        # TODO seems to only cause a config flag to be set, this could be done
        # in a procedure
        fake_dates=Parameter(
            args=('--fake-dates', ),
            action='store_true',
            doc="""Configure the repository to use fake dates. The date for a
            new commit will be set to one second later than the latest commit
            in the repository. This can be used to anonymize dates."""),
        cfg_proc=Parameter(
            args=("-c", "--cfg-proc"),
            metavar="PROC",
            action='append',
            doc="""Run cfg_PROC procedure(s) (can be specified multiple times)
            on the created dataset. Use
            [PY: `run_procedure(discover=True)` PY][CMD: run_procedure --discover CMD]
            to get a list of available procedures, such as cfg_text2git.
            """))

    @staticmethod
    @datasetmethod(name='create')
    @eval_results
    def __call__(path=None,
                 initopts=None,
                 force=False,
                 description=None,
                 dataset=None,
                 no_annex=False,
                 fake_dates=False,
                 cfg_proc=None):
        refds_path = dataset.path if hasattr(dataset, 'path') else dataset

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if path:
            path = rev_resolve_path(path, dataset)

        path = path if path \
            else getpwd() if dataset is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # prep for yield
        res = dict(action='create',
                   path=text_type(path),
                   logger=lgr,
                   type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != path:
            refds = require_dataset(refds_path,
                                    check_installed=True,
                                    purpose='creating a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", dataset,
                        text_type(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = rev_get_dataset_root(
            op.normpath(op.join(text_type(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = ut.Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = ut.Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if any(check_path == p or check_path in p.parents
                   for p in pstatus):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents
                ]
                res.update({
                    'status':
                    'error',
                    'message':
                    ('collision with content in parent dataset at %s: %s',
                     text_type(parentds_path),
                     [text_type(c) for c in conflict])
                })
                yield res
                return
            # another set of check to see whether the target path is pointing
            # into a known subdataset that is not around ATM
            subds_status = {
                parentds_path / k.relative_to(prepo.path)
                for k, v in iteritems(pstatus)
                if v.get('type', None) == 'dataset'
            }
            check_paths = [check_path]
            check_paths.extend(check_path.parents)
            if any(p in subds_status for p in check_paths):
                conflict = [p for p in check_paths if p in subds_status]
                res.update({
                    'status':
                    'error',
                    'message':
                    ('collision with %s (dataset) in dataset %s',
                     text_type(conflict[0]), text_type(parentds_path))
                })
                yield res
                return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = dataset if isinstance(dataset, Dataset) and \
            dataset.path == path else Dataset(text_type(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`force` option to ignore'
            })
            yield res
            return

        # stuff that we create and want to have tracked with git (not annex)
        add_to_git = {}

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # create and configure desired repository
        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            tbrepo = GitRepo(tbds.path,
                             url=None,
                             create=True,
                             create_sanity_checks=False,
                             git_opts=initopts,
                             fake_dates=fake_dates)
            # place a .noannex file to indicate annex to leave this repo alone
            stamp_path = ut.Path(tbrepo.path) / '.noannex'
            stamp_path.touch()
            add_to_git[stamp_path] = {'type': 'file', 'state': 'untracked'}
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                # do not set backend here, to avoid a dedicated commit
                backend=None,
                # None causes version to be taken from config
                version=None,
                description=description,
                git_opts=initopts,
                fake_dates=fake_dates)
            # set the annex backend in .gitattributes as a staged change
            tbrepo.set_default_backend(cfg.obtain('datalad.repo.backend'),
                                       persistent=True,
                                       commit=False)
            add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                'type': 'file',
                'state': 'added'
            }
            # make sure that v6 annex repos never commit content under .datalad
            attrs_cfg = (('config', 'annex.largefiles', 'nothing'), (
                'metadata/aggregate*', 'annex.largefiles', 'nothing'
            ), ('metadata/objects/**', 'annex.largefiles', '({})'.format(
                cfg.obtain('datalad.metadata.create-aggregate-annex-limit'))))
            attrs = tbds.repo.get_gitattributes(
                [op.join('.datalad', i[0]) for i in attrs_cfg])
            set_attrs = []
            for p, k, v in attrs_cfg:
                if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v:
                    set_attrs.append((p, {k: v}))
            if set_attrs:
                tbds.repo.set_gitattributes(set_attrs,
                                            attrfile=op.join(
                                                '.datalad', '.gitattributes'))

            # prevent git annex from ever annexing .git* stuff (gh-1597)
            attrs = tbds.repo.get_gitattributes('.git')
            if not attrs.get('.git', {}).get('annex.largefiles',
                                             None) == 'nothing':
                tbds.repo.set_gitattributes([('**/.git*', {
                    'annex.largefiles': 'nothing'
                })])
                # must use the repo.pathobj as this will have resolved symlinks
                add_to_git[tbds.repo.pathobj / '.gitattributes'] = {
                    'type': 'file',
                    'state': 'untracked'
                }

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds.config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds.config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds.config.add(id_var,
                        tbds_id if tbds_id is not None else uuid_id,
                        where='dataset',
                        reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in iteritems(tbds.config.overrides):
            tbds.config.add(k, v, where='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds.config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbds.repo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'
        }

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbds.repo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(refds, Dataset) and refds.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in refds.save(path=tbds.path, ):
                yield r

        res.update({'status': 'ok'})
        yield res

        for cfg_proc_ in cfg_proc or []:
            for r in tbds.run_procedure('cfg_' + cfg_proc_):
                yield r

    @staticmethod
    def custom_result_renderer(res, **kwargs):  # pragma: no cover
        from datalad.ui import ui
        if res.get('action', None) == 'create' and \
                res.get('status', None) == 'ok' and \
                res.get('type', None) == 'dataset':
            ui.message("Created dataset at {}.".format(res['path']))
        else:
            ui.message("Nothing was created")
Exemplo n.º 11
0
class RunProcedure(Interface):
    """Run prepared procedures (DataLad scripts) on a dataset

    *Concept*

    A "procedure" is an algorithm with the purpose to process a dataset in a
    particular way. Procedures can be useful in a wide range of scenarios,
    like adjusting dataset configuration in a uniform fashion, populating
    a dataset with particular content, or automating other routine tasks,
    such as synchronizing dataset content with certain siblings.

    Implementations of some procedures are shipped together with DataLad,
    but additional procedures can be provided by 1) any DataLad extension,
    2) any (sub-)dataset, 3) a local user, or 4) a local system administrator.
    DataLad will look for procedures in the following locations and order:

    Directories identified by the configuration settings

    - 'datalad.locations.user-procedures' (determined by
      appdirs.user_config_dir; defaults to '$HOME/.config/datalad/procedures'
      on GNU/Linux systems)
    - 'datalad.locations.system-procedures' (determined by
      appdirs.site_config_dir; defaults to '/etc/xdg/datalad/procedures' on
      GNU/Linux systems)
    - 'datalad.locations.dataset-procedures'

    and subsequently in the 'resources/procedures/' directories of any
    installed extension, and, lastly, of the DataLad installation itself.

    Please note that a dataset that defines
    'datalad.locations.dataset-procedures' provides its procedures to
    any dataset it is a subdataset of. That way you can have a collection of
    such procedures in a dedicated dataset and install it as a subdataset into
    any dataset you want to use those procedures with. In case of a naming
    conflict with such a dataset hierarchy, the dataset you're calling
    run-procedures on will take precedence over its subdatasets and so on.

    Each configuration setting can occur multiple times to indicate multiple
    directories to be searched. If a procedure matching a given name is found
    (filename without a possible extension), the search is aborted and this
    implementation will be executed. This makes it possible for individual
    datasets, users, or machines to override externally provided procedures
    (enabling the implementation of customizable processing "hooks").


    *Procedure implementation*

    A procedure can be any executable. Executables must have the appropriate
    permissions and, in the case of a script, must contain an appropriate
    "shebang" line. If a procedure is not executable, but its filename ends
    with '.py', it is automatically executed by the 'python' interpreter
    (whichever version is available in the present environment). Likewise,
    procedure implementations ending on '.sh' are executed via 'bash'.

    Procedures can implement any argument handling, but must be capable
    of taking at least one positional argument (the absolute path to the
    dataset they shall operate on).

    For further customization there are two configuration settings per procedure
    available:

    - 'datalad.procedures.<NAME>.call-format'
      fully customizable format string to determine how to execute procedure
      NAME (see also datalad-run).
      It currently requires to include the following placeholders:

      - '{script}': will be replaced by the path to the procedure
      - '{ds}': will be replaced by the absolute path to the dataset the
        procedure shall operate on
      - '{args}': (not actually required) will be replaced by
        [CMD: all additional arguments passed into run-procedure after NAME CMD]
        [PY: all but the first element of `spec` if `spec` is a list or tuple PY]
        As an example the default format string for a call to a python script is:
        "python {script} {ds} {args}"
    - 'datalad.procedures.<NAME>.help'
      will be shown on `datalad run-procedure --help-proc NAME` to provide a
      description and/or usage info for procedure NAME

    *Customize other commands with procedures*

    On execution of any commands, DataLad inspects two additional
    configuration settings:

    - 'datalad.<name>.proc-pre'

    - 'datalad.<name>.proc-post'

    where '<name>' is the name of a DataLad command. Using this mechanism
    DataLad can be instructed to run one or more procedures before or
    after the execution of a given command. For example, configuring
    a set of metadata types in any newly created dataset can be achieved
    via:

      % datalad -c 'datalad.create.proc-post=cfg_metadatatypes xmp image' create -d myds

    As procedures run on datasets, it is necessary to explicitly identify
    the target dataset via the -d (--dataset) option.
    """
    _params_ = dict(
        spec=Parameter(args=("spec", ),
                       metavar='NAME [ARGS]',
                       nargs=REMAINDER,
                       doc="""Name and possibly additional arguments of the
            to-be-executed procedure. [CMD: Note, that all options to
            run-procedure need to be put before NAME, since all ARGS get
            assigned to NAME CMD]"""),
        dataset=Parameter(args=("-d", "--dataset"),
                          metavar="PATH",
                          doc="""specify the dataset to run the procedure on.
            An attempt is made to identify the dataset based on the current
            working directory.""",
                          constraints=EnsureDataset() | EnsureNone()),
        discover=Parameter(
            args=('--discover', ),
            action='store_true',
            doc="""if given, all configured paths are searched for procedures
            and one result record per discovered procedure is yielded, but
            no procedure is executed"""),
        help_proc=Parameter(
            args=('--help-proc', ),
            action='store_true',
            doc="""if given, get a help message for procedure NAME from config
            setting datalad.procedures.NAME.help"""))

    @staticmethod
    @datasetmethod(name='run_procedure')
    @eval_results
    def __call__(spec=None, dataset=None, discover=False, help_proc=False):
        if not spec and not discover:
            raise InsufficientArgumentsError(
                'requires at least a procedure name')
        if help_proc and not spec:
            raise InsufficientArgumentsError('requires a procedure name')

        try:
            ds = require_dataset(dataset,
                                 check_installed=False,
                                 purpose='run a procedure')
        except NoDatasetArgumentFound:
            ds = None

        if discover:
            reported = set()
            for m, cmd_tmpl, cmd_help in _get_procedure_implementation('*',
                                                                       ds=ds):
                if m in reported:
                    continue
                ex = _guess_exec(m)
                # configured template (call-format string) takes precedence:
                if cmd_tmpl:
                    ex['template'] = cmd_tmpl
                if ex['type'] is None and ex['template'] is None:
                    # doesn't seem like a match
                    lgr.debug(
                        "Neither type nor execution template found for "
                        "%s. Ignored.", m)
                    continue
                message = ex['type'] if ex['type'] else 'unknown type'
                message += ' (missing)' if ex['state'] == 'absent' else ''
                res = get_status_dict(action='run_procedure',
                                      path=m,
                                      type='file',
                                      logger=lgr,
                                      refds=ds.path if ds else None,
                                      status='ok',
                                      state=ex['state'],
                                      procedure_type=ex['type'],
                                      procedure_callfmt=ex['template'],
                                      message=message)
                reported.add(m)
                yield res
            return

        if not isinstance(spec, (tuple, list)):
            # maybe coming from config
            import shlex
            spec = shlex.split(spec)
        name = spec[0]
        args = spec[1:]

        try:
            # get the first match an run with it
            procedure_file, cmd_tmpl, cmd_help = \
                next(_get_procedure_implementation(name, ds=ds))
        except StopIteration:
            res = get_status_dict(
                action='run_procedure',
                # TODO: Default renderer requires a key "path" to exist.
                # Doesn't make a lot of sense in this case
                path=name,
                logger=lgr,
                refds=ds.path if ds else None,
                status='impossible',
                message="Cannot find procedure with name '%s'" % name)
            yield res
            return

        ex = _guess_exec(procedure_file)
        # configured template (call-format string) takes precedence:
        if cmd_tmpl:
            ex['template'] = cmd_tmpl

        if help_proc:
            if cmd_help:
                res = get_status_dict(action='procedure_help',
                                      path=procedure_file,
                                      type='file',
                                      logger=lgr,
                                      refds=ds.path if ds else None,
                                      status='ok',
                                      state=ex['state'],
                                      procedure_type=ex['type'],
                                      procedure_callfmt=ex['template'],
                                      message=cmd_help)
            else:
                res = get_status_dict(action='procedure_help',
                                      path=procedure_file,
                                      type='file',
                                      logger=lgr,
                                      refds=ds.path if ds else None,
                                      status='impossible',
                                      state=ex['state'],
                                      procedure_type=ex['type'],
                                      procedure_callfmt=ex['template'],
                                      message="No help available for '%s'" %
                                      name)

            yield res
            return

        if not ex['template']:
            raise ValueError("No idea how to execute procedure %s. "
                             "Missing 'execute' permissions?" % procedure_file)

        cmd = ex['template'].format(
            script=procedure_file,
            ds=ds.path if ds else '',
            args=u' '.join(u'"{}"'.format(a) for a in args) if args else '')
        lgr.debug('Attempt to run procedure {} as: {}'.format(name, cmd))
        for r in Run.__call__(
                cmd=cmd,
                dataset=ds,
                explicit=True,
                inputs=None,
                outputs=None,
                # pass through here
                on_failure='ignore',
        ):
            yield r
Exemplo n.º 12
0
class NoAnnex(Interface):
    """Configure a dataset to never put some content into the dataset's annex

    This can be useful in mixed datasets that also contain textual data, such
    as source code, which can be efficiently and more conveniently managed
    directly in Git.

    Patterns generally look like this::

      code/*

    which would match all file in the code directory. In order to match all
    files under ``code/``, including all its subdirectories use such a
    pattern::

      code/**

    Note that the plugin works incrementally, hence any existing configuration
    (e.g. from a previous plugin run) is amended, not replaced.

    Parameters
    ----------
    ref_dir : str, optional
    makedirs : bool, optional
    """
    from datalad.support.param import Parameter
    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import EnsureNone

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc=""""specify the dataset to configure. If no dataset is given,
            an attempt is made to identify the dataset based on the current
            working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        pattern=Parameter(
            args=("--pattern",),
            nargs='+',
            doc="""list of path patterns. Any content whose path is matching
            any pattern will not be annexed when added to a dataset, but
            instead will be tracked directly in Git. Path pattern have to be
            relative to the directory given by the `ref_dir` option. By
            default, patterns should be relative to the root of the dataset."""),
        ref_dir=Parameter(
            args=("--ref-dir",),
            doc="""Relative path (within the dataset) to the directory that is
            to be configured. All patterns are interpreted relative to this
            path, and configuration is written to a ``.gitattributes`` file in
            this directory."""),
        makedirs=Parameter(
            args=("--makedirs",),
            action='store_true',
            doc="""If set, any missing directories will be created in order to
            be able to place a file into ``--ref-dir``."""),
    )

    @staticmethod
    @datasetmethod(name='no_annex')
    @eval_results
    def __call__(dataset, pattern, ref_dir='.', makedirs=False):
        # could be extended to accept actual largefile expressions
        from os.path import join as opj
        from os.path import isabs
        from os.path import exists
        from os import makedirs as makedirsfx
        from datalad.distribution.dataset import require_dataset
        from datalad.support.annexrepo import AnnexRepo
        from datalad.utils import assure_list

        pattern = assure_list(pattern)
        ds = require_dataset(dataset, check_installed=True,
                             purpose='no_annex configuration')

        res_kwargs = dict(
            path=ds.path,
            type='dataset',
            action='no_annex',
        )

        # all the ways we refused to cooperate
        if not isinstance(ds.repo, AnnexRepo):
            yield dict(
                res_kwargs,
                status='notneeded',
                message='dataset has no annex')
            return
        if any(isabs(p) for p in pattern):
            yield dict(
                res_kwargs,
                status='error',
                message=('path pattern for `no_annex` configuration must be relative paths: %s',
                         pattern))
            return
        if isabs(ref_dir):
            yield dict(
                res_kwargs,
                status='error',
                message=('`ref_dir` for `no_annex` configuration must be a relative path: %s',
                         ref_dir))
            return

        gitattr_dir = opj(ds.path, ref_dir)
        if not exists(gitattr_dir):
            if makedirs:
                makedirsfx(gitattr_dir)
            else:
                yield dict(
                    res_kwargs,
                    status='error',
                    message='target directory for `no_annex` does not exist (consider makedirs=True)')
                return

        gitattr_file = opj(gitattr_dir, '.gitattributes')
        ds.repo.set_gitattributes(
            [(p, {'annex.largefiles': 'nothing'}) for p in pattern],
            attrfile=gitattr_file)
        yield dict(res_kwargs, status='ok')

        for r in ds.save(
                gitattr_file,
                to_git=True,
                message="[DATALAD] exclude paths from annex'ing",
                result_filter=None,
                result_xfm=None):
            yield r
Exemplo n.º 13
0
class RunProcedure(Interface):
    """Run prepared procedures (DataLad scripts) on a dataset

    *Concept*

    A "procedure" is an algorithm with the purpose to process a dataset in a
    particular way. Procedures can be useful in a wide range of scenarios,
    like adjusting dataset configuration in a uniform fashion, populating
    a dataset with particular content, or automating other routine tasks,
    such as synchronizing dataset content with certain siblings.

    Implementations of some procedures are shipped together with DataLad,
    but additional procedures can be provided by 1) any DataLad extension,
    2) any (sub-)dataset, 3) a local user, or 4) a local system administrator.
    DataLad will look for procedures in the following locations and order:

    Directories identified by the configuration settings

    - 'datalad.locations.user-procedures' (determined by
      appdirs.user_config_dir; defaults to '$HOME/.config/datalad/procedures'
      on GNU/Linux systems)
    - 'datalad.locations.system-procedures' (determined by
      appdirs.site_config_dir; defaults to '/etc/xdg/datalad/procedures' on
      GNU/Linux systems)
    - 'datalad.locations.dataset-procedures'

    and subsequently in the 'resources/procedures/' directories of any
    installed extension, and, lastly, of the DataLad installation itself.

    Please note that a dataset that defines
    'datalad.locations.dataset-procedures' provides its procedures to
    any dataset it is a subdataset of. That way you can have a collection of
    such procedures in a dedicated dataset and install it as a subdataset into
    any dataset you want to use those procedures with. In case of a naming
    conflict with such a dataset hierarchy, the dataset you're calling
    run-procedures on will take precedence over its subdatasets and so on.

    Each configuration setting can occur multiple times to indicate multiple
    directories to be searched. If a procedure matching a given name is found
    (filename without a possible extension), the search is aborted and this
    implementation will be executed. This makes it possible for individual
    datasets, users, or machines to override externally provided procedures
    (enabling the implementation of customizable processing "hooks").


    *Procedure implementation*

    A procedure can be any executable. Executables must have the appropriate
    permissions and, in the case of a script, must contain an appropriate
    "shebang" line. If a procedure is not executable, but its filename ends
    with '.py', it is automatically executed by the 'python' interpreter
    (whichever version is available in the present environment). Likewise,
    procedure implementations ending on '.sh' are executed via 'bash'.

    Procedures can implement any argument handling, but must be capable
    of taking at least one positional argument (the absolute path to the
    dataset they shall operate on).

    For further customization there are two configuration settings per procedure
    available:

    - 'datalad.procedures.<NAME>.call-format'
      fully customizable format string to determine how to execute procedure
      NAME (see also datalad-run).
      It currently requires to include the following placeholders:

      - '{script}': will be replaced by the path to the procedure
      - '{ds}': will be replaced by the absolute path to the dataset the
        procedure shall operate on
      - '{args}': (not actually required) will be replaced by
        [CMD: all additional arguments passed into run-procedure after NAME CMD]
        [PY: all but the first element of `spec` if `spec` is a list or tuple PY]
        As an example the default format string for a call to a python script is:
        "python {script} {ds} {args}"
    - 'datalad.procedures.<NAME>.help'
      will be shown on `datalad run-procedure --help-proc NAME` to provide a
      description and/or usage info for procedure NAME
    """
    _params_ = dict(
        spec=Parameter(
            args=("spec",),
            metavar='NAME [ARGS]',
            nargs=REMAINDER,
            doc="""Name and possibly additional arguments of the
            to-be-executed procedure. [CMD: Note, that all options to
            run-procedure need to be put before NAME, since all ARGS get
            assigned to NAME CMD]"""),
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar="PATH",
            doc="""specify the dataset to run the procedure on.
            An attempt is made to identify the dataset based on the current
            working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        discover=Parameter(
            args=('--discover',),
            action='store_true',
            doc="""if given, all configured paths are searched for procedures
            and one result record per discovered procedure is yielded, but
            no procedure is executed"""),
        help_proc=Parameter(
            args=('--help-proc',),
            action='store_true',
            doc="""if given, get a help message for procedure NAME from config
            setting datalad.procedures.NAME.help"""
        )
    )

    _examples_ = [
        dict(text="Find out which procedures are available on the current system",
             code_py="run_procedure(discover=True)",
             code_cmd="datalad run-procedure --discover"),
        dict(text="Run the 'yoda' procedure in the current dataset",
             code_py="run_procedure(spec='cfg_yoda', recursive=True)",
             code_cmd="datalad run-procedure cfg_yoda"),
    ]

    result_renderer = 'tailored'

    @staticmethod
    @datasetmethod(name='run_procedure')
    @eval_results
    def __call__(
            spec=None,
            dataset=None,
            discover=False,
            help_proc=False):
        if not spec and not discover:
            raise InsufficientArgumentsError('requires at least a procedure name')
        if help_proc and not spec:
            raise InsufficientArgumentsError('requires a procedure name')

        try:
            ds = require_dataset(
                dataset, check_installed=False,
                purpose='run a procedure')
        except NoDatasetFound:
            ds = None

        if discover:
            # specific path of procedures that were already reported
            reported = set()
            # specific names of procedure for which an active one has been
            # found
            active = set()
            for m, cmd_name, cmd_tmpl, cmd_help in \
                    _get_procedure_implementation('*', ds=ds):
                if m in reported:
                    continue
                ex = _guess_exec(m)
                # configured template (call-format string) takes precedence:
                if cmd_tmpl:
                    ex['template'] = cmd_tmpl
                if ex['state'] is None:
                    # doesn't seem like a match
                    lgr.debug("%s does not look like a procedure, ignored.", m)
                    continue
                state = 'overridden' if cmd_name in active else ex['state']
                message = ex['type'] if ex['type'] else 'unknown type'
                message += ' ({})'.format(state) if state != 'executable' else ''
                res = get_status_dict(
                    action='discover_procedure',
                    path=m,
                    type='file',
                    logger=lgr,
                    refds=ds.path if ds else None,
                    status='ok',
                    state=state,
                    procedure_name=cmd_name,
                    procedure_type=ex['type'],
                    procedure_callfmt=ex['template'],
                    procedure_help=cmd_help,
                    message=message)
                reported.add(m)
                if state == 'executable':
                    active.add(cmd_name)
                yield res
            return

        if not isinstance(spec, (tuple, list)):
            # maybe coming from config
            spec = split_cmdline(spec)
        name = spec[0]
        args = spec[1:]

        try:
            # get the first match an run with it
            procedure_file, cmd_name, cmd_tmpl, cmd_help = \
                next(_get_procedure_implementation(name, ds=ds))
        except StopIteration:
            res = get_status_dict(
                    action='run_procedure',
                    # TODO: Default renderer requires a key "path" to exist.
                    # Doesn't make a lot of sense in this case
                    path=name,
                    logger=lgr,
                    refds=ds.path if ds else None,
                    status='impossible',
                    message="Cannot find procedure with name '%s'" % name)
            yield res
            return

        ex = _guess_exec(procedure_file)
        # configured template (call-format string) takes precedence:
        if cmd_tmpl:
            ex['template'] = cmd_tmpl

        if help_proc:
            if cmd_help:
                res = get_status_dict(
                        action='procedure_help',
                        path=procedure_file,
                        type='file',
                        logger=lgr,
                        refds=ds.path if ds else None,
                        status='ok',
                        state=ex['state'],
                        procedure_name=cmd_name,
                        procedure_type=ex['type'],
                        procedure_callfmt=ex['template'],
                        message=cmd_help)
            else:
                res = get_status_dict(
                        action='procedure_help',
                        path=procedure_file,
                        type='file',
                        logger=lgr,
                        refds=ds.path if ds else None,
                        status='impossible',
                        state=ex['state'],
                        procedure_name=cmd_name,
                        procedure_type=ex['type'],
                        procedure_callfmt=ex['template'],
                        message="No help available for '%s'" % name)

            yield res
            return

        if not ex['template']:
            raise ValueError("No idea how to execute procedure %s. "
                             "Missing 'execute' permissions?" % procedure_file)

        cmd = ex['template'].format(
            script=quote_cmdlinearg(procedure_file),
            ds=quote_cmdlinearg(ds.path) if ds else '',
            args=(u' '.join(quote_cmdlinearg(a) for a in args) if args else ''))
        lgr.info(u"Running procedure %s", name)
        lgr.debug(u'Full procedure command: %r', cmd)
        for r in Run.__call__(
                cmd=cmd,
                dataset=ds,
                explicit=True,
                inputs=None,
                outputs=None,
                # pass through here
                on_failure='ignore',
                return_type='generator'
        ):
            yield r

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        from datalad.ui import ui
        from datalad.interface.utils import default_result_renderer

        if res['status'] != 'ok':
            # logging complained about this already
            return

        if 'procedure' not in res.get('action', ''):
            # it's not our business
            default_result_renderer(res)
            return

        if kwargs.get('discover', None):
            ui.message('{name} ({path}){msg}'.format(
                # bold-faced name, if active
                name=ac.color_word(res['procedure_name'], ac.BOLD)
                if res['state'] == 'executable' else res['procedure_name'],
                path=res['path'],
                msg=' [{}]'.format(
                    res['message'][0] % res['message'][1:]
                    if isinstance(res['message'], tuple) else res['message'])
                if 'message' in res else ''
            ))

        elif kwargs.get('help_proc', None):
            ui.message('{name} ({path}){help}'.format(
                name=ac.color_word(res['procedure_name'], ac.BOLD),
                path=op.relpath(
                    res['path'],
                    res['refds'])
                if res.get('refds', None) else res['path'],
                help='{nl}{msg}'.format(
                    nl=os.linesep,
                    msg=res['message'][0] % res['message'][1:]
                    if isinstance(res['message'], tuple) else res['message'])
                if 'message' in res else ''
            ))

        else:
            default_result_renderer(res)
Exemplo n.º 14
0
class RunProcedure(Interface):
    """Run prepared procedures (DataLad scripts) on a dataset

    *Concept*

    A "procedure" is an algorithm with the purpose to process a dataset in a
    particular way. Procedures can be useful in a wide range of scenarios,
    like adjusting dataset configuration in a uniform fashion, populating
    a dataset with particular content, or automating other routine tasks,
    such as synchronizing dataset content with certain siblings.

    Implementations of some procedures are shipped together with DataLad,
    but additional procedures can be provided by 1) any DataLad extension,
    2) any dataset, 3) a local user, or 4) a local system administrator.
    DataLad will look for procedures in the following locations and order:

    Directories identified by the configuration settings

    - 'datalad.locations.dataset-procedures'
    - 'datalad.locations.user-procedures' (determined by
      appdirs.user_config_dir; defaults to '$HOME/.config/datalad/procedures'
      on GNU/Linux systems)
    - 'datalad.locations.system-procedures' (determined by
      appdirs.site_config_dir; defaults to '/etc/xdg/datalad/procedures' on
      GNU/Linux systems)

    and subsequently in the 'resources/procedures/' directories of any
    installed extension, and, lastly, of the DataLad installation itself.

    Each configuration setting can occur multiple times to indicate multiple
    directories to be searched. If a procedure matching a given name is found
    (filename without a possible extension), the search is aborted and this
    implementation will be executed. This makes it possible for individual
    datasets, users, or machines to override externally provided procedures
    (enabling the implementation of cutomizable processing "hooks").


    *Procedure implementation*

    A procedure can be any executable. Executables must have the appropriate
    permissions and, in the case of a script, must contain an appropriate
    "shebang" line. If a procedure is not executable, but its filename ends
    with '.py', it is automatically executed by the 'python' interpreter
    (whichever version is available in the present environment). Likewise,
    procedure implementations ending on '.sh' are executed via 'bash'.

    Procedures can implement any argument handling, but must be capable
    of taking at least one positional argument (the absolute path to the
    dataset they shall operate on).


    *Customize other commands with procedures*

    On execution of any commands, DataLad inspects two additional
    configuration settings:

    - 'datalad.<name>.proc-pre'

    - 'datalad.<name>.proc-post'

    where '<name>' is the name of a DataLad command. Using this mechanism
    DataLad can be instructed to run one or more procedures before or
    after the execution of a given command. For example, configuring
    a set of metadata types in any newly created dataset can be achieved
    via:

      % datalad -c 'datalad.create.proc-post=cfg_metadatatypes xmp image' create -d myds

    As procedures run on datasets, it is necessary to explicitly identify
    the target dataset via the -d (--dataset) option.
    """
    _params_ = dict(
        spec=Parameter(args=("spec", ),
                       metavar='NAME [ARGS]',
                       nargs=REMAINDER,
                       doc="""Name and possibly additional arguments of the
            to-be-executed procedure."""),
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to run the procedure on.
            An attempt is made to identify the dataset based on the current
            working directory.""",
                          constraints=EnsureDataset() | EnsureNone()),
    )

    @staticmethod
    @datasetmethod(name='run_procedure')
    @eval_results
    def __call__(spec, dataset=None):
        if not isinstance(spec, (tuple, list)):
            # maybe coming from config
            import shlex
            spec = shlex.split(spec)
        name = spec[0]
        args = spec[1:]
        procedure_file = _get_procedure_implementation(name, ds=dataset)
        if not procedure_file:
            # TODO error result
            raise ValueError("Cannot find procedure with name '%s'", name)

        ds = require_dataset(dataset,
                             check_installed=False,
                             purpose='run a procedure') if dataset else None

        cmd_tmpl = _guess_exec(procedure_file)
        cmd = cmd_tmpl.format(script=procedure_file,
                              ds=ds.path if ds else '',
                              args=u' '.join(u'"{}"'.format(a)
                                             for a in args) if args else '')
        lgr.debug('Attempt to run procedure {} as: {}'.format(name, cmd))
        for r in Run.__call__(
                cmd=cmd,
                dataset=ds,
                # See gh-2593 for discussion on run feature extension
                #explicit=True,
                #inputs=None,
                #outputs=None,
                # pass through here
                on_failure='ignore',
        ):
            yield r
Exemplo n.º 15
0
class AddReadme(Interface):
    """Add basic information about DataLad datasets to a README file

    The README file is added to the dataset and the addition is saved
    in the dataset.
    Note: Make sure that no unsaved modifications to your dataset's
    .gitattributes file exist.

    """
    from datalad.support.param import Parameter
    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import (
        EnsureChoice,
        EnsureNone,
        EnsureStr,
    )

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""Dataset to add information to. If no dataset is given, an
            attempt is made to identify the dataset based on the current
            working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        filename=Parameter(
            args=("filename",),
            metavar="PATH",
            nargs='?',
            doc="""Path of the README file within the dataset.""",
            constraints=EnsureStr()),
        existing=Parameter(
            args=("--existing",),
            doc="""How to react if a file with the target name already exists:
            'skip': do nothing; 'append': append information to the existing
            file; 'replace': replace the existing file with new content.""",
            constraints=EnsureChoice("skip", "append", "replace")),
    )

    @staticmethod
    @datasetmethod(name='add_readme')
    @eval_results
    def __call__(dataset, filename='README.md', existing='skip'):
        from os.path import lexists
        from os.path import join as opj
        from io import open
        import logging
        lgr = logging.getLogger('datalad.local.add_readme')

        from datalad.distribution.dataset import require_dataset
        from datalad.utils import ensure_list

        dataset = require_dataset(dataset, check_installed=True,
                                  purpose='add README')

        fpath = opj(dataset.path, filename)
        res_kwargs = dict(action='add_readme', path=fpath)

        if lexists(fpath) and existing == 'skip':
            yield dict(
                res_kwargs,
                status='notneeded',
                message='file already exists, and not appending content')
            return

        # unlock, file could be annexed
        if lexists(fpath):
            dataset.unlock(fpath)
        if not lexists(fpath):
            # if we have an annex repo, shall the README go to Git or annex?

            if isinstance(dataset.repo, AnnexRepo) \
                and 'annex.largefiles' not in \
                    dataset.repo.get_gitattributes(filename).get(filename, {}):
                # configure the README to go into Git
                dataset.repo.set_gitattributes(
                    [(filename, {'annex.largefiles': 'nothing'})])
                dataset.save(
                    path='.gitattributes',
                    message="[DATALAD] Configure README to be in Git",
                    to_git=True
                )

        # get any metadata on the dataset itself
        dsinfo = dataset.metadata(
            '.', reporton='datasets', return_type='item-or-list',
            on_failure='ignore')
        meta = {}
        if not isinstance(dsinfo, dict) or dsinfo.get('status', None) != 'ok':
            lgr.warning("Could not obtain dataset metadata, proceeding without")
            dsinfo = {}
        else:
            # flatten possibly existing multiple metadata sources
            for src in dsinfo['metadata']:
                if src.startswith('@'):
                    # not a source
                    continue
                meta.update(dsinfo['metadata'][src])

        metainfo = ''
        for label, content in (
                ('', meta.get('description', meta.get('shortdescription', ''))),
                ('Author{}'.format('s' if isinstance(meta.get('author', None), list) else ''),
                    u'\n'.join([u'- {}'.format(a) for a in ensure_list(meta.get('author', []))])),
                ('Homepage', meta.get('homepage', '')),
                ('Reference', meta.get('citation', '')),
                ('License', meta.get('license', '')),
                ('Keywords', u', '.join([u'`{}`'.format(k) for k in ensure_list(meta.get('tag', []))])),
                ('Funding', meta.get('fundedby', '')),
                ):
            if label and content:
                metainfo += u'\n\n### {}\n\n{}'.format(label, content)
            elif content:
                metainfo += u'\n\n{}'.format(content)

        for key in 'title', 'name', 'shortdescription':
            if 'title' in meta:
                break
            if key in meta:
                meta['title'] = meta[key]

        default_content=u"""\
# {title}{metainfo}

## General information

This is a DataLad dataset{id}.

## DataLad datasets and how to use them

This repository is a [DataLad](https://www.datalad.org/) dataset. It provides
fine-grained data access down to the level of individual files, and allows for
tracking future updates. In order to use this repository for data retrieval,
[DataLad](https://www.datalad.org/) is required. It is a free and open source
command line tool, available for all major operating systems, and builds up on
Git and [git-annex](https://git-annex.branchable.com/) to allow sharing,
synchronizing, and version controlling collections of large files.

More information on how to install DataLad and [how to install](http://handbook.datalad.org/en/latest/intro/installation.html)
it can be found in the [DataLad Handbook](https://handbook.datalad.org/en/latest/index.html).

### Get the dataset

A DataLad dataset can be `cloned` by running

```
datalad clone <url>
```

Once a dataset is cloned, it is a light-weight directory on your local machine.
At this point, it contains only small metadata and information on the identity
of the files in the dataset, but not actual *content* of the (sometimes large)
data files.

### Retrieve dataset content

After cloning a dataset, you can retrieve file contents by running

```
datalad get <path/to/directory/or/file>
```

This command will trigger a download of the files, directories, or subdatasets
you have specified.

DataLad datasets can contain other datasets, so called *subdatasets*.  If you
clone the top-level dataset, subdatasets do not yet contain metadata and
information on the identity of files, but appear to be empty directories. In
order to retrieve file availability metadata in subdatasets, run

```
datalad get -n <path/to/subdataset>
```

Afterwards, you can browse the retrieved metadata to find out about subdataset
contents, and retrieve individual files with `datalad get`.  If you use
`datalad get <path/to/subdataset>`, all contents of the subdataset will be
downloaded at once.

### Stay up-to-date

DataLad datasets can be updated. The command `datalad update` will *fetch*
updates and store them on a different branch (by default
`remotes/origin/master`). Running

```
datalad update --merge
```

will *pull* available updates and integrate them in one go.

### Find out what has been done

DataLad datasets contain their history in the ``git log``.  By running ``git
log`` (or a tool that displays Git history) in the dataset or on specific
files, you can find out what has been done to the dataset or to individual
files by whom, and when.
""".format(
            title='Dataset "{}"'.format(meta['title']) if 'title' in meta else 'About this dataset',
            metainfo=metainfo,
            id=u' (id: {})'.format(dataset.id) if dataset.id else '',
            )

        with open(fpath, 'a' if existing == 'append' else 'w', encoding='utf-8') as fp:
            fp.write(default_content)
            yield dict(
                status='ok',
                path=fpath,
                type='file',
                action='add_readme')

        for r in dataset.save(
                fpath,
                message='[DATALAD] added README',
                result_filter=None,
                result_xfm=None):
            yield r
Exemplo n.º 16
0
class Addurls(Interface):
    """Create and update a dataset from a list of URLs.

    *Format specification*

    Several arguments take format strings.  These are similar to normal Python
    format strings where the names from `URL-FILE` (column names for a CSV or
    properties for JSON) are available as placeholders.  If `URL-FILE` is a CSV
    file, a positional index can also be used (i.e., "{0}" for the first
    column).  Note that a placeholder cannot contain a ':' or '!'.

    In addition, the `FILENAME-FORMAT` arguments has a few special
    placeholders.

      - _repindex

        The constructed file names must be unique across all fields rows.  To
        avoid collisions, the special placeholder "_repindex" can be added to
        the formatter.  Its value will start at 0 and increment every time a
        file name repeats.

      - _url_hostname, _urlN, _url_basename*

        Various parts of the formatted URL are available.  Take
        "http://datalad.org/asciicast/seamless_nested_repos.sh" as an example.

        "datalad.org" is stored as "_url_hostname".  Components of the URL's
        path can be referenced as "_urlN".  "_url0" and "_url1" would map to
        "asciicast" and "seamless_nested_repos.sh", respectively.  The final
        part of the path is also available as "_url_basename".

        This name is broken down further.  "_url_basename_root" and
        "_url_basename_ext" provide access to the root name and extension.
        These values are similar to the result of os.path.splitext, but, in the
        case of multiple periods, the extension is identified using the same
        length heuristic that git-annex uses.  As a result, the extension of
        "file.tar.gz" would be ".tar.gz", not ".gz".  In addition, the fields
        "_url_basename_root_py" and "_url_basename_ext_py" provide access to
        the result of os.path.splitext.

      - _url_filename*

        These are similar to _url_basename* fields, but they are obtained with
        a server request.  This is useful if the file name is set in the
        Content-Disposition header.


    *Examples*

    Consider a file "avatars.csv" that contains::

        who,ext,link
        neurodebian,png,https://avatars3.githubusercontent.com/u/260793
        datalad,png,https://avatars1.githubusercontent.com/u/8927200

    To download each link into a file name composed of the 'who' and 'ext'
    fields, we could run::

      $ datalad addurls -d avatar_ds --fast avatars.csv '{link}' '{who}.{ext}'

    The `-d avatar_ds` is used to create a new dataset in "$PWD/avatar_ds".

    If we were already in a dataset and wanted to create a new subdataset in an
    "avatars" subdirectory, we could use "//" in the `FILENAME-FORMAT`
    argument::

      $ datalad addurls --fast avatars.csv '{link}' 'avatars//{who}.{ext}'

    .. note::

       For users familiar with 'git annex addurl': A large part of this
       plugin's functionality can be viewed as transforming data from
       `URL-FILE` into a "url filename" format that fed to 'git annex addurl
       --batch --with-files'.
    """

    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import EnsureChoice, EnsureNone, EnsureStr
    from datalad.support.param import Parameter

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""Add the URLs to this dataset (or possibly subdatasets of
            this dataset).  An empty or non-existent directory is passed to
            create a new dataset.  New subdatasets can be specified with
            `FILENAME-FORMAT`.""",
            constraints=EnsureDataset() | EnsureNone()),
        urlfile=Parameter(
            args=("urlfile", ),
            metavar="URL-FILE",
            doc="""A file that contains URLs or information that can be used to
            construct URLs.  Depending on the value of --input-type, this
            should be a CSV file (with a header as the first row) or a JSON
            file (structured as a list of objects with string values)."""),
        urlformat=Parameter(
            args=("urlformat", ),
            metavar="URL-FORMAT",
            doc="""A format string that specifies the URL for each entry.  See
            the 'Format Specification' section above."""),
        filenameformat=Parameter(
            args=("filenameformat", ),
            metavar="FILENAME-FORMAT",
            doc="""Like `URL-FORMAT`, but this format string specifies the file
            to which the URL's content will be downloaded.  The file name may
            contain directories.  The separator "//" can be used to indicate
            that the left-side directory should be created as a new subdataset.
            See the 'Format Specification' section above."""),
        input_type=Parameter(
            args=("-t", "--input-type"),
            metavar="TYPE",
            doc="""Whether `URL-FILE` should be considered a CSV file or a JSON
            file.  The default value, "ext", means to consider `URL-FILE` as a
            JSON file if it ends with ".json".  Otherwise, treat it as a CSV
            file.""",
            constraints=EnsureChoice("ext", "csv", "json")),
        exclude_autometa=Parameter(
            args=("-x", "--exclude_autometa"),
            metavar="REGEXP",
            doc="""By default, metadata field=value pairs are constructed with
            each column in `URL-FILE`, excluding any single column that is
            specified via `URL-FORMAT`.  This argument can be used to exclude
            columns that match a regular expression.  If set to '*' or an empty
            string, automatic metadata extraction is disabled completely.  This
            argument does not affect metadata set explicitly with --meta."""),
        meta=Parameter(
            args=(
                "-m",
                "--meta",
            ),
            metavar="FORMAT",
            action="append",
            doc="""A format string that specifies metadata.  It should be
            structured as "<field>=<value>".  As an example, "location={3}"
            would mean that the value for the "location" metadata field should
            be set the value of the fourth column.  This option can be given
            multiple times."""),
        message=Parameter(
            args=("--message", ),
            metavar="MESSAGE",
            doc="""Use this message when committing the URL additions.""",
            constraints=EnsureNone() | EnsureStr()),
        dry_run=Parameter(
            args=("-n", "--dry-run"),
            action="store_true",
            doc="""Report which URLs would be downloaded to which files and
            then exit."""),
        fast=Parameter(
            args=("--fast", ),
            action="store_true",
            doc="""If True, add the URLs, but don't download their content.
            Underneath, this passes the --fast flag to `git annex addurl`."""),
        ifexists=Parameter(
            args=("--ifexists", ),
            metavar="ACTION",
            doc="""What to do if a constructed file name already exists.  The
            default behavior is to proceed with the `git annex addurl`, which
            will fail if the file size has changed.  If set to 'overwrite',
            remove the old file before adding the new one.  If set to 'skip',
            do not add the new file.""",
            constraints=EnsureNone() | EnsureChoice("overwrite", "skip")),
        missing_value=Parameter(
            args=("--missing-value", ),
            metavar="VALUE",
            doc="""When an empty string is encountered, use this value
            instead.""",
            constraints=EnsureNone() | EnsureStr()),
        save=nosave_opt,
        version_urls=Parameter(
            args=("--version-urls", ),
            action="store_true",
            doc="""Try to add a version ID to the URL. This currently only has
            an effect on URLs for AWS S3 buckets."""),
    )

    @staticmethod
    @datasetmethod(name='addurls')
    @eval_results
    def __call__(dataset,
                 urlfile,
                 urlformat,
                 filenameformat,
                 input_type="ext",
                 exclude_autometa=None,
                 meta=None,
                 message=None,
                 dry_run=False,
                 fast=False,
                 ifexists=None,
                 missing_value=None,
                 save=True,
                 version_urls=False):
        # Temporarily work around gh-2269.
        url_file = urlfile
        url_format, filename_format = urlformat, filenameformat

        from requests.exceptions import RequestException

        from datalad.distribution.add import Add
        from datalad.distribution.create import Create
        from datalad.distribution.dataset import Dataset, require_dataset
        from datalad.interface.results import get_status_dict
        from datalad.support.annexrepo import AnnexRepo

        lgr = logging.getLogger("datalad.plugin.addurls")

        dataset = require_dataset(dataset, check_installed=False)
        if dataset.repo and not isinstance(dataset.repo, AnnexRepo):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message="not an annex repo")
            return

        if input_type == "ext":
            extension = os.path.splitext(url_file)[1]
            input_type = "json" if extension == ".json" else "csv"

        with open(url_file) as fd:
            try:
                rows, subpaths = extract(fd, input_type, url_format,
                                         filename_format, exclude_autometa,
                                         meta, dry_run, missing_value)
            except (ValueError, RequestException) as exc:
                yield get_status_dict(action="addurls",
                                      ds=dataset,
                                      status="error",
                                      message=exc_str(exc))
                return

        if len(rows) != len(set(row["filename"] for row in rows)):
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="error",
                                  message=("There are file name collisions; "
                                           "consider using {_repindex}"))
            return

        if dry_run:
            for subpath in subpaths:
                lgr.info("Would create a subdataset at %s", subpath)
            for row in rows:
                lgr.info("Would download %s to %s", row["url"],
                         os.path.join(dataset.path, row["filename"]))
                lgr.info(
                    "Metadata: %s",
                    sorted(u"{}={}".format(k, v)
                           for k, v in row["meta_args"].items()))
            yield get_status_dict(action="addurls",
                                  ds=dataset,
                                  status="ok",
                                  message="dry-run finished")
            return

        if not dataset.repo:
            # Populate a new dataset with the URLs.
            for r in dataset.create(result_xfm=None,
                                    return_type='generator',
                                    save=save):
                yield r

        annex_options = ["--fast"] if fast else []

        for spath in subpaths:
            if os.path.exists(os.path.join(dataset.path, spath)):
                lgr.warning("Not creating subdataset at existing path: %s",
                            spath)
            else:
                for r in dataset.create(spath,
                                        result_xfm=None,
                                        return_type='generator',
                                        save=save):
                    yield r

        for row in rows:
            # Add additional information that we'll need for various
            # operations.
            filename_abs = os.path.join(dataset.path, row["filename"])
            if row["subpath"]:
                ds_current = Dataset(os.path.join(dataset.path,
                                                  row["subpath"]))
                ds_filename = os.path.relpath(filename_abs, ds_current.path)
            else:
                ds_current = dataset
                ds_filename = row["filename"]
            row.update({
                "filename_abs": filename_abs,
                "ds": ds_current,
                "ds_filename": ds_filename
            })

        if version_urls:
            num_urls = len(rows)
            log_progress(lgr.info,
                         "addurls_versionurls",
                         "Versioning %d URLs",
                         num_urls,
                         label="Versioning URLs",
                         total=num_urls,
                         unit=" URLs")
            for row in rows:
                url = row["url"]
                try:
                    row["url"] = get_versioned_url(url)
                except (ValueError, NotImplementedError) as exc:
                    # We don't expect this to happen because get_versioned_url
                    # should return the original URL if it isn't an S3 bucket.
                    # It only raises exceptions if it doesn't know how to
                    # handle the scheme for what looks like an S3 bucket.
                    lgr.warning("error getting version of %s: %s", row["url"],
                                exc_str(exc))
                log_progress(lgr.info,
                             "addurls_versionurls",
                             "Versioned result for %s: %s",
                             url,
                             row["url"],
                             update=1,
                             increment=True)
            log_progress(lgr.info, "addurls_versionurls",
                         "Finished versioning URLs")

        files_to_add = set()
        for r in add_urls(rows, ifexists=ifexists, options=annex_options):
            if r["status"] == "ok":
                files_to_add.add(r["path"])
            yield r

            msg = message or """\
[DATALAD] add files from URLs

url_file='{}'
url_format='{}'
filename_format='{}'""".format(url_file, url_format, filename_format)

        if files_to_add:
            for r in dataset.add(files_to_add, save=False):
                yield r

            meta_rows = [r for r in rows if r["filename_abs"] in files_to_add]
            for r in add_meta(meta_rows):
                yield r

            # Save here rather than the add call above to trigger a metadata
            # commit on the git-annex branch.
            if save:
                for r in dataset.save(message=msg, recursive=True):
                    yield r
Exemplo n.º 17
0
class WTF(Interface):
    """Generate a report about the DataLad installation and configuration

    IMPORTANT: Sharing this report with untrusted parties (e.g. on the web)
    should be done with care, as it may include identifying information, and/or
    credentials or access tokens.
    """
    from datalad.support.param import Parameter
    from datalad.distribution.dataset import datasetmethod
    from datalad.interface.utils import eval_results
    from datalad.distribution.dataset import EnsureDataset
    from datalad.support.constraints import EnsureNone

    _params_ = dict(dataset=Parameter(args=("-d", "--dataset"),
                                      doc=""""specify the dataset to report on.
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory.""",
                                      constraints=EnsureDataset()
                                      | EnsureNone()), )

    @staticmethod
    @datasetmethod(name='wtf')
    @eval_results
    def __call__(dataset=None):
        from datalad.distribution.dataset import require_dataset
        from datalad.support.exceptions import NoDatasetArgumentFound
        ds = None
        try:
            ds = require_dataset(dataset,
                                 check_installed=False,
                                 purpose='reporting')
        except NoDatasetArgumentFound:
            # failure is already logged
            pass
        if ds and not ds.is_installed():
            # we don't deal with absent datasets
            ds = None
        if ds is None:
            from datalad import cfg
        else:
            cfg = ds.config
        from datalad.ui import ui
        from datalad.api import metadata
        from datalad.metadata import extractors as metaextractors
        from datalad.support.external_versions import external_versions
        import os
        import platform as pl
        import json

        # formatting helper
        def _t2s(t):
            res = []
            for e in t:
                if isinstance(e, tuple):
                    es = _t2s(e)
                    if es != '':
                        res += ['(%s)' % es]
                elif e != '':
                    res += [e]
            return '/'.join(res)

        report_template = """\
System
======
{system}

Environment
===========
{env}

Externals
=========
{externals}
Available metadata extractors
=============================
{metaextractors}

Configuration
=============
{cfg}
{dataset}
"""

        dataset_template = """\

Dataset information
===================
{basic}

Metadata
--------
{meta}
"""
        ds_meta = None
        if ds and ds.is_installed():
            ds_meta = metadata(
                dataset=ds,
                reporton='datasets',
                return_type='list',
                result_filter=lambda x: x['action'] == 'metadata',
                result_renderer='disabled')
        if ds_meta:
            ds_meta = [dm['metadata'] for dm in ds_meta]
            if len(ds_meta) == 1:
                ds_meta = ds_meta.pop()
        ui.message(
            report_template.format(
                system='\n'.join(
                    '{}: {}'.format(*i) for i in (('OS          ', ' '.join(
                        [os.name,
                         pl.system(),
                         pl.release(),
                         pl.version()]).rstrip()), ('Distribution', ' '.join([
                             _t2s(pl.dist()),
                             _t2s(pl.mac_ver()),
                             _t2s(pl.win32_ver())
                         ]).rstrip()))),
                env='\n'.join('{}: {}'.format(k, v)
                              for k, v in os.environ.items()
                              if k.startswith('PYTHON') or k.startswith('GIT')
                              or k.startswith('DATALAD')),
                dataset='' if not ds else dataset_template.format(
                    basic='\n'.join('{}: {}'.format(k, v) for k, v in (
                        ('path', ds.path),
                        ('repo',
                         ds.repo.__class__.__name__ if ds.repo else '[NONE]'),
                    )),
                    meta=json.dumps(ds_meta, indent=1)
                    if ds_meta else '[no metadata]'),
                externals=external_versions.dumps(preamble=None,
                                                  indent='',
                                                  query=True),
                metaextractors='\n'.join(p for p in dir(metaextractors)
                                         if not p.startswith('_')),
                cfg='\n'.join(
                    '{}: {}'.format(
                        k, '<HIDDEN>'
                        if 'user' in k or 'token' in k or 'passwd' in k else v)
                    for k, v in sorted(cfg.items(), key=lambda x: x[0])),
            ))
        yield
Exemplo n.º 18
0
class Search(Interface):
    """Search within available in datasets' meta data
    """
    # XXX prevent common args from being added to the docstring
    _no_eval_results = True

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to perform the query operation on. If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory and/or the `path` given""",
            constraints=EnsureDataset() | EnsureNone()),
        match=Parameter(
            args=("match",),
            metavar='STRING',
            nargs="+",
            doc="a string (or a regular expression if "
                "[PY: `regex=True` PY][CMD: --regex CMD]) to search for "
                "in all meta data values. If multiple provided, all must have "
                "a match among some fields of a dataset"),
        #match=Parameter(
        #    args=('-m', '--match',),
        #    metavar='REGEX',
        #    action='append',
        #    nargs=2,
        #    doc="""Pair of two regular expressions to match a property and its
        #    value.[CMD:  This option can be given multiple times CMD]"""),
        search=Parameter(
            args=('-s', '--search'),
            metavar='PROPERTY',
            action='append',
            # could also be regex
            doc="""name of the property to search for any match.[CMD:  This
            option can be given multiple times. CMD] By default, all properties
            are searched."""),
        report=Parameter(
            args=('-r', '--report'),
            metavar='PROPERTY',
            action='append',
            # could also be regex
            doc="""name of the property to report for any match.[CMD:  This
            option can be given multiple times. CMD] If '*' is given, all
            properties are reported."""),
        report_matched=Parameter(
            args=('-R', '--report-matched',),
            action="store_true",
            doc="""flag to report those fields which have matches. If `report`
             option values are provided, union of matched and those in `report`
             will be output"""),
        # Theoretically they should be CMDLINE specific I guess?
        format=Parameter(
            args=('-f', '--format'),
            constraints=EnsureChoice('custom', 'json', 'yaml'),
            doc="""format for output."""
        ),
        regex=Parameter(
            args=("--regex",),
            action="store_true",
            doc="flag for STRING to be used as a (Python) regular expression "
                "which should match the value"),
    )

    @staticmethod
    @datasetmethod(name='search')
    def __call__(match,
                 dataset=None,
                 search=None,
                 report=None,
                 report_matched=False,
                 format='custom',
                 regex=False):
        """
        Yields
        ------
        location : str
            (relative) path to the dataset
        report : dict
            fields which were requested by `report` option
        """

        lgr.debug("Initiating search for match=%r and dataset %r",
                  match, dataset)
        try:
            ds = require_dataset(dataset, check_installed=True, purpose='dataset search')
            if ds.id is None:
                raise NoDatasetArgumentFound(
                    "This does not seem to be a dataset (no DataLad dataset ID "
                    "found). 'datalad create --force %s' can initialize "
                    "this repository as a DataLad dataset" % ds.path)
        except NoDatasetArgumentFound:
            exc_info = sys.exc_info()
            if dataset is None:
                if not ui.is_interactive:
                    raise NoDatasetArgumentFound(
                        "No DataLad dataset found. Specify a dataset to be "
                        "searched, or run interactively to get assistance "
                        "installing a queriable superdataset."
                    )
                # none was provided so we could ask user either he possibly wants
                # to install our beautiful mega-duper-super-dataset?
                # TODO: following logic could possibly benefit other actions.
                if os.path.exists(LOCAL_CENTRAL_PATH):
                    central_ds = Dataset(LOCAL_CENTRAL_PATH)
                    if central_ds.is_installed():
                        if ui.yesno(
                            title="No DataLad dataset found at current location",
                            text="Would you like to search the DataLad "
                                 "superdataset at %r?"
                                  % LOCAL_CENTRAL_PATH):
                            pass
                        else:
                            reraise(*exc_info)
                    else:
                        raise NoDatasetArgumentFound(
                            "No DataLad dataset found at current location. "
                            "The DataLad superdataset location %r exists, "
                            "but does not contain an dataset."
                            % LOCAL_CENTRAL_PATH)
                elif ui.yesno(
                        title="No DataLad dataset found at current location",
                        text="Would you like to install the DataLad "
                             "superdataset at %r?"
                             % LOCAL_CENTRAL_PATH):
                    from datalad.api import install
                    central_ds = install(LOCAL_CENTRAL_PATH, source='///')
                    ui.message(
                        "From now on you can refer to this dataset using the "
                        "label '///'"
                    )
                else:
                    reraise(*exc_info)

                lgr.info(
                    "Performing search using DataLad superdataset %r",
                    central_ds.path
                )
                for res in central_ds.search(
                        match,
                        search=search, report=report,
                        report_matched=report_matched,
                        format=format, regex=regex):
                    yield res
                return
            else:
                raise

        cache_dir = opj(opj(ds.path, get_git_dir(ds.path)), 'datalad', 'cache')
        mcache_fname = opj(cache_dir, 'metadata.p%d' % pickle.HIGHEST_PROTOCOL)

        meta = None
        if os.path.exists(mcache_fname):
            lgr.debug("use cached metadata of '{}' from {}".format(ds, mcache_fname))
            meta, checksum = pickle.load(open(mcache_fname, 'rb'))
            # TODO add more sophisticated tests to decide when the cache is no longer valid
            if checksum != ds.repo.get_hexsha():
                # errrr, try again below
                meta = None

        # don't put in 'else', as yet to be written tests above might fail and require
        # regenerating meta data
        if meta is None:
            lgr.info("Loading and caching local meta-data... might take a few seconds")
            if not exists(cache_dir):
                os.makedirs(cache_dir)

            meta = get_metadata(ds, guess_type=False, ignore_subdatasets=False,
                                ignore_cache=False)
            # merge all info on datasets into a single dict per dataset
            meta = flatten_metadata_graph(meta)
            # extract graph, if any
            meta = meta.get('@graph', meta)
            # build simple queriable representation
            if not isinstance(meta, list):
                meta = [meta]

            # sort entries by location (if present)
            sort_keys = ('location', 'description', 'id')
            # note with str() instead of '%' getting encoding issues...
            meta = sorted(meta, key=lambda m: tuple("%s" % (m.get(x, ""),) for x in sort_keys))

            # use pickle to store the optimized graph in the cache
            pickle.dump(
                # graph plus checksum from what it was built
                (meta, ds.repo.get_hexsha()),
                open(mcache_fname, 'wb'))
            lgr.debug("cached meta data graph of '{}' in {}".format(ds, mcache_fname))

        if report in ('', ['']):
            report = []
        elif report and not isinstance(report, list):
            report = [report]

        match = assure_list(match)
        search = assure_list(search)
        # convert all to lower case for case insensitive matching
        search = {x.lower() for x in search}

        def get_in_matcher(m):
            """Function generator to provide closure for a specific value of m"""
            mlower = m.lower()

            def matcher(s):
                return mlower in s.lower()
            return matcher

        matchers = [
            re.compile(match_).search
            if regex
            else get_in_matcher(match_)
            for match_ in match
        ]

        # location should be reported relative to current location
        # We will assume that noone chpwd while we are yielding
        ds_path_prefix = get_path_prefix(ds.path)

        # So we could provide a useful message whenever there were not a single
        # dataset with specified `--search` properties
        observed_properties = set()

        # for every meta data set
        for mds in meta:
            hit = False
            hits = [False] * len(matchers)
            matched_fields = set()
            if not mds.get('type', mds.get('schema:type', None)) == 'Dataset':
                # we are presently only dealing with datasets
                continue
            # TODO consider the possibility of nested and context/graph dicts
            # but so far we were trying to build simple lists of dicts, as much
            # as possible
            if not isinstance(mds, dict):
                raise NotImplementedError("nested meta data is not yet supported")

            # manual loop for now
            for k, v in iteritems(mds):
                if search:
                    k_lower = k.lower()
                    if k_lower not in search:
                        if observed_properties is not None:
                            # record for providing a hint later
                            observed_properties.add(k_lower)
                        continue
                    # so we have a hit, no need to track
                    observed_properties = None
                if isinstance(v, dict) or isinstance(v, list):
                    v = text_type(v)
                for imatcher, matcher in enumerate(matchers):
                    if matcher(v):
                        hits[imatcher] = True
                        matched_fields.add(k)
                if all(hits):
                    hit = True
                    # no need to do it longer than necessary
                    if not report_matched:
                        break

            if hit:
                location = mds.get('location', '.')
                report_ = matched_fields.union(report if report else {}) \
                    if report_matched else report
                if report_ == ['*']:
                    report_dict = mds
                elif report_:
                    report_dict = {k: mds[k] for k in report_ if k in mds}
                    if report_ and not report_dict:
                        lgr.debug(
                            'meta data match for %s, but no to-be-reported '
                            'properties (%s) found. Present properties: %s',
                            location, ", ".join(report_), ", ".join(sorted(mds))
                        )
                else:
                    report_dict = {}  # it was empty but not None -- asked to
                    # not report any specific field
                if isinstance(location, (list, tuple)):
                    # could be that the same dataset installed into multiple
                    # locations. For now report them separately
                    for l in location:
                        yield opj(ds_path_prefix, l), report_dict
                else:
                    yield opj(ds_path_prefix, location), report_dict

        if search and observed_properties is not None:
            import difflib
            suggestions = {
                s: difflib.get_close_matches(s, observed_properties)
                for s in search
            }
            suggestions_str = "\n ".join(
                "%s for %s" % (", ".join(choices), s)
                for s, choices in iteritems(suggestions) if choices
            )
            lgr.warning(
                "Found no properties which matched one of the one you "
                "specified (%s).  May be you meant one among: %s.\n"
                "Suggestions:\n"
                " %s",
                ", ".join(search),
                ", ".join(observed_properties),
                suggestions_str if suggestions_str.strip() else "none"
            )

    @staticmethod
    def result_renderer_cmdline(res, cmdlineargs):
        from datalad.ui import ui
        if res is None:
            res = []

        format = cmdlineargs.format or 'custom'
        if format == 'custom':

            if cmdlineargs.report in ('*', ['*']) \
                    or cmdlineargs.report_matched \
                    or (cmdlineargs.report is not None
                        and len(cmdlineargs.report) > 1):
                # multiline if multiple were requested and we need to disambiguate
                ichr = jchr = '\n'
                fmt = ' {k}: {v}'
            else:
                jchr = ', '
                ichr = ' '
                fmt = '{v}'

            anything = False
            for location, r in res:
                # XXX Yarik thinks that Match should be replaced with actual path to the dataset
                ui.message('{}{}{}{}'.format(
                    ansi_colors.color_word(location, ansi_colors.DATASET),
                    ':' if r else '',
                    ichr,
                    jchr.join(
                        [
                            fmt.format(
                                k=ansi_colors.color_word(k, ansi_colors.FIELD),
                                v=pretty_bytes(r[k]))
                            for k in sorted(r)
                        ])))
                anything = True
            if not anything:
                ui.message("Nothing to report")
        elif format == 'json':
            import json
            ui.message(json.dumps(list(map(itemgetter(1), res)), indent=2))
        elif format == 'yaml':
            import yaml
            lgr.warning("yaml output support is not yet polished")
            ui.message(yaml.safe_dump(list(map(itemgetter(1), res)),
                                      allow_unicode=True))
Exemplo n.º 19
0
class AggregateMetaData(Interface):
    """Aggregate metadata of one or more datasets for later query.

    Metadata aggregation refers to a procedure that extracts metadata present
    in a dataset into a portable representation that is stored a single
    standardized format. Moreover, metadata aggregation can also extract
    metadata in this format from one dataset and store it in another
    (super)dataset. Based on such collections of aggregated metadata it is
    possible to discover particular datasets and specific parts of their
    content, without having to obtain the target datasets first (see the
    DataLad 'search' command).

    To enable aggregation of metadata that are contained in files of a dataset,
    one has to enable one or more metadata extractor for a dataset. DataLad
    supports a number of common metadata standards, such as the Exchangeable
    Image File Format (EXIF), Adobe's Extensible Metadata Platform (XMP), and
    various audio file metadata systems like ID3. DataLad extension packages
    can provide metadata data extractors for additional metadata sources. For
    example, the neuroimaging extension provides extractors for scientific
    (meta)data standards like BIDS, DICOM, and NIfTI1.  Some metadata
    extractors depend on particular 3rd-party software. The list of metadata
    extractors available to a particular DataLad installation is reported by
    the 'wtf' command ('datalad wtf').

    Enabling a metadata extractor for a dataset is done by adding its name to the
    'datalad.metadata.nativetype' configuration variable -- typically in the
    dataset's configuration file (.datalad/config), e.g.::

      [datalad "metadata"]
        nativetype = exif
        nativetype = xmp

    If an enabled metadata extractor is not available in a particular DataLad
    installation, metadata extraction will not succeed in order to avoid
    inconsistent aggregation results.

    Enabling multiple extractors is supported. In this case, metadata are
    extracted by each extractor individually, and stored alongside each other.
    Metadata aggregation will also extract DataLad's own metadata (extractors
    'datalad_core', and 'annex').

    Metadata aggregation can be performed recursively, in order to aggregate all
    metadata across all subdatasets, for example, to be able to search across
    any content in any dataset of a collection. Aggregation can also be performed
    for subdatasets that are not available locally. In this case, pre-aggregated
    metadata from the closest available superdataset will be considered instead.

    Depending on the versatility of the present metadata and the number of dataset
    or files, aggregated metadata can grow prohibitively large. A number of
    configuration switches are provided to mitigate such issues.

    datalad.metadata.aggregate-content-<extractor-name>
      If set to false, content metadata aggregation will not be performed for
      the named metadata extractor (a potential underscore '_' in the extractor name must
      be replaced by a dash '-'). This can substantially reduce the runtime for
      metadata extraction, and also reduce the size of the generated metadata
      aggregate. Note, however, that some extractors may not produce any metadata
      when this is disabled, because their metadata might come from individual
      file headers only. 'datalad.metadata.store-aggregate-content' might be
      a more appropriate setting in such cases.

    datalad.metadata.aggregate-ignore-fields
      Any metadata key matching any regular expression in this configuration setting
      is removed prior to generating the dataset-level metadata summary (keys
      and their unique values across all dataset content), and from the dataset
      metadata itself. This switch can also be used to filter out sensitive
      information prior aggregation.

    datalad.metadata.generate-unique-<extractor-name>
      If set to false, DataLad will not auto-generate a summary of unique content
      metadata values for a particular extractor as part of the dataset-global metadata
      (a potential underscore '_' in the extractor name must be replaced by a dash '-').
      This can be useful if such a summary is bloated due to minor uninformative (e.g.
      numerical) differences, or when a particular extractor already provides a
      carefully designed content metadata summary.

    datalad.metadata.maxfieldsize
      Any metadata value that exceeds the size threshold given by this configuration
      setting (in bytes/characters) is removed.

    datalad.metadata.store-aggregate-content
      If set, extracted content metadata are still used to generate a dataset-level
      summary of present metadata (all keys and their unique values across all
      files in a dataset are determined and stored as part of the dataset-level
      metadata aggregate, see datalad.metadata.generate-unique-<extractor-name>),
      but metadata on individual files are not stored.
      This switch can be used to avoid prohibitively large metadata files. Discovery
      of datasets containing content matching particular metadata properties will
      still be possible, but such datasets would have to be obtained first in order
      to discover which particular files in them match these properties.
    """
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""topmost dataset metadata will be aggregated into. All dataset
            between this dataset and any given path will receive updated
            aggregated metadata from all given paths.""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path",),
            metavar="PATH",
            doc="""path to datasets that shall be aggregated.
            When a given path is pointing into a dataset, the metadata of the
            containing dataset will be aggregated.  If no paths given, current
            dataset metadata is aggregated.""",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        update_mode=Parameter(
            args=('--update-mode',),
            constraints=EnsureChoice('all', 'target'),
            doc="""which datasets to update with newly aggregated metadata:
            all datasets from any leaf dataset to the top-level target dataset
            including all intermediate datasets (all), or just the top-level
            target dataset (target)."""),
        incremental=Parameter(
            args=('--incremental',),
            action='store_true',
            doc="""If set, all information on metadata records of subdatasets
            that have not been (re-)aggregated in this run will be kept unchanged.
            This is useful when (re-)aggregation only a subset of a dataset hierarchy,
            for example, because not all subdatasets are locally available."""),
        force_extraction=Parameter(
            args=('--force-extraction',),
            action='store_true',
            doc="""If set, all enabled extractors will be engaged regardless of
            whether change detection indicates that metadata has already been
            extracted for a given dataset state."""),
        save=nosave_opt,
    )

    @staticmethod
    @datasetmethod(name='aggregate_metadata')
    @eval_results
    def __call__(
            path=None,
            *,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            update_mode='target',
            incremental=False,
            force_extraction=False,
            save=True):
        refds_path = require_dataset(dataset)

        # it really doesn't work without a dataset
        ds = require_dataset(
            dataset, check_installed=True, purpose='metadata aggregation')
        path = ensure_list(path)
        if not path:
            # then current/reference dataset is "aggregated"
            # We should not add ds.path always since then --recursive would
            # also recurse current even if paths are given
            path.append(ds.path)

        agginfo_db_location, agg_base_path = get_ds_aggregate_db_locations(
            ds,
            # do not warn here, next call triggers the same warning
            warn_absent=False)
        agginfo_db = load_ds_aggregate_db(ds, abspath=True)

        to_save = []
        to_aggregate = set()
        paths_by_ds, errors = get_paths_by_ds(
            require_dataset(dataset),
            dataset,
            paths=ensure_list(path),
            subdsroot_mode='super')
        for ap in _minimal_annotate_paths(
                paths_by_ds,
                errors,
                action='aggregate_metadata',
                recursive=recursive,
                recursion_limit=recursion_limit):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            ap_type = ap.get('type', None)
            ap_state = ap.get('state', None)
            assert('parentds' in ap or ap_type == 'dataset')
            if ap_type == 'dataset' and ap_state != 'absent':
                # a present dataset, we can take directly from it
                aggsrc = ap['path']
                lgr.info('Aggregate metadata for dataset %s', aggsrc)
            else:
                # everything else needs to come from the parent
                aggsrc = ap['parentds']
                if ap_state == 'absent':
                    lgr.info(
                        'Attempt to use pre-aggregate metadata for absent %s from dataset at %s',
                        ap['path'],
                        aggsrc)
                else:
                    lgr.info(
                        'Aggregate metadata for %s from dataset at %s',
                        ap['path'],
                        aggsrc)

            to_aggregate.add(aggsrc)

            if ap_state == 'absent':
                # key thought: recursive is done by path annotation, hence
                # once we hit an absent dataset, we are 100% certain that
                # there is nothing to recurse into on the file system
                # hence we only have to look into the aggregated metadata
                # of the last available dataset in the dataset tree edge
                #
                # if there is nothing at this path, we need to look into the
                # parentds and check if we know anything about this path
                # if we do, we need to grab all the info and objects
                # if not, we need to error
                res = _get_dsinfo_from_aggmetadata(
                    aggsrc, ap['path'], recursive, agginfo_db)
                if not isinstance(res, list):
                    yield get_status_dict(
                        status='impossible',
                        message=res,
                        action='aggregate_metadata',
                        path=ap['path'],
                        logger=lgr)
                    continue
                # cue for aggregation
                to_aggregate.update(res)
            else:
                # actually aggregate metadata for this dataset, immediately place
                # generated objects into the aggregated or reference dataset,
                # and put info into DB to get the distributed to all datasets
                # that need to be updated
                errored = _dump_extracted_metadata(
                    ds,
                    Dataset(aggsrc),
                    agginfo_db,
                    to_save,
                    force_extraction,
                    agg_base_path)
                if errored:
                    yield get_status_dict(
                        status='error',
                        message='Metadata extraction failed (see previous error message, set datalad.runtime.raiseonerror=yes to fail immediately)',
                        action='aggregate_metadata',
                        path=aggsrc,
                        logger=lgr)

        # at this point we have dumped all aggregated metadata into object files
        # somewhere, we know what needs saving, but having saved anything, and
        # we know about the states of all aggregated dataset in the DB
        # what remains to do is to update all dataset, so they have there own copy
        # of aggregated metadata and update their respective aggregate.json with
        # info on what states we just aggregated from

        # first, let's figure out what dataset need updating at all
        # get adjencency info of the dataset tree spanning the base to all leaf dataset
        # associated with the path arguments
        if update_mode == 'all':
            ds_adj = {}
            discover_dataset_trace_to_targets(
                ds.path, to_aggregate, [], ds_adj,
                # we know that to_aggregate only lists datasets, existing and
                # absent ones -- we want to aggregate all of them, either from
                # just extracted metadata, or from previously aggregated metadata
                # of the closest superdataset
                includeds=to_aggregate)
            # TODO we need to work in the info about dataset that we only got from
            # aggregated metadata, that had no trace on the file system in here!!
            subtrees = _adj2subtrees(ds.path, ds_adj, to_aggregate)
        elif update_mode == 'target':
            subtrees = {ds.path: list(agginfo_db.keys())}
        else:
            raise ValueError(
                "unknown `update_mode` '%s' for metadata aggregation", update_mode)

        # go over datasets in bottom-up fashion
        for parentds_path in sorted(subtrees, reverse=True):
            lgr.info('Update aggregate metadata in dataset at: %s', parentds_path)

            _update_ds_agginfo(
                ds.path,
                parentds_path,
                subtrees[parentds_path],
                incremental,
                agginfo_db,
                to_save)
            # update complete
            res = get_status_dict(
                status='ok',
                action='aggregate_metadata',
                path=parentds_path,
                type='dataset',
                logger=lgr)
            res.update(agginfo_db.get(parentds_path, {}))
            yield res
        #
        # save potential modifications to dataset global metadata
        #
        if not to_save:
            return
        lgr.info('Attempting to save %i files/datasets', len(to_save))
        for res in Save.__call__(
                # save does not need any pre-annotated path hints
                path=[r['path'] for r in to_save],
                dataset=refds_path,
                message='[DATALAD] Dataset aggregate metadata update',
                return_type='generator',
                result_renderer='disabled',
                result_xfm=None,
                result_filter=None,
                on_failure='ignore'):
            yield res
Exemplo n.º 20
0
class Unlock(Interface):
    """Unlock file(s) of a dataset

    Unlock files of a dataset in order to be able to edit the actual content
    """

    result_xfm = 'paths'
    on_failure = 'continue'

    _params_ = dict(
        path=Parameter(args=("path", ),
                       doc="""file(s) to unlock""",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(args=("-d", "--dataset"),
                          doc=""""specify the dataset to unlock files in. If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory. If the latter fails, an
            attempt is made to identify the dataset based on `path` """,
                          constraints=EnsureDataset() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
    )

    @staticmethod
    @datasetmethod(name='unlock')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None):

        if path is None and dataset is None:
            raise InsufficientArgumentsError(
                "insufficient arguments for unlocking: needs at least "
                "a dataset or a path to unlock.")

        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='unlock', logger=lgr, refds=refds_path)

        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='unlock',
                unavailable_path_status='impossible',
                unavailable_path_msg="path does not exist",
                nondataset_path_status='impossible',
                modified=None,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', 'dataset') == 'dataset':
                # this is a dataset
                ap['process_content'] = True
            to_process.append(ap)

        content_by_ds, ds_props, completed, nondataset_paths = \
            annotated2content_by_ds(
                to_process,
                refds_path=refds_path,
                path_only=False)
        assert (not completed)

        for ds_path in sorted(content_by_ds.keys()):
            ds = Dataset(ds_path)
            content = content_by_ds[ds_path]

            if not isinstance(ds.repo, AnnexRepo):
                for ap in content:
                    ap['status'] = 'notneeded'
                    ap['message'] = "not annex'ed, nothing to unlock"
                    ap.update(res_kwargs)
                    yield ap
                continue

            files = [ap['path'] for ap in content]

            for r in ds.repo.unlock(files):
                yield get_status_dict(path=r,
                                      status='ok',
                                      type='file',
                                      **res_kwargs)
Exemplo n.º 21
0
class Clean(Interface):
    """Clean up after DataLad (possible temporary files etc.)

    Removes temporary files and directories left behind by DataLad and
    git-annex in a dataset.

    """

    result_renderer = 'tailored'

    _examples_ = [
        dict(text="Clean all known temporary locations of a dataset",
             code_py="clean()",
             code_cmd="datalad clean"),
        dict(text="Report on all existing temporary locations of a dataset",
             code_py="clean(dry_run=True)",
             code_cmd="datalad clean --dry-run"),
        dict(text="Clean all known temporary locations of a dataset and all "
             "its subdatasets",
             code_py="clean(recursive=True)",
             code_cmd="datalad clean -r"),
        dict(text="Clean only the archive extraction caches of a dataset and "
             "all its subdatasets",
             code_py="clean(what='cached-archives', recursive=True)",
             code_cmd="datalad clean --what cached-archives -r"),
        dict(text="Report on existing annex transfer files of a dataset and "
             "all its subdatasets",
             code_py="clean(what='annex-transfer', recursive=True, "
             "dry_run=True)",
             code_cmd="datalad clean --what annex-transfer -r --dry-run"),
    ]

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to perform the clean operation on.  If
                no dataset is given, an attempt is made to identify the dataset
                in current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        dry_run=Parameter(
            args=("--dry-run", ),
            doc="""Report on cleanable locations - not actually cleaning up
            anything.""",
            action="store_true",
        ),
        # TODO: Python only???
        what=Parameter(args=("--what", ),
                       dest='what',
                       choices=('cached-archives', 'annex-tmp',
                                'annex-transfer', 'search-index'),
                       nargs="*",
                       doc="""What to clean. If none specified -- all known
            targets are considered."""),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
    )

    @staticmethod
    @datasetmethod(name='clean')
    @eval_results
    def __call__(*,
                 dataset=None,
                 what=None,
                 dry_run=False,
                 recursive=False,
                 recursion_limit=None):

        ds = require_dataset(dataset,
                             purpose="report on cleanable locations"
                             if dry_run else "clean dataset")
        res_kwargs = dict(action='clean [dry-run]' if dry_run else 'clean',
                          logger=lgr,
                          refds=ds.path)
        for wds in itertools.chain(
            [ds],
                ds.subdatasets(state='present',
                               recursive=recursive,
                               recursion_limit=recursion_limit,
                               return_type='generator',
                               result_renderer='disabled',
                               result_xfm='datasets') if recursive else []):
            d = wds.pathobj
            gitdir = wds.repo.dot_git
            DIRS_PLURAL = ("directory", "directories")
            FILES_PLURAL = ("file", "files")
            discover_or_remove = "Discovered" if dry_run else "Removed"

            for dirpath, flag, msg, sing_pl in [
                (Path(ARCHIVES_TEMP_DIR), "cached-archives",
                 "temporary archive", DIRS_PLURAL),
                (Path(ANNEX_TEMP_DIR), "annex-tmp", "temporary annex",
                 FILES_PLURAL),
                (Path(ANNEX_TRANSFER_DIR), "annex-transfer",
                 "annex temporary transfer", DIRS_PLURAL),
                (gitdir / Path(SEARCH_INDEX_DOTGITDIR), 'search-index',
                 "metadata search index", FILES_PLURAL),
            ]:
                topdir = wds.pathobj / dirpath
                lgr.debug("Considering to clean %s:%s", d, dirpath)
                if not ((what is None) or (flag in what)):
                    yield get_status_dict(path=str(topdir),
                                          status='notneeded',
                                          type='directory',
                                          **res_kwargs)
                    continue

                paths = [p for p in topdir.glob('*')]
                if not paths:
                    if not topdir.exists():
                        yield get_status_dict(path=str(topdir),
                                              status='notneeded',
                                              type='directory',
                                              **res_kwargs)
                        continue
                    else:
                        # we empty topdir only
                        message = ("%s empty %s directory", discover_or_remove,
                                   msg)
                else:
                    pl = len(paths) > 1
                    message = ("%s %d %s %s: %s", discover_or_remove,
                               len(paths), msg, sing_pl[int(pl)], ", ".join(
                                   sorted([
                                       str(p.relative_to(topdir))
                                       for p in paths if p != topdir
                                   ])))

                if not dry_run:
                    rmtree(str(topdir))

                yield get_status_dict(path=str(topdir),
                                      status='ok',
                                      type='directory',
                                      message=message,
                                      **res_kwargs)

    @staticmethod
    def custom_result_renderer(res, **kwargs):  # pragma: more cover
        # Don't render things like 'status' for clean-info messages -
        # seems rather meaningless.

        from os import getcwd

        import datalad.support.ansi_colors as ac
        from datalad.interface.utils import generic_result_renderer
        from datalad.utils import Path

        if res['action'] == 'clean':
            # default renderer is just fine
            return generic_result_renderer(res)
        elif res['action'] != 'clean [dry-run]':
            # Result didn't come from within `clean`.
            # Should be handled elsewhere.
            return

        assert res['action'] == 'clean [dry-run]'

        if res.get('status', None) == 'ok':
            from datalad.ui import ui

            # when to render relative paths:
            #  1) if a dataset arg was given
            #  2) if CWD is the refds

            refds = res.get('refds', None)
            refds = refds if kwargs.get('dataset', None) is not None \
                             or refds == getcwd() else None
            path = res['path'] if refds is None \
                else str(Path(res['path']).relative_to(refds))

            ui.message(u"{path}: {message}".format(
                path=ac.color_word(path, ac.BOLD),
                message=(res['message'][0] % res['message'][1:] if isinstance(
                    res['message'], tuple) else res['message']) if res.get(
                        'message', None) else ''))

        else:
            # Any other status than 'ok' is reported the default way.
            return generic_result_renderer(res)

    @staticmethod
    def custom_result_summary_renderer(results):
        # Since 'notneeded' results aren't rendered by default, give
        # a nothing-to-clean-message if all results were "notneeded",
        # to not remain entirely silent.

        if all(r['status'] == 'notneeded' for r in results):
            from datalad.ui import ui
            ui.message("nothing to clean, no temporary locations present.")
Exemplo n.º 22
0
class Create(Interface):
    """Create a new dataset from scratch.

    This command initializes a new dataset at a given location, or the
    current directory. The new dataset can optionally be registered in an
    existing superdataset (the new dataset's path needs to be located
    within the superdataset for that, and the superdataset needs to be given
    explicitly via [PY: `dataset` PY][CMD: --dataset CMD]). It is recommended
    to provide a brief description to label the dataset's nature *and*
    location, e.g. "Michael's music on black laptop". This helps humans to
    identify data locations in distributed scenarios.  By default an identifier
    comprised of user and machine name, plus path will be generated.

    This command only creates a new dataset, it does not add existing content
    to it, even if the target directory already contains additional files or
    directories.

    Plain Git repositories can be created via [PY: `annex=False` PY][CMD: --no-annex CMD].
    However, the result will not be a full dataset, and, consequently,
    not all features are supported (e.g. a description).

    || REFLOW >>
    To create a local version of a remote dataset use the
    :func:`~datalad.api.install` command instead.
    << REFLOW ||

    .. note::
      Power-user info: This command uses :command:`git init` and
      :command:`git annex init` to prepare the new dataset. Registering to a
      superdataset is performed via a :command:`git submodule add` operation
      in the discovered superdataset.
    """

    # in general this command will yield exactly one result
    return_type = 'item-or-list'
    # in general users expect to get an instance of the created dataset
    result_xfm = 'datasets'
    # result filter
    result_filter = \
        EnsureKeyChoice('action', ('create',)) & \
        EnsureKeyChoice('status', ('ok', 'notneeded'))

    _examples_ = [
        dict(text="Create a dataset 'mydataset' in the current directory",
             code_py="create(path='mydataset')",
             code_cmd="datalad create mydataset"),
        dict(text="Apply the text2git procedure upon creation of a dataset",
             code_py="create(path='mydataset', cfg_proc='text2git')",
             code_cmd="datalad create -c text2git mydataset"),
        dict(text="Create a subdataset in the root of an existing dataset",
             code_py="create(dataset='.', path='mysubdataset')",
             code_cmd="datalad create -d . mysubdataset"),
        dict(text="Create a dataset in an existing, non-empty directory",
             code_py="create(force=True)",
             code_cmd="datalad create --force"),
        dict(text="Create a plain Git repository",
             code_py="create(path='mydataset', annex=False)",
             code_cmd="datalad create --no-annex mydataset"),
    ]

    _params_ = dict(
        path=Parameter(
            args=("path", ),
            nargs='?',
            metavar='PATH',
            doc="""path where the dataset shall be created, directories
            will be created as necessary. If no location is provided, a dataset
            will be created in the location specified by [PY: `dataset`
            PY][CMD: --dataset CMD] (if given) or the current working
            directory. Either way the command will error if the target
            directory is not empty. Use [PY: `force` PY][CMD: --force CMD] to
            create a dataset in a non-empty directory.""",
            # put dataset 2nd to avoid useless conversion
            constraints=EnsureStr() | EnsureDataset() | EnsureNone()),
        initopts=Parameter(
            args=("initopts", ),
            metavar='INIT OPTIONS',
            nargs=REMAINDER,
            doc="""options to pass to :command:`git init`. [PY: Options can be
            given as a list of command line arguments or as a GitPython-style
            option dictionary PY][CMD: Any argument specified after the
            destination path of the repository will be passed to git-init
            as-is CMD]. Note that not all options will lead to viable results.
            For example '--bare' will not yield a repository where DataLad
            can adjust files in its working tree."""),
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='DATASET',
            doc="""specify the dataset to perform the create operation on. If
            a dataset is given along with `path`, a new subdataset will be created
            in it at the `path` provided to the create command. If a dataset is
            given but `path` is unspecified, a new dataset will be created at the
            location specified by this option.""",
            constraints=EnsureDataset() | EnsureNone()),
        force=Parameter(
            args=(
                "-f",
                "--force",
            ),
            doc="""enforce creation of a dataset in a non-empty directory""",
            action='store_true'),
        description=location_description,
        annex=Parameter(
            args=("--no-annex", ),
            dest='annex',
            doc="""if [CMD: set CMD][PY: disabled PY], a plain Git repository
            will be created without any annex""",
            action='store_false'),
        # TODO seems to only cause a config flag to be set, this could be done
        # in a procedure
        fake_dates=Parameter(
            args=('--fake-dates', ),
            action='store_true',
            doc="""Configure the repository to use fake dates. The date for a
            new commit will be set to one second later than the latest commit
            in the repository. This can be used to anonymize dates."""),
        cfg_proc=Parameter(
            args=("-c", "--cfg-proc"),
            metavar="PROC",
            action='append',
            doc="""Run cfg_PROC procedure(s) (can be specified multiple times)
            on the created dataset. Use
            [PY: `run_procedure(discover=True)` PY][CMD: run-procedure --discover CMD]
            to get a list of available procedures, such as cfg_text2git.
            """))

    @staticmethod
    @datasetmethod(name='create')
    @eval_results
    def __call__(path=None,
                 initopts=None,
                 *,
                 force=False,
                 description=None,
                 dataset=None,
                 annex=True,
                 fake_dates=False,
                 cfg_proc=None):
        # we only perform negative tests below
        no_annex = not annex

        if dataset:
            if isinstance(dataset, Dataset):
                ds = dataset
            else:
                ds = Dataset(dataset)
            refds_path = ds.path
        else:
            ds = refds_path = None

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if (isinstance(initopts, (list, tuple))
                and '--bare' in initopts) or (isinstance(initopts, dict)
                                              and 'bare' in initopts):
            raise ValueError(
                "Creation of bare repositories is not supported. Consider "
                "one of the create-sibling commands, or use "
                "Git to init a bare repository and push an existing dataset "
                "into it.")

        if path:
            path = resolve_path(path, dataset)

        path = path if path \
            else getpwd() if ds is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # assure cfg_proc is a list (relevant if used via Python API)
        cfg_proc = ensure_list(cfg_proc)

        # prep for yield
        res = dict(action='create',
                   path=str(path),
                   logger=lgr,
                   type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != str(path):
            refds = require_dataset(refds_path,
                                    check_installed=True,
                                    purpose='create a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", ds, str(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = get_dataset_root(
            op.normpath(op.join(str(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if (not pstatus.get(check_path, {}).get("type") == "dataset"
                    and any(check_path == p or check_path in p.parents
                            for p in pstatus)):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents
                ]
                res.update({
                    'status':
                    'error',
                    'message':
                    ('collision with content in parent dataset at %s: %s',
                     str(parentds_path), [str(c) for c in conflict])
                })
                yield res
                return
            if not force:
                # another set of check to see whether the target path is pointing
                # into a known subdataset that is not around ATM
                subds_status = {
                    parentds_path / k.relative_to(prepo.path)
                    for k, v in pstatus.items()
                    if v.get('type', None) == 'dataset'
                }
                check_paths = [check_path]
                check_paths.extend(check_path.parents)
                if any(p in subds_status for p in check_paths):
                    conflict = [p for p in check_paths if p in subds_status]
                    res.update({
                        'status':
                        'error',
                        'message':
                        ('collision with %s (dataset) in dataset %s',
                         str(conflict[0]), str(parentds_path))
                    })
                    yield res
                    return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = ds if isinstance(ds, Dataset) and \
            ds.path == path else Dataset(str(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`--force` option to ignore'
            })
            yield res
            return

        # Check if specified cfg_proc(s) can be discovered, storing
        # the results so they can be used when the time comes to run
        # the procedure. If a procedure cannot be found, raise an
        # error to prevent creating the dataset.
        cfg_proc_specs = []
        if cfg_proc:
            discovered_procs = tbds.run_procedure(
                discover=True,
                result_renderer='disabled',
                return_type='generator',
            )
            for cfg_proc_ in cfg_proc:
                for discovered_proc in discovered_procs:
                    if discovered_proc['procedure_name'] == 'cfg_' + cfg_proc_:
                        cfg_proc_specs.append(discovered_proc)
                        break
                else:
                    raise ValueError("Cannot find procedure with name "
                                     "'%s'" % cfg_proc_)

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # Note for the code below:
        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Re-use tbrepo instance, do not use tbds.repo

        # create and configure desired repository
        # also provides initial set of content to be tracked with git (not annex)
        if no_annex:
            tbrepo, add_to_git = _setup_git_repo(path, initopts, fake_dates)
        else:
            tbrepo, add_to_git = _setup_annex_repo(path, initopts, fake_dates,
                                                   description)

        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Note, must not happen earlier (before if) since "smart" it would not be
        tbds_config = tbds.config

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds_config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds_config.unset(id_var, scope='branch')

        if _seed is None:
            # just the standard way
            # use a fully random identifier (i.e. UUID version 4)
            uuid_id = str(uuid.uuid4())
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds_config.add(id_var,
                        tbds_id if tbds_id is not None else uuid_id,
                        scope='branch',
                        reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in tbds_config.overrides.items():
            tbds_config.add(k, v, scope='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds_config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbrepo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'
        }

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbrepo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        for cfg_proc_spec in cfg_proc_specs:
            yield from tbds.run_procedure(
                cfg_proc_spec,
                result_renderer='disabled',
                return_type='generator',
            )

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(refds, Dataset) and refds.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            yield from refds.save(
                path=tbds.path,
                return_type='generator',
                result_renderer='disabled',
            )

        res.update({'status': 'ok'})
        yield res
Exemplo n.º 23
0
class Uninstall(Interface):
    """Uninstall a dataset component or entire datasets."""

    # TODO: It's not actually clear yet, what are the actual meanings of
    # uninstall (including options) and what exactly are the methods to
    # uninstall certain components.

    # uninstall should be the opposite of install, obviously. that means:
    #   - we uninstall FROM a dataset as opposed to install INTO a dataset
    #   - any operation possible by install should be possible to be reverted
    #     by uninstall

    # If we want to uninstall something "completely", --recursive is implied.
    # Do we require the user to nevertheless explicitly use `recursive`?

    # possible components to uninstall:
    #   - submodule (checked out or not checked out) (fulfilled, unfulfilled)
    #   - annex'ed files with no content
    #   - annex'ed files with content
    #   - files in git
    #   - untracked files ? Do we want to deal with them at all?
    #   - directories (empty or not)? May be not, since we cannot install a
    #     directory, or can we?

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to perform the uninstall operation on.
            If no dataset is given, an attempt is made to identify the dataset
            based on the current working directory and/or the `path` given""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(args=("path", ),
                       doc="path/name of the component to be uninstalled",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        data_only=Parameter(
            args=("--data-only", ),
            doc="If set, only data is uninstalled, but the handles are kept.",
            action="store_true"),
        recursive=Parameter(
            args=("-r", "--recursive"),
            doc="""If set, uninstall recursively, including all subdatasets.
            The value of `data` is used for recursive uninstallation, too.""",
            action="store_true"))

    @staticmethod
    @datasetmethod(name='uninstall')
    def __call__(dataset=None, path=None, data_only=True, recursive=False):

        # Note: copy logic from install to resolve dataset and path:
        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        if not path:
            if ds is None:
                # no dataset, no target location, nothing to do
                raise ValueError(
                    "insufficient information for uninstallation (needs at "
                    "least a dataset or a path")
        elif isinstance(path, list):
            # TODO: not sure. might be possible to deal with that list directly
            return [
                Uninstall.__call__(dataset=ds,
                                   path=p,
                                   data_only=data_only,
                                   recursive=recursive) for p in path
            ]

        # resolve the target location against the provided dataset
        if path is not None:
            path = resolve_path(path, ds)

        lgr.debug("Resolved uninstallation target: {0}".format(path))

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the resolved target location (that is now guaranteed to
        # be specified
        if ds is None:
            # try to find a dataset at or above the installation target
            dspath = GitRepo.get_toppath(abspath(path))
            if dspath is None:
                # no top-level dataset found, use path as such
                dspath = path
            ds = Dataset(dspath)
        assert (ds is not None)

        lgr.debug("Resolved target dataset for uninstallation: {0}".format(ds))

        if not ds.is_installed():
            if not path or path == ds.path:
                # we want to uninstall the dataset itself, which is not
                # installed => nothing to do
                # TODO: consider `data` option! is_installed currently only
                # checks for a repository
                lgr.info("Dataset {0} not installed. Nothing to "
                         "do.".format(ds.path))
                return
            else:
                # we want to uninstall something from a not installed dataset
                # Doesn't make sense, does it? => fail
                raise ValueError("Dataset {0} is not installed.".format(
                    ds.path))

        assert (ds.repo is not None)

        if not path or path == ds.path:
            # uninstall the dataset `ds`
            # TODO: what to consider?
            #   - whether it is a submodule of another dataset
            #   - `data_only` ?
            #   - `recursive`
            #   - what to return in what case (data_only)?
            raise NotImplementedError("TODO: Uninstall dataset %s" % ds.path)

        # needed by the logic below
        assert (isabs(path))

        # express the destination path relative to the root of this dataset
        relativepath = relpath(path, start=ds.path)
        if path.startswith(pardir):
            raise ValueError("uninstallation path outside dataset")

        lgr.debug(
            "Resolved uninstallation target relative to dataset {0}: {1}".
            format(ds, relativepath))

        # figure out, what path actually is pointing to:
        if not exists(path):
            # nothing there, nothing to uninstall
            lgr.info("Nothing found to uninstall at %s" % path)
            return

        if relativepath in ds.get_dataset_handles(recursive=True):
            # it's a submodule
            # --recursive required or implied?
            raise NotImplementedError("TODO: uninstall submodule %s from "
                                      "dataset %s" % (relativepath, ds.path))

        if isdir(path):
            # don't know what to do yet
            # in git vs. untracked?
            # recursive?
            raise NotImplementedError("TODO: uninstall directory %s from "
                                      "dataset %s" % (path, ds.path))

        # we know, it's an existing file
        if isinstance(ds.repo, AnnexRepo):
            try:
                ds.repo.get_file_key(relativepath)
            except FileInGitError:
                # file directly in git
                _file_in_git = True

            except FileNotInAnnexError:
                # either an untracked file in this dataset, or something that
                # also actually exists in the file system but could be part of
                # a subdataset
                _untracked_or_within_submodule = True

            # it's an annexed file
            if data_only:
                ds.repo.annex_drop([path])
                return path
            else:
                raise NotImplementedError("TODO: fully uninstall file %s "
                                          "(annex) from dataset %s" %
                                          (path, ds.path))
        else:
            # plain git repo
            if relativepath in ds.repo.get_indexed_files():
                # file directly in git
                _file_in_git = True
            else:
                # either an untracked file in this dataset, or something that
                # also actually exists in the file system but could be part of
                # a subdataset
                _untracked_or_within_submodule = True

        if _file_in_git:
            if data_only:
                raise ValueError("%s is not a file handle. Removing its "
                                 "data only doesn't make sense." % path)
            else:
                return ds.repo.git_remove([relativepath])

        elif _untracked_or_within_submodule:
            subds = get_containing_subdataset(ds, relativepath)
            if ds.path != subds.path:
                # target path belongs to a subdataset, hand uninstallation
                # over to it
                return subds.uninstall(path=relpath(path, start=subds.path),
                                       data_only=data_only,
                                       recursive=recursive)

            # this must be an untracked/existing something
            # it wasn't installed, so we cannot uninstall it
            raise ValueError("Cannot uninstall %s" % path)
Exemplo n.º 24
0
class Metadata(Interface):
    """Metadata reporting for files and entire datasets

    Two types of metadata are supported:

    1. metadata describing a dataset as a whole (dataset-global metadata), and

    2. metadata for files in a dataset (content metadata).

    Both types can be accessed with this command.

    Examples:

      Report the metadata of a single file, as aggregated into the closest
      locally available dataset, containing the query path::

        % datalad metadata somedir/subdir/thisfile.dat

      Sometimes it is helpful to get metadata records formatted in a more accessible
      form, here as pretty-printed JSON::

        % datalad -f json_pp metadata somedir/subdir/thisfile.dat

      Same query as above, but specify which dataset to query (must be
      containing the query path)::

        % datalad metadata -d . somedir/subdir/thisfile.dat

      Report any metadata record of any dataset known to the queried dataset::

        % datalad metadata --recursive --reporton datasets 

      Get a JSON-formatted report of aggregated metadata in a dataset, incl.
      information on enabled metadata extractors, dataset versions, dataset IDs,
      and dataset paths::

        % datalad -f json metadata --get-aggregates
    """
    # make the custom renderer the default, path reporting isn't the top
    # priority here
    result_renderer = 'tailored'

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""dataset to query. If given, metadata will be reported
            as stored in this dataset. Otherwise, the closest available
            dataset containing a query path will be consulted.""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(args=("path", ),
                       metavar="PATH",
                       doc="path(s) to query metadata for",
                       nargs="*",
                       constraints=EnsureStr() | EnsureNone()),
        get_aggregates=Parameter(
            args=('--get-aggregates', ),
            action='store_true',
            doc="""if set, yields all (sub)datasets for which aggregate
            metadata are available in the dataset. No other action is
            performed, even if other arguments are given. The reported
            results contain a datasets's ID, the commit hash at which
            metadata aggregation was performed, and the location of the
            object file(s) containing the aggregated metadata."""),
        reporton=reporton_opt,
        recursive=recursion_flag)
    # MIH: not sure of a recursion limit makes sense here
    # ("outdated from 5 levels down?")
    #recursion_limit=recursion_limit)

    @staticmethod
    @datasetmethod(name='metadata')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 get_aggregates=False,
                 reporton='all',
                 recursive=False):
        # prep results
        refds_path = Interface.get_refds_path(dataset)
        res_kwargs = dict(action='metadata', logger=lgr)
        if refds_path:
            res_kwargs['refds'] = refds_path

        if get_aggregates:
            # yield all datasets for which we have aggregated metadata as results
            # the get actual dataset results, so we can turn them into dataset
            # instances using generic top-level code if desired
            ds = require_dataset(refds_path,
                                 check_installed=True,
                                 purpose='aggregate metadata query')
            info_fpath = opj(ds.path, agginfo_relpath)
            if not exists(info_fpath):
                # if there has ever been an aggregation run, this file would
                # exist, hence there has not been and we need to tell this
                # to people
                yield get_status_dict(
                    ds=ds,
                    status='impossible',
                    action='metadata',
                    logger=lgr,
                    message=
                    'metadata aggregation has never been performed in this dataset'
                )
                return
            agginfos = _load_json_object(info_fpath)
            parentds = []
            for sd in sorted(agginfos):
                info = agginfos[sd]
                dspath = normpath(opj(ds.path, sd))
                if parentds and not path_is_subpath(dspath, parentds[-1]):
                    parentds.pop()
                info.update(
                    path=dspath,
                    type='dataset',
                    status='ok',
                )
                if sd == curdir:
                    info['layout_version'] = aggregate_layout_version
                if parentds:
                    info['parentds'] = parentds[-1]
                yield dict(info, **res_kwargs)
                parentds.append(dspath)
            return

        if not dataset and not path:
            # makes no sense to have no dataset, go with "here"
            # error generation happens during annotation
            path = curdir

        content_by_ds = OrderedDict()
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                path=path,
                # MIH: we are querying the aggregated metadata anyways, and that
                # mechanism has its own, faster way to go down the hierarchy
                #recursive=recursive,
                #recursion_limit=recursion_limit,
                action='metadata',
                # uninstalled subdatasets could be queried via aggregated metadata
                # -> no 'error'
                unavailable_path_status='',
                nondataset_path_status='error',
                # we need to know when to look into aggregated data
                force_subds_discovery=True,
                force_parentds_discovery=True,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(
                    ap['path']):
                ap['process_content'] = True
            to_query = None
            if ap.get('state', None) == 'absent' or \
                    ap.get('type', 'dataset') != 'dataset':
                # this is a lonely absent dataset/file or content in a present dataset
                # -> query through parent
                # there must be a parent, otherwise this would be a non-dataset path
                # and would have errored during annotation
                to_query = ap['parentds']
            else:
                to_query = ap['path']
            if to_query:
                pcontent = content_by_ds.get(to_query, [])
                pcontent.append(ap)
                content_by_ds[to_query] = pcontent

        for ds_path in content_by_ds:
            ds = Dataset(ds_path)
            query_agg = [
                ap for ap in content_by_ds[ds_path]
                # this is an available subdataset, will be processed in another
                # iteration
                if ap.get('state', None) == 'absent' or not (ap.get(
                    'type', None) == 'dataset' and ap['path'] != ds_path)
            ]
            if not query_agg:
                continue
            # report from aggregated metadata
            for r in query_aggregated_metadata(
                    reporton,
                    # by default query the reference dataset, only if there is none
                    # try our luck in the dataset that contains the queried path
                    # this is consistent with e.g. `get_aggregates` reporting the
                    # situation in the reference dataset only
                    Dataset(refds_path) if refds_path else ds,
                    query_agg,
                    # recursion above could only recurse into datasets
                    # on the filesystem, but there might be any number of
                    # uninstalled datasets underneath the last installed one
                    # for which we might have metadata
                    recursive=recursive,
                    **res_kwargs):
                yield r
        return

    @staticmethod
    def custom_result_renderer(res, **kwargs):
        if res['status'] != 'ok' or not res.get('action', None) == 'metadata':
            # logging complained about this already
            return
        # list the path, available metadata keys, and tags
        path = relpath(res['path'], res['refds']) if res.get(
            'refds', None) else res['path']
        meta = res.get('metadata', {})
        ui.message('{path}{type}:{spacer}{meta}{tags}'.format(
            path=ac.color_word(path, ac.BOLD),
            type=' ({})'.format(ac.color_word(res['type'], ac.MAGENTA))
            if 'type' in res else '',
            spacer=' ' if len([m for m in meta if m != 'tag']) else '',
            meta=','.join(k for k in sorted(meta.keys())
                          if k not in ('tag', '@context', '@id'))
            if meta else ' -' if 'metadata' in res else ' aggregated',
            tags='' if 'tag' not in meta else ' [{}]'.format(','.join(
                assure_list(meta['tag'])))))
Exemplo n.º 25
0
class Save(Interface):
    """Save the current state of a dataset

    Saving the state of a dataset records changes that have been made to it.
    This change record is annotated with a user-provided description.
    Optionally, an additional tag, such as a version, can be assigned to the
    saved state. Such tag enables straightforward retrieval of past versions at
    a later point in time.

    Examples:

      Save any content underneath the current directory, without altering
      any potential subdataset (use --recursive for that)::

        % datalad rev-save .

      Save any modification of known dataset content, but leave untracked
      files (e.g. temporary files) untouched::

        % dataset rev-save -u -d <path_to_dataset>

      Tag the most recent saved state of a dataset::

        % dataset rev-save -d <path_to_dataset> --version-tag bestyet

    .. note::
      For performance reasons, any Git repository without an initial commit
      located inside a Dataset is ignored, and content underneath it will be
      saved to the respective superdataset. DataLad datasets always have an
      initial commit, hence are not affected by this behavior.
    """
    # note above documents that out behavior is like that of `git add`, but
    # does not explicitly mention the connection to keep it simple.

    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc=""""specify the dataset to save""",
                          constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            doc="""path/name of the dataset component to save. If given, only
            changes made to those components are recorded in the new state.""",
            nargs='*',
            constraints=EnsureStr() | EnsureNone()),
        message=save_message_opt,
        message_file=Parameter(
            args=("-F", "--message-file"),
            doc="""take the commit message from this file. This flag is
            mutually exclusive with -m.""",
            constraints=EnsureStr() | EnsureNone()),
        version_tag=Parameter(
            args=(
                "-t",
                "--version-tag",
            ),
            metavar='ID',
            doc="""an additional marker for that state. Every dataset that
            is touched will receive the tag.""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        updated=Parameter(
            args=(
                '-u',
                '--updated',
            ),
            action='store_true',
            doc="""if given, only saves previously tracked paths."""),
        to_git=Parameter(
            args=("--to-git", ),
            action='store_true',
            doc="""flag whether to add data directly to Git, instead of
            tracking data identity only.  Usually this is not desired,
            as it inflates dataset sizes and impacts flexibility of data
            transport. If not specified - it will be up to git-annex to
            decide, possibly on .gitattributes options. Use this flag
            with a simultaneous selection of paths to save. In general,
            it is better to pre-configure a dataset to track particular paths,
            file types, or file sizes with either Git or git-annex.
            See https://git-annex.branchable.com/tips/largefiles/"""),
    )

    @staticmethod
    @datasetmethod(name='rev_save')
    @eval_results
    def __call__(
        path=None,
        message=None,
        dataset=None,
        version_tag=None,
        recursive=False,
        recursion_limit=None,
        updated=False,
        message_file=None,
        to_git=None,
    ):
        if message and message_file:
            raise ValueError(
                "Both a message and message file were specified for save()")

        path = assure_list(path)

        if message_file:
            with open(message_file) as mfh:
                message = mfh.read()

        # we want 'normal' to achieve the most compact argument list
        # for git calls
        # untracked_mode = 'no' if updated else 'normal'
        # TODO however, Repo.add() would refuse to add any dotfiles
        # in a directory that is itself untracked, hence the only
        # choice is to go with potentially crazy long lists
        # until https://github.com/datalad/datalad/issues/1454
        # has a resolution
        untracked_mode = 'no' if updated else 'all'

        # there are three basic scenarios:
        # 1. save modifications to any already tracked content
        # 2. save any content (including removal of deleted content)
        #    to bring things to a clean state
        # 3. like (2), but only operate on a given subset of content
        #    identified by paths
        # - all three have to work in conjunction with --recursive
        # - the difference between (1) and (2) should be no more
        #   that a switch from --untracked=no to --untracked=all
        #   in Repo.save()

        # we do not support
        # - simultaneous operations on multiple datasets from disjoint
        #   dataset hierarchies, hence a single reference dataset must be
        #   identifiable from the either
        #   - curdir or
        #   - the `dataset` argument.
        #   This avoids complex annotation loops and hierarchy tracking.
        # - any modification upwards from the root dataset

        ds = require_dataset(dataset, check_installed=True, purpose='saving')

        # use status() to do all discovery and annotation of paths
        paths_by_ds = {}
        for s in Status()(
                # ATTN: it is vital to pass the `dataset` argument as it,
                # and not a dataset instance in order to maintain the path
                # semantics between here and the status() call
                dataset=dataset,
                path=path,
                untracked=untracked_mode,
                recursive=recursive,
                recursion_limit=recursion_limit,
                result_renderer='disabled'):
            # fish out status dict for this parent dataset
            ds_status = paths_by_ds.get(s['parentds'], {})
            # reassemble path status info as repo.status() would have made it
            ds_status[ut.Path(s['path'])] = \
                {k: v for k, v in iteritems(s)
                 if k not in (
                     'path', 'parentds', 'refds', 'status', 'action',
                     'logger')}
            paths_by_ds[s['parentds']] = ds_status

        lgr.debug('Determined %i datasets for saving from input arguments',
                  len(paths_by_ds))
        # figure out what datasets to process, start with the ones containing
        # the paths that were given as arguments
        discovered_datasets = list(paths_by_ds.keys())
        if dataset:
            # if a reference dataset was given we want to save all the way up
            # to it, so let's throw it into the mix
            discovered_datasets.append(ds.path)
        # sort the datasets into (potentially) disjoint hierarchies,
        # or a single one, if a reference dataset was given
        dataset_hierarchies = get_tree_roots(discovered_datasets)
        for rootds, children in iteritems(dataset_hierarchies):
            edges = {}
            discover_dataset_trace_to_targets(rootds,
                                              children, [],
                                              edges,
                                              includeds=children)
            for superds, subdss in iteritems(edges):
                superds_status = paths_by_ds.get(superds, {})
                for subds in subdss:
                    # TODO actually start from an entry that may already
                    # exist in the status record
                    superds_status[ut.Path(subds)] = dict(
                        # shot from the hip, some status config
                        # to trigger this specific super/sub
                        # relation to be saved
                        state='untracked',
                        type='dataset')
                paths_by_ds[superds] = superds_status

        # TODO parallelize, whenever we have multiple subdataset of a single
        # dataset they can all be processed simultaneously
        # sort list of dataset to handle, starting with the ones deep down
        for pdspath in sorted(paths_by_ds, reverse=True):
            pds = Dataset(pdspath)
            # pop status for this dataset, we are not coming back to it
            pds_status = {
                # for handing over to the low-level code, we recode any
                # path relative to the real repo location, this avoid
                # cumbersome symlink handling without context in the
                # lower levels
                pds.repo.pathobj / p.relative_to(pdspath): props
                for p, props in iteritems(paths_by_ds.pop(pdspath))
            }
            start_commit = pds.repo.get_hexsha()
            if not all(p['state'] == 'clean' for p in pds_status.values()):
                for res in pds.repo.save_(
                        message=message,
                        # make sure to have the `path` arg be None, as we want
                        # to prevent and bypass any additional repo.status()
                        # calls
                        paths=None,
                        # prevent whining of GitRepo
                        git=True
                        if not hasattr(ds.repo, 'annexstatus') else to_git,
                        # we are supplying the full status already, do not
                        # detect anything else
                        untracked='no',
                        _status=pds_status):
                    # TODO remove stringification when datalad-core can handle
                    # path objects, or when PY3.6 is the lowest supported
                    # version
                    for k in ('path', 'refds'):
                        if k in res:
                            res[k] = str(
                                # recode path back to dataset path anchor
                                pds.pathobj /
                                res[k].relative_to(pds.repo.pathobj))
                    yield res
            # report on the dataset itself
            dsres = dict(
                action='save',
                type='dataset',
                path=pds.path,
                refds=ds.path,
                status='ok'
                if start_commit != pds.repo.get_hexsha() else 'notneeded',
                logger=lgr,
            )
            if not version_tag:
                yield dsres
                continue
            try:
                pds.repo.tag(version_tag)
                dsres.update(status='ok', version_tag=version_tag)
                yield dsres
            except CommandError as e:
                if dsres['status'] == 'ok':
                    # first we yield the result for the actual save
                    yield dsres.copy()
                # and now complain that tagging didn't work
                dsres.update(status='error',
                             message=('cannot tag this version: %s',
                                      e.stderr.strip()))
                yield dsres
Exemplo n.º 26
0
class Update(Interface):
    """Update an existing dataset of a UKbiobank participant

    This command expects an ukb-init initialized DataLad dataset. The dataset
    may or may not have any downloaded content already.

    Downloads are performed with the `ukbfetch` tool, which is expected to
    be available and executable.
    """

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            metavar='DATASET',
            doc="""specify the dataset to perform the initialization on""",
            constraints=EnsureDataset() | EnsureNone()),
        keyfile=Parameter(args=(
            '-k',
            '--keyfile',
        ),
                          metavar='PATH',
                          doc="""path to a file with an authentification key
            (ukbfetch -a ...). If none is given, the configuration
            datalad.ukbiobank.keyfile is consulted.""",
                          constraints=EnsureStr() | EnsureNone()),
        merge=Parameter(args=('--merge', ),
                        action='store_true',
                        doc="""merge any updates into the active branch
            """),
        force_update=Parameter(
            args=('--force-update', ),
            action='store_true',
            doc="""update the incoming-processed branch, even if (re-)download
            did not yield changed content (can be useful when restructuring
            setup has changed)."""),
        bids=Parameter(
            args=('--bids', ),
            action='store_true',
            doc="""restructure the incoming-processed branch into a BIDS-like
            organization."""),
        non_bids_dir=Parameter(
            args=('--non-bids-dir', ),
            metavar='PATH',
            doc="""if BIDS restructuring is enabled, relative path (to the
            session directory) of a directory to place all unrecognized files
            into.""",
            constraints=EnsureStr() | EnsureNone()),
    )

    @staticmethod
    @datasetmethod(name='ukb_update')
    @eval_results
    def __call__(keyfile=None,
                 merge=False,
                 force_update=False,
                 bids=False,
                 non_bids_dir='non-bids',
                 dataset=None):
        ds = require_dataset(dataset, check_installed=True, purpose='update')

        repo = ds.repo
        if not keyfile:
            # will error out, if no config was given
            keyfile = repo.config.obtain('datalad.ukbiobank.keyfile')

        # prep for yield
        res = dict(
            action='ukb_update',
            path=ds.path,
            type='dataset',
            logger=lgr,
            refds=ds.path,
        )

        if repo.dirty:
            yield dict(
                res,
                status='error',
                message="Refuse to operate on dirty dataset",
            )
            return

        # check if we have 'ukbfetch' before we start fiddling with the dataset
        # and leave it in a mess for no reason
        try:
            subprocess.run(
                # pull version info
                ['ukbfetch', '-i'],
                capture_output=True,
            )
        except Exception as e:
            raise RuntimeError(
                "Cannot execute 'ukbfetch'. Original error: {}".format(e))

        # just to be nice, and to be able to check it out again,
        # when we are done
        initial_branch = repo.get_active_branch()
        initial_incoming = repo.get_hexsha('incoming')

        # make sure we are in incoming
        repo.call_git(['checkout', 'incoming'])

        # first wipe out all prev. downloaded zip files so we can detect
        # when some files are no longer available
        for fp in repo.pathobj.glob('[0-9]*_[0-9]*_[0-9]_[0-9].*'):
            fp.unlink()

        # a place to put the download logs
        # better be semi-persistent to ease inspection
        tmpdir = repo.pathobj / repo.get_git_dir(repo) / 'tmp' / 'ukb'
        tmpdir.mkdir(parents=True, exist_ok=True)

        # redownload, run with explicit mode, because we just deleted the
        # ZIP files and that is OK
        ds.run(
            cmd='ukbfetch -v -a{} -b.ukbbatch -o{}'.format(
                quote_cmdlinearg(keyfile),
                quote_cmdlinearg(str(tmpdir)),
            ),
            explicit=True,
            outputs=['.'],
            message="Update from UKbiobank",
        )

        # TODO what if something broke before? needs force switch
        if not force_update and repo.get_hexsha() == initial_incoming:
            yield dict(
                res,
                status='notneeded',
                message='No new content available',
            )
            repo.call_git(['checkout', initial_branch])
            # TODO drop?
            return

        # onto extraction and transformation of downloaded content
        repo.call_git(['checkout', 'incoming-processed'])

        # mark the incoming change as merged
        # (but we do not actually want any branch content)
        repo.call_git(['merge', 'incoming', '--strategy=ours', 'incoming'])

        for fp in repo.get_content_info(ref='incoming-processed',
                                        eval_file_type=False):
            fp.unlink()

        subid = None
        if bids:
            from datalad_ukbiobank.ukb2bids import restructure_ukb2bids
            # get participant ID from batch file
            subid = list(
                repo.call_git_items_(["cat-file", "-p", "incoming:.ukbbatch"
                                      ]))[0].split(maxsplit=1)[0]

        # discover all zip files present in the last commit in 'incoming'
        for fp, props in repo.get_content_annexinfo(
                ref='incoming', eval_availability=False).items():
            if fp.name.startswith('.'):
                # skip internals
                continue
            # we have to extract into per-instance directories, otherwise files
            # would conflict
            ids = fp.stem.split('_')
            if not len(ids) >= 3:
                raise RuntimeError(
                    'Unrecognized filename structure: {}'.format(fp))
            extract_dir = repo.pathobj / 'instance-{}'.format(ids[2])
            extract_dir.mkdir(exist_ok=True)

            if fp.suffix == '.zip':
                with chpwd(extract_dir):
                    # extract and add their content
                    AddArchiveContent.__call__(
                        props['key'],
                        key=True,
                        annex=repo,
                        # --use-current-dir due to
                        # https://github.com/datalad/datalad/issues/3995
                        use_current_dir=True,
                        allow_dirty=True,
                        commit=False,
                    )
            else:
                # move into instance dir, and strip participant ID, and instance ID
                # but keep array index
                # e.g. -> 25747_3_0.adv -> instance-3/25747_0
                repo.call_git([
                    'annex', 'fromkey', props['key'],
                    str(extract_dir /
                        ('_'.join(ids[1::2]) + ''.join(fp.suffixes)))
                ])

            if bids:
                yield from restructure_ukb2bids(
                    ds,
                    subid=subid,
                    unrecognized_dir=Path('ses-{}'.format(ids[2])) /
                    non_bids_dir,
                    base_path=extract_dir,
                    session=ids[2],
                )

        # save whatever the state is now, `save` will discover deletions
        # automatically and also commit them -- wonderful!
        ds.save(message="Track ZIP file content")
        yield dict(
            res,
            status='ok',
        )

        if not merge:
            return

        # and update active branch
        repo.call_git(['checkout', initial_branch])

        if initial_branch in ('incoming', 'incoming-processed'):
            yield dict(
                res,
                action='ukb_merge_update',
                status='impossible',
                message='Refuse to merge into incoming* branch',
            )
            return

        repo.call_git([
            'merge', '-m', "Merge update from UKbiobank", 'incoming-processed'
        ])

        yield dict(
            res,
            action='ukb_merge_update',
            status='ok',
        )
        return
Exemplo n.º 27
0
class Clone(Interface):
    """Obtain a dataset (copy) from a URL or local directory

    The purpose of this command is to obtain a new clone (copy) of a dataset
    and place it into a not-yet-existing or empty directory. As such `clone`
    provides a strict subset of the functionality offered by `install`. Only a
    single dataset can be obtained, and immediate recursive installation of
    subdatasets is not supported. However, once a (super)dataset is installed
    via `clone`, any content, including subdatasets can be obtained by a
    subsequent `get` command.

    Primary differences over a direct `git clone` call are 1) the automatic
    initialization of a dataset annex (pure Git repositories are equally
    supported); 2) automatic registration of the newly obtained dataset as a
    subdataset (submodule), if a parent dataset is specified; 3) support
    for additional resource identifiers (DataLad resource identifiers as used
    on datasets.datalad.org, and RIA store URLs as used for store.datalad.org
    - optionally in specific versions as identified by a branch or a tag; see
    examples); and 4) automatic configurable generation of alternative access
    URL for common cases (such as appending '.git' to the URL in case the
    accessing the base URL failed).

    || PYTHON >>By default, the command returns a single Dataset instance for
    an installed dataset, regardless of whether it was newly installed ('ok'
    result), or found already installed from the specified source ('notneeded'
    result).<< PYTHON ||

    .. seealso::

      :ref:`handbook:3-001`
        More information on Remote Indexed Archive (RIA) stores
    """
    # by default ignore everything but install results
    # i.e. no "add to super dataset"
    result_filter = EnsureKeyChoice('action', ('install',))
    # very frequently this command will yield exactly one installed dataset
    # spare people the pain of going through a list by default
    return_type = 'item-or-list'
    # as discussed in #1409 and #1470, we want to return dataset instances
    # matching what is actually available after command completion (and
    # None for any failed dataset installation)
    result_xfm = 'successdatasets-or-none'

    _examples_ = [
        dict(text="Install a dataset from Github into the current directory",
             code_py="clone("
             "source='https://github.com/datalad-datasets/longnow"
             "-podcasts.git')",
             code_cmd="datalad clone "
             "https://github.com/datalad-datasets/longnow-podcasts.git"),
        dict(text="Install a dataset into a specific directory",
             code_py="""\
             clone(source='https://github.com/datalad-datasets/longnow-podcasts.git',
                   path='myfavpodcasts')""",
             code_cmd="""\
             datalad clone https://github.com/datalad-datasets/longnow-podcasts.git \\
             myfavpodcasts"""),
        dict(text="Install a dataset as a subdataset into the current dataset",
             code_py="""\
             clone(dataset='.',
                   source='https://github.com/datalad-datasets/longnow-podcasts.git')""",
             code_cmd="datalad clone -d . "
             "https://github.com/datalad-datasets/longnow-podcasts.git"),
        dict(text="Install the main superdataset from datasets.datalad.org",
             code_py="clone(source='///')",
             code_cmd="datalad clone ///"),
        dict(text="Install a dataset identified by a literal alias from store.datalad.org",
             code_py="clone(source='ria+http://store.datalad.org#~hcp-openaccess')",
             code_cmd="datalad clone ria+http://store.datalad.org#~hcp-openaccess"),
        dict(
            text="Install a dataset in a specific version as identified by a "
                 "branch or tag name from store.datalad.org",
            code_py="clone(source='ria+http://store.datalad.org#76b6ca66-36b1-11ea-a2e6-f0d5bf7b5561@myidentifier')",
            code_cmd="datalad clone ria+http://store.datalad.org#76b6ca66-36b1-11ea-a2e6-f0d5bf7b5561@myidentifier"),
        dict(
            text="Install a dataset with group-write access permissions",
            code_py=\
            "clone(source='http://example.com/dataset', reckless='shared-group')",
            code_cmd=\
            "datalad clone http://example.com/dataset --reckless shared-group"),
    ]

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""(parent) dataset to clone into. If given, the newly cloned
            dataset is registered as a subdataset of the parent. Also, if given,
            relative paths are interpreted as being relative to the parent
            dataset, and not relative to the working directory.""",
            constraints=EnsureDataset() | EnsureNone()),
        source=Parameter(
            args=("source",),
            metavar='SOURCE',
            doc="""URL, DataLad resource identifier, local path or instance of
            dataset to be cloned""",
            constraints=EnsureStr() | EnsureNone()),
        path=Parameter(
            args=("path",),
            metavar='PATH',
            nargs="?",
            doc="""path to clone into.  If no `path` is provided a
            destination path will be derived from a source URL
            similar to :command:`git clone`"""),
        description=location_description,
        reckless=reckless_opt,
    )

    @staticmethod
    @datasetmethod(name='clone')
    @eval_results
    def __call__(
            source,
            path=None,
            dataset=None,
            description=None,
            reckless=None):
        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = require_dataset(
            dataset, check_installed=True, purpose='cloning') \
            if dataset is not None else dataset
        refds_path = ds.path if ds else None

        # legacy compatibility
        if reckless is True:
            # so that we can forget about how things used to be
            reckless = 'auto'

        if isinstance(source, Dataset):
            source = source.path

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "clone `source` and destination `path` are identical [{}]. "
                "If you are trying to add a subdataset simply use `save`".format(
                    path))

        if path is not None:
            path = resolve_path(path, dataset)

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            # since this is a relative `path`, resolve it:
            # we are not going to reuse the decoded URL, as this is done for
            # all source candidates in clone_dataset(), we just use to determine
            # a destination path here in order to perform a bunch of additional
            # checks that shall not pollute the helper function
            source_ = decode_source_spec(
                source, cfg=None if ds is None else ds.config)
            path = resolve_path(source_['default_destpath'], dataset)
            lgr.debug("Determined clone target path from source")
        lgr.debug("Resolved clone target path to: '%s'", path)

        # there is no other way -- my intoxicated brain tells me
        assert(path is not None)

        result_props = dict(
            action='install',
            logger=lgr,
            refds=refds_path,
            source_url=source)

        try:
            # this will implicitly cause pathlib to run a bunch of checks
            # whether the present path makes any sense on the platform
            # we are running on -- we don't care if the path actually
            # exists at this point, but we want to abort early if the path
            # spec is determined to be useless
            path.exists()
        except OSError as e:
            yield get_status_dict(
                status='error',
                path=path,
                message=('cannot handle target path: %s', exc_str(e)),
                **result_props)
            return

        destination_dataset = Dataset(path)
        result_props['ds'] = destination_dataset

        if ds is not None and ds.pathobj not in path.parents:
            yield get_status_dict(
                status='error',
                message=("clone target path '%s' not in specified target dataset '%s'",
                         path, ds),
                **result_props)
            return

        # perform the actual cloning operation
        yield from clone_dataset(
            [source],
            destination_dataset,
            reckless,
            description,
            result_props,
            cfg=None if ds is None else ds.config,
        )

        # TODO handle any 'version' property handling and verification using a dedicated
        # public helper

        if ds is not None:
            # we created a dataset in another dataset
            # -> make submodule
            for r in ds.save(
                    path,
                    return_type='generator',
                    result_filter=None,
                    result_xfm=None,
                    on_failure='ignore'):
                yield r
Exemplo n.º 28
0
class WebApp(Interface):
    """
    """
    _params_ = dict(
        app=Parameter(args=('--app', ),
                      doc="yeah!",
                      nargs='+',
                      action='append'),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to serve as the anchor of the webapp.
            An attempt is made to identify the dataset based on the current
            working directory. If a dataset is given, the command will be
            executed in the root directory of this dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
        daemonize=Parameter(args=("--daemonize", ),
                            action='store_true',
                            doc="yeah!"),
    )

    @staticmethod
    @datasetmethod(name='webapp')
    @eval_results
    def __call__(app, dataset=None, daemonize=False):
        apps = assure_list(app)
        if not apps:
            raise ValueError('no app specification given')
        if not isinstance(apps[0], (list, tuple)):
            apps = [apps]
        apps = {
            a[0] if isinstance(a, (list, tuple)) else a:
            a[1] if isinstance(a, (list, tuple)) and len(a) > 1 else None
            for a in apps
        }

        import cherrypy

        # global config
        cherrypy.config.update({
            # prevent visible tracebacks, etc:
            # http://docs.cherrypy.org/en/latest/config.html#id14
            #'environment': 'production',
            #'log.error_file': 'site.log',
        })

        # set the priority according to your needs if you are hooking something
        # else on the 'before_finalize' hook point.
        @cherrypy.tools.register('before_finalize', priority=60)
        def secureheaders():
            headers = cherrypy.response.headers
            headers['X-Frame-Options'] = 'DENY'
            headers['X-XSS-Protection'] = '1; mode=block'
            headers['Content-Security-Policy'] = "default-src='self'"
            # only add Strict-Transport headers if we're actually using SSL; see the ietf spec
            # "An HSTS Host MUST NOT include the STS header field in HTTP responses
            # conveyed over non-secure transport"
            # http://tools.ietf.org/html/draft-ietf-websec-strict-transport-sec-14#section-7.2
            if (cherrypy.server.ssl_certificate != None
                    and cherrypy.server.ssl_private_key != None):
                headers[
                    'Strict-Transport-Security'] = 'max-age=31536000'  # one year

        if daemonize:
            from cherrypy.process.plugins import Daemonizer
            Daemonizer(cherrypy.engine).subscribe()
            #PIDFile(cherrypy.engine, '/var/run/myapp.pid').subscribe()

        # when running on a priviledged port
        #DropPrivileges(cherrypy.engine, uid=1000, gid=1000).subscribe()

        enabled_apps = []
        for ep in iter_entry_points('datalad.webapps'):
            if ep.name not in apps:
                continue
            mount = apps[ep.name] if apps[ep.name] else '/'
            # get the webapp class
            cls = ep.load()
            # fire up the webapp instance
            inst = cls(**dict(dataset=dataset))
            # mount under global URL tree (default or given suburl)
            app = cherrypy.tree.mount(
                root=inst,
                script_name=mount,
                # app config file, it is ok for that file to not exist
                config=cls._webapp_config)
            # forcefully impose more secure mode
            # TODO might need one (or more) switch(es) to turn things off for
            # particular scenarios
            enabled_apps.append(ep.name)
            app.merge({
                '/': {
                    # turns all security headers on
                    'tools.secureheaders.on': True,
                    'tools.sessions.secure': True,
                    'tools.sessions.httponly': True
                }
            })
            static_dir = opj(cls._webapp_dir, cls._webapp_staticdir)
            if isdir(static_dir):
                app.merge({
                    # the key has to be / even when an app is mount somewhere
                    # below
                    '/': {
                        'tools.staticdir.on': True,
                        'tools.staticdir.root': cls._webapp_dir,
                        'tools.staticdir.dir': cls._webapp_staticdir
                    }
                })
        failed_apps = set(apps).difference(enabled_apps)
        if failed_apps:
            lgr.warning('Failed to load webapps: %s', failed_apps)
        if not enabled_apps:
            return
        cherrypy.engine.start()
        cherrypy.engine.block()
        yield {}
Exemplo n.º 29
0
class Run(Interface):
    """Run an arbitrary shell command and record its impact on a dataset.

    It is recommended to craft the command such that it can run in the root
    directory of the dataset that the command will be recorded in. However,
    as long as the command is executed somewhere underneath the dataset root,
    the exact location will be recorded relative to the dataset root.

    If the executed command did not alter the dataset in any way, no record of
    the command execution is made.

    If the given command errors, a `CommandError` exception with the same exit
    code will be raised, and no modifications will be saved.

    *Command format*

    || REFLOW >>
    A few placeholders are supported in the command via Python format
    specification. "{pwd}" will be replaced with the full path of the current
    working directory. "{dspath}" will be replaced with the full path of the
    dataset that run is invoked on. "{inputs}" and "{outputs}" represent the
    values specified by [CMD: --input and --output CMD][PY: `inputs` and
    `outputs` PY]. If multiple values are specified, the values will be joined
    by a space. The order of the values will match that order from the command
    line, with any globs expanded in alphabetical order (like bash). Individual
    values can be accessed with an integer index (e.g., "{inputs[0]}").
    << REFLOW ||

    To escape a brace character, double it (i.e., "{{" or "}}").
    """
    _params_ = dict(
        cmd=Parameter(
            args=("cmd",),
            nargs=REMAINDER,
            metavar='COMMAND',
            doc="command for execution"),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to record the command results in.
            An attempt is made to identify the dataset based on the current
            working directory. If a dataset is given, the command will be
            executed in the root directory of this dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
        inputs=Parameter(
            args=("--input",),
            dest="inputs",
            metavar=("PATH"),
            action='append',
            doc="""A dependency for the run. Before running the command, the
            content of this file will be retrieved. A value of "." means "run
            :command:`datalad get .`". The value can also be a glob. [CMD: This
            option can be given more than once. CMD]"""),
        outputs=Parameter(
            args=("--output",),
            dest="outputs",
            metavar=("PATH"),
            action='append',
            doc="""Prepare this file to be an output file of the command. A
            value of "." means "run :command:`datalad unlock .`" (and will fail
            if some content isn't present). For any other value, if the content
            of this file is present, unlock the file. Otherwise, remove it. The
            value can also be a glob. [CMD: This option can be given more than
            once. CMD]"""),
        expand=Parameter(
            args=("--expand",),
            metavar=("WHICH"),
            doc="""Expand globs when storing inputs and/or outputs in the
            commit message.""",
            constraints=EnsureNone() | EnsureChoice("inputs", "outputs", "both")),
        message=save_message_opt,
        sidecar=Parameter(
            args=('--sidecar',),
            metavar="yes|no",
            doc="""By default, the configuration variable
            'datalad.run.record-sidecar' determines whether a record with
            information on a command's execution is placed into a separate
            record file instead of the commit message (default: off). This
            option can be used to override the configured behavior on a
            case-by-case basis. Sidecar files are placed into the dataset's
            '.datalad/runinfo' directory (customizable via the
            'datalad.run.record-directory' configuration variable).""",
            constraints=EnsureNone() | EnsureBool()),
        rerun=Parameter(
            args=('--rerun',),
            action='store_true',
            doc="""re-run the command recorded in the last saved change (if any).
            Note: This option is deprecated since version 0.9.2 and
            will be removed in a later release. Use `datalad rerun`
            instead."""),
    )

    @staticmethod
    @datasetmethod(name='run')
    @eval_results
    def __call__(
            cmd=None,
            dataset=None,
            inputs=None,
            outputs=None,
            expand=None,
            message=None,
            sidecar=None,
            rerun=False):
        if rerun:
            if cmd:
                lgr.warning("Ignoring provided command in --rerun mode")
            lgr.warning("The --rerun option is deprecated since version 0.9.2. "
                        "Use `datalad rerun` instead.")
            from datalad.interface.rerun import Rerun
            for r in Rerun.__call__(dataset=dataset, message=message):
                yield r
        else:
            if cmd:
                for r in run_command(cmd, dataset=dataset,
                                     inputs=inputs, outputs=outputs,
                                     expand=expand,
                                     message=message,
                                     sidecar=sidecar):
                    yield r
            else:
                lgr.warning("No command given")
Exemplo n.º 30
0
class Run(Interface):
    """Run an arbitrary shell command and record its impact on a dataset.

    It is recommended to craft the command such that it can run in the root
    directory of the dataset that the command will be recorded in. However,
    as long as the command is executed somewhere underneath the dataset root,
    the exact location will be recorded relative to the dataset root.

    If the executed command did not alter the dataset in any way, no record of
    the command execution is made.

    If the given command errors, a `CommandError` exception with the same exit
    code will be raised, and no modifications will be saved.

    *Command format*

    || REFLOW >>
    A few placeholders are supported in the command via Python format
    specification. "{pwd}" will be replaced with the full path of the current
    working directory. "{dspath}" will be replaced with the full path of the
    dataset that run is invoked on. "{tmpdir}" will be replaced with the full
    path of a temporary directory. "{inputs}" and "{outputs}" represent the
    values specified by [CMD: --input and --output CMD][PY: `inputs` and
    `outputs` PY]. If multiple values are specified, the values will be joined
    by a space. The order of the values will match that order from the command
    line, with any globs expanded in alphabetical order (like bash). Individual
    values can be accessed with an integer index (e.g., "{inputs[0]}").
    << REFLOW ||

    || REFLOW >>
    Note that the representation of the inputs or outputs in the formatted
    command string depends on whether the command is given as a list of
    arguments or as a string[CMD:  (quotes surrounding the command) CMD]. The
    concatenated list of inputs or outputs will be surrounded by quotes when
    the command is given as a list but not when it is given as a string. This
    means that the string form is required if you need to pass each input as a
    separate argument to a preceding script (i.e., write the command as
    "./script {inputs}", quotes included). The string form should also be used
    if the input or output paths contain spaces or other characters that need
    to be escaped.
    << REFLOW ||

    To escape a brace character, double it (i.e., "{{" or "}}").

    Custom placeholders can be added as configuration variables under
    "datalad.run.substitutions".  As an example:

      Add a placeholder "name" with the value "joe"::

        % git config --file=.datalad/config datalad.run.substitutions.name joe
        % datalad add -m "Configure name placeholder" .datalad/config

      Access the new placeholder in a command::

        % datalad run "echo my name is {name} >me"
    """
    _examples_ = [
        dict(
            text="Run an executable script and record the impact on a dataset",
            code_py="run(message='run my script', cmd='code/script.sh')",
            code_cmd="datalad run -m 'run my script' 'code/script.sh'"),
        dict(text="Run a command and specify a directory as a dependency "
             "for the run. The contents of the dependency will be retrieved "
             "prior to running the script",
             code_cmd="datalad run -m 'run my script' -i 'data/*' "
             "'code/script.sh'",
             code_py="""\
             run(cmd='code/script.sh', message='run my script',
                 inputs=['data/*'])"""),
        dict(text="Run an executable script and specify output files of the "
             "script to be unlocked prior to running the script",
             code_py="""\
             run(cmd='code/script.sh', message='run my script',
                 inputs=['data/*'], outputs=['output_dir'])""",
             code_cmd="""\
             datalad run -m 'run my script' -i 'data/*' \\
             -o 'output_dir/*' 'code/script.sh'"""),
        dict(text="Specify multiple inputs and outputs",
             code_py="""\
             run(cmd='code/script.sh',
                 message='run my script',
                 inputs=['data/*', 'datafile.txt'],
                 outputs=['output_dir', 'outfile.txt'])""",
             code_cmd="""\
             datalad run -m 'run my script' -i 'data/*' \\
             -i 'datafile.txt' -o 'output_dir/*' -o \\
             'outfile.txt' 'code/script.sh'""")
    ]

    _params_ = dict(
        cmd=Parameter(
            args=("cmd", ),
            nargs=REMAINDER,
            metavar='COMMAND',
            doc="""command for execution. A leading '--' can be used to
            disambiguate this command from the preceding options to
            DataLad."""),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to record the command results in.
            An attempt is made to identify the dataset based on the current
            working directory. If a dataset is given, the command will be
            executed in the root directory of this dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
        inputs=Parameter(
            args=("-i", "--input"),
            dest="inputs",
            metavar=("PATH"),
            action='append',
            doc="""A dependency for the run. Before running the command, the
            content of this file will be retrieved. A value of "." means "run
            :command:`datalad get .`". The value can also be a glob. [CMD: This
            option can be given more than once. CMD]"""),
        outputs=Parameter(
            args=("-o", "--output"),
            dest="outputs",
            metavar=("PATH"),
            action='append',
            doc="""Prepare this file to be an output file of the command. A
            value of "." means "run :command:`datalad unlock .`" (and will fail
            if some content isn't present). For any other value, if the content
            of this file is present, unlock the file. Otherwise, remove it. The
            value can also be a glob. [CMD: This option can be given more than
            once. CMD]"""),
        expand=Parameter(
            args=("--expand", ),
            doc="""Expand globs when storing inputs and/or outputs in the
            commit message.""",
            constraints=EnsureChoice(None, "inputs", "outputs", "both")),
        explicit=Parameter(
            args=("--explicit", ),
            action="store_true",
            doc="""Consider the specification of inputs and outputs to be
            explicit. Don't warn if the repository is dirty, and only save
            modifications to the listed outputs."""),
        message=save_message_opt,
        sidecar=Parameter(args=('--sidecar', ),
                          metavar="{yes|no}",
                          doc="""By default, the configuration variable
            'datalad.run.record-sidecar' determines whether a record with
            information on a command's execution is placed into a separate
            record file instead of the commit message (default: off). This
            option can be used to override the configured behavior on a
            case-by-case basis. Sidecar files are placed into the dataset's
            '.datalad/runinfo' directory (customizable via the
            'datalad.run.record-directory' configuration variable).""",
                          constraints=EnsureNone() | EnsureBool()),
    )

    @staticmethod
    @datasetmethod(name='run')
    @eval_results
    def __call__(cmd=None,
                 dataset=None,
                 inputs=None,
                 outputs=None,
                 expand=None,
                 explicit=False,
                 message=None,
                 sidecar=None):
        for r in run_command(cmd,
                             dataset=dataset,
                             inputs=inputs,
                             outputs=outputs,
                             expand=expand,
                             explicit=explicit,
                             message=message,
                             sidecar=sidecar):
            yield r