示例#1
0
def test_logging_to_a_file(dst):
    ok_(not exists(dst))

    lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst)
    ok_(exists(dst))  # nothing was logged -- no file created

    msg = "Oh my god, they killed Kenny"
    lgr.error(msg)
    with open(dst) as f:
        lines = f.readlines()
    assert_equal(len(lines), 1, "Read more than a single log line: %s" %  lines)
    line = lines[0]
    ok_(msg in line)
    ok_('\033[' not in line,
        msg="There should be no color formatting in log files. Got: %s" % line)
    # verify that time stamp and level are present in the log line
    # do not want to rely on not having race conditions around date/time changes
    # so matching just with regexp
    # (...)? is added to swallow possible traceback logs
    regex = "\[ERROR\]"
    if EnsureBool()(dl_cfg.get('datalad.log.timestamp', False)):
        regex = "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} " + regex
    if EnsureBool()(dl_cfg.get('datalad.log.vmem', False)):
        regex += ' RSS/VMS: \S+/\S+( \S+)?\s*'
    regex += "(\s+\S+\s*)? " + msg
    assert_re_in(regex, line, match=True)
    # Close all handlers so windows is happy -- apparently not closed fast enough
    for handler in lgr.handlers:
        handler.close()
示例#2
0
 def __init__(self, *args, **kwargs):
     super(FileResource, self).__init__(*args, **kwargs)
     # setup parser
     bool_type = EnsureBool()
     json_type = EnsureChoice('yes', 'no', 'stream')
     self.rp = reqparse.RequestParser()
     self.rp.add_argument(
         'path', type=str,
         help="""path to file. If none is given, or the path contains a
         wildcard character '*', a list of (matching) files in the
         dataset is returned.""",
         location=['args', 'json', 'form'])
     self.rp.add_argument(
         'json', type=json_type,
         default='no',
         help='%s. {error_msg}' % repr(json_type),
         location=['args', 'json', 'form'])
     self.rp.add_argument(
         'verify_availability', type=bool_type,
         default='yes',
         help='%s. {error_msg}' % repr(bool_type),
         location=['args', 'json', 'form'])
     self.rp.add_argument(
         'content',
         help='file content',
         location=['form', 'json'])
     self.rp.add_argument(
         'togit', type=bool_type,
         help="""flag whether to add files to git, instead of making a
         decision based on the dataset configuration. %s. {error_msg}"""
         % repr(bool_type),
         location=['json', 'form'])
示例#3
0
def test_logging_to_a_file(dst):
    ok_(not exists(dst))

    lgr = LoggerHelper("dataladtest-1").get_initialized_logger(logtarget=dst)
    ok_(exists(dst))  # nothing was logged -- no file created

    msg = "Oh my god, they killed Kenny"
    lgr.error(msg)
    with open(dst) as f:
        lines = f.readlines()
    assert_equal(len(lines), 1, "Read more than a single log line: %s" % lines)
    line = lines[0]
    ok_(msg in line)
    ok_(not '\033[' in line,
        msg="There should be no color formatting in log files. Got: %s" % line)
    # verify that time stamp and level are present in the log line
    # do not want to rely on not having race conditions around date/time changes
    # so matching just with regexp
    # .* is added to swallow possible traceback logs
    if EnsureBool()(cfg.get('datalad.log.timestamp', False)):
        ok_(
            re.match(
                "\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3} \[ERROR\](\s+\S+\s*)? %s"
                % msg, line))
    else:
        ok_(re.match("\[ERROR\](\s+\S+\s*)? %s" % msg, line))
示例#4
0
def _get_commit_info(ds, refcommit, status):
    """Get info about all commits, up to (and incl. the refcommit)"""
    #- get all the commit info with git log --pretty='%aN%x00%aI%x00%H'
    #  - use all first-level paths other than .datalad and .git for the query
    #- from this we can determine all modification timestamps, described refcommit
    #- do a subsequent git log query for the determined refcommit to determine
    #  a version by counting all commits since inception up to the refcommit
    #  - we cannot use the first query, because it will be constrained by the
    #    present paths that may not have existed previously at all

    # grab the history until the refcommit
    commits = [
        line.split('\0') for line in ds.repo.call_git_items_(
            # name, email, timestamp, shasum
            ['log', '--pretty=format:%aN%x00%aE%x00%aI%x00%H', refcommit])
    ]
    # version, always anchored on the first commit (tags could move and
    # make the integer commit count ambigous, and subtantially complicate
    # version comparisons
    version = '0-{}-g{}'.format(
        len(commits),
        # abbreviated shasum (like git-describe)
        ds.repo.get_hexsha(commits[0][3], short=True),
    )
    meta = {
        'version': version,
    }
    if ds.config.obtain('datalad.metadata.datalad-core.report-contributors',
                        True,
                        valtype=EnsureBool()):
        meta.update(contributors=sorted(set(tuple(c[:2]) for c in commits)))
    if ds.config.obtain(
            'datalad.metadata.datalad-core.report-modification-dates',
            True,
            valtype=EnsureBool()):
        meta.update(
            dateCreated=commits[-1][2],
            dateModified=commits[0][2],
        )
    return meta
示例#5
0
 def get_state(self, dataset):
     ds = dataset
     return {
         # increment when output format changes
         'version':
         1,
         'unique_exclude':
         list(self._unique_exclude),
         'remotes':
         ds.config.obtain('datalad.metadata.datalad-core.report-remotes',
                          True,
                          valtype=EnsureBool()),
         'contributors':
         ds.config.obtain(
             'datalad.metadata.datalad-core.report-contributors',
             True,
             valtype=EnsureBool()),
         'modification-dates':
         ds.config.obtain(
             'datalad.metadata.datalad-core.report-modification-dates',
             True,
             valtype=EnsureBool()),
     }
示例#6
0
class CreateSibling(Interface):
    """Create dataset(s)'s sibling (e.g., on a web server).

    Those (empty) datasets can then serve as a target for the `publish` command.
    """

    _params_ = dict(
        # TODO: Figure out, whether (and when) to use `sshurl` as push url
        dataset=Parameter(
            args=(
                "--dataset",
                "-d",
            ),
            doc="""specify the dataset to create the publication target for. If
                no dataset is given, an attempt is made to identify the dataset
                based on the current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        sshurl=Parameter(
            args=("sshurl", ),
            metavar='SSHURL',
            doc="""Login information for the target server. This can be given
                as a URL (ssh://host/path) or SSH-style (user@host:path).
                Unless overridden, this also serves the future dataset's access
                URL and path on the server.""",
            constraints=EnsureStr()),
        target=Parameter(
            args=('target', ),
            metavar='TARGETNAME',
            doc="""sibling name to create for this publication target.
                If `recursive` is set, the same name will be used to label all
                the subdatasets' siblings.  Note, this is just a
                convenience option, siblings can also be added at a later point
                in time.  When creation target datasets fails, no siblings are
                added""",
            constraints=EnsureStr() | EnsureNone(),
            nargs="?"),
        target_dir=Parameter(
            args=('--target-dir', ),
            metavar='PATH',
            doc="""path to the directory *on the server* where the dataset
                shall be created. By default the SSH access URL is used to
                identify this directory. If a relative path is provided here,
                it is interpreted as being relative to the user's home
                directory on the server.\n
                Additional features are relevant for recursive processing of
                datasets with subdatasets. By default, the local
                dataset structure is replicated on the server. However, it is
                possible to provide a template for generating different target
                directory names for all (sub)datasets. Templates can contain
                certain placeholder that are substituted for each (sub)dataset.
                For example: "/mydirectory/dataset-%%NAME".\nSupported
                placeholders:\n
                %%NAME - the name of the datasets, with any slashes replaced by
                dashes\n""",
            constraints=EnsureStr() | EnsureNone()),
        target_url=Parameter(
            args=('--target-url', ),
            metavar='URL',
            doc=""""public" access URL of the to-be-created target dataset(s)
                (default: `sshurl`). Accessibility of this URL determines the
                access permissions of potential consumers of the dataset.
                As with `target_dir`, templates (same set of placeholders)
                are supported.  Also, if specified, it is provided as the annex
                description\n""",
            constraints=EnsureStr() | EnsureNone()),
        target_pushurl=Parameter(
            args=('--target-pushurl', ),
            metavar='URL',
            doc="""In case the `target_url` cannot be used to publish to the
                dataset, this option specifies an alternative URL for this
                purpose. As with `target_url`, templates (same set of
                placeholders) are supported.\n""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        existing=Parameter(
            args=("--existing", ),
            constraints=EnsureChoice('skip', 'replace', 'error',
                                     'reconfigure'),
            metavar='MODE',
            doc="""action to perform, if target directory exists already.
                Dataset is skipped if 'skip'. 'replace' forces to (re-)init
                the dataset, and to (re-)configure the dataset sibling,
                i.e. its URL(s), in case it already exists. 'reconfigure'
                updates metadata of the dataset sibling. 'error' causes
                an exception to be raised.""",
        ),
        shared=Parameter(
            args=("--shared", ),
            metavar='false|true|umask|group|all|world|everybody|0xxx',
            doc="""if given, configures the access permissions on the server
            for multi-users (this could include access by a webserver!).
            Possible values for this option are identical to those of
            `git init --shared` and are described in its documentation.""",
            constraints=EnsureStr() | EnsureBool()),
        ui=Parameter(args=("--ui", ),
                     metavar='false|true|html_filename',
                     doc="""publish a web interface for the dataset with an
            optional user-specified name for the html at publication
            target. defaults to `index.html` at dataset root""",
                     constraints=EnsureBool() | EnsureStr()),
        as_common_datasrc=as_common_datasrc,
        publish_depends=publish_depends,
        publish_by_default=publish_by_default,
    )

    @staticmethod
    @datasetmethod(name='create_sibling')
    def __call__(sshurl,
                 target=None,
                 target_dir=None,
                 target_url=None,
                 target_pushurl=None,
                 dataset=None,
                 recursive=False,
                 existing='error',
                 shared=False,
                 ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None):

        if sshurl is None:
            raise ValueError("""insufficient information for target creation
            (needs at least a dataset and a SSH URL).""")

        if target is None and (target_url is not None
                               or target_pushurl is not None):
            raise ValueError("""insufficient information for adding the target
            as a sibling (needs at least a name)""")

        # shortcut
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='creating a sibling')

        assert (ds is not None and sshurl is not None and ds.repo is not None)

        # determine target parameters:
        sshri = RI(sshurl)

        if not isinstance(sshri, SSHRI) \
                and not (isinstance(sshri, URL) and sshri.scheme == 'ssh'):
            raise ValueError(
                "Unsupported SSH URL: '{0}', use ssh://host/path or host:path syntax"
                .format(sshurl))

        if target_dir is None:
            if sshri.path:
                target_dir = sshri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = False
        if "%NAME" not in target_dir:
            replicate_local_structure = True

        # collect datasets to use:
        datasets = dict()
        datasets[basename(ds.path)] = ds
        if recursive:
            for subds in ds.get_subdatasets(recursive=True):
                sub_path = opj(ds.path, subds)
                # TODO: when enhancing Dataset/*Repo classes and therefore
                # adapt to moved code, make proper distinction between name and
                # path of a submodule, which are technically different. This
                # probably will become important on windows as well as whenever
                # we want to allow for moved worktrees.
                datasets[basename(ds.path) + '/' + subds] = \
                    Dataset(sub_path)

        # request ssh connection:
        not_supported_on_windows("TODO")
        lgr.info("Connecting ...")
        ssh = ssh_manager.get_connection(sshurl)
        ssh.open()

        # flag to check if at dataset_root
        at_root = True

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        remote_repos_to_run_hook_for = []
        for current_dspath in \
                sorted(datasets.keys(), key=lambda x: x.count('/')):
            current_ds = datasets[current_dspath]
            if not current_ds.is_installed():
                lgr.info("Skipping %s since not installed locally",
                         current_dspath)
                continue
            if not replicate_local_structure:
                path = target_dir.replace("%NAME",
                                          current_dspath.replace("/", "-"))
            else:
                # TODO: opj depends on local platform, not the remote one.
                # check how to deal with it. Does windows ssh server accept
                # posix paths? vice versa? Should planned SSH class provide
                # tools for this issue?
                path = normpath(
                    opj(target_dir,
                        relpath(datasets[current_dspath].path, start=ds.path)))

            lgr.info("Creating target dataset {0} at {1}".format(
                current_dspath, path))
            # Must be set to True only if exists and existing='reconfigure'
            # otherwise we might skip actions if we say existing='reconfigure'
            # but it did not even exist before
            only_reconfigure = False
            if path != '.':
                # check if target exists
                # TODO: Is this condition valid for != '.' only?
                path_exists = True
                try:
                    out, err = ssh(["ls", path])
                except CommandError as e:
                    if "No such file or directory" in e.stderr and \
                            path in e.stderr:
                        path_exists = False
                    else:
                        raise  # It's an unexpected failure here

                if path_exists:
                    if existing == 'error':
                        raise RuntimeError(
                            "Target directory %s already exists." % path)
                    elif existing == 'skip':
                        continue
                    elif existing == 'replace':
                        ssh([
                            "chmod", "+r+w", "-R", path
                        ])  # enable write permissions to allow removing dir
                        ssh(["rm", "-rf", path])  # remove target at path
                        path_exists = False  # if we succeeded in removing it
                    elif existing == 'reconfigure':
                        only_reconfigure = True
                    else:
                        raise ValueError(
                            "Do not know how to handle existing=%s" %
                            repr(existing))

                if not path_exists:
                    try:
                        ssh(["mkdir", "-p", path])
                    except CommandError as e:
                        lgr.error(
                            "Remotely creating target directory failed at "
                            "%s.\nError: %s" % (path, exc_str(e)))
                        continue

            # don't (re-)initialize dataset if existing == reconfigure
            if not only_reconfigure:
                # init git and possibly annex repo
                if not CreateSibling.init_remote_repo(path,
                                                      ssh,
                                                      shared,
                                                      datasets[current_dspath],
                                                      description=target_url):
                    continue

            # check git version on remote end
            lgr.info("Adjusting remote git configuration")
            remote_git_version = CreateSibling.get_remote_git_version(ssh)
            if remote_git_version and remote_git_version >= "2.4":
                # allow for pushing to checked out branch
                try:
                    ssh(["git", "-C", path] + [
                        "config", "receive.denyCurrentBranch", "updateInstead"
                    ])
                except CommandError as e:
                    lgr.error(
                        "git config failed at remote location %s.\n"
                        "You will not be able to push to checked out "
                        "branch. Error: %s", path, exc_str(e))
            else:
                lgr.error(
                    "Git version >= 2.4 needed to configure remote."
                    " Version detected on server: %s\nSkipping configuration"
                    " of receive.denyCurrentBranch - you will not be able to"
                    " publish updates to this repository. Upgrade your git"
                    " and run with --existing=reconfigure" %
                    remote_git_version)

            # enable metadata refresh on dataset updates to publication server
            lgr.info("Enabling git post-update hook ...")
            try:
                CreateSibling.create_postupdate_hook(path, ssh,
                                                     datasets[current_dspath])
            except CommandError as e:
                lgr.error("Failed to add json creation command to post update "
                          "hook.\nError: %s" % exc_str(e))

            # publish web-interface to root dataset on publication server
            if at_root and ui:
                lgr.info("Uploading web interface to %s" % path)
                at_root = False
                try:
                    CreateSibling.upload_web_interface(path, ssh, shared, ui)
                except CommandError as e:
                    lgr.error("Failed to push web interface to the remote "
                              "datalad repository.\nError: %s" % exc_str(e))

            remote_repos_to_run_hook_for.append(path)

        # in reverse order would be depth first
        lgr.debug("Running post-update hooks in all created siblings")
        for path in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            try:
                ssh(
                    ["cd '" + _path_(path, ".git") + "' && hooks/post-update"],
                    wrap_args=False  # we wrapped here manually
                )
            except CommandError as e:
                lgr.error("Failed to run post-update hook under path %s. "
                          "Error: %s" % (path, exc_str(e)))

        if target:
            # add the sibling(s):
            lgr.debug("Adding the siblings")
            if target_url is None:
                target_url = sshurl
            if target_pushurl is None and sshurl != target_url:
                target_pushurl = sshurl
            AddSibling()(dataset=ds,
                         name=target,
                         url=target_url,
                         pushurl=target_pushurl,
                         recursive=recursive,
                         fetch=True,
                         force=existing in {'replace'},
                         as_common_datasrc=as_common_datasrc,
                         publish_by_default=publish_by_default,
                         publish_depends=publish_depends)

        # TODO: Return value!?
        #       => [(Dataset, fetch_url)]

    @staticmethod
    def init_remote_repo(path, ssh, shared, dataset, description=None):
        cmd = ["git", "-C", path, "init"]
        if shared:
            cmd.append("--shared=%s" % shared)
        try:
            ssh(cmd)
        except CommandError as e:
            lgr.error("Initialization of remote git repository failed at %s."
                      "\nError: %s\nSkipping ..." % (path, exc_str(e)))
            return False

        if isinstance(dataset.repo, AnnexRepo):
            # init remote git annex repo (part fix of #463)
            try:
                ssh(["git", "-C", path, "annex", "init"] +
                    ([description] if description else []))
            except CommandError as e:
                lgr.error(
                    "Initialization of remote git annex repository failed at %s."
                    "\nError: %s\nSkipping ..." % (path, exc_str(e)))
                return False
        return True

    @staticmethod
    def get_remote_git_version(ssh):
        try:
            # options to disable all auto so we don't trigger them while testing
            # for absent changes
            out, err = ssh(["git"] + ["version"])
            assert out.strip().startswith("git version")
            git_version = out.strip().split()[2]
            lgr.debug("Detected git version on server: %s" % git_version)
            return LooseVersion(git_version)

        except CommandError as e:
            lgr.warning("Failed to determine git version on remote.\n"
                        "Error: {0}\nTrying to configure anyway "
                        "...".format(exc_str(e)))
        return None

    @staticmethod
    def create_postupdate_hook(path, ssh, dataset):
        # location of post-update hook file, logs folder on remote target
        hooks_remote_dir = opj(path, '.git', 'hooks')
        hook_remote_target = opj(hooks_remote_dir, 'post-update')
        # post-update hook should create its log directory if doesn't exist
        logs_remote_dir = opj(path, WEB_META_LOG)

        make_log_dir = 'mkdir -p "{}"'.format(logs_remote_dir)

        # create json command for current dataset
        json_command = r'''
        mkdir -p {};
        ( which datalad > /dev/null \
        && ( cd ..; GIT_DIR=$PWD/.git datalad ls -a --json file '{}'; ) \
        || echo "no datalad found - skipping generation of indexes for web frontend"; \
        ) &> "{}/{}"
        '''.format(logs_remote_dir, str(path), logs_remote_dir,
                   'datalad-publish-hook-$(date +%s).log' % TIMESTAMP_FMT)

        # collate content for post_update hook
        hook_content = '\n'.join([
            '#!/bin/bash', 'git update-server-info', make_log_dir, json_command
        ])

        with make_tempfile(content=hook_content
                           ) as tempf:  # create post_update hook script
            ssh.copy(tempf, hook_remote_target)  # upload hook to dataset
        ssh(['chmod', '+x', hook_remote_target])  # and make it executable

    @staticmethod
    def upload_web_interface(path, ssh, shared, ui):
        # path to web interface resources on local
        webui_local = opj(dirname(datalad.__file__), 'resources', 'website')
        # local html to dataset
        html_local = opj(webui_local, "index.html")

        # name and location of web-interface html on target
        html_targetname = {True: ui, False: "index.html"}[isinstance(ui, str)]
        html_target = opj(path, html_targetname)

        # upload ui html to target
        ssh.copy(html_local, html_target)

        # upload assets to the dataset
        webresources_local = opj(webui_local, 'assets')
        webresources_remote = opj(path, WEB_HTML_DIR)
        ssh(['mkdir', '-p', webresources_remote])
        ssh.copy(webresources_local, webresources_remote, recursive=True)

        # minimize and upload js assets
        for js_file in glob(opj(webresources_local, 'js', '*.js')):
            with open(js_file) as asset:
                try:
                    from jsmin import jsmin
                    minified = jsmin(asset.read())  # minify asset
                except ImportError:
                    lgr.warning(
                        "Will not minify web interface javascript, no jsmin available"
                    )
                    minified = asset.read()  # no minify available
                with make_tempfile(content=minified
                                   ) as tempf:  # write minified to tempfile
                    js_name = js_file.split('/')[-1]
                    ssh.copy(tempf,
                             opj(webresources_remote, 'assets', 'js',
                                 js_name))  # and upload js

        # explicitly make web+metadata dir of dataset world-readable, if shared set to 'all'
        mode = None
        if shared in (True, 'true', 'all', 'world', 'everybody'):
            mode = 'a+rX'
        elif shared == 'group':
            mode = 'g+rX'
        elif str(shared).startswith('0'):
            mode = shared

        if mode:
            ssh([
                'chmod', mode, '-R',
                dirname(webresources_remote),
                opj(path, 'index.html')
            ])
示例#7
0
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None):
    """Make a direct query of a dataset to extract its metadata.

    Parameters
    ----------
    ds : Dataset
    types : list
    """
    errored = False
    dsmeta = MetadataDict()
    # each item in here will be a MetadataDict, but not the whole thing
    contentmeta = {}

    if global_meta is not None and content_meta is not None and \
            not global_meta and not content_meta:
        # both are false and not just none
        return dsmeta, contentmeta, errored

    context = {
        '@vocab':
        'http://docs.datalad.org/schema_v{}.json'.format(vocabulary_version)
    }

    fullpathlist = paths
    if paths and isinstance(ds.repo, AnnexRepo):
        # Ugly? Jep: #2055
        content_info = zip(paths, ds.repo.file_has_content(paths),
                           ds.repo.is_under_annex(paths))
        paths = [p for p, c, a in content_info if not a or c]
        nocontent = len(fullpathlist) - len(paths)
        if nocontent:
            # TODO better fail, or support incremental and label this file as no present
            lgr.warn(
                '{} files have no content present, skipped metadata extraction for {}'
                .format(
                    nocontent, 'them' if nocontent > 10 else
                    [p for p, c, a in content_info if not c and a]))

    # pull out potential metadata field blacklist config settings
    blacklist = [
        re.compile(bl) for bl in assure_list(
            ds.config.obtain('datalad.metadata.aggregate-ignore-fields',
                             default=[]))
    ]
    # enforce size limits
    max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize')
    # keep local, who knows what some extractors might pull in
    from . import extractors
    lgr.info('Engage metadata extractors: %s', types)
    for mtype in types:
        mtype_key = mtype
        try:
            pmod = import_module('.{}'.format(mtype),
                                 package=extractors.__package__)
        except ImportError as e:
            lgr.warning(
                "Failed to import metadata extractor for '%s', "
                "broken dataset configuration (%s)? "
                "This type of metadata will be ignored: %s", mtype, ds,
                exc_str(e))
            if cfg.get('datalad.runtime.raiseonerror'):
                raise
            errored = True
            continue
        extractor = pmod.MetadataExtractor(ds, paths=paths)
        try:
            dsmeta_t, contentmeta_t = extractor.get_metadata(
                dataset=global_meta
                if global_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-dataset-{}'.format(
                        mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()),
                content=content_meta
                if content_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-content-{}'.format(
                        mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()))
        except Exception as e:
            lgr.error('Failed to get dataset metadata ({}): {}'.format(
                mtype, exc_str(e)))
            if cfg.get('datalad.runtime.raiseonerror'):
                raise
            errored = True
            # if we dont get global metadata we do not want content metadata
            continue

        if dsmeta_t:
            if _ok_metadata(dsmeta_t, mtype, ds, None):
                dsmeta_t = _filter_metadata_fields(dsmeta_t,
                                                   maxsize=max_fieldsize,
                                                   blacklist=blacklist)
                dsmeta[mtype_key] = dsmeta_t
            else:
                errored = True

        unique_cm = {}
        for loc, meta in contentmeta_t or {}:
            if not _ok_metadata(meta, mtype, ds, loc):
                errored = True
                continue
            # we also want to store info that there was no metadata(e.g. to get a list of
            # files that have no metadata)
            # if there is an issue that a extractor needlessly produces empty records, the
            # extractor should be fixed and not a general switch. For example the datalad_core
            # issues empty records to document the presence of a file
            #elif not meta:
            #    continue

            meta = MetadataDict(meta)
            # apply filters
            meta = _filter_metadata_fields(meta,
                                           maxsize=max_fieldsize,
                                           blacklist=blacklist)

            # assign
            # only ask each metadata extractor once, hence no conflict possible
            loc_dict = contentmeta.get(loc, {})
            if meta:
                # do not store empty stuff
                loc_dict[mtype_key] = meta
            contentmeta[loc] = loc_dict

            if ds.config.obtain('datalad.metadata.generate-unique-{}'.format(
                    mtype_key.replace('_', '-')),
                                default=True,
                                valtype=EnsureBool()):
                # go through content metadata and inject report of unique keys
                # and values into `dsmeta`
                for k, v in meta.items():
                    if k in dsmeta.get(mtype_key, {}):
                        # if the dataset already has a dedicated idea
                        # about a key, we skip it from the unique list
                        # the point of the list is to make missing info about
                        # content known in the dataset, not to blindly
                        # duplicate metadata. Example: list of samples data
                        # were recorded from. If the dataset has such under
                        # a 'sample' key, we should prefer that, over an
                        # aggregated list of a hopefully-kinda-ok structure
                        continue
                    vset = unique_cm.get(k, set())
                    vset.add(_val2hashable(v))
                    unique_cm[k] = vset

        if unique_cm:
            # per source storage here too
            ucp = dsmeta.get('datalad_unique_content_properties', {})
            # important: we want to have a stable order regarding
            # the unique values (a list). we cannot guarantee the
            # same order of discovery, hence even when not using a
            # set above we would still need sorting. the callenge
            # is that any value can be an arbitrarily complex nested
            # beast
            # we also want to have each unique value set always come
            # in a top-level list, so we known if some unique value
            # was a list, os opposed to a list of unique values
            ucp[mtype_key] = {
                k: [
                    dict(i) if isinstance(i, ReadOnlyDict) else i
                    for i in sorted(v, key=_unique_value_key)
                ]
                for k, v in unique_cm.items()
            }
            dsmeta['datalad_unique_content_properties'] = ucp

    # always identify the effective vocabulary - JSON-LD style
    if context:
        dsmeta['@context'] = context

    return dsmeta, contentmeta, errored
示例#8
0
class Add(Interface):
    r"""Add metadata to metadata model instance.

    This command reads metadata from a source and adds this metadata
    to a metadata model instance. A source can be: arguments, standard
    input, or a local file. The metadata format is a strings with the
    JSON-serialized dictionary that describes the metadata

    [TODO: add a schema]

    If metadata is read from a source, parameter can overwrite or
    amend information that is stored in the source.

    The METADATA and the ADDITIONAL_VALUES arguments can be pre-fixed by '@',
    in which case the pre-fixed argument is interpreted as a file-name and
    the argument value is read from the file.

    """

    _examples_ = [
        dict(text='Add metadata stored in the file "metadata-123.json" to the '
             'metadata model instance in the current directory.',
             code_cmd="datalad meta-add metadata-123.json"),
        dict(text='Add metadata stored in the file "metadata-123.json" to the '
             'metadata stored in the git-repository "/home/user/dataset_0"',
             code_cmd="datalad meta-add --metadata-store /home/user/dataset_0 "
             "metadata-123.json"),
        dict(text='Add metadata stored in the file "metadata-123.json" to the '
             'metadata model instance in the current directory and '
             'overwrite the "dataset_id" value stored in '
             '"metadata-123.json"',
             code_cmd='datalad meta-add --metadata-store /home/user/dataset_0 '
             'metadata-123.json \'{"dataset_id": '
             '"00010203-1011-2021-3031-404142434445"}\''),
        dict(
            text='Add metadata read from standard input to the metadata model '
            'instance in the current directory',
            code_cmd='datalad meta-add --metadata-store /home/user/dataset_0 '
            'metadata-123.json @extra-info.json'),
        dict(text='Add metadata stored in the file "metadata-123.json" to the '
             'metadata model instance in the current directory and '
             'overwrite metadata values with the values stored in '
             '"extra-info.json"',
             code_cmd='atalad meta-add --metadata-store /home/user/dataset_0 '
             'metadata-123.json @extra-info.json')
    ]

    required_keys = ("type", "extractor_name", "extractor_version",
                     "extraction_parameter", "extraction_time", "agent_name",
                     "agent_email", "dataset_id", "dataset_version",
                     "extracted_metadata")

    optional_keys = ("path", )

    required_additional_keys = ("root_dataset_id", "root_dataset_version",
                                "dataset_path")

    required_keys_lines = "\n".join(map(repr, required_keys))
    required_additional_keys_lines = "\n".join(
        map(repr, required_additional_keys))

    _params_ = dict(
        metadata=Parameter(
            args=("metadata", ),
            metavar="METADATA",
            doc=f"""Path of a file that contains the metadata that
            should be added to the metadata model instance (the
            metadata must be provided as a JSON-serialized metadata
            dictionary).
            
            If the path is "-", metadata is read from standard input.
            
            The dictionary must contain the following keys:
            
            {required_keys_lines}
            
            If the metadata is associated with a file, the following key
            indicates the file path:
            
            'path'
            
            It may in addition contain either all or none of the
            following keys (they are used to add metadata element
            as a sub-dataset element, i.e. perform aggregation):
            
            {required_additional_keys_lines}            
            """,
            constraints=EnsureStr() | EnsureNone()),
        metadata_store=Parameter(
            args=("-m", "--metadata-store"),
            metavar="METADATA_STORE",
            doc="""Directory in which the metadata model instance is
            stored. If no directory name is provided, the current working
            directory is used.""",
            constraints=EnsureStr() | EnsureNone()),
        additionalvalues=Parameter(
            args=("additionalvalues", ),
            metavar="ADDITIONAL_VALUES",
            doc="""A string that contains a JSON serialized dictionary of
            key value-pairs. These key values-pairs are used in addition to
            the key value pairs in the metadata dictionary to describe
            the metadata that should be added. If an additional key is
            already present in the metadata, an error is raised, unless
            -o, --allow-override is provided. In this case, the additional
            values will override the value in metadata and a warning is 
            issued.""",
            nargs="?",
            constraints=EnsureStr() | EnsureNone()),
        allow_override=Parameter(
            args=("-o", "--allow-override"),
            doc="""Allow the additional values to override values given in
            metadata.""",
            default=False,
            constraints=EnsureBool() | EnsureNone()),
        allow_unknown=Parameter(
            args=("-u", "--allow-unknown"),
            doc="""Allow unknown keys. By default, unknown keys generate
            an errors. If this switch is True, unknown keys will only be
            reported.""",
            default=False,
            constraints=EnsureBool() | EnsureNone()))

    @staticmethod
    @datasetmethod(name="meta_add")
    @eval_results
    def __call__(metadata: Union[str, JSONObject],
                 metadata_store: Optional[str] = None,
                 additionalvalues: Optional[Union[str, JSONObject]] = None,
                 allow_override: bool = False,
                 allow_unknown: bool = False):

        additionalvalues = additionalvalues or dict()
        metadata_store = Path(metadata_store or curdir)

        metadata = process_parameters(
            metadata=read_json_object(metadata),
            additional_values=get_json_object(additionalvalues),
            allow_override=allow_override,
            allow_unknown=allow_unknown)

        lgr.debug(f"attempting to add metadata: {json.dumps(metadata)}")

        add_parameter = AddParameter(
            dataset_id=UUID(metadata["dataset_id"]),
            dataset_version=metadata["dataset_version"],
            file_path=(MetadataPath(metadata["path"])
                       if "path" in metadata else None),
            root_dataset_id=(UUID(metadata["root_dataset_id"])
                             if "root_dataset_id" in metadata else None),
            root_dataset_version=metadata.get("root_dataset_version", None),
            dataset_path=MetadataPath(metadata.get("dataset_path", "")),
            extractor_name=metadata["extractor_name"],
            extractor_version=metadata["extractor_version"],
            extraction_time=metadata["extraction_time"],
            extraction_parameter=metadata["extraction_parameter"],
            agent_name=metadata["agent_name"],
            agent_email=metadata["agent_email"],
            extracted_metadata=metadata["extracted_metadata"])

        # If the key "path" is present in the metadata
        # dictionary, we assume that the metadata-dictionary describes
        # file-level metadata. Otherwise, we assume that the
        # metadata-dictionary contains dataset-level metadata.
        if add_parameter.file_path:
            yield from add_file_metadata(metadata_store, add_parameter)
        else:
            yield from add_dataset_metadata(metadata_store, add_parameter)
        return
class CreateSiblingRia(Interface):
    """Creates a sibling to a dataset in a RIA store

    This creates a representation of a dataset in a ria-remote compliant
    storage location. For access to it two siblings are configured for the
    dataset by default. A "regular" one and a RIA remote (git-annex
    special remote).  Furthermore, the former is configured to have a
    publication dependency on the latter. If not given a default name for
    the RIA remote is derived from the sibling's name by appending "-ria".

    The store's base path currently is expected to either:

      - not yet exist or
      - be empty or
      - have a valid `ria-layout-version` file and an `error_logs` directory.

    In the first two cases, said file and directory are created by this
    command. Alternatively you can manually create the third case, of course.
    Please note, that `ria-layout-version` needs to contain a line stating the
    version (currently '1') and optionally enable error logging (append '|l' in
    that case). Currently, this line MUST end with a newline!

    Error logging will create files in the `error_log` directory whenever the
    RIA special remote (storage sibling) raises an exception, storing the
    python traceback of it. The logfiles are named according to the scheme
    <dataset id>.<annex uuid of the remote>.log showing 'who' ran into this
    issue with what dataset. Since this logging can potentially leak personal
    data (like local file paths for example) it can be disabled from the client
    side via `annex.ria-remote.<RIAREMOTE>.ignore-remote-config`.

    Todo
    ----
    Where to put the description of a RIA store (see below)?

    The targeted layout of such a store is a tree of datasets, starting at the
    configured base path. First level of subdirectories are named for the first
    three characters of the datasets' id, second level is the remainder of
    those ids. The thereby created dataset directories contain a bare git
    repository.  Those bare repositories are slightly different from plain
    git-annex bare repositories in that they use the standard dirhashmixed
    layout beneath annex/objects as opposed to dirhashlower, which is
    git-annex's default for bare repositories. Furthermore, there is an
    additional directory 'archives' within the dataset directories, which may
    or may not contain archives with annexed content.  Note, that this helps to
    reduce the number of inodes consumed (no checkout + potential archive) as
    well as it allows to resolve dependencies (that is (sub)datasets) merely by
    their id.  Finally, there is a file `ria-layout-version` put beneath the
    store's base path, determining the version of the dataset tree layout and a
    file of the same name per each dataset directory determining object tree
    layout version (we already switch from dirhashlower to dirhashmixed for
    example) and an additional directory `error_logs` at the toplevel.  """

    # TODO: description?
    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to process.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        url=Parameter(
            args=("url", ),
            metavar="ria+<ssh|file>://<host>[/path]",
            doc="""URL identifying the target RIA store and access protocol.
            """,
            constraints=EnsureStr() | EnsureNone()),
        name=Parameter(args=(
            '-s',
            '--name',
        ),
                       metavar='NAME',
                       doc="""Name of the sibling.
            With `recursive`, the same name will be used to label all
            the subdatasets' siblings.""",
                       constraints=EnsureStr() | EnsureNone(),
                       required=True),
        ria_remote_name=Parameter(
            args=("--ria-remote-name", ),
            metavar="NAME",
            doc="""Name of the RIA remote (a git-annex special remote).
            Must not be identical to the sibling name. If not specified,
            defaults to the sibling name plus a '-ria' suffix.""",
            constraints=EnsureStr() | EnsureNone()),
        post_update_hook=Parameter(
            args=("--post-update-hook", ),
            doc="""Enable git's default post-update-hook for the created
            sibling.""",
            action="store_true"),
        shared=Parameter(
            args=("--shared", ),
            metavar='{false|true|umask|group|all|world|everybody|0xxx}',
            doc="""If given, configures the permissions in the
            RIA store for multi-users access.
            Possible values for this option are identical to those of
            `git init --shared` and are described in its documentation.""",
            constraints=EnsureStr() | EnsureBool() | EnsureNone()),
        group=Parameter(
            args=("--group", ),
            metavar="GROUP",
            doc="""Filesystem group for the repository. Specifying the group is
            crucial when [CMD: --shared=group CMD][PY: shared="group" PY]""",
            constraints=EnsureStr() | EnsureNone()),
        ria_remote=Parameter(
            args=("--no-ria-remote", ),
            dest='ria_remote',
            doc="""Whether to establish remote indexed archive (RIA) capabilties
            for the created sibling. If enabled, git-annex special remote access
            will be configured to enable regular git-annex key storage, and
            also retrieval of keys from (compressed) 7z archives that might be
            provided by the dataset store. If disabled, git-annex is instructed
            to ignore the sibling.""",
            action="store_false"),
        existing=Parameter(
            args=("--existing", ),
            constraints=EnsureChoice('skip', 'replace', 'error',
                                     'reconfigure'),
            metavar='MODE',
            doc="""Action to perform, if a sibling or ria-remote is already
            configured under the given name and/or a target already exists.
            In this case, a dataset can be skipped ('skip'), an existing target
            directory be forcefully re-initialized, and the sibling
            (re-)configured ('replace', implies 'reconfigure'), the sibling
            configuration be updated only ('reconfigure'), or to error
            ('error').""",
        ),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
    )

    @staticmethod
    @datasetmethod(name='create_sibling_ria')
    @eval_results
    def __call__(url,
                 name,
                 dataset=None,
                 ria_remote_name=None,
                 post_update_hook=False,
                 shared=None,
                 group=None,
                 ria_remote=True,
                 existing='error',
                 recursive=False,
                 recursion_limit=None):

        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='create sibling RIA')
        res_kwargs = dict(
            ds=ds,
            action="create-sibling-ria",
            logger=lgr,
        )

        if ds.repo.get_hexsha() is None or ds.id is None:
            raise RuntimeError("Repository at {} is not a DataLad dataset, "
                               "run 'datalad create [--force]' first.".format(
                                   ds.path))

        if not ria_remote and ria_remote_name:
            lgr.warning(
                "RIA remote setup disabled, but a ria-remote name was provided"
            )

        if ria_remote and not ria_remote_name:
            ria_remote_name = "{}-ria".format(name)

        if ria_remote and name == ria_remote_name:
            # leads to unresolvable, circular dependency with publish-depends
            raise ValueError("sibling names must not be equal")

        if not isinstance(url, str):
            raise TypeError("url is not a string, but %s" % type(url))

        # Query existing siblings upfront in order to fail early on
        # existing=='error', since misconfiguration (particularly of special
        # remotes) only to fail in a subdataset later on with that config, can
        # be quite painful.
        # TODO: messages - this is "create-sibling". Don't confuse existence of
        #       local remotes with existence of the actual remote sibling
        #       in wording
        if existing == 'error':
            # in recursive mode this check could take a substantial amount of
            # time: employ a progress bar (or rather a counter, because we dont
            # know the total in advance
            pbar_id = 'check-siblings-{}'.format(id(ds))
            log_progress(
                lgr.info,
                pbar_id,
                'Start checking pre-existing sibling configuration %s',
                ds,
                label='Query siblings',
                unit=' Siblings',
            )
            # even if we have to fail, let's report all conflicting siblings
            # in subdatasets
            failed = False
            for r in ds.siblings(result_renderer=None,
                                 recursive=recursive,
                                 recursion_limit=recursion_limit):
                log_progress(lgr.info,
                             pbar_id,
                             'Discovered sibling %s in dataset at %s',
                             r['name'],
                             r['path'],
                             update=1,
                             increment=True)
                if not r['type'] == 'sibling' or r['status'] != 'ok':
                    # this is an internal status query that has not consequence
                    # for the outside world. Be silent unless something useful
                    # can be said
                    #yield r
                    continue
                if r['name'] == name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
                if ria_remote_name and r['name'] == ria_remote_name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(ria_remote_name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
            log_progress(
                lgr.info,
                pbar_id,
                'Finished checking pre-existing sibling configuration %s',
                ds,
            )
            if failed:
                return

        yield from _create_sibling_ria(ds, url, name, ria_remote,
                                       ria_remote_name, existing, shared,
                                       group, post_update_hook, res_kwargs)

        if recursive:
            # Note: subdatasets can be treated independently, so go full
            # recursion when querying for them and _no_recursion with the
            # actual call. Theoretically this can be parallelized.

            for subds in ds.subdatasets(fulfilled=True,
                                        recursive=True,
                                        recursion_limit=recursion_limit,
                                        result_xfm='datasets'):
                yield from _create_sibling_ria(subds, url, name, ria_remote,
                                               ria_remote_name, existing,
                                               shared, group, post_update_hook,
                                               res_kwargs)
示例#10
0
文件: run.py 项目: nicholsn/datalad
class Run(Interface):
    """Run an arbitrary shell command and record its impact on a dataset.

    It is recommended to craft the command such that it can run in the root
    directory of the dataset that the command will be recorded in. However,
    as long as the command is executed somewhere underneath the dataset root,
    the exact location will be recorded relative to the dataset root.

    If the executed command did not alter the dataset in any way, no record of
    the command execution is made.

    If the given command errors, a `CommandError` exception with the same exit
    code will be raised, and no modifications will be saved.

    *Command format*

    || REFLOW >>
    A few placeholders are supported in the command via Python format
    specification. "{pwd}" will be replaced with the full path of the current
    working directory. "{dspath}" will be replaced with the full path of the
    dataset that run is invoked on. "{tmpdir}" will be replaced with the full
    path of a temporary directory. "{inputs}" and "{outputs}" represent the
    values specified by [CMD: --input and --output CMD][PY: `inputs` and
    `outputs` PY]. If multiple values are specified, the values will be joined
    by a space. The order of the values will match that order from the command
    line, with any globs expanded in alphabetical order (like bash). Individual
    values can be accessed with an integer index (e.g., "{inputs[0]}").
    << REFLOW ||

    || REFLOW >>
    Note that the representation of the inputs or outputs in the formatted
    command string depends on whether the command is given as a list of
    arguments or as a string[CMD:  (quotes surrounding the command) CMD]. The
    concatenated list of inputs or outputs will be surrounded by quotes when
    the command is given as a list but not when it is given as a string. This
    means that the string form is required if you need to pass each input as a
    separate argument to a preceding script (i.e., write the command as
    "./script {inputs}", quotes included). The string form should also be used
    if the input or output paths contain spaces or other characters that need
    to be escaped.
    << REFLOW ||

    To escape a brace character, double it (i.e., "{{" or "}}").

    Custom placeholders can be added as configuration variables under
    "datalad.run.substitutions".  As an example:

      Add a placeholder "name" with the value "joe"::

        % git config --file=.datalad/config datalad.run.substitutions.name joe
        % datalad add -m "Configure name placeholder" .datalad/config

      Access the new placeholder in a command::

        % datalad run "echo my name is {name} >me"
    """
    _examples_ = [
        dict(
            text="Run an executable script and record the impact on a dataset",
            code_py="run(message='run my script', cmd='code/script.sh')",
            code_cmd="datalad run -m 'run my script' 'code/script.sh'"),
        dict(text="Run a command and specify a directory as a dependency "
             "for the run. The contents of the dependency will be retrieved "
             "prior to running the script",
             code_cmd="datalad run -m 'run my script' -i 'data/*' "
             "'code/script.sh'",
             code_py="""\
             run(cmd='code/script.sh', message='run my script',
                 inputs=['data/*'])"""),
        dict(text="Run an executable script and specify output files of the "
             "script to be unlocked prior to running the script",
             code_py="""\
             run(cmd='code/script.sh', message='run my script',
                 inputs=['data/*'], outputs=['output_dir'])""",
             code_cmd="""\
             datalad run -m 'run my script' -i 'data/*' \\
             -o 'output_dir/*' 'code/script.sh'"""),
        dict(text="Specify multiple inputs and outputs",
             code_py="""\
             run(cmd='code/script.sh',
                 message='run my script',
                 inputs=['data/*', 'datafile.txt'],
                 outputs=['output_dir', 'outfile.txt'])""",
             code_cmd="""\
             datalad run -m 'run my script' -i 'data/*' \\
             -i 'datafile.txt' -o 'output_dir/*' -o \\
             'outfile.txt' 'code/script.sh'""")
    ]

    _params_ = dict(
        cmd=Parameter(
            args=("cmd", ),
            nargs=REMAINDER,
            metavar='COMMAND',
            doc="""command for execution. A leading '--' can be used to
            disambiguate this command from the preceding options to
            DataLad."""),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to record the command results in.
            An attempt is made to identify the dataset based on the current
            working directory. If a dataset is given, the command will be
            executed in the root directory of this dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
        inputs=Parameter(
            args=("-i", "--input"),
            dest="inputs",
            metavar=("PATH"),
            action='append',
            doc="""A dependency for the run. Before running the command, the
            content of this file will be retrieved. A value of "." means "run
            :command:`datalad get .`". The value can also be a glob. [CMD: This
            option can be given more than once. CMD]"""),
        outputs=Parameter(
            args=("-o", "--output"),
            dest="outputs",
            metavar=("PATH"),
            action='append',
            doc="""Prepare this file to be an output file of the command. A
            value of "." means "run :command:`datalad unlock .`" (and will fail
            if some content isn't present). For any other value, if the content
            of this file is present, unlock the file. Otherwise, remove it. The
            value can also be a glob. [CMD: This option can be given more than
            once. CMD]"""),
        expand=Parameter(
            args=("--expand", ),
            doc="""Expand globs when storing inputs and/or outputs in the
            commit message.""",
            constraints=EnsureChoice(None, "inputs", "outputs", "both")),
        explicit=Parameter(
            args=("--explicit", ),
            action="store_true",
            doc="""Consider the specification of inputs and outputs to be
            explicit. Don't warn if the repository is dirty, and only save
            modifications to the listed outputs."""),
        message=save_message_opt,
        sidecar=Parameter(args=('--sidecar', ),
                          metavar="{yes|no}",
                          doc="""By default, the configuration variable
            'datalad.run.record-sidecar' determines whether a record with
            information on a command's execution is placed into a separate
            record file instead of the commit message (default: off). This
            option can be used to override the configured behavior on a
            case-by-case basis. Sidecar files are placed into the dataset's
            '.datalad/runinfo' directory (customizable via the
            'datalad.run.record-directory' configuration variable).""",
                          constraints=EnsureNone() | EnsureBool()),
    )

    @staticmethod
    @datasetmethod(name='run')
    @eval_results
    def __call__(cmd=None,
                 dataset=None,
                 inputs=None,
                 outputs=None,
                 expand=None,
                 explicit=False,
                 message=None,
                 sidecar=None):
        for r in run_command(cmd,
                             dataset=dataset,
                             inputs=inputs,
                             outputs=outputs,
                             expand=expand,
                             explicit=explicit,
                             message=message,
                             sidecar=sidecar):
            yield r
示例#11
0
class WebApp(Interface):
    """
    """
    _params_ = dict(
        app=Parameter(args=('app', ),
                      nargs='?',
                      metavar='APPNAME',
                      doc="""Name of a registered webapp to start"""),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to serve as the anchor of the webapp.
            An attempt is made to identify the dataset based on the current
            working directory. If a dataset is given, the command will be
            executed in the root directory of this dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
        read_only=Parameter(
            args=("--read-only", ),
            constraints=EnsureBool(),
            doc="""do not perform operations other then read-only access
            to dataset. It is up to the individual resources to interpret
            this flag and act accordingly."""),
        mode=Parameter(
            args=("--mode", ),
            constraints=EnsureChoice('normal', 'daemon', 'dry-run', 'debug'),
            doc="""Execution mode: regular foreground process (normal);
            background process (daemon); no server is started, but all
            configuration is perform (dry-run); like normal, but in debug
            mode (debug)"""),
        static_root=Parameter(
            args=("--static-root", ),
            doc="""path to static (HTML) files that should be served in
            root of the webapp. Defaults to the current directory."""),
        get_apps=Parameter(args=('--get-apps', ),
                           action='store_true',
                           doc="""if set, yields all registered webapp."""),
    )

    @staticmethod
    @datasetmethod(name='webapp')
    @eval_results
    def __call__(app=None,
                 dataset=None,
                 read_only=False,
                 mode='normal',
                 static_root=None,
                 get_apps=False):
        if get_apps:
            for ep in iter_entry_points('datalad.webapp.apps'):
                yield dict(action='webapp',
                           status='ok' if resource_isdir(
                               ep.module_name, ep.load()) else 'error',
                           path=ep.name,
                           logger=lgr,
                           message=("provided by '%s'", ep.module_name))
            return

        from datalad.distribution.dataset import require_dataset
        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='serving')

        if static_root is None and app:
            for ep in iter_entry_points('datalad.webapp.apps'):
                if ep.name == app:
                    app_path = resource_filename(ep.module_name, ep.load())
                    if not resource_isdir(ep.module_name, ep.load()):
                        yield dict(
                            action='webapp',
                            status='error',
                            path=dataset.path,
                            message=
                            ("app entrypoint '%s' does not point to directory",
                             app, app_path))
                        return
                    static_root = app_path
                    break
            if static_root is None:
                yield dict(action='webapp',
                           status='error',
                           path=dataset.path,
                           message=("no registered webapp with name '%s'",
                                    app))
                return
        elif static_root is None:
            static_root = op.curdir

        from flask import Flask
        app = Flask(
            __name__,
            root_path=dataset.path,
            static_url_path='',
            static_folder=op.abspath(static_root),
        )
        app.secret_key = os.urandom(64)
        # expose via arg
        app.config['api_key'] = 'dummy'

        webapp_props['config'] = app.config

        from flask_restful import Api
        api = Api(app, prefix="/api/v1")

        # TODO add default route to static index.html, if one exists
        # TODO use opt-in model for endpoints to limit exposure of
        # functionality to what is really needed
        for ep in iter_entry_points('datalad.webapp.resources'):
            lgr.warn("Available webapp resource'%s'", ep.name)
            cls = ep.load()
            urls = ['/{}'.format(ep.name)]
            if hasattr(cls, '_urlarg_spec'):
                urls.append('/{}/{}'.format(ep.name, cls._urlarg_spec))

            api.add_resource(cls,
                             *urls,
                             resource_class_kwargs=dict(dataset=dataset, ))

        if op.exists(op.join(static_root, 'index.html')):
            from flask import send_from_directory

            @app.route('/')
            def serve_index():
                return send_from_directory(static_root, 'index.html')

        if mode == 'dry-run':
            yield dict(
                action='webapp',
                status='ok',
                app=app,
                path=dataset.path,
            )
            return

        print("""
*************************************************
*************************************************

      THIS IS NOT A PRODUCTION-READY TOOL

      - only use in a trusted environment
      - do not expose service on public
        network interfaces

*************************************************
*************************************************
""")
        # TODO expose flags, or use FLASK config vars
        app.run(debug=mode == 'debug')
示例#12
0
 },
 'datalad.exc.str.tblimit': {
     'ui': ('question', {
         'title':
         'This flag is used by datalad to cap the number of traceback steps included in exception logging and result reporting to DATALAD_EXC_STR_TBLIMIT of pre-processed entries from traceback.'
     }),
 },
 'datalad.fake-dates': {
     'ui': ('yesno', {
         'title': 'Fake (anonymize) dates',
         'text': 'Should the dates in the logs be faked?'
     }),
     'destination':
     'local',
     'type':
     EnsureBool(),
     'default':
     False,
 },
 'datalad.fake-dates-start': {
     'ui': ('question', {
         'title':
         'Initial fake date',
         'text':
         'When faking dates and there are no commits in any local branches, generate the date by adding one second to this value (Unix epoch time). The value must be positive.'
     }),
     'type':
     EnsureInt(),
     'default':
     1112911993,
 },
示例#13
0
class CreateSibling(Interface):
    """Create a dataset sibling on a UNIX-like SSH-accessible machine

    Given a local dataset, and SSH login information this command creates
    a remote dataset repository and configures it as a dataset sibling to
    be used as a publication target (see `publish` command).

    Various properties of the remote sibling can be configured (e.g. name
    location on the server, read and write access URLs, and access
    permissions.

    Optionally, a basic web-viewer for DataLad datasets can be installed
    at the remote location.

    This command supports recursive processing of dataset hierarchies, creating
    a remote sibling for each dataset in the hierarchy. By default, remote
    siblings are created in hierarchical structure that reflects the
    organization on the local file system. However, a simple templating
    mechanism is provided to produce a flat list of datasets (see
    --target-dir).
    """
    # XXX prevent common args from being added to the docstring
    _no_eval_results = True

    _params_ = dict(
        # TODO: Figure out, whether (and when) to use `sshurl` as push url
        dataset=Parameter(
            args=("--dataset", "-d",),
            doc="""specify the dataset to create the publication target for. If
                no dataset is given, an attempt is made to identify the dataset
                based on the current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        sshurl=Parameter(
            args=("sshurl",),
            metavar='SSHURL',
            nargs='?',
            doc="""Login information for the target server. This can be given
                as a URL (ssh://host/path) or SSH-style (user@host:path).
                Unless overridden, this also serves the future dataset's access
                URL and path on the server.""",
            constraints=EnsureStr()),
        name=Parameter(
            args=('-s', '--name',),
            metavar='NAME',
            doc="""sibling name to create for this publication target.
                If `recursive` is set, the same name will be used to label all
                the subdatasets' siblings. When creating a target dataset fails,
                no sibling is added""",
            constraints=EnsureStr() | EnsureNone(),
            nargs="?"),
        target_dir=Parameter(
            args=('--target-dir',),
            metavar='PATH',
            doc="""path to the directory *on the server* where the dataset
                shall be created. By default the SSH access URL is used to
                identify this directory. If a relative path is provided here,
                it is interpreted as being relative to the user's home
                directory on the server.\n
                Additional features are relevant for recursive processing of
                datasets with subdatasets. By default, the local
                dataset structure is replicated on the server. However, it is
                possible to provide a template for generating different target
                directory names for all (sub)datasets. Templates can contain
                certain placeholder that are substituted for each (sub)dataset.
                For example: "/mydirectory/dataset%%RELNAME".\nSupported
                placeholders:\n
                %%RELNAME - the name of the datasets, with any slashes replaced by
                dashes\n""",
            constraints=EnsureStr() | EnsureNone()),
        target_url=Parameter(
            args=('--target-url',),
            metavar='URL',
            doc=""""public" access URL of the to-be-created target dataset(s)
                (default: `sshurl`). Accessibility of this URL determines the
                access permissions of potential consumers of the dataset.
                As with `target_dir`, templates (same set of placeholders)
                are supported.  Also, if specified, it is provided as the annex
                description\n""",
            constraints=EnsureStr() | EnsureNone()),
        target_pushurl=Parameter(
            args=('--target-pushurl',),
            metavar='URL',
            doc="""In case the `target_url` cannot be used to publish to the
                dataset, this option specifies an alternative URL for this
                purpose. As with `target_url`, templates (same set of
                placeholders) are supported.\n""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        existing=Parameter(
            args=("--existing",),
            constraints=EnsureChoice('skip', 'replace', 'error', 'reconfigure'),
            metavar='MODE',
            doc="""action to perform, if a sibling is already configured under the
            given name and/or a target directory already exists.
            In this case, a dataset can be skipped ('skip'), an existing target
            directory be forcefully re-initialized, and the sibling (re-)configured
            ('replace', implies 'reconfigure'), the sibling configuration be updated
            only ('reconfigure'), or to error ('error').""",),
        inherit=inherit_opt,
        shared=Parameter(
            args=("--shared",),
            metavar='{false|true|umask|group|all|world|everybody|0xxx}',
            doc="""if given, configures the access permissions on the server
            for multi-users (this could include access by a webserver!).
            Possible values for this option are identical to those of
            `git init --shared` and are described in its documentation.""",
            constraints=EnsureStr() | EnsureBool() | EnsureNone()),
        group=Parameter(
            args=("--group",),
            metavar="GROUP",
            doc="""Filesystem group for the repository. Specifying the group is
            particularly important when [CMD: --shared=group CMD][PY:
            shared="group" PY]""",
            constraints=EnsureStr() | EnsureNone()
        ),
        ui=Parameter(
            args=("--ui",),
            metavar='{false|true|html_filename}',
            doc="""publish a web interface for the dataset with an
            optional user-specified name for the html at publication
            target. defaults to `index.html` at dataset root""",
            constraints=EnsureBool() | EnsureStr()),
        as_common_datasrc=as_common_datasrc,
        publish_depends=publish_depends,
        publish_by_default=publish_by_default,
        annex_wanted=annex_wanted_opt,
        annex_group=annex_group_opt,
        annex_groupwanted=annex_groupwanted_opt,
        since=Parameter(
            args=("--since",),
            constraints=EnsureStr() | EnsureNone(),
            doc="""limit processing to datasets that have been changed since a given
            state (by tag, branch, commit, etc). This can be used to create siblings
            for recently added subdatasets."""),
    )

    @staticmethod
    @datasetmethod(name='create_sibling')
    @eval_results
    def __call__(sshurl, name=None, target_dir=None,
                 target_url=None, target_pushurl=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 existing='error',
                 shared=None,
                 group=None,
                 ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None,
                 annex_wanted=None, annex_group=None, annex_groupwanted=None,
                 inherit=False,
                 since=None):
        #
        # nothing without a base dataset
        #
        ds = require_dataset(dataset, check_installed=True,
                             purpose='creating a sibling')
        refds_path = ds.path

        #
        # all checks that are possible before we start parsing the dataset
        #

        # possibly use sshurl to get the name in case if not specified
        if not sshurl:
            if not inherit:
                raise InsufficientArgumentsError(
                    "needs at least an SSH URL, if no inherit option"
                )
            if name is None:
                raise ValueError(
                    "Neither SSH URL, nor the name of sibling to inherit from "
                    "was specified"
                )
            # It might well be that we already have this remote setup
            try:
                sshurl = CreateSibling._get_remote_url(ds, name)
            except Exception as exc:
                lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc))
        elif inherit:
            raise ValueError(
                "For now, for clarity not allowing specifying a custom sshurl "
                "while inheriting settings"
            )
            # may be could be safely dropped -- still WiP

        if not sshurl:
            # TODO: may be more back up before _prep?
            super_ds = ds.get_superdataset()
            if not super_ds:
                raise ValueError(
                    "Could not determine super dataset for %s to inherit URL"
                    % ds
                )
            super_url = CreateSibling._get_remote_url(super_ds, name)
            # for now assuming hierarchical setup
            # (TODO: to be able to destinguish between the two, probably
            # needs storing datalad.*.target_dir to have %RELNAME in there)
            sshurl = slash_join(super_url, relpath(ds.path, super_ds.path))

        # check the login URL
        sshri = RI(sshurl)
        if not is_ssh(sshri):
            raise ValueError(
                "Unsupported SSH URL: '{0}', "
                "use ssh://host/path or host:path syntax".format(sshurl))

        if not name:
            # use the hostname as default remote name
            name = sshri.hostname
            lgr.debug(
                "No sibling name given, use URL hostname '%s' as sibling name",
                name)

        if since == '':
            # consider creating siblings only since the point of
            # the last update
            # XXX here we assume one to one mapping of names from local branches
            # to the remote
            active_branch = ds.repo.get_active_branch()
            since = '%s/%s' % (name, active_branch)

        #
        # parse the base dataset to find all subdatasets that need processing
        #
        to_process = []
        for ap in AnnotatePaths.__call__(
                dataset=refds_path,
                # only a single path!
                path=refds_path,
                recursive=recursive,
                recursion_limit=recursion_limit,
                action='create_sibling',
                # both next should not happen anyways
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                modified=since,
                return_type='generator',
                on_failure='ignore'):
            if ap.get('status', None):
                # this is done
                yield ap
                continue
            if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent':
                # this can happen when there is `since`, but we have no
                # use for anything but datasets here
                continue
            checkds_remotes = Dataset(ap['path']).repo.get_remotes() \
                if ap.get('state', None) != 'absent' \
                else []
            if publish_depends:
                # make sure dependencies are valid
                # TODO: inherit -- we might want to automagically create
                # those dependents as well???
                unknown_deps = set(assure_list(publish_depends)).difference(checkds_remotes)
                if unknown_deps:
                    ap['status'] = 'error'
                    ap['message'] = (
                        'unknown sibling(s) specified as publication dependency: %s',
                        unknown_deps)
                    yield ap
                    continue
            if name in checkds_remotes and existing in ('error', 'skip'):
                ap['status'] = 'error' if existing == 'error' else 'notneeded'
                ap['message'] = (
                    "sibling '%s' already configured (specify alternative name, or force "
                    "reconfiguration via --existing",
                    name)
                yield ap
                continue
            to_process.append(ap)

        if not to_process:
            # we ruled out all possibilities
            # TODO wait for gh-1218 and make better return values
            lgr.info("No datasets qualify for sibling creation. "
                     "Consider different settings for --existing "
                     "or --since if this is unexpected")
            return

        if target_dir is None:
            if sshri.path:
                target_dir = sshri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = "%RELNAME" not in target_dir

        # request ssh connection:
        lgr.info("Connecting ...")
        assert(sshurl is not None)  # delayed anal verification
        ssh = ssh_manager.get_connection(sshurl)
        if not ssh.get_annex_version():
            raise MissingExternalDependency(
                'git-annex',
                msg='on the remote system')

        #
        # all checks done and we have a connection, now do something
        #

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        yielded = set()
        remote_repos_to_run_hook_for = []
        for currentds_ap in \
                sorted(to_process, key=lambda x: x['path'].count('/')):
            current_ds = Dataset(currentds_ap['path'])

            path = _create_dataset_sibling(
                name,
                current_ds,
                ds.path,
                ssh,
                replicate_local_structure,
                sshri,
                target_dir,
                target_url,
                target_pushurl,
                existing,
                shared,
                group,
                publish_depends,
                publish_by_default,
                ui,
                as_common_datasrc,
                annex_wanted,
                annex_group,
                annex_groupwanted,
                inherit
            )
            if not path:
                # nothing new was created
                # TODO is 'notneeded' appropriate in this case?
                currentds_ap['status'] = 'notneeded'
                # TODO explain status in 'message'
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            remote_repos_to_run_hook_for.append((path, currentds_ap))

            # publish web-interface to root dataset on publication server
            if current_ds.path == ds.path and ui:
                lgr.info("Uploading web interface to %s" % path)
                try:
                    CreateSibling.upload_web_interface(path, ssh, shared, ui)
                except CommandError as e:
                    currentds_ap['status'] = 'error'
                    currentds_ap['message'] = (
                        "failed to push web interface to the remote datalad repository (%s)",
                        exc_str(e))
                    yield currentds_ap
                    yielded.add(currentds_ap['path'])
                    continue

        # in reverse order would be depth first
        lgr.info("Running post-update hooks in all created siblings")
        # TODO: add progressbar
        for path, currentds_ap in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            lgr.debug("Running hook for %s (if exists and executable)", path)
            try:
                ssh("cd {} "
                    "&& ( [ -x hooks/post-update ] && hooks/post-update || : )"
                    "".format(sh_quote(_path_(path, ".git"))))
            except CommandError as e:
                currentds_ap['status'] = 'error'
                currentds_ap['message'] = (
                    "failed to run post-update hook under remote path %s (%s)",
                    path, exc_str(e))
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            if not currentds_ap['path'] in yielded:
                # if we were silent until now everything is just splendid
                currentds_ap['status'] = 'ok'
                yield currentds_ap

    @staticmethod
    def _run_on_ds_ssh_remote(ds, name, ssh, cmd):
        """Given a dataset, and name of the remote, run command via ssh

        Parameters
        ----------
        cmd: str
          Will be .format()'ed given the `path` to the dataset on remote

        Returns
        -------
        out

        Raises
        ------
        CommandError
        """
        remote_url = CreateSibling._get_remote_url(ds, name)
        remote_ri = RI(remote_url)
        out, err = ssh(cmd.format(path=sh_quote(remote_ri.path)))
        if err:
            lgr.warning("Got stderr while calling ssh: %s", err)
        return out

    @staticmethod
    def _get_ds_remote_shared_setting(ds, name, ssh):
        """Figure out setting of sharedrepository for dataset's `name` remote"""
        shared = None
        try:
            # TODO -- we might need to expanduser taking .user into account
            # but then it must be done also on remote side
            out = CreateSibling._run_on_ds_ssh_remote(
                ds, name, ssh,
                'git -C {path} config --get core.sharedrepository'
            )
            shared = out.strip()
        except CommandError as e:
            lgr.debug(
                "Could not figure out remote shared setting of %s for %s due "
                "to %s",
                ds, name, exc_str(e)
            )
            # could well be ok if e.g. not shared
            # TODO: more detailed analysis may be?
        return shared

    @staticmethod
    def _has_active_postupdate(ds, name, ssh):
        """Figure out either has active post-update hook

        Returns
        -------
        bool or None
          None if something went wrong and we could not figure out
        """
        has_active_post_update = None
        try:
            # TODO -- we might need to expanduser taking .user into account
            # but then it must be done also on remote side
            out = CreateSibling._run_on_ds_ssh_remote(
                ds, name, ssh,
                'cd {path} && [ -x .git/hooks/post-update ] && echo yes || echo no'
            )
            out = out.strip()
            assert out in ('yes', 'no')
            has_active_post_update = out == "yes"
        except CommandError as e:
            lgr.debug(
                "Could not figure out either %s on remote %s has active "
                "post_update hook due to %s",
                ds, name, exc_str(e)
            )
        return has_active_post_update

    @staticmethod
    def _get_remote_url(ds, name):
        """A little helper to get url from pushurl or from url if not defined"""
        # take pushurl if present, if not -- just a url
        url = ds.config.get('remote.%s.pushurl' % name) or \
            ds.config.get('remote.%s.url' % name)
        if not url:
            raise ValueError(
                "%s had neither pushurl or url defined for %s" % (ds, name)
            )
        return url

    @staticmethod
    def init_remote_repo(path, ssh, shared, dataset, description=None):
        cmd = "git -C {} init{}".format(
            sh_quote(path),
            " --shared='{}'".format(sh_quote(shared)) if shared else '')
        try:
            ssh(cmd)
        except CommandError as e:
            lgr.error("Initialization of remote git repository failed at %s."
                      "\nError: %s\nSkipping ..." % (path, exc_str(e)))
            return False

        if isinstance(dataset.repo, AnnexRepo):
            # init remote git annex repo (part fix of #463)
            try:
                ssh(
                    "git -C {} annex init {}".format(
                        sh_quote(path),
                        sh_quote(description)
                        if description else '')
                )
            except CommandError as e:
                lgr.error("Initialization of remote git annex repository failed at %s."
                          "\nError: %s\nSkipping ..." % (path, exc_str(e)))
                return False
        return True

    @staticmethod
    def create_postupdate_hook(path, ssh, dataset):
        # location of post-update hook file, logs folder on remote target
        hooks_remote_dir = opj(path, '.git', 'hooks')
        # make sure hooks directory exists (see #1251)
        ssh('mkdir -p {}'.format(sh_quote(hooks_remote_dir)))
        hook_remote_target = opj(hooks_remote_dir, 'post-update')

        # create json command for current dataset
        log_filename = 'datalad-publish-hook-$(date +%s).log' % TIMESTAMP_FMT
        hook_content = r'''#!/bin/bash

git update-server-info

#
# DataLad
#
# (Re)generate meta-data for DataLad Web UI and possibly init new submodules
dsdir="$(dirname $0)/../.."
logfile="$dsdir/{WEB_META_LOG}/{log_filename}"

if [ ! -e "$dsdir/.git" ]; then
  echo Assumption of being under .git has failed >&2
  exit 1
fi

mkdir -p "$dsdir/{WEB_META_LOG}"  # assure logs directory exists

( which datalad > /dev/null \
  && ( cd "$dsdir"; GIT_DIR="$PWD/.git" datalad ls -a --json file .; ) \
  || echo "E: no datalad found - skipping generation of indexes for web frontend"; \
) &> "$logfile"
'''.format(WEB_META_LOG=WEB_META_LOG, **locals())

        with make_tempfile(content=hook_content) as tempf:
            # create post_update hook script
            # upload hook to dataset
            ssh.put(tempf, hook_remote_target)
        # and make it executable
        ssh('chmod +x {}'.format(sh_quote(hook_remote_target)))

    @staticmethod
    def upload_web_interface(path, ssh, shared, ui):
        # path to web interface resources on local
        webui_local = opj(dirname(datalad.__file__), 'resources', 'website')
        # local html to dataset
        html_local = opj(webui_local, "index.html")

        # name and location of web-interface html on target
        html_targetname = {True: ui, False: "index.html"}[isinstance(ui, str)]
        html_target = opj(path, html_targetname)

        # upload ui html to target
        ssh.put(html_local, html_target)

        # upload assets to the dataset
        webresources_local = opj(webui_local, 'assets')
        webresources_remote = opj(path, WEB_HTML_DIR)
        ssh('mkdir -p {}'.format(sh_quote(webresources_remote)))
        ssh.put(webresources_local, webresources_remote, recursive=True)

        # minimize and upload js assets
        for js_file in glob(opj(webresources_local, 'js', '*.js')):
            with open(js_file) as asset:
                try:
                    from jsmin import jsmin
                    # jsmin = lambda x: x   # no minimization
                    minified = jsmin(asset.read())                      # minify asset
                except ImportError:
                    lgr.warning(
                        "Will not minify web interface javascript, no jsmin available")
                    minified = asset.read()                             # no minify available
                with make_tempfile(content=minified) as tempf:          # write minified to tempfile
                    js_name = js_file.split('/')[-1]
                    ssh.put(tempf, opj(webresources_remote, 'assets', 'js', js_name))  # and upload js

        # explicitly make web+metadata dir of dataset world-readable, if shared set to 'all'
        mode = None
        if shared in (True, 'true', 'all', 'world', 'everybody'):
            mode = 'a+rX'
        elif shared == 'group':
            mode = 'g+rX'
        elif str(shared).startswith('0'):
            mode = shared

        if mode:
            ssh('chmod {} -R {} {}'.format(
                mode,
                sh_quote(dirname(webresources_remote)),
                sh_quote(opj(path, 'index.html'))))
示例#14
0
class Subdatasets(Interface):
    """Report subdatasets and their properties.

    The following properties are reported (if possible) for each matching
    subdataset record.

    "name"
        Name of the subdataset in the parent (often identical with the
        relative path in the parent dataset)

    "path"
        Absolute path to the subdataset

    "parentds"
        Absolute path to the parent dataset

    "revision"
        SHA1 of the subdataset commit recorded in the parent dataset

    "state"
        Condition of the subdataset: 'clean', 'modified', 'absent', 'conflict'
        as reported by `git submodule`

    "revision_descr"
        Output of `git describe` for the subdataset

    "gitmodule_url"
        URL of the subdataset recorded in the parent

    "gitmodule_<label>"
        Any additional configuration property on record.

    Performance note: Requesting `bottomup` reporting order, or a particular
    numerical `recursion_limit` implies an internal switch to an alternative
    query implementation for recursive query that is more flexible, but also
    notably slower (performs one call to Git per dataset versus a single call
    for all combined).

    """
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to query.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the input and/or the current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        fulfilled=Parameter(
            args=("--fulfilled",),
            doc="""if given, must be a boolean flag indicating whether
            to report either only locally present or absent datasets.
            By default subdatasets are reported regardless of their
            status""",
            constraints=EnsureBool() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        contains=Parameter(
            args=('--contains',),
            metavar='PATH',
            doc="""limit report to the subdatasets containing the
            given path. If a root path of a subdataset is given the last
            reported dataset will be the subdataset itself.""",
            constraints=EnsureStr() | EnsureNone()),
        bottomup=Parameter(
            args=("--bottomup",),
            action="store_true",
            doc="""whether to report subdatasets in bottom-up order along
            each branch in the dataset tree, and not top-down."""),
        set_property=Parameter(
            args=('--set-property',),
            metavar='VALUE',
            nargs=2,
            action='append',
            doc="""Name and value of one or more subdataset properties to
            be set in the parent dataset's .gitmodules file. The value can be
            a Python format() template string wrapped in '<>' (e.g.
            '<{gitmodule_name}>').
            Supported keywords are any item reported in the result properties
            of this command, plus 'refds_relpath' and 'refds_relname':
            the relative path of a subdataset with respect to the base dataset
            of the command call, and, in the latter case, the same string with
            all directory separators replaced by dashes.[CMD:  This
            option can be given multiple times. CMD]""",
            constraints=EnsureStr() | EnsureNone()),
        delete_property=Parameter(
            args=('--delete-property',),
            metavar='NAME',
            action='append',
            doc="""Name of one or more subdataset properties to be removed
            from the parent dataset's .gitmodules file.[CMD:  This
            option can be given multiple times. CMD]""",
            constraints=EnsureStr() | EnsureNone()))

    @staticmethod
    @datasetmethod(name='subdatasets')
    @eval_results
    def __call__(
            dataset=None,
            fulfilled=None,
            recursive=False,
            recursion_limit=None,
            contains=None,
            bottomup=False,
            set_property=None,
            delete_property=None):
        dataset = require_dataset(
            dataset, check_installed=False, purpose='subdataset reporting/modification')
        refds_path = dataset.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        try:
            if not (bottomup or contains or set_property or delete_property or \
                    (recursive and recursion_limit is not None)):
                # FAST IMPLEMENTATION FOR THE STRAIGHTFORWARD CASE
                # as fast as possible (just a single call to Git)
                # need to track current parent
                stack = [refds_path]
                modinfo_cache = {}
                for sm in _parse_git_submodules(refds_path, recursive=recursive):
                    # unwind the parent stack until we find the right one
                    # this assumes that submodules come sorted
                    while not sm['path'].startswith(_with_sep(stack[-1])):
                        stack.pop()
                    parent = stack[-1]
                    if parent not in modinfo_cache:
                        # read the parent .gitmodules, if not done yet
                        modinfo_cache[parent] = _parse_gitmodules(parent)
                    # get URL info, etc.
                    sm.update(modinfo_cache[parent].get(sm['path'], {}))
                    subdsres = get_status_dict(
                        'subdataset',
                        status='ok',
                        type='dataset',
                        refds=refds_path,
                        logger=lgr)
                    subdsres.update(sm)
                    subdsres['parentds'] = parent
                    if (fulfilled is None or
                            GitRepo.is_valid_repo(sm['path']) == fulfilled):
                        yield subdsres
                    # for the next "parent" commit this subdataset to the stack
                    stack.append(sm['path'])
                # MUST RETURN: the rest of the function is doing another implementation
                return
        except InvalidGitRepositoryError as e:
            lgr.debug("fast subdataset query failed, trying slow robust one (%s)",
                      exc_str(e))

        # MORE ROBUST, FLEXIBLE, BUT SLOWER IMPLEMENTATION
        # slow but flexible (one Git call per dataset), but deals with subdatasets in
        # direct mode
        if contains:
            contains = resolve_path(contains, dataset)
        for r in _get_submodules(
                dataset.path, fulfilled, recursive, recursion_limit,
                contains, bottomup, set_property, delete_property,
                refds_path):
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
示例#15
0
class AnnotatePaths(Interface):
    """Analyze and act upon input paths

    Given paths (or more generally location requests) are inspected and
    annotated with a number of properties. A list of recognized properties
    is provided below.

    || PYTHON >>Input `paths` for this command can either be un-annotated
    (raw) path strings, or already (partially) annotated paths. In the latter
    case, further annotation is limited to yet-unknown properties, and is
    potentially faster than initial annotation.<< PYTHON ||


    *Recognized path properties*

    {proplist}

    In the case of enabled modification detection the results may contain
    additional properties regarding the nature of the modification. See the
    documentation of the `diff` command for details.

    """
    _docs_ = dict(
        proplist='\n\n    '.join(
            '"{}"\n{}'.format(
                k,
                textwrap.fill(known_props[k],
                              initial_indent='        ',
                              subsequent_indent='        '))
            for k in sorted(known_props)))

    _params_ = dict(
        path=Parameter(
            args=("path",),
            metavar="PATH",
            doc="""path to be annotated""",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""an optional reference/base dataset for the paths""",
            constraints=EnsureDataset() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        action=Parameter(
            args=("--action",),
            metavar="LABEL",
            doc="""an "action" property value to include in the
            path annotation""",
            constraints=EnsureStr() | EnsureNone()),
        unavailable_path_status=Parameter(
            args=("--unavailable-path-status",),
            metavar="LABEL",
            doc="""a "status" property value to include in the
            annotation for paths that are underneath a dataset, but
            do not exist on the filesystem""",
            constraints=EnsureStr() | EnsureNone()),
        unavailable_path_msg=Parameter(
            args=("--unavailable-path-msg",),
            metavar="message",
            doc="""a "message" property value to include in the
            annotation for paths that are underneath a dataset, but
            do not exist on the filesystem""",
            constraints=EnsureStr() | EnsureNone()),
        nondataset_path_status=Parameter(
            args=("--nondataset-path-status",),
            metavar="LABEL",
            doc="""a "status" property value to include in the
            annotation for paths that are not underneath any dataset""",
            constraints=EnsureStr() | EnsureNone()),
        force_parentds_discovery=Parameter(
            args=("--no-parentds-discovery",),
            dest='force_parentds_discovery',
            action='store_false',
            doc="""Flag to disable reports of parent dataset information for any
            path, in particular dataset root paths. Disabling saves on command
            run time, if this information is not needed."""),
        force_subds_discovery=Parameter(
            args=("--no-subds-discovery",),
            action='store_false',
            dest='force_subds_discovery',
            doc="""Flag to disable reporting type='dataset' for subdatasets, even
            when they are not installed, or their mount point directory doesn't
            exist. Disabling saves on command run time, if this information is
            not needed."""),
        force_untracked_discovery=Parameter(
            args=("--no-untracked-discovery",),
            action='store_false',
            dest='force_untracked_discovery',
            doc="""Flag to disable discovery of untracked changes.
                Disabling saves on command run time, if this information is
                not needed."""),
        force_no_revision_change_discovery=Parameter(
            args=("--revision-change-discovery",),
            action='store_false',
            dest='force_no_revision_change_discovery',
            doc="""Flag to disable discovery of changes which were not yet committed.
            Disabling saves on command run time, if this information is
            not needed."""),
        modified=Parameter(
            args=("--modified",),
            nargs='?',
            const=True,
            constraints=EnsureStr() | EnsureBool() | EnsureNone(),
            doc="""comparison reference specification for modification detection.
            This can be (mostly) anything that `git diff` understands (commit,
            treeish, tag, etc). See the documentation of `datalad diff --revision`
            for details. Unmodified paths will not be annotated. If a requested
            path was not modified but some content underneath it was, then the
            request is replaced by the modified paths and those are annotated
            instead. This option can be used [PY: with `True` as PY][CMD: without CMD]
            an argument to test against changes that have been made, but have not
            yet been staged for a commit."""))

    @staticmethod
    @datasetmethod(name='annotate_paths')
    @eval_results
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            action=None,
            unavailable_path_status='',
            unavailable_path_msg=None,
            nondataset_path_status='error',
            force_parentds_discovery=True,
            force_subds_discovery=True,
            force_no_revision_change_discovery=True,
            force_untracked_discovery=True,
            modified=None):
        # upfront check for the fastest possible response
        if not path and dataset is None:
            # nothing given, try "here", but do not use `require_dataset`, as
            # it will determine the root dataset of `curdir` and further down
            # lead to path annotation of upstairs directories
            dataset = curdir

        if force_subds_discovery and not force_parentds_discovery:
            raise ValueError(
                'subdataset discovery requires parent dataset discovery')

        # CONCEPT: yield with no status to indicate further processing

        # everything in one big loop to be able too yield as fast a possible
        # without any precomputing for all paths
        refds_path = Interface.get_refds_path(dataset)
        if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)):
            raise ValueError(
                "modification detection only works with a base dataset (non-given or found)")

        # prep common result props
        res_kwargs = dict(
            action=action if action else 'annotate_path',
            refds=refds_path,
            logger=lgr)

        # handle the case of recursion into a single dataset without any
        # extra fancy processing first -- full recursion can be done
        # faster than manual recursion, hence we gain quite some speed
        # from these few lines of extra code
        if not modified and not path and refds_path:
            if not GitRepo.is_valid_repo(refds_path):
                yield get_status_dict(
                    # doesn't matter if the path is in another dataset
                    # it was given as reference dataset
                    status=nondataset_path_status,
                    message='given reference dataset is not a dataset',
                    path=refds_path,
                    **res_kwargs)
                return

            refds = Dataset(refds_path)
            path = []
            # yield the dataset itself
            r = get_status_dict(ds=refds, status='', **res_kwargs)
            yield r

            if recursive:
                # if we have nothing given, but need recursion, we need to feed
                # the dataset path itself
                for r in yield_recursive(
                        refds,
                        refds_path,
                        action,
                        recursion_limit):
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    yield r
            return

        # goal: structure in a way that makes most information on any path
        # available in a single pass, at the cheapest possible cost
        reported_paths = {}
        requested_paths = assure_list(path)

        if modified is not None:
            # modification detection would silently kill all nondataset paths
            # but we have to complain about them, hence doing it here
            if requested_paths and refds_path:
                for r in requested_paths:
                    p = r['path'] if isinstance(r, dict) else r
                    p = resolve_path(p, ds=refds_path)
                    if _with_sep(p).startswith(_with_sep(refds_path)):
                        # all good
                        continue
                    # not the refds
                    path_props = r if isinstance(r, dict) else {}
                    res = get_status_dict(
                        **dict(res_kwargs, **path_props))
                    res['status'] = nondataset_path_status
                    res['message'] = 'path not associated with reference dataset'
                    reported_paths[r] = res
                    yield res

            # preserve non-existing paths to be silently killed by modification
            # detection and append them to requested_paths again after detection.
            # TODO: This might be melted in with treatment of non dataset paths
            # above. Re-appending those paths seems to be better than yielding
            # directly to avoid code duplication, since both cases later on are
            # dealt with again.
            preserved_paths = []
            if requested_paths:
                [preserved_paths.append(r)
                 for r in requested_paths
                 if not lexists(r['path'] if isinstance(r, dict) else r)]

            # replace the requested paths by those paths that were actually
            # modified underneath or at a requested location
            requested_paths = get_modified_subpaths(
                # either the request, or the base dataset, if there was no request
                requested_paths if requested_paths else [refds_path],
                refds=Dataset(refds_path),
                revision=modified,
                report_no_revision_change=force_no_revision_change_discovery,
                report_untracked='all' if force_untracked_discovery else 'no',
                recursion_limit=recursion_limit)

            from itertools import chain
            # re-append the preserved paths:
            requested_paths = chain(requested_paths, iter(preserved_paths))

        # do not loop over unique(), this could be a list of dicts
        # we avoid duplicates manually below via `reported_paths`
        for path in requested_paths:
            if not isinstance(path, dict):
                path = rawpath2ap(path, refds_path)
            # this is now an annotated path!
            path_props = path
            path = path['path']
            # we need to mark our territory, who knows where this has been
            path_props.update(res_kwargs)

            if path in reported_paths:
                # we already recorded this path in the output
                # this can happen, whenever `path` is a subdataset, that was
                # discovered via recursive processing of another path before
                continue
            # the path exists in some shape or form
            # TODO if we have path_props already we could skip this test
            if isdir(path):
                # keep any existing type info, previously a more expensive run
                # could have discovered an uninstalled 'dataset', and we don't
                # want it to be relabeled to a directory
                path_props['type'] = \
                    path_props.get(
                        'type',
                        'dataset' if GitRepo.is_valid_repo(path) else 'directory')
                # this could contain all types of additional content
                containing_dir = path
            else:
                if lexists(path):
                    path_props['type'] = 'file'
                else:
                    path_props['state'] = 'absent'
                # for everything else we are interested in the container
                containing_dir = dirname(path)
                if not containing_dir:
                    containing_dir = curdir

            dspath = parent = get_dataset_root(containing_dir)
            if dspath:
                if path_props.get('type', None) == 'dataset':
                    # for a dataset the root is not the parent, for anything else
                    # it is
                    parent = path_props.get('parentds', None)
                    oneupdir = normpath(opj(containing_dir, pardir))
                    if parent is None and (force_parentds_discovery or (
                            refds_path and _with_sep(oneupdir).startswith(
                                _with_sep(refds_path)))):
                        # either forced, or only if we have a reference dataset, and
                        # only if we stay within this refds when searching for the
                        # parent
                        parent = get_dataset_root(normpath(opj(containing_dir, pardir)))
                        # NOTE the `and refds_path` is critical, as it will determine
                        # whether a top-level dataset that was discovered gets the
                        # parent property or not, it won't get it without a common
                        # base dataset, and that is how we always rolled
                    if parent and refds_path:
                        path_props['parentds'] = parent
                        # don't check whether this is actually a true subdataset of the
                        # parent, done further down
                else:
                    # set parent, but prefer existing property
                    path_props['parentds'] = path_props.get('parentds', dspath)

            # test for `dspath` not `parent`, we only need to know whether there is
            # ANY dataset, not which one is the true parent, logic below relies on
            # the fact that we end here, if there is no dataset at all
            if not dspath:
                # not in any dataset
                res = get_status_dict(
                    **dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = 'path not associated with any dataset'
                reported_paths[path] = res
                yield res
                continue

            # check that we only got SUBdatasets
            if refds_path and not _with_sep(dspath).startswith(_with_sep(refds_path)):
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = \
                    ('path not part of the reference dataset at %s', refds_path)
                reported_paths[path] = res
                yield res
                continue

            if path_props.get('type', None) == 'file':
                # nothing else we can learn about this
                res = get_status_dict(**dict(res_kwargs, **path_props))
                if 'status' not in res:
                    res['status'] = ''
                reported_paths[path] = res
                yield res
                continue

            containing_ds = None
            path_type = path_props.get('type', None)
            if parent and force_subds_discovery and (
                    (path_type == 'dataset' and 'registered_subds' not in path_props) or
                    path_type == 'directory' or
                    not lexists(path)):
                # if the path doesn't exist, or is labeled a directory, or a dataset even
                # a dataset (without this info) -> record whether this is a known subdataset
                # to its parent
                containing_ds = Dataset(parent)
                subdss = containing_ds.subdatasets(
                    fulfilled=None, recursive=False,
                    result_xfm=None, result_filter=None, return_type='list')
                if path in [s['path'] for s in subdss]:
                    if path_type == 'directory' or not lexists(path):
                        # first record that it isn't here, if just a dir or not here at all
                        path_props['state'] = 'absent'
                    # this must be a directory, and it is not installed
                    path_props['type'] = 'dataset'
                    path_props['registered_subds'] = True

            if not lexists(path) or \
                    (path_props.get('type', None) == 'dataset' and
                     path_props.get('state', None) == 'absent'):
                # not there (yet)
                message = unavailable_path_msg if unavailable_path_msg else None
                if message and '%s' in message:
                    message = (message, path)
                path_props['message'] = message
                res = get_status_dict(**dict(res_kwargs, **path_props))
                # assign given status, but only if the props don't indicate a status
                # already
                res['status'] = path_props.get(
                    'status', unavailable_path_status)
                reported_paths[path] = res
                yield res
                continue

            # we know everything we can, report
            res = get_status_dict(**dict(res_kwargs, **path_props))
            if 'status' not in res:
                res['status'] = ''
            reported_paths[path] = res
            yield res

            rec_paths = []
            if recursive:
                # here we need to consider the special case that `path` is
                # a dataset itself, if a recursion_limit is given (e.g.
                # `remove` will do that by default), we need to recurse
                # from the dataset itself, and not its parent to get things
                # right -- this will also avoid needless discovery of
                # unrelated subdatasets
                if path_props.get('type', None) == 'dataset':
                    containing_ds = Dataset(path)
                else:
                    # regular parent, we might have a dataset already
                    containing_ds = Dataset(parent) if containing_ds is None else containing_ds
                for r in yield_recursive(containing_ds, path, action, recursion_limit):
                    # capture reported paths
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    reported_paths[r['path']] = r
                    if modified is not None:
                        # we cannot yield right away, maybe it wasn't modified
                        rec_paths.append(r)
                    else:
                        yield r
            if modified is not None and rec_paths:
                # replace the recursively discovered paths by those paths that
                # were actually modified underneath or at a requested location
                for r in get_modified_subpaths(
                        rec_paths,
                        refds=Dataset(refds_path),
                        revision=modified,
                        report_no_revision_change=force_no_revision_change_discovery,
                        report_untracked='all' if force_untracked_discovery else 'no',
                        recursion_limit=recursion_limit):
                    res = get_status_dict(**dict(r, **res_kwargs))
                    reported_paths[res['path']] = res
                    yield res
        return
示例#16
0
class Install(Interface):
    """Install a dataset component or entire datasets.

    This command can make arbitrary content available in a dataset. This
    includes the fulfillment of exisiting dataset handles or file handles
    in a dataset, as well as the adding such handles for content available
    locally or remotely.
    """

    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to perform the install operation on. If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory and/or the `path` given""",
            constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path",),
            doc="""path/name of the installation target. If no `source` is
            provided, and no `dataset` is given or detected, this is
            interpreted as the source URL of a dataset and a destination
            path will be derived from the URL similar to 'git clone'.""",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        source=Parameter(
            args=("-s", "--source",),
            doc="url or local path of the installation source",
            constraints=EnsureStr() | EnsureNone()),
        # TODO this probably needs --with-data and --recursive as a plain boolean
        recursive=Parameter(
            args=("-r", "--recursive"),
            constraints=EnsureChoice('handles', 'data') | EnsureBool(),
            doc="""If set, all content is installed recursively, including
            content of any subdatasets."""),
        add_data_to_git=Parameter(
            args=("--add-data-to-git",),
            constraints=EnsureBool(),
            doc="""Flag whether to add data directly to Git, instead of
            tracking data identity only. Usually this is not desired,
            as it inflates dataset sizes and impacts flexibility of data
            transport."""))

    @staticmethod
    @datasetmethod(name='install')
    def __call__(dataset=None, path=None, source=None, recursive=False,
                 add_data_to_git=False):
        lgr.debug("Installation attempt started")
        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        if isinstance(path, list):
            if not len(path):
                # normalize value to expected state when nothing was provided
                path = None
            elif len(path) == 1:
                # we can simply continue with the function as called with a
                # single argument
                path = path[0]
            else:
                lgr.debug("Installation of multiple targets was requested: {0}".format(path))
                return [Install.__call__(
                        dataset=ds,
                        path=p,
                        source=source,
                        recursive=recursive) for p in path]

        # resolve the target location against the provided dataset
        if path is not None:
            # make sure it is not a URL, `resolve_path` cannot handle that
            if is_url(path):
                try:
                    path = get_local_path_from_url(path)
                    path = resolve_path(path, ds)
                except ValueError:
                    # URL doesn't point to a local something
                    pass
            else:
                path = resolve_path(path, ds)

        # any `path` argument that point to something local now resolved and
        # is no longer a URL

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the resolved target location (that is now guaranteed to
        # be specified, but only if path isn't a URL (anymore) -> special case,
        # handles below
        if ds is None and path is not None and not is_url(path):
            # try to find a dataset at or above the installation target
            dspath = GitRepo.get_toppath(abspath(path))
            if dspath is None:
                # no top-level dataset found, use path as such
                dspath = path
            ds = Dataset(dspath)

        if ds is None and source is None and path is not None:
            # no dataset, no source
            # this could be a shortcut install call, where the first
            # arg identifies the source
            if is_url(path) or os.path.exists(path):
                # we have an actual URL -> this should be the source
                # OR
                # it is not a URL, but it exists locally
                lgr.debug(
                    "Single argument given to install and no dataset found. "
                    "Assuming the argument identifies a source location.")
                source = path
                path = None

        lgr.debug("Resolved installation target: {0}".format(path))

        if ds is None and path is None and source is not None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            lgr.debug(
                "Neither dataset not target installation path provided. "
                "Assuming installation of a remote dataset. "
                "Deriving destination path from given source {0}".format(
                    source))
            ds = Dataset(_installationpath_from_url(source))

        if not path and ds is None:
            # no dataset, no target location, nothing to do
            raise InsufficientArgumentsError(
                "insufficient information for installation (needs at "
                "least a dataset or an installation path")

        assert(ds is not None)

        lgr.debug("Resolved target dataset for installation: {0}".format(ds))

        vcs = ds.repo
        if vcs is None:
            # TODO check that a "ds.path" actually points to a TOPDIR
            # should be the case already, but maybe nevertheless check
            try:
                with swallow_logs():
                    vcs = Install._get_new_vcs(ds, source, vcs)
            except GitCommandError:
                lgr.debug("Cannot retrieve from URL: {0}".format(source))
                # maybe source URL was missing a '/.git'
                if source and not source.rstrip('/').endswith('/.git'):
                    source = '{0}/.git'.format(source.rstrip('/'))
                    lgr.debug("Attempt to retrieve from URL: {0}".format(source))
                    vcs = Install._get_new_vcs(ds, source, vcs)
                else:
                    lgr.debug("Unable to establish repository instance at: {0}".format(ds.path))
                    raise

        assert(ds.repo)  # is automagically re-evaluated in the .repo property

        runner = Runner()

        if path is None or path == ds.path:
            # if the goal was to install this dataset, we are done,
            # except for 'recursive'.

            # TODO: For now 'recursive' means just submodules.
            # See --with-data vs. -- recursive and figure it out
            if recursive:
                for sm in ds.repo.get_submodules():
                    _install_subds_from_flexible_source(
                        ds, sm.path, sm.url, recursive=recursive)
            return ds

        # at this point this dataset is "installed", now we can test whether to
        # install something into the dataset

        # needed by the logic below
        assert(isabs(path))

        # express the destination path relative to the root of this dataset
        relativepath = relpath(path, start=ds.path)
        if path.startswith(pardir):
            raise ValueError("installation path outside dataset")

        lgr.debug(
            "Resolved installation target relative to dataset {0}: {1}".format(
                ds, relativepath))

        # this dataset must already know everything necessary
        ###################################################
        # FLOW GUIDE
        #
        # at this point we know nothing about the
        # installation targether
        ###################################################
        try:
            # it is simplest to let annex tell us what we are dealing with
            lgr.debug("Trying to fetch file %s using annex", relativepath)
            if not isinstance(vcs, AnnexRepo):
                assert(isinstance(vcs, GitRepo))
                # FLOW GUIDE
                # this is not an annex repo, but we raise exceptions
                # to be able to treat them alike in the special case handling
                # below
                if not exists(path):
                    raise IOError("path doesn't exist yet, might need special handling")
                elif relativepath in vcs.get_indexed_files():
                    # relativepath is in git
                    raise FileInGitError("We need to handle it as known to git")
                else:
                    raise FileNotInAnnexError("We don't have yet annex repo here")
            if vcs.get_file_key(relativepath):
                # FLOW GUIDE EXIT POINT
                # this is an annex'ed file -> get it
                # TODO implement `copy --from` using `source`
                # TODO fail if `source` is something strange
                vcs.annex_get(relativepath)
                # return the absolute path to the installed file
                return path

        except FileInGitError:
            ###################################################
            # FLOW GUIDE
            #
            # `path` is either
            # - a  file already checked into Git
            # - known submodule
            ###################################################
            lgr.log(5, "FileInGitError logic")
            if source is not None:
                raise FileInGitError("File %s is already in git. Specifying source (%s) makes no sense"
                                     % (path, source))
            # file is checked into git directly -> nothing to do
            # OR this is a submodule of this dataset
            submodule = [sm for sm in ds.repo.get_submodules()
                         if sm.path == relativepath]
            if not len(submodule):
                # FLOW GUIDE EXIT POINT
                # this is a file in Git and no submodule, just return its path
                lgr.debug("Don't act, data already present in Git")
                return path
            elif len(submodule) > 1:
                raise RuntimeError(
                    "more than one submodule registered at the same path?")
            submodule = submodule[0]

            # FLOW GUIDE EXIT POINT
            # we are dealing with a known submodule (i.e. `source`
            # doesn't matter) -> check it out
            lgr.debug("Install subdataset at: {0}".format(submodule.path))
            subds = _install_subds_from_flexible_source(
                ds, submodule.path, submodule.url, recursive=recursive)
            return subds

        except FileNotInAnnexError:
            ###################################################
            # FLOW GUIDE
            #
            # `path` is either
            # - content of a subdataset
            # - an untracked file in this dataset
            # - an entire untracked/unknown existing subdataset
            ###################################################
            lgr.log(5, "FileNotInAnnexError logic")
            subds = get_containing_subdataset(ds, relativepath)
            if ds.path != subds.path:
                # FLOW GUIDE EXIT POINT
                # target path belongs to a known subdataset, hand
                # installation over to it
                return subds.install(
                    path=relpath(path, start=subds.path),
                    source=source,
                    recursive=recursive,
                    add_data_to_git=add_data_to_git)

            # FLOW GUIDE
            # this must be an untracked/existing something, so either
            # - a file
            # - a directory
            # - an entire repository
            if exists(opj(path, '.git')):
                # FLOW GUIDE EXIT POINT
                # this is an existing repo and must be in-place turned into
                # a submodule of this dataset
                return _install_subds_inplace(
                    ds, path, relativepath, source, runner)

            # FLOW GUIDE EXIT POINT
            # - untracked file or directory in this dataset
            if isdir(path) and not recursive:
                # this is a directory and we want --recursive for it
                raise ValueError(
                    "installation of a directory requires the `recursive` flag")

            # few sanity checks
            if source and abspath(source) != path:
                raise ValueError(
                    "installation target already exists, but `source` points to "
                    "another location (target: '{0}', source: '{0}'".format(
                        source, path))

            if not add_data_to_git and not (isinstance(vcs, AnnexRepo)):
                raise RuntimeError(
                    "Trying to install file(s) into a dataset "
                    "with a plain Git repository. First initialize annex, or "
                    "provide override flag.")

            # switch `add` procedure between Git and Git-annex according to flag
            if add_data_to_git:
                vcs.git_add(relativepath)
                added_files = resolve_path(relativepath, ds)
            else:
                # do a blunt `annex add`
                added_files = vcs.annex_add(relativepath)
                # return just the paths of the installed components
                if isinstance(added_files, list):
                    added_files = [resolve_path(i['file'], ds) for i in added_files]
                else:
                    added_files = resolve_path(added_files['file'], ds)
            if added_files:
                return added_files
            else:
                return None

        except IOError:
            ###################################################
            # FLOW GUIDE
            #
            # more complicated special cases -- `path` is either
            # - a file/subdataset in a not yet initialized but known
            #   submodule
            # - an entire untracked/unknown existing subdataset
            # - non-existing content that should be installed from `source`
            ###################################################
            lgr.log(5, "IOError logic")
            # we can end up here in two cases ATM
            if (exists(path) or islink(path)) or source is None:
                # FLOW GUIDE
                # - target exists but this dataset's VCS rejects it,
                #   so it should be part of a subdataset
                # or
                # - target doesn't exist, but no source is given, so
                #   it could be a handle that is actually contained in
                #   a not yet installed subdataset
                subds = get_containing_subdataset(ds, relativepath)
                if ds.path != subds.path:
                    # FLOW GUIDE
                    # target path belongs to a subdataset, hand installation
                    # over to it
                    if not subds.is_installed():
                        # FLOW GUIDE
                        # we are dealing with a target in a not yet
                        # available but known subdataset -> install it first
                        ds.install(subds.path, recursive=recursive)
                    return subds.install(
                        path=relpath(path, start=subds.path),
                        source=source,
                        recursive=recursive,
                        add_data_to_git=add_data_to_git)

                # FLOW GUIDE EXIT POINT
                raise InsufficientArgumentsError(
                    "insufficient information for installation: the "
                    "installation target {0} doesn't exists, isn't a "
                    "known handle of dataset {1}, and no `source` "
                    "information was provided.".format(path, ds))

            if not source:
                # FLOW GUIDE EXIT POINT
                raise InsufficientArgumentsError(
                    "insufficient information for installation: the "
                    "installation target {0} doesn't exists, isn't a "
                    "known handle of dataset {1}, and no `source` "
                    "information was provided.".format(path, ds))

            source_path = expandpath(source)
            if exists(source_path):
                # FLOW GUIDE EXIT POINT
                # this could be
                # - local file
                # - local directory
                # - repository outside the dataset
                # we only want to support the last case of locally cloning
                # a repo -- fail otherwise
                if exists(opj(source_path, '.git')):
                    return _install_subds_from_flexible_source(
                        ds, relativepath, source_path, recursive)

                raise ValueError(
                    "installing individual local files or directories is not "
                    "supported, copy/move them into the dataset first")

            # FLOW GUIDE
            # `source` is non-local, it could be:
            #   - repository
            #   - file
            # we have no further evidence, hence we need to try
            try:
                # FLOW GUIDE EXIT POINT
                # assume it is a dataset
                return _install_subds_from_flexible_source(
                    ds, relativepath, source, recursive)
            except CommandError:
                # FLOW GUIDE EXIT POINT
                # apaarently not a repo, assume it is a file url
                vcs.annex_addurl_to_file(relativepath, source)
                return path

    @staticmethod
    def _get_new_vcs(ds, source, vcs):
        if source is None:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", ds.path)
            vcs = AnnexRepo(ds.path, url=source, create=True)
        else:
            # when obtained from remote, try with plain Git
            lgr.info("Creating a new git repo at %s", ds.path)
            vcs = GitRepo(ds.path, url=source, create=True)
            if knows_annex(ds.path):
                # init annex when traces of a remote annex can be detected
                lgr.info("Initializing annex repo at %s", ds.path)
                vcs = AnnexRepo(ds.path, init=True)
            else:
                lgr.debug("New repository clone has no traces of an annex")
        return vcs

    @staticmethod
    def result_renderer_cmdline(res):
        from datalad.ui import ui
        if res is None:
            res = []
        if not isinstance(res, list):
            res = [res]
        if not len(res):
            ui.message("Nothing was installed")
            return
        items = '\n'.join(map(str, res))
        msg = "{n} installed {obj} available at\n{items}".format(
            obj='items are' if len(res) > 1 else 'item is',
            n=len(res),
            items=items)
        ui.message(msg)
示例#17
0
def dlplugin(dataset, pattern, ref_dir='.', makedirs='no'):
    # could be extended to accept actual largefile expressions
    """Configure a dataset to never put some content into the dataset's annex

    This can be useful in mixed datasets that also contain textual data, such
    as source code, which can be efficiently and more conveniently managed
    directly in Git.

    Patterns generally look like this::

      code/*

    which would match all file in the code directory. In order to match all
    files under ``code/``, including all its subdirectories use such a
    pattern::

      code/**

    Note that the plugin works incrementally, hence any existing configuration
    (e.g. from a previous plugin run) is amended, not replaced.

    Parameters
    ----------
    dataset : Dataset
      dataset to configure
    pattern : list
      list of path patterns. Any content whose path is matching any pattern
      will not be annexed when added to a dataset, but instead will be
      tracked directly in Git. Path pattern have to be relative to the
      directory given by the `ref_dir` option. By default, patterns should
      be relative to the root of the dataset.
    ref_dir : str, optional
      Relative path (within the dataset) to the directory that is to be
      configured. All patterns are interpreted relative to this path,
      and configuration is written to a ``.gitattributes`` file in this
      directory.
    makedirs : bool, optional
      If set, any missing directories will be created in order to be able
      to place a file into ``ref_dir``. Default: False.
    """
    from os.path import join as opj
    from os.path import isabs
    from os.path import exists
    from os import makedirs as makedirsfx
    from datalad.distribution.dataset import require_dataset
    from datalad.support.annexrepo import AnnexRepo
    from datalad.support.constraints import EnsureBool
    from datalad.utils import assure_list

    makedirs = EnsureBool()(makedirs)
    pattern = assure_list(pattern)
    ds = require_dataset(dataset,
                         check_installed=True,
                         purpose='no_annex configuration')

    res_kwargs = dict(
        path=ds.path,
        type='dataset',
        action='no_annex',
    )

    # all the ways we refused to cooperate
    if not isinstance(ds.repo, AnnexRepo):
        yield dict(res_kwargs,
                   status='notneeded',
                   message='dataset has no annex')
        return
    if any(isabs(p) for p in pattern):
        yield dict(
            res_kwargs,
            status='error',
            message=
            ('path pattern for `no_annex` configuration must be relative paths: %s',
             pattern))
        return
    if isabs(ref_dir):
        yield dict(
            res_kwargs,
            status='error',
            message=
            ('`ref_dir` for `no_annex` configuration must be a relative path: %s',
             ref_dir))
        return

    gitattr_dir = opj(ds.path, ref_dir)
    if not exists(gitattr_dir):
        if makedirs:
            makedirsfx(gitattr_dir)
        else:
            yield dict(
                res_kwargs,
                status='error',
                message=
                'target directory for `no_annex` does not exist (consider makedirs=True)'
            )
            return

    gitattr_file = opj(gitattr_dir, '.gitattributes')
    with open(gitattr_file, 'a') as fp:
        for p in pattern:
            fp.write('{} annex.largefiles=nothing'.format(p))
        yield dict(res_kwargs, status='ok')

    for r in dataset.add(gitattr_file,
                         to_git=True,
                         message="[DATALAD] exclude paths from annex'ing",
                         result_filter=None,
                         result_xfm=None):
        yield r
示例#18
0
class CreatePublicationTargetSSHWebserver(Interface):
    """Create a dataset on a web server via SSH, that may then serve as
    a target for the publish command, if added as a sibling."""

    _params_ = dict(
        # TODO: Somehow the replacement of '_' and '-' is buggy on
        # positional arguments
        # TODO: Figure out, whether (and when) to use `sshurl` as push url
        dataset=Parameter(
            args=(
                "--dataset",
                "-d",
            ),
            doc="""specify the dataset to create the publication target for. If
                no dataset is given, an attempt is made to identify the dataset
                based on the current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        sshurl=Parameter(
            args=("sshurl", ),
            doc="""SSH URL to use to log into the server and create the target
                dataset(s). This also serves as a default for the URL to be
                used to add the target as a sibling to `dataset` and as a
                default for the directory on the server, where to create the
                dataset.""",
            constraints=EnsureStr()),
        target=Parameter(
            args=('target', ),
            doc="""Sibling name to create for this publication target.
                If `recursive` is set, the same name will be used to address
                the subdatasets' siblings. Note, that this is just a
                convenience function, calling add_sibling after the actual
                creation of the target dataset(s). Whenever the creation fails,
                no siblings are added.""",
            constraints=EnsureStr() | EnsureNone(),
            nargs="?"),
        target_dir=Parameter(
            args=('--target-dir', ),
            doc="""Path to the directory on the server where to create the
                dataset. By default it's wherever `sshurl` points to. If a
                relative path is provided, it's interpreted as relative to the
                user's home directory on the server.
                Especially when using `recursive`, it's possible to provide a
                template for building the URLs of all (sub)datasets to be
                created by using placeholders. If you don't provide a template
                the local hierarchy with respect to `dataset` will be
                replicated on the server rooting in `target_dir`.\n
                List of currently available placeholders:\n
                %%NAME\tthe name of the datasets, where slashes are
                replaced by dashes.\n""",
            constraints=EnsureStr() | EnsureNone()),
        target_url=Parameter(
            args=('--target-url', ),
            doc="""The URL of the dataset sibling named by `target`. Defaults
                to `sshurl`. This URL has to be accessible to anyone, who is
                supposed to have access to the dataset later on.\n
                Especially when using `recursive`, it's possible to provide a
                template for building the URLs of all (sub)datasets to be
                created by using placeholders.\n
                List of currently available placeholders:\n
                %%NAME\tthe name of the datasets, where slashes are
                replaced by dashes.\n""",
            nargs="?",
            constraints=EnsureStr() | EnsureNone()),
        target_pushurl=Parameter(
            args=('--target-pushurl', ),
            doc="""Defaults to `sshurl`. In case the `target_url` cannot be
                used to publish to the dataset sibling, this option specifies a
                URL to be used for the actual publication operation.""",
            constraints=EnsureStr() | EnsureNone()),
        recursive=Parameter(
            args=("--recursive", "-r"),
            action="store_true",
            doc="""Recursively create the publication target for all
                subdatasets of `dataset`""",
        ),
        existing=Parameter(
            args=("--existing", ),
            constraints=EnsureChoice('skip', 'replace', 'raise'),
            doc="""Action to perform, if target directory exists already.
                Dataset is skipped if `skip`. `replace` forces to (re-)init
                git and to (re-)configure sibling `target`
                (i.e. its URL(s)) in case it already exists. `raise` just
                raises an Exception""",
        ),
        shared=Parameter(
            args=("--shared", ),
            doc="""passed to git-init. TODO: Figure out how to communicate what
                this is about""",
            constraints=EnsureStr() | EnsureBool()),
    )

    @staticmethod
    @datasetmethod(name='create_publication_target_sshwebserver')
    def __call__(sshurl,
                 target=None,
                 target_dir=None,
                 target_url=None,
                 target_pushurl=None,
                 dataset=None,
                 recursive=False,
                 existing='raise',
                 shared=False):

        if sshurl is None:
            raise ValueError("""insufficient information for target creation
            (needs at least a dataset and a SSH URL).""")

        if target is None and (target_url is not None
                               or target_pushurl is not None):
            raise ValueError("""insufficient information for adding the target
            as a sibling (needs at least a name)""")

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)
        if ds is None:
            # try to find a dataset at or above CWD
            dspath = GitRepo.get_toppath(abspath(getpwd()))
            if dspath is None:
                raise ValueError("""No dataset found
                                 at or above {0}.""".format(getpwd()))
            ds = Dataset(dspath)
            lgr.debug("Resolved dataset for target creation: {0}".format(ds))
        assert (ds is not None and sshurl is not None)

        if not ds.is_installed():
            raise ValueError(
                """Dataset {0} is not installed yet.""".format(ds))
        assert (ds.repo is not None)

        # determine target parameters:
        parsed_target = urlparse(sshurl)
        host_name = parsed_target.netloc

        # TODO: Sufficient to fail on this condition?
        if not parsed_target.netloc:
            raise ValueError("Malformed URL: {0}".format(sshurl))

        if target_dir is None:
            if parsed_target.path:
                target_dir = parsed_target.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = False
        if "%NAME" not in target_dir:
            replicate_local_structure = True

        # collect datasets to use:
        datasets = dict()
        datasets[basename(ds.path)] = ds
        if recursive:
            for subds in ds.get_dataset_handles(recursive=True):
                sub_path = opj(ds.path, subds)
                # TODO: when enhancing Dataset/*Repo classes and therefore
                # adapt to moved code, make proper distinction between name and
                # path of a submodule, which are technically different. This
                # probably will become important on windows as well as whenever
                # we want to allow for moved worktrees.
                datasets[basename(ds.path) + '/' + subds] = \
                    Dataset(sub_path)

        # setup SSH Connection:
        # TODO: Make the entire setup a helper to use it when pushing via
        # publish?

        # - build control master:
        from datalad.utils import assure_dir
        not_supported_on_windows("TODO")
        from os import geteuid  # Linux specific import
        var_run_user_datalad = "/var/run/user/%s/datalad" % geteuid()
        assure_dir(var_run_user_datalad)
        control_path = "%s/%s" % (var_run_user_datalad, host_name)
        control_path += ":%s" % parsed_target.port if parsed_target.port else ""

        # - start control master:
        cmd = "ssh -o ControlMaster=yes -o \"ControlPath=%s\" " \
              "-o ControlPersist=yes %s exit" % (control_path, host_name)
        lgr.debug("Try starting control master by calling:\n%s" % cmd)
        import subprocess
        proc = subprocess.Popen(cmd, shell=True)
        proc.communicate(input="\n")  # why the f.. this is necessary?

        runner = Runner()
        ssh_cmd = ["ssh", "-S", control_path, host_name]

        lgr.info("Creating target datasets ...")
        for current_dataset in datasets:
            if not replicate_local_structure:
                path = target_dir.replace("%NAME",
                                          current_dataset.replace("/", "-"))
            else:
                # TODO: opj depends on local platform, not the remote one.
                # check how to deal with it. Does windows ssh server accept
                # posix paths? vice versa? Should planned SSH class provide
                # tools for this issue?
                path = normpath(
                    opj(target_dir,
                        relpath(datasets[current_dataset].path,
                                start=ds.path)))

            if path != '.':
                # check if target exists
                # TODO: Is this condition valid for != '.' only?
                path_exists = True
                cmd = ssh_cmd + ["ls", path]
                try:
                    out, err = runner.run(cmd,
                                          expect_fail=True,
                                          expect_stderr=True)
                except CommandError as e:
                    if "No such file or directory" in e.stderr and \
                                    path in e.stderr:
                        path_exists = False
                    else:
                        raise  # It's an unexpected failure here

                if path_exists:
                    if existing == 'raise':
                        raise RuntimeError(
                            "Target directory %s already exists." % path)
                    elif existing == 'skip':
                        continue
                    elif existing == 'replace':
                        pass
                    else:
                        raise ValueError(
                            "Do not know how to hand existing=%s" %
                            repr(existing))

                cmd = ssh_cmd + ["mkdir", "-p", path]
                try:
                    runner.run(cmd)
                except CommandError as e:
                    lgr.error("Remotely creating target directory failed at "
                              "%s.\nError: %s" % (path, str(e)))
                    continue

            # init git repo
            cmd = ssh_cmd + ["git", "-C", path, "init"]
            if shared:
                cmd.append("--shared=%s" % shared)
            try:
                runner.run(cmd)
            except CommandError as e:
                lgr.error("Remotely initializing git repository failed at %s."
                          "\nError: %s\nSkipping ..." % (path, str(e)))
                continue

            # check git version on remote end:
            cmd = ssh_cmd + ["git", "version"]
            try:
                out, err = runner.run(cmd)
                git_version = out.lstrip("git version").strip()
                lgr.debug("Detected git version on server: %s" % git_version)
                if git_version < "2.4":
                    lgr.error("Git version >= 2.4 needed to configure remote."
                              " Version detected on server: %s\nSkipping ..." %
                              git_version)
                    continue

            except CommandError as e:
                lgr.warning("Failed to determine git version on remote.\n"
                            "Error: {0}\nTrying to configure anyway "
                            "...".format(e.message))

            # allow for pushing to checked out branch
            cmd = ssh_cmd + [
                "git", "-C", path, "config", "receive.denyCurrentBranch",
                "updateInstead"
            ]
            try:
                runner.run(cmd)
            except CommandError as e:
                lgr.warning("git config failed at remote location %s.\n"
                            "You will not be able to push to checked out "
                            "branch." % path)

            # enable post-update hook:
            cmd = ssh_cmd + [
                "mv",
                opj(path, ".git/hooks/post-update.sample"),
                opj(path, ".git/hooks/post-update")
            ]
            try:
                runner.run(cmd)
            except CommandError as e:
                lgr.error("Failed to enable post update hook.\n"
                          "Error: %s" % e.message)

            # initially update server info "manually":
            cmd = ssh_cmd + ["git", "-C", path, "update-server-info"]
            try:
                runner.run(cmd)
            except CommandError as e:
                lgr.error("Failed to update server info.\n"
                          "Error: %s" % e.message)

        # stop controlmaster (close ssh connection):
        cmd = ["ssh", "-O", "stop", "-S", control_path, host_name]
        out, err = runner.run(cmd, expect_stderr=True)

        if target:
            # add the sibling(s):
            if target_url is None:
                target_url = sshurl
            if target_pushurl is None:
                target_pushurl = sshurl
            result_adding = AddSibling()(dataset=ds,
                                         name=target,
                                         url=target_url,
                                         pushurl=target_pushurl,
                                         recursive=recursive,
                                         force=existing in {'replace'})
示例#19
0
class CreateSiblingRia(Interface):
    """Creates a sibling to a dataset in a RIA store

    Communication with a dataset in a RIA store is implemented via two
    siblings. A regular Git remote (repository sibling) and a git-annex
    special remote for data transfer (storage sibling) -- with the former
    having a publication dependency on the latter. By default, the name of the
    storage sibling is derived from the repository sibling's name by appending
    "-storage".

    The store's base path is expected to not exist, be an empty directory,
    or a valid RIA store.

    RIA store layout
    ~~~~~~~~~~~~~~~~

    A RIA store is a directory tree with a dedicated subdirectory for each
    dataset in the store. The subdirectory name is constructed from the
    DataLad dataset ID, e.g. '124/68afe-59ec-11ea-93d7-f0d5bf7b5561', where
    the first three characters of the ID are used for an intermediate
    subdirectory in order to mitigate files system limitations for stores
    containing a large number of datasets.

    Each dataset subdirectory contains a standard bare Git repository for
    the dataset.

    In addition, a subdirectory 'annex' hold a standard Git-annex object
    store. However, instead of using the 'dirhashlower' naming scheme for
    the object directories, like Git-annex would do, a 'dirhashmixed'
    layout is used -- the same as for non-bare Git repositories or regular
    DataLad datasets.

    Optionally, there can be a further subdirectory 'archives' with
    (compressed) 7z archives of annex objects. The storage remote is able to
    pull annex objects from these archives, if it cannot find in the regular
    annex object store. This feature can be useful for storing large
    collections of rarely changing data on systems that limit the number of
    files that can be stored.

    Each dataset directory also contains a 'ria-layout-version' file that
    identifies the data organization (as, for example, described above).

    Lastly, there is a global 'ria-layout-version' file at the store's
    base path that identifies where dataset subdirectories themselves are
    located. At present, this file must contain a single line stating the
    version (currently "1"). This line MUST end with a newline character.

    It is possible to define an alias for an individual dataset in a store by
    placing a symlink to the dataset location into an 'alias/' directory
    in the root of the store. This enables dataset access via URLs of format:
    'ria+<protocol>://<storelocation>#~<aliasname>'.

    Error logging
    ~~~~~~~~~~~~~

    To enable error logging at the remote end, append a pipe symbol and an "l"
    to the version number in ria-layout-version (like so '1|l\\n').

    Error logging will create files in an "error_log" directory whenever the
    git-annex special remote (storage sibling) raises an exception, storing the
    Python traceback of it. The logfiles are named according to the scheme
    '<dataset id>.<annex uuid of the remote>.log' showing "who" ran into this
    issue with which dataset. Because logging can potentially leak personal
    data (like local file paths for example), it can be disabled client-side
    by setting the configuration variable
    "annex.ora-remote.<storage-sibling-name>.ignore-remote-config".
    """

    # TODO: description?
    _params_ = dict(
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to process.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory""",
            constraints=EnsureDataset() | EnsureNone()),
        url=Parameter(
            args=("url",),
            metavar="ria+<ssh|file>://<host>[/path]",
            doc="""URL identifying the target RIA store and access protocol.
            """,
            constraints=EnsureStr() | EnsureNone()),
        name=Parameter(
            args=('-s', '--name',),
            metavar='NAME',
            doc="""Name of the sibling.
            With `recursive`, the same name will be used to label all
            the subdatasets' siblings.""",
            constraints=EnsureStr() | EnsureNone(),
            required=True),
        storage_name=Parameter(
            args=("--storage-name",),
            metavar="NAME",
            doc="""Name of the storage sibling (git-annex special remote).
            Must not be identical to the sibling name. If not specified,
            defaults to the sibling name plus '-storage' suffix. If only
            a storage sibling is created, this setting is ignored, and
            the primary sibling name is used.""",
            constraints=EnsureStr() | EnsureNone()),
        post_update_hook=Parameter(
            args=("--post-update-hook",),
            doc="""Enable git's default post-update-hook for the created
            sibling.""",
            action="store_true"),
        shared=Parameter(
            args=("--shared",),
            metavar='{false|true|umask|group|all|world|everybody|0xxx}',
            doc="""If given, configures the permissions in the
            RIA store for multi-users access.
            Possible values for this option are identical to those of
            `git init --shared` and are described in its documentation.""",
            constraints=EnsureStr() | EnsureBool() | EnsureNone()),
        group=Parameter(
            args=("--group",),
            metavar="GROUP",
            doc="""Filesystem group for the repository. Specifying the group is
            crucial when [CMD: --shared=group CMD][PY: shared="group" PY]""",
            constraints=EnsureStr() | EnsureNone()),
        storage_sibling=Parameter(
            args=("--storage-sibling",),
            dest='storage_sibling',
            metavar='MODE',
            constraints=EnsureChoice('only') | EnsureBool() | EnsureNone(),
            doc="""By default, an ORA storage sibling and a Git repository
            sibling are created ([CMD: on CMD][PY: True|'on' PY]).
            Alternatively, creation of the storage sibling can be disabled
            ([CMD: off CMD][PY: False|'off' PY]), or a storage sibling
            created only and no Git sibling
            ([CMD: only CMD][PY: 'only' PY]). In the latter mode, no Git
            installation is required on the target host."""),
        existing=Parameter(
            args=("--existing",),
            constraints=EnsureChoice(
                'skip', 'error', 'reconfigure') | EnsureNone(),
            metavar='MODE',
            doc="""Action to perform, if a (storage) sibling is already
            configured under the given name and/or a target already exists.
            In this case, a dataset can be skipped ('skip'), an existing target
            repository be forcefully re-initialized, and the sibling
            (re-)configured ('reconfigure'), or the command be instructed to
            fail ('error').""", ),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        trust_level=Parameter(
            args=("--trust-level",),
            metavar="TRUST-LEVEL",
            constraints=EnsureChoice(
                'trust', 'semitrust', 'untrust') | EnsureNone(),
            doc="""specify a trust level for the storage sibling. If not
            specified, the default git-annex trust level is used.""",),
        disable_storage__=Parameter(
            args=("--no-storage-sibling",),
            dest='disable_storage__',
            doc="""This option is deprecated. Use '--storage-sibling off'
            instead.""",
            action="store_false"),
    )

    @staticmethod
    @datasetmethod(name='create_sibling_ria')
    @eval_results
    def __call__(url,
                 name,
                 dataset=None,
                 storage_name=None,
                 post_update_hook=False,
                 shared=None,
                 group=None,
                 storage_sibling=True,
                 existing='error',
                 trust_level=None,
                 recursive=False,
                 recursion_limit=None,
                 disable_storage__=None,
                 ):
        if disable_storage__ is not None:
            import warnings
            warnings.warn("datalad-create-sibling-ria --no-storage-sibling "
                          "is deprecated, use --storage-sibling off instead.",
                          DeprecationWarning)
            # recode to new setup
            disable_storage__ = None
            storage_sibling = False

        if storage_sibling == 'only' and storage_name:
            lgr.warning(
                "Sibling name will be used for storage sibling in "
                "storage-sibling-only mode, but a storage sibling name "
                "was provided"
            )

        ds = require_dataset(
            dataset, check_installed=True, purpose='create sibling RIA')
        res_kwargs = dict(
            ds=ds,
            action="create-sibling-ria",
            logger=lgr,
        )

        # parse target URL
        try:
            ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config)
        except ValueError as e:
            yield get_status_dict(
                status='error',
                message=str(e),
                **res_kwargs
            )
            return

        if ds.repo.get_hexsha() is None or ds.id is None:
            raise RuntimeError(
                "Repository at {} is not a DataLad dataset, "
                "run 'datalad create [--force]' first.".format(ds.path))

        if not storage_sibling and storage_name:
            lgr.warning(
                "Storage sibling setup disabled, but a storage sibling name "
                "was provided"
            )

        if storage_sibling and not storage_name:
            storage_name = "{}-storage".format(name)

        if storage_sibling and name == storage_name:
            # leads to unresolvable, circular dependency with publish-depends
            raise ValueError("sibling names must not be equal")

        if not isinstance(url, str):
            raise TypeError("url is not a string, but %s" % type(url))

        # Query existing siblings upfront in order to fail early on
        # existing=='error', since misconfiguration (particularly of special
        # remotes) only to fail in a subdataset later on with that config, can
        # be quite painful.
        # TODO: messages - this is "create-sibling". Don't confuse existence of
        #       local remotes with existence of the actual remote sibling
        #       in wording
        if existing == 'error':
            # in recursive mode this check could take a substantial amount of
            # time: employ a progress bar (or rather a counter, because we don't
            # know the total in advance
            pbar_id = 'check-siblings-{}'.format(id(ds))
            log_progress(
                lgr.info, pbar_id,
                'Start checking pre-existing sibling configuration %s', ds,
                label='Query siblings',
                unit=' Siblings',
            )
            # even if we have to fail, let's report all conflicting siblings
            # in subdatasets
            failed = False
            for r in ds.siblings(result_renderer=None,
                                 recursive=recursive,
                                 recursion_limit=recursion_limit):
                log_progress(
                    lgr.info, pbar_id,
                    'Discovered sibling %s in dataset at %s',
                    r['name'], r['path'],
                    update=1,
                    increment=True)
                if not r['type'] == 'sibling' or r['status'] != 'ok':
                    # this is an internal status query that has not consequence
                    # for the outside world. Be silent unless something useful
                    # can be said
                    #yield r
                    continue
                if r['name'] == name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
                if storage_name and r['name'] == storage_name:
                    res = get_status_dict(
                        status='error',
                        message="a sibling '{}' is already configured in "
                        "dataset {}".format(storage_name, r['path']),
                        **res_kwargs,
                    )
                    failed = True
                    yield res
                    continue
            log_progress(
                lgr.info, pbar_id,
                'Finished checking pre-existing sibling configuration %s', ds,
            )
            if failed:
                return

        # TODO: - URL parsing + store creation needs to be RF'ed based on
        #         command abstractions
        #       - more generally consider store creation a dedicated command or
        #         option
        # Note: URL parsing is done twice ATM (for top-level ds). This can't be
        # reduced to single instance, since rewriting url based on config could
        # be different for subdatasets.

        create_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(),
                     Path(base_path),
                     '1')

        yield from _create_sibling_ria(
            ds,
            url,
            name,
            storage_sibling,
            storage_name,
            existing,
            shared,
            group,
            post_update_hook,
            trust_level,
            res_kwargs)

        if recursive:
            # Note: subdatasets can be treated independently, so go full
            # recursion when querying for them and _no_recursion with the
            # actual call. Theoretically this can be parallelized.

            for subds in ds.subdatasets(fulfilled=True,
                                        recursive=True,
                                        recursion_limit=recursion_limit,
                                        result_xfm='datasets'):
                yield from _create_sibling_ria(
                    subds,
                    url,
                    name,
                    storage_sibling,
                    storage_name,
                    existing,
                    shared,
                    group,
                    post_update_hook,
                    trust_level,
                    res_kwargs)
示例#20
0
def _dump_extracted_metadata(agginto_ds, aggfrom_ds, db, to_save, force_extraction, agg_base_path):
    """Dump metadata from a dataset into object in the metadata store of another

    Info on the metadata objects is placed into a DB dict under the
    absolute path of the dataset whose metadata was aggregated.

    Parameters
    ----------
    agginto_ds : Dataset
    aggfrom_ds : Dataset
    db : dict
    """
    subds_relpaths = aggfrom_ds.subdatasets(result_xfm='relpaths', return_type='list')
    # figure out a "state" of the dataset wrt its metadata that we are describing
    # 1. the latest commit that changed any file for which we could have native metadata
    refcommit = _get_latest_refcommit(aggfrom_ds, subds_relpaths)
    objid = refcommit if refcommit else ''
    # 2, our own dataset-global metadata and the dataset config
    for tfile in (
            op.join(aggfrom_ds.path, DATASET_METADATA_FILE),
            op.join(aggfrom_ds.path, DATASET_CONFIG_FILE)):
        if op.exists(tfile):
            objid += md5(open(tfile, 'r').read().encode()).hexdigest()
    # 3. potential annex-based metadata
    # XXX TODO shouldn't this be the annex extractor?
    if isinstance(aggfrom_ds, AnnexRepo) and \
            aggfrom_ds.config.obtain(
                'datalad.metadata.aggregate-content-datalad-core',
                default=True,
                valtype=EnsureBool()):
        # if there is no annex metadata, this will come out empty,
        # hence hash would be same as for a plain GitRepo
        # and no, we cannot use the shasum of the annex branch,
        # because this will change even when no metadata has changed
        timestamps, _ = aggfrom_ds.repo.call_annex_oneline([
            'metadata',
            '.',
            '-g', 'lastchanged'])
        objid += timestamps.strip()

    if not objid:
        lgr.debug('%s has no metadata-relevant content', aggfrom_ds)
    else:
        lgr.debug(
            'Dump metadata of %s into %s',
            aggfrom_ds, agginto_ds)

    # check if we already have in store what we are about to create
    old_agginfo = db.get(aggfrom_ds.path, {})

    agginfo = {}
    # dataset global
    if aggfrom_ds.id:
        agginfo['id'] = aggfrom_ds.id
    agginfo['refcommit'] = refcommit
    # put in DB
    db[aggfrom_ds.path] = agginfo

    if not objid:
        # this is no error, there is simply no metadata whatsoever
        return False

    # shorten to MD5sum
    objid = md5(objid.encode()).hexdigest()

    # assemble info on the metadata extraction and storage
    #               label  type      targetds    storage method
    metasources = {'ds': {'type': 'dataset', 'targetds': agginto_ds, 'dumper': json_py.dump}}
    # do not store content metadata if either the source or the target dataset
    # do not want it
    # TODO this AND was an OR before (wrong), misses a test
    if aggfrom_ds.config.obtain(
            'datalad.metadata.store-aggregate-content',
            default=True,
            valtype=EnsureBool()) and \
            agginto_ds.config.obtain(
                'datalad.metadata.store-aggregate-content',
                default=True,
                valtype=EnsureBool()):
            metasources['cn'] = {
                'type': 'content',
                'targetds': agginto_ds,
                'dumper': json_py.dump2xzstream}

    # check if we have the extracted metadata for this state already
    # either in the source or in the destination dataset
    # The situation is trickier!  Extracted metadata could change for the same
    # state (commit etc), e.g. if extractors changed.
    # The "correct" thing would be either
    # - to inspect git history either there were changes
    #   within aggfrom_ds since agginto_ds got the metadata committed OR
    # - check by content - if file is under git - compute checksum,
    #   if under annex -- take checksum from the key without asking for the
    #   content
    metafound = {}
    uptodatemeta = []  # record which meta not only found but matching in content
    # TODO: current fixes might break logic for when fromds is not installed
    #       when I guess we just need to skip it?
    if not force_extraction:
        for s, sprop in metasources.items():
            objloc = op.join(agg_base_path,
                             _get_obj_location(objid, s, sprop['dumper']))
            smetafound = [
                # important to test for lexists() as we do not need to
                # or want to `get()` metadata files for this test.
                # Info on identity is NOT sufficient - later compare content if
                # multiple found
                objloc if op.lexists(op.join(d.path, objloc)) else None
                # Order of dss matters later
                for d in (aggfrom_ds, agginto_ds)
            ]
            if all(smetafound):
                # both have it
                metafound[s] = smetafound
                # but are they the same?
                try:
                    if _the_same_across_datasets(objloc, aggfrom_ds, agginto_ds):
                        uptodatemeta.append(s)
                except RuntimeError as exc:
                    # TODO: dedicated test - when meta content changes
                    lgr.debug("For now will just do re-extraction since caught %s",
                              CapturedException(exc))
            # source one has it, so we might be able to copy it
            # TODO: dedicated test - when it is sufficient to copy we do not re-extract

    if len(metafound) != len(metasources):
        # found some (either ds or cn) metadata missing entirely in both
        # from and into datasets
        lgr.debug(
            "Incomplete or absent metadata while aggregating %s <- %s: %s",
              agginto_ds, aggfrom_ds, metafound
        )
        # no metadata found -> extract
        # this places metadata dump files into the configured
        # target dataset and lists them in `to_save`, as well
        # as updates the `db` record for `aggfrom_ds`
        return _extract_metadata(
            agginto_ds,
            aggfrom_ds,
            db,
            to_save,
            objid,
            metasources,
            refcommit,
            subds_relpaths,
            agg_base_path)

    # we did not actually run an extraction, so we need to
    # assemble an aggregation record from the existing pieces
    # that we found
    # simple case: the target dataset has all the records already and they are up to date:
    if len(uptodatemeta) == len(metasources):
        lgr.debug('Sticking with up-to-date metadata for %s', aggfrom_ds)
        # no change, use old record from the target dataset
        db[aggfrom_ds.path] = old_agginfo
        # no error
        return False
    else:
        lgr.debug('Reusing previously extracted metadata for %s', aggfrom_ds)
        # we need to move the metadata dump(s) into the target dataset
        objrelpaths = {
            label: next(filter(bool, smetafound))
            for label, smetafound in metafound.items()
        }
        # make sure all the to-be-moved metadata records are present
        # locally
        aggfrom_ds.get(
            path=[op.join(aggfrom_ds.path, p)
                  for p in objrelpaths.values()],
            result_renderer='disabled')

        # actually copy dump files
        for objrelpath in objrelpaths.values():
            objpath = op.join(agginto_ds.path, objrelpath)
            objdir = op.dirname(objpath)
            if not op.exists(objdir):
                makedirs(objdir)
            if op.lexists(objpath):
                os.unlink(objpath)  # remove previous version first
                # was a wild thought as a workaround for 
                # http://git-annex.branchable.com/bugs/cannot_commit___34__annex_add__34__ed_modified_file_which_switched_its_largefile_status_to_be_committed_to_git_now/#comment-bf70dd0071de1bfdae9fd4f736fd1ec1
                # agginto_ds.repo.remove(objpath)
            # XXX TODO once we have a command that can copy/move files
            # from one dataset to another including file availability
            # info, this should be used here
            shutil.copyfile(
                op.join(aggfrom_ds.path, objrelpath),
                objpath)
            # mark for saving
            to_save.append(dict(
                path=objpath,
                parentds=agginto_ds.path,
                type='file'))

        # lastly get 'self' aggregation record from source dataset and
        # use in target dataset
        db[aggfrom_ds.path] = load_ds_aggregate_db(aggfrom_ds, abspath=True)[aggfrom_ds.path]
        return False
示例#21
0
def _extract_metadata(agginto_ds, aggfrom_ds, db, merge_native, to_save):
    """Dump metadata from a dataset into object in the metadata store of another

    Info on the metadata objects is placed into a DB dict under the
    absolute path of the dataset whose metadata was aggregated.

    Parameters
    ----------
    agginto_ds : Dataset
    aggfrom_ds : Dataset
    db : dict
    merge_native : str
      Merge mode.
    """
    subds_relpaths = aggfrom_ds.subdatasets(result_xfm='relpaths',
                                            return_type='list')
    # figure out a "state" of the dataset wrt its metadata that we are describing
    # 1. the latest commit that changed any file for which we could have native metadata
    refcommit = _get_latest_refcommit(aggfrom_ds, subds_relpaths)
    objid = refcommit if refcommit else ''
    # 2, our own dataset-global metadata
    dsmetafile = opj(aggfrom_ds.path, '.datalad', 'metadata', 'dataset.json')
    if exists(dsmetafile):
        objid += md5(open(dsmetafile, 'r').read().encode()).hexdigest()
    # 3. potential annex-based metadata
    if isinstance(aggfrom_ds, AnnexRepo) and \
            aggfrom_ds.config.obtain(
                'datalad.metadata.aggregate-content-datalad-core',
                default=True,
                valtype=EnsureBool()):
        # if there is no annex metadata, this will come out empty,
        # hence hash would be same as for a plain GitRepo
        # and no, we cannot use the shasum of the annex branch,
        # because this will change even when no metadata has changed
        timestamps, _ = aggfrom_ds.repo._run_annex_command(
            'metadata', '.', '-g', 'lastchanged')
        objid += timestamps.strip()

    if not objid:
        lgr.debug('%s has no metadata-relevant content', aggfrom_ds)
    else:
        lgr.debug('Dump metadata of %s (merge mode: %s) into %s', aggfrom_ds,
                  merge_native, agginto_ds)

    agginfo = {}
    # dataset global
    if aggfrom_ds.id:
        agginfo['id'] = aggfrom_ds.id
    agginfo['refcommit'] = refcommit
    # put in DB
    db[aggfrom_ds.path] = agginfo

    if not objid:
        dsmeta = contentmeta = None
        # this is no error, there is simply no metadata whatsoever
        return False

    # if there is any chance for metadata
    # obtain metadata for dataset and content
    relevant_paths = sorted(
        _get_metadatarelevant_paths(aggfrom_ds, subds_relpaths))
    nativetypes = get_metadata_type(aggfrom_ds)
    dsmeta, contentmeta, errored = _get_metadata(
        aggfrom_ds,
        # core must come first
        ['datalad_core'] + assure_list(nativetypes),
        merge_native,
        # None indicates to honor a datasets per-parser configuration and to be
        # on by default
        global_meta=None,
        content_meta=None,
        paths=relevant_paths)

    # shorten to MD5sum
    objid = md5(objid.encode()).hexdigest()

    metasources = [('ds', 'dataset', dsmeta, aggfrom_ds, json_py.dump)]

    # do not store content metadata if either the source or the target dataset
    # do not want it
    if aggfrom_ds.config.obtain(
            'datalad.metadata.store-aggregate-content',
            default=True,
            valtype=EnsureBool()) or \
            agginto_ds.config.obtain(
                'datalad.metadata.store-aggregate-content',
                default=True,
                valtype=EnsureBool()):
        metasources.append((
            'cn',
            'content',
            # sort by path key to get deterministic dump content
            (dict(contentmeta[k], path=k) for k in sorted(contentmeta)),
            aggfrom_ds,
            json_py.dump2xzstream))

    # for both types of metadata
    for label, mtype, meta, dest, store in metasources:
        if not meta:
            continue
        # only write to disk if there is something
        objrelpath = _get_obj_location(objid, label)
        if store is json_py.dump2xzstream:
            objrelpath += '.xz'
        # place metadata object into the source dataset
        objpath = opj(dest.path, dirname(agginfo_relpath), objrelpath)

        # write obj files
        if exists(objpath):
            dest.unlock(objpath)
        # TODO actually dump a compressed file when annexing is possible
        # to speed up on-demand access
        store(meta, objpath)
        # stage for dataset.save()
        to_save.append(dict(path=objpath, type='file'))

        # important to use abspath here, needs to be rewritten relative to
        # all receiving datasets
        agginfo['{}_info'.format(mtype)] = objpath

    return errored
示例#22
0
文件: run.py 项目: mprati/datalad
class Run(Interface):
    """Run an arbitrary shell command and record its impact on a dataset.

    It is recommended to craft the command such that it can run in the root
    directory of the dataset that the command will be recorded in. However,
    as long as the command is executed somewhere underneath the dataset root,
    the exact location will be recorded relative to the dataset root.

    If the executed command did not alter the dataset in any way, no record of
    the command execution is made.

    If the given command errors, a `CommandError` exception with the same exit
    code will be raised, and no modifications will be saved.

    *Command format*

    || REFLOW >>
    A few placeholders are supported in the command via Python format
    specification. "{pwd}" will be replaced with the full path of the current
    working directory. "{dspath}" will be replaced with the full path of the
    dataset that run is invoked on. "{inputs}" and "{outputs}" represent the
    values specified by [CMD: --input and --output CMD][PY: `inputs` and
    `outputs` PY]. If multiple values are specified, the values will be joined
    by a space. The order of the values will match that order from the command
    line, with any globs expanded in alphabetical order (like bash). Individual
    values can be accessed with an integer index (e.g., "{inputs[0]}").
    << REFLOW ||

    To escape a brace character, double it (i.e., "{{" or "}}").
    """
    _params_ = dict(
        cmd=Parameter(
            args=("cmd",),
            nargs=REMAINDER,
            metavar='COMMAND',
            doc="command for execution"),
        dataset=Parameter(
            args=("-d", "--dataset"),
            doc="""specify the dataset to record the command results in.
            An attempt is made to identify the dataset based on the current
            working directory. If a dataset is given, the command will be
            executed in the root directory of this dataset.""",
            constraints=EnsureDataset() | EnsureNone()),
        inputs=Parameter(
            args=("--input",),
            dest="inputs",
            metavar=("PATH"),
            action='append',
            doc="""A dependency for the run. Before running the command, the
            content of this file will be retrieved. A value of "." means "run
            :command:`datalad get .`". The value can also be a glob. [CMD: This
            option can be given more than once. CMD]"""),
        outputs=Parameter(
            args=("--output",),
            dest="outputs",
            metavar=("PATH"),
            action='append',
            doc="""Prepare this file to be an output file of the command. A
            value of "." means "run :command:`datalad unlock .`" (and will fail
            if some content isn't present). For any other value, if the content
            of this file is present, unlock the file. Otherwise, remove it. The
            value can also be a glob. [CMD: This option can be given more than
            once. CMD]"""),
        expand=Parameter(
            args=("--expand",),
            metavar=("WHICH"),
            doc="""Expand globs when storing inputs and/or outputs in the
            commit message.""",
            constraints=EnsureNone() | EnsureChoice("inputs", "outputs", "both")),
        message=save_message_opt,
        sidecar=Parameter(
            args=('--sidecar',),
            metavar="yes|no",
            doc="""By default, the configuration variable
            'datalad.run.record-sidecar' determines whether a record with
            information on a command's execution is placed into a separate
            record file instead of the commit message (default: off). This
            option can be used to override the configured behavior on a
            case-by-case basis. Sidecar files are placed into the dataset's
            '.datalad/runinfo' directory (customizable via the
            'datalad.run.record-directory' configuration variable).""",
            constraints=EnsureNone() | EnsureBool()),
        rerun=Parameter(
            args=('--rerun',),
            action='store_true',
            doc="""re-run the command recorded in the last saved change (if any).
            Note: This option is deprecated since version 0.9.2 and
            will be removed in a later release. Use `datalad rerun`
            instead."""),
    )

    @staticmethod
    @datasetmethod(name='run')
    @eval_results
    def __call__(
            cmd=None,
            dataset=None,
            inputs=None,
            outputs=None,
            expand=None,
            message=None,
            sidecar=None,
            rerun=False):
        if rerun:
            if cmd:
                lgr.warning("Ignoring provided command in --rerun mode")
            lgr.warning("The --rerun option is deprecated since version 0.9.2. "
                        "Use `datalad rerun` instead.")
            from datalad.interface.rerun import Rerun
            for r in Rerun.__call__(dataset=dataset, message=message):
                yield r
        else:
            if cmd:
                for r in run_command(cmd, dataset=dataset,
                                     inputs=inputs, outputs=outputs,
                                     expand=expand,
                                     message=message,
                                     sidecar=sidecar):
                    yield r
            else:
                lgr.warning("No command given")
示例#23
0
def _get_metadata(ds, types, global_meta=None, content_meta=None, paths=None):
    """Make a direct query of a dataset to extract its metadata.

    Parameters
    ----------
    ds : Dataset
    types : list
    """
    errored = False
    dsmeta = MetadataDict()
    # each item in here will be a MetadataDict, but not the whole thing
    contentmeta = {}

    if global_meta is not None and content_meta is not None and \
            not global_meta and not content_meta:
        # both are false and not just none
        return dsmeta, contentmeta, errored

    context = {
        '@vocab':
        'http://docs.datalad.org/schema_v{}.json'.format(vocabulary_version)
    }

    fullpathlist = paths
    if paths and isinstance(ds.repo, AnnexRepo):
        # Ugly? Jep: #2055
        content_info = zip(paths, ds.repo.file_has_content(paths),
                           ds.repo.is_under_annex(paths))
        paths = [p for p, c, a in content_info if not a or c]
        nocontent = len(fullpathlist) - len(paths)
        if nocontent:
            # TODO better fail, or support incremental and label this file as no present
            lgr.warn('{} files have no content present, '
                     'some extractors will not operate on {}'.format(
                         nocontent, 'them' if nocontent > 10 else
                         [p for p, c, a in content_info if not c and a]))

    # pull out potential metadata field blacklist config settings
    blacklist = [
        re.compile(bl) for bl in assure_list(
            ds.config.obtain('datalad.metadata.aggregate-ignore-fields',
                             default=[]))
    ]
    # enforce size limits
    max_fieldsize = ds.config.obtain('datalad.metadata.maxfieldsize')
    # keep local, who knows what some extractors might pull in
    from pkg_resources import iter_entry_points  # delayed heavy import
    extractors = {
        ep.name: ep
        for ep in iter_entry_points('datalad.metadata.extractors')
    }

    log_progress(
        lgr.info,
        'metadataextractors',
        'Start metadata extraction from %s',
        ds,
        total=len(types),
        label='Metadata extraction',
        unit=' extractors',
    )
    for mtype in types:
        mtype_key = mtype
        log_progress(lgr.info,
                     'metadataextractors',
                     'Engage %s metadata extractor',
                     mtype_key,
                     update=1,
                     increment=True)
        if mtype_key not in extractors:
            # we said that we want to fail, rather then just moan about less metadata
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s',
                mtype_key,
                ds,
            )
            raise ValueError(
                'Enable metadata extractor %s is not available in this installation',
                mtype_key)
        try:
            extractor_cls = extractors[mtype_key].load()
            extractor = extractor_cls(
                ds,
                paths=paths if extractor_cls.NEEDS_CONTENT else fullpathlist)
        except Exception as e:
            log_progress(
                lgr.error,
                'metadataextractors',
                'Failed %s metadata extraction from %s',
                mtype_key,
                ds,
            )
            raise ValueError(
                "Failed to load metadata extractor for '%s', "
                "broken dataset configuration (%s)?: %s", mtype, ds,
                exc_str(e))
            continue
        try:
            dsmeta_t, contentmeta_t = extractor.get_metadata(
                dataset=global_meta
                if global_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-dataset-{}'.format(
                        mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()),
                content=content_meta
                if content_meta is not None else ds.config.obtain(
                    'datalad.metadata.aggregate-content-{}'.format(
                        mtype.replace('_', '-')),
                    default=True,
                    valtype=EnsureBool()))
        except Exception as e:
            lgr.error('Failed to get dataset metadata ({}): {}'.format(
                mtype, exc_str(e)))
            if cfg.get('datalad.runtime.raiseonerror'):
                log_progress(
                    lgr.error,
                    'metadataextractors',
                    'Failed %s metadata extraction from %s',
                    mtype_key,
                    ds,
                )
                raise
            errored = True
            # if we dont get global metadata we do not want content metadata
            continue

        if dsmeta_t:
            if _ok_metadata(dsmeta_t, mtype, ds, None):
                dsmeta_t = _filter_metadata_fields(dsmeta_t,
                                                   maxsize=max_fieldsize,
                                                   blacklist=blacklist)
                dsmeta[mtype_key] = dsmeta_t
            else:
                errored = True

        unique_cm = {}
        extractor_unique_exclude = getattr(extractor_cls, "_unique_exclude",
                                           set())
        # TODO: ATM neuroimaging extractors all provide their own internal
        #  log_progress but if they are all generators, we could provide generic
        #  handling of the progress here.  Note also that log message is actually
        #  seems to be ignored and not used, only the label ;-)
        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Metadata extraction per location for %s', mtype,
        #     # contentmeta_t is a generator... so no cound is known
        #     # total=len(contentmeta_t or []),
        #     label='Metadata extraction per location',
        #     unit=' locations',
        # )
        for loc, meta in contentmeta_t or {}:
            lgr.log(5, "Analyzing metadata for %s", loc)
            # log_progress(
            #     lgr.debug,
            #     'metadataextractors_loc',
            #     'ignoredatm',
            #     label=loc,
            #     update=1,
            #     increment=True)
            if not _ok_metadata(meta, mtype, ds, loc):
                errored = True
                # log_progress(
                #     lgr.debug,
                #     'metadataextractors_loc',
                #     'ignoredatm',
                #     label='Failed for %s' % loc,
                # )
                continue
            # we also want to store info that there was no metadata(e.g. to get a list of
            # files that have no metadata)
            # if there is an issue that a extractor needlessly produces empty records, the
            # extractor should be fixed and not a general switch. For example the datalad_core
            # issues empty records to document the presence of a file
            #elif not meta:
            #    continue

            meta = MetadataDict(meta)
            # apply filters
            meta = _filter_metadata_fields(meta,
                                           maxsize=max_fieldsize,
                                           blacklist=blacklist)

            if not meta:
                continue

            # assign
            # only ask each metadata extractor once, hence no conflict possible
            loc_dict = contentmeta.get(loc, {})
            loc_dict[mtype_key] = meta
            contentmeta[loc] = loc_dict

            if ds.config.obtain('datalad.metadata.generate-unique-{}'.format(
                    mtype_key.replace('_', '-')),
                                default=True,
                                valtype=EnsureBool()):
                # go through content metadata and inject report of unique keys
                # and values into `dsmeta`
                for k, v in iteritems(meta):
                    if k in dsmeta.get(mtype_key, {}):
                        # if the dataset already has a dedicated idea
                        # about a key, we skip it from the unique list
                        # the point of the list is to make missing info about
                        # content known in the dataset, not to blindly
                        # duplicate metadata. Example: list of samples data
                        # were recorded from. If the dataset has such under
                        # a 'sample' key, we should prefer that, over an
                        # aggregated list of a hopefully-kinda-ok structure
                        continue
                    elif k in extractor_unique_exclude:
                        # the extractor thinks this key is worthless for the purpose
                        # of discovering whole datasets
                        # we keep the key (so we know that some file is providing this key),
                        # but ignore any value it came with
                        unique_cm[k] = None
                        continue
                    vset = unique_cm.get(k, set())
                    vset.add(_val2hashable(v))
                    unique_cm[k] = vset

        # log_progress(
        #     lgr.debug,
        #     'metadataextractors_loc',
        #     'Finished metadata extraction across locations for %s', mtype)

        if unique_cm:
            # per source storage here too
            ucp = dsmeta.get('datalad_unique_content_properties', {})

            # important: we want to have a stable order regarding
            # the unique values (a list). we cannot guarantee the
            # same order of discovery, hence even when not using a
            # set above we would still need sorting. the callenge
            # is that any value can be an arbitrarily complex nested
            # beast
            # we also want to have each unique value set always come
            # in a top-level list, so we known if some unique value
            # was a list, os opposed to a list of unique values

            def _ensure_serializable(val):
                if isinstance(val, ReadOnlyDict):
                    return {
                        k: _ensure_serializable(v)
                        for k, v in iteritems(val)
                    }
                if isinstance(val, (tuple, list)):
                    return [_ensure_serializable(v) for v in val]
                else:
                    return val

            ucp[mtype_key] = {
                k: [
                    _ensure_serializable(i)
                    for i in sorted(v, key=_unique_value_key)
                ] if v is not None else None
                for k, v in iteritems(unique_cm)
                # v == None (disable unique, but there was a value at some point)
                # otherwise we only want actual values, and also no single-item-lists
                # of a non-value
                # those contribute no information, but bloat the operation
                # (inflated number of keys, inflated storage, inflated search index, ...)
                if v is None or (v and not v == {''})
            }
            dsmeta['datalad_unique_content_properties'] = ucp

    log_progress(
        lgr.info,
        'metadataextractors',
        'Finished metadata extraction from %s',
        ds,
    )

    # always identify the effective vocabulary - JSON-LD style
    if context:
        dsmeta['@context'] = context

    return dsmeta, contentmeta, errored
示例#24
0
class Subdatasets(Interface):
    """Report subdatasets and their properties.

    The following properties are reported (if possible) for each matching
    subdataset record.

    "name"
        Name of the subdataset in the parent (often identical with the
        relative path in the parent dataset)

    "path"
        Absolute path to the subdataset

    "parentds"
        Absolute path to the parent dataset

    "revision"
        SHA1 of the subdataset commit recorded in the parent dataset

    "state"
        Condition of the subdataset: 'clean', 'modified', 'absent', 'conflict'
        as reported by `git submodule`

    "revision_descr"
        Output of `git describe` for the subdataset

    "gitmodule_url"
        URL of the subdataset recorded in the parent

    "gitmodule_<label>"
        Any additional configuration property on record.

    Performance note: Property modification, requesting `bottomup` reporting
    order, or a particular numerical `recursion_limit` implies an internal
    switch to an alternative query implementation for recursive query that is
    more flexible, but also notably slower (performs one call to Git per
    dataset versus a single call for all combined).

    The following properties for subdatasets are recognized by DataLad
    (without the 'gitmodule\_' prefix that is used in the query results):

    "datalad-recursiveinstall"
        If set to 'skip', the respective subdataset is skipped when DataLad
        is recursively installing its superdataset. However, the subdataset
        remains installable when explicitly requested, and no other features
        are impaired.



    """
    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to query.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the input and/or the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        fulfilled=Parameter(
            args=("--fulfilled", ),
            doc="""if given, must be a boolean flag indicating whether
            to report either only locally present or absent datasets.
            By default subdatasets are reported regardless of their
            status""",
            constraints=EnsureBool() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        contains=Parameter(
            args=('--contains', ),
            metavar='PATH',
            doc="""limit report to the subdatasets containing the
            given path. If a root path of a subdataset is given the last
            reported dataset will be the subdataset itself.""",
            constraints=EnsureStr() | EnsureNone()),
        bottomup=Parameter(
            args=("--bottomup", ),
            action="store_true",
            doc="""whether to report subdatasets in bottom-up order along
            each branch in the dataset tree, and not top-down."""),
        set_property=Parameter(
            args=('--set-property', ),
            metavar=('NAME', 'VALUE'),
            nargs=2,
            action='append',
            doc="""Name and value of one or more subdataset properties to
            be set in the parent dataset's .gitmodules file. The property name
            is case-insensitive, must start with a letter, and consist only
            of alphanumeric characters. The value can be
            a Python format() template string wrapped in '<>' (e.g.
            '<{gitmodule_name}>').
            Supported keywords are any item reported in the result properties
            of this command, plus 'refds_relpath' and 'refds_relname':
            the relative path of a subdataset with respect to the base dataset
            of the command call, and, in the latter case, the same string with
            all directory separators replaced by dashes.[CMD:  This
            option can be given multiple times. CMD]""",
            constraints=EnsureStr() | EnsureNone()),
        delete_property=Parameter(
            args=('--delete-property', ),
            metavar='NAME',
            action='append',
            doc="""Name of one or more subdataset properties to be removed
            from the parent dataset's .gitmodules file.[CMD:  This
            option can be given multiple times. CMD]""",
            constraints=EnsureStr() | EnsureNone()))

    @staticmethod
    @datasetmethod(name='subdatasets')
    @eval_results
    def __call__(dataset=None,
                 fulfilled=None,
                 recursive=False,
                 recursion_limit=None,
                 contains=None,
                 bottomup=False,
                 set_property=None,
                 delete_property=None):
        dataset = require_dataset(dataset,
                                  check_installed=False,
                                  purpose='subdataset reporting/modification')
        refds_path = dataset.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must start with a letter)",
                        k)
        if contains:
            contains = resolve_path(contains, dataset)
        for r in _get_submodules(dataset.path, fulfilled, recursive,
                                 recursion_limit, contains, bottomup,
                                 set_property, delete_property, refds_path):
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
示例#25
0
文件: update.py 项目: kyleam/datalad
class Update(Interface):
    """Update a dataset from a sibling.

    """
    # TODO: adjust docs to say:
    # - update from just one sibling at a time

    _examples_ = [
        dict(text="Update from a particular sibling",
             code_py="update(sibling='siblingname')",
             code_cmd="datalad update -s <siblingname>"),
        dict(text="Update from a particular sibling and merge the changes "
             "from a configured or matching branch from the sibling "
             "(see [CMD: --follow CMD][PY: `follow` PY] for details)",
             code_py="update(sibling='siblingname', merge=True)",
             code_cmd="datalad update --merge -s <siblingname>"),
        dict(text="Update from the sibling 'origin', traversing into "
             "subdatasets. For subdatasets, merge the revision "
             "registered in the parent dataset into the current branch",
             code_py="update(sibling='origin', merge=True, "
             "follow='parentds', recursive=True)",
             code_cmd="datalad update -s origin --merge "
             "--follow=parentds --recursive"),
    ]

    _params_ = dict(
        path=Parameter(
            args=("path", ),
            metavar="PATH",
            doc=
            """constrain to-be-updated subdatasets to the given path for recursive
            operation.""",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        sibling=Parameter(
            args=(
                "-s",
                "--sibling",
            ),
            doc="""name of the sibling to update from. If no sibling
            is given, updates from all siblings are obtained.""",
            constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to update.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        merge=Parameter(
            args=("--merge", ),
            metavar="ALLOWED",
            # const and nargs are set to map --merge to --merge=any.
            const="any",
            nargs="?",
            constraints=EnsureBool() | EnsureChoice("any", "ff-only"),
            doc="""merge obtained changes from the sibling. If a sibling is not
            explicitly given and there is only a single known sibling, that
            sibling is used. Otherwise, an unspecified sibling defaults to the
            configured remote for the current branch. By default, changes are
            fetched from the sibling but not merged into the current branch.
            With [CMD: --merge or --merge=any CMD][PY: merge=True or
            merge="any" PY], the changes will be merged into the current
            branch. A value of 'ff-only' restricts the allowed merges to
            fast-forwards."""),
        follow=Parameter(
            args=("--follow", ),
            constraints=EnsureChoice("sibling", "parentds"),
            doc="""source of updates for subdatasets. For 'sibling', the update
            will be done by merging in a branch from the (specified or
            inferred) sibling. The branch brought in will either be the current
            branch's configured branch, if it points to a branch that belongs
            to the sibling, or a sibling branch with a name that matches the
            current branch. For 'parentds', the revision registered in the
            parent dataset of the subdataset is merged in. Note that the
            current dataset is always updated according to 'sibling'. This
            option has no effect unless a merge is requested and [CMD:
            --recursive CMD][PY: recursive=True PY] is specified.""",
        ),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        fetch_all=Parameter(
            args=("--fetch-all", ),
            action="store_true",
            doc=
            """this option has no effect and will be removed in a future version.
            When no siblings are given, an all-sibling update will be performed.""",
        ),
        reobtain_data=Parameter(
            args=("--reobtain-data", ),
            action="store_true",
            doc="""if enabled, file content that was present before an update
            will be re-obtained in case a file was changed by the update."""),
    )

    @staticmethod
    @datasetmethod(name='update')
    @eval_results
    def __call__(path=None,
                 sibling=None,
                 merge=False,
                 follow="sibling",
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 fetch_all=None,
                 reobtain_data=False):
        if fetch_all is not None:
            lgr.warning(
                'update(fetch_all=...) called. Option has no effect, and will be removed'
            )
        if path and not recursive:
            lgr.warning('path constraints for subdataset updates ignored, '
                        'because `recursive` option was not given')

        refds = require_dataset(dataset,
                                check_installed=True,
                                purpose='updating')

        save_paths = []
        merge_failures = set()
        saw_subds = False
        for ds, revision in itertools.chain(
            [(refds, None)],
                refds.subdatasets(path=path,
                                  fulfilled=True,
                                  recursive=recursive,
                                  recursion_limit=recursion_limit,
                                  return_type='generator',
                                  result_renderer='disabled',
                                  result_xfm=YieldDatasetAndRevision())
                if recursive else []):
            if ds != refds:
                saw_subds = True
            repo = ds.repo
            is_annex = isinstance(repo, AnnexRepo)
            # prepare return value
            res = get_status_dict('update',
                                  ds=ds,
                                  logger=lgr,
                                  refds=refds.path)
            # get all remotes which have references (would exclude
            # special remotes)
            remotes = repo.get_remotes(**({
                'exclude_special_remotes': True
            } if is_annex else {}))
            if not remotes and not sibling:
                res['message'] = (
                    "No siblings known to dataset at %s\nSkipping", repo.path)
                res['status'] = 'notneeded'
                yield res
                continue
            curr_branch = repo.get_active_branch()
            tracking_remote = None
            if not sibling and len(remotes) == 1:
                # there is only one remote, must be this one
                sibling_ = remotes[0]
            elif not sibling:
                # nothing given, look for tracking branch
                tracking_remote = repo.get_tracking_branch(branch=curr_branch,
                                                           remote_only=True)[0]
                sibling_ = tracking_remote
            else:
                sibling_ = sibling
            if sibling_ and sibling_ not in remotes:
                res['message'] = ("'%s' not known to dataset %s\nSkipping",
                                  sibling_, repo.path)
                res['status'] = 'impossible'
                yield res
                continue
            if not sibling_ and len(remotes) > 1 and merge:
                lgr.debug("Found multiple siblings:\n%s" % remotes)
                res['status'] = 'impossible'
                res['message'] = "Multiple siblings, please specify from which to update."
                yield res
                continue
            lgr.info("Fetching updates for %s", ds)
            # fetch remote
            fetch_kwargs = dict(
                # test against user-provided value!
                remote=None if sibling is None else sibling_,
                all_=sibling is None,
                # required to not trip over submodules that
                # were removed in the origin clone
                recurse_submodules="no",
                prune=True)  # prune to not accumulate a mess over time
            repo.fetch(**fetch_kwargs)
            # NOTE reevaluate ds.repo again, as it might have be converted from
            # a GitRepo to an AnnexRepo
            repo = ds.repo

            follow_parent = revision and follow == "parentds"
            if follow_parent and not repo.commit_exists(revision):
                if sibling_:
                    try:
                        lgr.debug("Fetching revision %s directly for %s",
                                  revision, repo)
                        repo.fetch(remote=sibling_,
                                   refspec=revision,
                                   git_options=["--recurse-submodules=no"])
                    except CommandError as exc:
                        yield dict(
                            res,
                            status="impossible",
                            message=("Attempt to fetch %s from %s failed: %s",
                                     revision, sibling_, exc_str(exc)))
                        continue
                else:
                    yield dict(res,
                               status="impossible",
                               message=("Need to fetch %s directly "
                                        "but single sibling not resolved",
                                        revision))
                    continue

            saw_merge_failure = False
            if merge:
                if follow_parent:
                    merge_target = revision
                else:
                    merge_target = _choose_merge_target(
                        repo, curr_branch, sibling_, tracking_remote)

                merge_fn = _choose_merge_fn(
                    repo,
                    is_annex=is_annex,
                    adjusted=is_annex and repo.is_managed_branch(curr_branch))

                merge_opts = None
                if merge_fn is _annex_sync:
                    if follow_parent:
                        yield dict(
                            res,
                            status="impossible",
                            message=("follow='parentds' is incompatible "
                                     "with adjusted branches"))
                        continue
                elif merge_target is None:
                    yield dict(res,
                               status="impossible",
                               message="Could not determine merge target")
                    continue
                elif merge == "ff-only":
                    merge_opts = ["--ff-only"]

                if is_annex and reobtain_data:
                    merge_fn = _reobtain(ds, merge_fn)

                for mres in merge_fn(repo,
                                     sibling_,
                                     merge_target,
                                     merge_opts=merge_opts):
                    if mres["action"] == "merge" and mres["status"] != "ok":
                        saw_merge_failure = True
                    yield dict(res, **mres)

            if saw_merge_failure:
                merge_failures.add(ds)
                res['status'] = 'error'
                res['message'] = ("Merge of %s failed", merge_target)
            else:
                res['status'] = 'ok'
                save_paths.append(ds.path)
            yield res
        # we need to save updated states only if merge was requested -- otherwise
        # it was a pure fetch
        if merge and recursive:
            if path and not saw_subds:
                lgr.warning(
                    'path constraints did not match an installed subdataset: %s',
                    path)
            if refds in merge_failures:
                lgr.warning(
                    "Not saving because top-level dataset %s "
                    "had a merge failure", refds.path)
            else:
                save_paths = [p for p in save_paths if p != refds.path]
                if not save_paths:
                    return
                lgr.debug(
                    'Subdatasets where updated state may need to be '
                    'saved in the parent dataset: %s', save_paths)
                for r in refds.save(
                        path=save_paths,
                        recursive=False,
                        message='[DATALAD] Save updated subdatasets'):
                    yield r
示例#26
0
    def __call__(title=None,
                 name="osf",
                 storage_name=None,
                 dataset=None,
                 mode="annex",
                 existing='error',
                 trust_level=None,
                 tags=None,
                 public=False,
                 category='data',
                 description=None,
                 ):
        ds = require_dataset(dataset,
                             purpose="create OSF remote",
                             check_installed=True)
        res_kwargs = dict(
            ds=ds,
            action="create-sibling-osf",
            logger=lgr,
        )
        # we need an annex
        if not isinstance(ds.repo, AnnexRepo):
            yield get_status_dict(
                type="dataset",
                status="impossible",
                message="dataset has no annex",
                **res_kwargs)
            return

        # NOTES:
        # - we prob. should check osf-special-remote availability upfront to
        #   fail early
        # - add --recursive option
        #       - recursive won't work easily. Need to think that through.
        #       - would need a naming scheme for subdatasets
        #       - flat on OSF or a tree?
        #       - how do we detect something is there already, so we can skip
        #         rather than duplicate (with a new name)?
        #         osf-type-special-remote sufficient to decide it's not needed?
        # - adapt to conclusions in issue #30
        #   -> create those subcomponents
        # - results need to report URL for created projects suitable for datalad
        #   output formatting!
        #   -> result_renderer
        #   -> needs to ne returned by create_node

        if not storage_name:
            storage_name = "{}-storage".format(name)

        sibling_conflicts = sibling_exists(
            ds, [name, storage_name],
            # TODO pass through
            recursive=False, recursion_limit=None,
            # fail fast, if error is desired
            exhaustive=existing == 'error',
        )
        if existing == 'error' and sibling_conflicts:
            # we only asked for one
            conflict = sibling_conflicts[0]
            yield get_status_dict(
                status='error',
                message=(
                    "a sibling '%s' is already configured in dataset %s",
                    conflict[1], conflict[0]),
                **res_kwargs,
            )
            return

        if title is None:
            # use dataset root basename
            title = ds.pathobj.name

        tags = ensure_list(tags)
        if 'DataLad dataset' not in tags:
            tags.append('DataLad dataset')
        if ds.id and ds.id not in tags:
            tags.append(ds.id)

        if not description:
            description = \
                "This component was built from a DataLad dataset using the " \
                "datalad-osf extension " \
                "(https://github.com/datalad/datalad-osf)."
            if mode != 'exportonly':
                description += \
                    " With this extension installed, this component can be " \
                    "git or datalad cloned from a 'osf://ID' URL, where " \
                    "'ID' is the OSF node ID that shown in the OSF HTTP " \
                    "URL, e.g. https://osf.io/q8xnk/ can be cloned from " \
                    "osf://q8xnk"
        cred = get_credentials(allow_interactive=True)
        osf = OSF(**cred)
        node_id, node_url = create_node(
            osf_session=osf.session,
            title=title,
            category=category,
            tags=tags if tags else None,
            public=EnsureBool()(public),
            description=description,
        )
        if mode != 'gitonly':
            init_opts = ["encryption=none",
                         "type=external",
                         "externaltype=osf",
                         "autoenable=true",
                         "node={}".format(node_id)]

            if mode in ("export", "exportonly"):
                init_opts += ["exporttree=yes"]

            ds.repo.init_remote(storage_name, options=init_opts)
            if trust_level:
                ds.repo.call_git(['annex', trust_level, storage_name])

            yield get_status_dict(
                type="dataset",
                url=node_url,
                id=node_id,
                name=storage_name,
                status="ok",
                **res_kwargs
            )

        if mode == 'exportonly':
            return

        ds.config.set(
            'remote.{}.annex-ignore'.format(name), 'true',
            where='local')
        yield from ds.siblings(
            # use configure, not add, to not trip over the config that
            # we just made
            action='configure',
            name=name,
            url='osf://{}'.format(node_id),
            fetch=False,
            publish_depends=storage_name if mode != 'gitonly' else None,
            recursive=False,
            result_renderer=None,
        )
示例#27
0
def _configure_remote(ds, name, known_remotes, url, pushurl, fetch,
                      description, as_common_datasrc, publish_depends,
                      publish_by_default, annex_wanted, annex_required,
                      annex_group, annex_groupwanted, inherit, get_annex_info,
                      **res_kwargs):
    result_props = dict(action='configure-sibling',
                        path=ds.path,
                        type='sibling',
                        name=name,
                        **res_kwargs)
    if name is None:
        result_props['status'] = 'error'
        result_props['message'] = 'need sibling `name` for configuration'
        yield result_props
        return

    if name != 'here':
        # do all configure steps that are not meaningful for the 'here' sibling
        # AKA the local repo
        if name not in known_remotes:
            # this remote is fresh: make it known
            # just minimalistic name and URL, the rest is coming from `configure`
            ds.repo.add_remote(name, url)
            known_remotes.append(name)
        elif url:
            # not new, override URl if given
            ds.repo.set_remote_url(name, url)

        # make sure we have a configured fetch expression at this point
        fetchvar = 'remote.{}.fetch'.format(name)
        if fetchvar not in ds.repo.config:
            # place default fetch refspec in config
            # same as `git remote add` would have added
            ds.repo.config.add(fetchvar,
                               '+refs/heads/*:refs/remotes/{}/*'.format(name),
                               where='local')

        if pushurl:
            ds.repo.set_remote_url(name, pushurl, push=True)

        if publish_depends:
            # Check if all `deps` remotes are known to the `repo`
            unknown_deps = set(
                assure_list(publish_depends)).difference(known_remotes)
            if unknown_deps:
                result_props['status'] = 'error'
                result_props['message'] = (
                    'unknown sibling(s) specified as publication dependency: %s',
                    unknown_deps)
                yield result_props
                return

        # define config var name for potential publication dependencies
        depvar = 'remote.{}.datalad-publish-depends'.format(name)
        # and default pushes
        dfltvar = "remote.{}.push".format(name)

        if fetch:
            # fetch the remote so we are up to date
            for r in Update.__call__(dataset=res_kwargs['refds'],
                                     path=[dict(path=ds.path, type='dataset')],
                                     sibling=name,
                                     merge=False,
                                     recursive=False,
                                     on_failure='ignore',
                                     return_type='generator',
                                     result_xfm=None):
                # fixup refds
                r.update(res_kwargs)
                yield r

        if inherit:
            # Adjust variables which we should inherit
            delayed_super = _DelayedSuper(ds.repo)
            publish_depends = _inherit_config_var(delayed_super, depvar,
                                                  publish_depends)
            publish_by_default = _inherit_config_var(delayed_super, dfltvar,
                                                     publish_by_default)
            # Copy relevant annex settings for the sibling
            # makes sense only if current AND super are annexes, so it is
            # kinda a boomer, since then forbids having a super a pure git
            if isinstance(ds.repo, AnnexRepo) and \
                    isinstance(delayed_super.repo, AnnexRepo):
                if annex_wanted is None:
                    annex_wanted = _inherit_annex_var(delayed_super, name,
                                                      'wanted')
                if annex_required is None:
                    annex_required = _inherit_annex_var(
                        delayed_super, name, 'required')
                if annex_group is None:
                    # I think it might be worth inheritting group regardless what
                    # value is
                    #if annex_wanted in {'groupwanted', 'standard'}:
                    annex_group = _inherit_annex_var(delayed_super, name,
                                                     'group')
                if annex_wanted == 'groupwanted' and annex_groupwanted is None:
                    # we better have a value for the expression for that group
                    annex_groupwanted = _inherit_annex_var(
                        delayed_super, name, 'groupwanted')

        if publish_depends:
            if depvar in ds.config:
                # config vars are incremental, so make sure we start from
                # scratch
                ds.config.unset(depvar, where='local', reload=False)
            for d in assure_list(publish_depends):
                lgr.info('Configure additional publication dependency on "%s"',
                         d)
                ds.config.add(depvar, d, where='local', reload=False)
            ds.config.reload()

        if publish_by_default:
            if dfltvar in ds.config:
                ds.config.unset(dfltvar, where='local', reload=False)
            for refspec in assure_list(publish_by_default):
                lgr.info(
                    'Configure additional default publication refspec "%s"',
                    refspec)
                ds.config.add(dfltvar, refspec, 'local')
            ds.config.reload()

        assert isinstance(ds.repo, GitRepo)  # just against silly code
        if isinstance(ds.repo, AnnexRepo):
            # we need to check if added sibling an annex, and try to enable it
            # another part of the fix for #463 and #432
            try:
                if not ds.config.obtain('remote.{}.annex-ignore'.format(name),
                                        default=False,
                                        valtype=EnsureBool(),
                                        store=False):
                    ds.repo.enable_remote(name)
            except CommandError as exc:
                # TODO yield
                # this is unlikely to ever happen, now done for AnnexRepo instances
                # only
                lgr.info("Failed to enable annex remote %s, "
                         "could be a pure git" % name)
                lgr.debug("Exception was: %s" % exc_str(exc))
            if as_common_datasrc:
                ri = RI(url)
                if isinstance(ri, URL) and ri.scheme in ('http', 'https'):
                    # XXX what if there is already a special remote
                    # of this name? Above check for remotes ignores special
                    # remotes. we need to `git annex dead REMOTE` on reconfigure
                    # before we can init a new one
                    # XXX except it is not enough

                    # make special remote of type=git (see #335)
                    ds.repo._run_annex_command('initremote',
                                               annex_options=[
                                                   as_common_datasrc,
                                                   'type=git',
                                                   'location={}'.format(url),
                                                   'autoenable=true'
                                               ])
                else:
                    yield dict(
                        status='impossible',
                        name=name,
                        message='cannot configure as a common data source, '
                        'URL protocol is not http or https',
                        **result_props)
    #
    # place configure steps that also work for 'here' below
    #
    if isinstance(ds.repo, AnnexRepo):
        for prop, var in (('wanted', annex_wanted),
                          ('required', annex_required), ('group',
                                                         annex_group)):
            if var is not None:
                ds.repo.set_preferred_content(prop, var,
                                              '.' if name == 'here' else name)
        if annex_groupwanted:
            ds.repo.set_groupwanted(annex_group, annex_groupwanted)

    if description:
        if not isinstance(ds.repo, AnnexRepo):
            result_props['status'] = 'impossible'
            result_props[
                'message'] = 'cannot set description of a plain Git repository'
            yield result_props
            return
        ds.repo._run_annex_command('describe',
                                   annex_options=[name, description])

    # report all we know at once
    info = list(
        _query_remotes(ds, name, known_remotes,
                       get_annex_info=get_annex_info))[0]
    info.update(dict(status='ok', **result_props))
    yield info
示例#28
0
class Subdatasets(Interface):
    r"""Report subdatasets and their properties.

    The following properties are reported (if possible) for each matching
    subdataset record.

    "name"
        Name of the subdataset in the parent (often identical with the
        relative path in the parent dataset)

    "path"
        Absolute path to the subdataset

    "parentds"
        Absolute path to the parent dataset

    "gitshasum"
        SHA1 of the subdataset commit recorded in the parent dataset

    "state"
        Condition of the subdataset: 'clean', 'modified', 'absent', 'conflict'
        as reported by `git submodule`

    "gitmodule_url"
        URL of the subdataset recorded in the parent

    "gitmodule_name"
        Name of the subdataset recorded in the parent

    "gitmodule_<label>"
        Any additional configuration property on record.

    Performance note: Property modification, requesting `bottomup` reporting
    order, or a particular numerical `recursion_limit` implies an internal
    switch to an alternative query implementation for recursive query that is
    more flexible, but also notably slower (performs one call to Git per
    dataset versus a single call for all combined).

    The following properties for subdatasets are recognized by DataLad
    (without the 'gitmodule\_' prefix that is used in the query results):

    "datalad-recursiveinstall"
        If set to 'skip', the respective subdataset is skipped when DataLad
        is recursively installing its superdataset. However, the subdataset
        remains installable when explicitly requested, and no other features
        are impaired.
    """
    _params_ = dict(
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to query.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the input and/or the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        path=Parameter(
            args=("path", ),
            metavar='PATH',
            doc="""path/name to query for subdatasets. Defaults to the
            current directory[PY: , or the entire dataset if called as
            a dataset method PY].""",
            nargs='*',
            constraints=EnsureStr() | EnsureNone()),
        fulfilled=Parameter(
            args=("--fulfilled", ),
            doc="""if given, must be a boolean flag indicating whether
            to report either only locally present or absent datasets.
            By default subdatasets are reported regardless of their
            status""",
            constraints=EnsureBool() | EnsureNone()),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        contains=Parameter(
            args=('--contains', ),
            metavar='PATH',
            action='append',
            doc="""limit report to the subdatasets containing the
            given path. If a root path of a subdataset is given the last
            reported dataset will be the subdataset itself.[CMD:  This
            option can be given multiple times CMD][PY:  Can be a list with
            multiple paths PY], in which case datasets will be reported that
            contain any of the given paths.""",
            constraints=EnsureStr() | EnsureNone()),
        bottomup=Parameter(
            args=("--bottomup", ),
            action="store_true",
            doc="""whether to report subdatasets in bottom-up order along
            each branch in the dataset tree, and not top-down."""),
        set_property=Parameter(
            args=('--set-property', ),
            metavar=('NAME', 'VALUE'),
            nargs=2,
            action='append',
            doc="""Name and value of one or more subdataset properties to
            be set in the parent dataset's .gitmodules file. The property name
            is case-insensitive, must start with a letter, and consist only
            of alphanumeric characters. The value can be
            a Python format() template string wrapped in '<>' (e.g.
            '<{gitmodule_name}>').
            Supported keywords are any item reported in the result properties
            of this command, plus 'refds_relpath' and 'refds_relname':
            the relative path of a subdataset with respect to the base dataset
            of the command call, and, in the latter case, the same string with
            all directory separators replaced by dashes.[CMD:  This
            option can be given multiple times. CMD]""",
            constraints=EnsureStr() | EnsureNone()),
        delete_property=Parameter(
            args=('--delete-property', ),
            metavar='NAME',
            action='append',
            doc="""Name of one or more subdataset properties to be removed
            from the parent dataset's .gitmodules file.[CMD:  This
            option can be given multiple times. CMD]""",
            constraints=EnsureStr() | EnsureNone()))

    @staticmethod
    @datasetmethod(name='subdatasets')
    @eval_results
    def __call__(path=None,
                 dataset=None,
                 fulfilled=None,
                 recursive=False,
                 recursion_limit=None,
                 contains=None,
                 bottomup=False,
                 set_property=None,
                 delete_property=None):
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='subdataset reporting/modification')

        paths = resolve_path(ensure_list(path), dataset, ds) if path else None

        # no constraints given -> query subdatasets under curdir
        if not paths and dataset is None:
            cwd = Path(getpwd())
            paths = None if cwd == ds.pathobj else [cwd]

        lgr.debug('Query subdatasets of %s', dataset)
        if paths is not None:
            lgr.debug('Query subdatasets underneath paths: %s', paths)
        refds_path = ds.path

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must "
                        "start with a letter)" % k)
        if contains:
            contains = resolve_path(ensure_list(contains), dataset, ds)
            # expand all test cases for the contains test in the loop below
            # leads to ~20% speedup per loop iteration of a non-match
            expanded_contains = [[c] + list(c.parents) for c in contains]
        else:
            expanded_contains = []
        contains_hits = set()
        for r in _get_submodules(ds, paths, fulfilled, recursive,
                                 recursion_limit, expanded_contains, bottomup,
                                 set_property, delete_property, refds_path):
            # a boat-load of ancient code consumes this and is ignorant of
            # Path objects
            r['path'] = str(r['path'])
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            if 'contains' in r:
                contains_hits.update(r['contains'])
                r['contains'] = [str(c) for c in r['contains']]
            yield r
        if contains:
            for c in set(contains).difference(contains_hits):
                yield get_status_dict(
                    'subdataset',
                    path=str(c),
                    status='impossible',
                    message='path not contained in any matching subdataset',
                    # we do not want to log such an event, because it is a
                    # legit query to check for matching subdatasets simply
                    # for the purpose of further decision making
                    # user communication in front-end scenarios will happen
                    # via result rendering
                    #logger=lgr
                )
示例#29
0
class Update(Interface):
    """Update a dataset from a sibling.

    """
    # TODO: adjust docs to say:
    # - update from just one sibling at a time

    _examples_ = [
        dict(text="Update from a particular sibling",
             code_py="update(sibling='siblingname')",
             code_cmd="datalad update -s <siblingname>"),
        dict(text="Update from a particular sibling and merge the changes "
             "from a configured or matching branch from the sibling "
             "(see [CMD: --follow CMD][PY: `follow` PY] for details)",
             code_py="update(sibling='siblingname', how='merge')",
             code_cmd="datalad update --how=merge -s <siblingname>"),
        dict(text="Update from the sibling 'origin', traversing into "
             "subdatasets. For subdatasets, merge the revision "
             "registered in the parent dataset into the current branch",
             code_py="update(sibling='origin', how='merge', "
             "follow='parentds', recursive=True)",
             code_cmd="datalad update -s origin --how=merge "
             "--follow=parentds -r"),
        dict(text="Fetch and merge the remote tracking branch "
             "into the current dataset. Then update each subdataset "
             "by resetting its current branch to the revision "
             "registered in the parent dataset, fetching only if "
             "the revision isn't already present",
             code_py="update(how='merge', how_subds='reset', "
             "follow='parentds-lazy', recursive=True)",
             code_cmd="datalad update --how=merge --how-subds=reset"
             "--follow=parentds-lazy -r"),
    ]

    _params_ = dict(
        path=Parameter(
            args=("path", ),
            metavar="PATH",
            doc=
            """constrain to-be-updated subdatasets to the given path for recursive
            operation.""",
            nargs="*",
            constraints=EnsureStr() | EnsureNone()),
        sibling=Parameter(
            args=(
                "-s",
                "--sibling",
            ),
            doc="""name of the sibling to update from. When unspecified,
            updates from all siblings are fetched. If there is more than one
            sibling and changes will be brought into the working tree (as
            requested via [CMD: --merge, --how, or --how-subds CMD][PY:
            `merge`, `how`, or `how_subds` PY]), a sibling will be chosen based
            on the configured remote for the current branch.""",
            constraints=EnsureStr() | EnsureNone()),
        dataset=Parameter(args=("-d", "--dataset"),
                          doc="""specify the dataset to update.  If
            no dataset is given, an attempt is made to identify the dataset
            based on the current working directory""",
                          constraints=EnsureDataset() | EnsureNone()),
        merge=Parameter(
            args=("--merge", ),
            metavar="ALLOWED",
            # const and nargs are set to map --merge to --merge=any.
            const="any",
            nargs="?",
            constraints=EnsureBool() | EnsureChoice("any", "ff-only"),
            # TODO: Decide whether this should be removed eventually.
            doc="""merge obtained changes from the sibling. This is a subset of
            the functionality that can be achieved via the newer [CMD: --how
            CMD][PY: `how` PY]. [CMD: --merge or --merge=any CMD][PY:
            merge=True or merge="any" PY] is equivalent to [CMD: --how=merge
            CMD][PY: how="merge" PY]. [CMD: --merge=ff-only CMD][PY:
            merge="ff-only" PY] is equivalent to [CMD: --how=ff-only CMD][PY:
            how="ff-only" PY]."""),
        how=Parameter(
            args=("--how", ),
            nargs="?",
            constraints=_how_constraints,
            doc="""how to update the dataset. The default ("fetch") simply
            fetches the changes from the sibling but doesn't incorporate them
            into the working tree. A value of "merge" or "ff-only" merges in
            changes, with the latter restricting the allowed merges to
            fast-forwards. "reset" incorporates the changes with 'git reset
            --hard <target>', staying on the current branch but discarding any
            changes that aren't shared with the target. "checkout", on the
            other hand, runs 'git checkout <target>', switching from the
            current branch to a detached state. When [CMD: --recursive CMD][PY:
            recursive=True PY] is specified, this action will also apply to
            subdatasets unless overridden by [CMD: --how-subds CMD][PY:
            `how_subds` PY]."""),
        how_subds=Parameter(
            args=("--how-subds", ),
            nargs="?",
            constraints=_how_constraints,
            doc="""Override the behavior of [CMD: --how CMD][PY: `how` PY] in
            subdatasets."""),
        follow=Parameter(
            args=("--follow", ),
            constraints=EnsureChoice("sibling", "parentds", "parentds-lazy"),
            doc="""source of updates for subdatasets. For 'sibling', the update
            will be done by merging in a branch from the (specified or
            inferred) sibling. The branch brought in will either be the current
            branch's configured branch, if it points to a branch that belongs
            to the sibling, or a sibling branch with a name that matches the
            current branch. For 'parentds', the revision registered in the
            parent dataset of the subdataset is merged in. 'parentds-lazy' is
            like 'parentds', but prevents fetching from a subdataset's sibling
            if the registered revision is present in the subdataset. Note that
            the current dataset is always updated according to 'sibling'. This
            option has no effect unless a merge is requested and [CMD:
            --recursive CMD][PY: recursive=True PY] is specified.""",
        ),
        recursive=recursion_flag,
        recursion_limit=recursion_limit,
        fetch_all=Parameter(
            args=("--fetch-all", ),
            action="store_true",
            doc=
            """this option has no effect and will be removed in a future version.
            When no siblings are given, an all-sibling update will be performed.""",
        ),
        reobtain_data=Parameter(
            args=("--reobtain-data", ),
            action="store_true",
            doc="""if enabled, file content that was present before an update
            will be re-obtained in case a file was changed by the update."""),
    )

    @staticmethod
    @datasetmethod(name='update')
    @eval_results
    def __call__(path=None,
                 *,
                 sibling=None,
                 merge=False,
                 how=None,
                 how_subds=None,
                 follow="sibling",
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 fetch_all=None,
                 reobtain_data=False):
        if fetch_all is not None:
            lgr.warning(
                'update(fetch_all=...) called. Option has no effect, and will be removed'
            )
        if path and not recursive:
            lgr.warning('path constraints for subdataset updates ignored, '
                        'because `recursive` option was not given')

        how, how_subds = _process_how_args(merge, how, how_subds)
        # `merge` should be considered through `how` and `how_subds` only.
        # Unbind `merge` to ensure that downstream code doesn't look at it.
        del merge

        refds = require_dataset(dataset,
                                check_installed=True,
                                purpose='update')

        save_paths = []
        update_failures = set()
        saw_subds = False
        for ds, revision in itertools.chain(
            [(refds, None)],
                refds.subdatasets(path=path,
                                  state='present',
                                  recursive=recursive,
                                  recursion_limit=recursion_limit,
                                  return_type='generator',
                                  result_renderer='disabled',
                                  result_xfm=YieldDatasetAndRevision())
                if recursive else []):
            if ds != refds:
                saw_subds = True
            repo = ds.repo
            is_annex = isinstance(repo, AnnexRepo)
            # prepare return value
            res = get_status_dict('update',
                                  ds=ds,
                                  logger=lgr,
                                  refds=refds.path)

            follow_parent = revision and follow.startswith("parentds")
            follow_parent_lazy = revision and follow == "parentds-lazy"
            if follow_parent_lazy and \
               repo.get_hexsha(repo.get_corresponding_branch()) == revision:
                res["message"] = (
                    "Dataset already at commit registered in parent: %s",
                    repo.path)
                res["status"] = "notneeded"
                yield res
                continue

            how_curr = how_subds if revision else how
            # get all remotes which have references (would exclude
            # special remotes)
            remotes = repo.get_remotes(**({
                'exclude_special_remotes': True
            } if is_annex else {}))
            if not remotes and not sibling:
                res['message'] = (
                    "No siblings known to dataset at %s\nSkipping", repo.path)
                res['status'] = 'notneeded'
                yield res
                continue
            curr_branch = repo.get_active_branch()
            tracking_remote = None
            if not sibling and len(remotes) == 1:
                # there is only one remote, must be this one
                sibling_ = remotes[0]
            elif not sibling:
                # nothing given, look for tracking branch
                tracking_remote = repo.get_tracking_branch(branch=curr_branch,
                                                           remote_only=True)[0]
                sibling_ = tracking_remote
            else:
                sibling_ = sibling
            if sibling_ and sibling_ not in remotes:
                res['message'] = ("'%s' not known to dataset %s\nSkipping",
                                  sibling_, repo.path)
                res['status'] = 'impossible'
                yield res
                continue
            if not sibling_ and len(remotes) > 1 and how_curr:
                lgr.debug("Found multiple siblings:\n%s", remotes)
                res['status'] = 'impossible'
                res['message'] = "Multiple siblings, please specify from which to update."
                yield res
                continue
            lgr.info("Fetching updates for %s", ds)
            # fetch remote
            fetch_kwargs = dict(
                # test against user-provided value!
                remote=None if sibling is None else sibling_,
                all_=sibling is None,
                git_options=[
                    # required to not trip over submodules that were removed in
                    # the origin clone
                    "--no-recurse-submodules",
                    # prune to not accumulate a mess over time
                    "--prune"
                ])
            if not (follow_parent_lazy and repo.commit_exists(revision)):
                try:
                    repo.fetch(**fetch_kwargs)
                except CommandError as exc:
                    ce = CapturedException(exc)
                    yield get_status_dict(
                        status="error",
                        message=("Fetch failed: %s", ce),
                        exception=ce,
                        **res,
                    )
                    continue

            # NOTE reevaluate ds.repo again, as it might have be converted from
            # a GitRepo to an AnnexRepo
            repo = ds.repo

            if follow_parent and not repo.commit_exists(revision):
                if sibling_:
                    try:
                        lgr.debug("Fetching revision %s directly for %s",
                                  revision, repo)
                        repo.fetch(remote=sibling_,
                                   refspec=revision,
                                   git_options=["--recurse-submodules=no"])
                    except CommandError as exc:
                        ce = CapturedException(exc)
                        yield dict(
                            res,
                            status="impossible",
                            message=("Attempt to fetch %s from %s failed: %s",
                                     revision, sibling_, ce),
                            exception=ce)
                        continue
                else:
                    yield dict(res,
                               status="impossible",
                               message=("Need to fetch %s directly "
                                        "but single sibling not resolved",
                                        revision))
                    continue

            saw_update_failure = False
            if how_curr:
                if follow_parent:
                    target = revision
                else:
                    target = _choose_update_target(repo, curr_branch, sibling_,
                                                   tracking_remote)

                adjusted = is_annex and repo.is_managed_branch(curr_branch)
                if adjusted:
                    if follow_parent:
                        yield dict(
                            res,
                            status="impossible",
                            message=("follow='parentds' is incompatible "
                                     "with adjusted branches"))
                        continue
                    if how_curr != "merge":
                        yield dict(
                            res,
                            status="impossible",
                            message=("Updating via '%s' is incompatible "
                                     "with adjusted branches", how_curr))
                        continue

                update_fn = _choose_update_fn(repo,
                                              how_curr,
                                              is_annex=is_annex,
                                              adjusted=adjusted)

                fn_opts = ["--ff-only"] if how_curr == "ff-only" else None
                if update_fn is not _annex_sync:
                    if target is None:
                        yield dict(res,
                                   status="impossible",
                                   message="Could not determine update target")
                        continue

                if is_annex and reobtain_data:
                    update_fn = _reobtain(ds, update_fn)

                for ures in update_fn(repo, sibling_, target, opts=fn_opts):
                    # NOTE: Ideally the "merge" action would also be prefixed
                    # with "update.", but a plain "merge" is used for backward
                    # compatibility.
                    if ures["status"] != "ok" and (
                            ures["action"] == "merge"
                            or ures["action"].startswith("update.")):
                        saw_update_failure = True
                    yield dict(res, **ures)

            if saw_update_failure:
                update_failures.add(ds)
                res['status'] = 'error'
                res['message'] = ("Update of %s failed", target)
            else:
                res['status'] = 'ok'
                save_paths.append(ds.path)
            yield res
        # we need to save updated states only if merge was requested -- otherwise
        # it was a pure fetch
        if how_curr and recursive:
            yield from _save_after_update(refds, save_paths, update_failures,
                                          path, saw_subds)
示例#30
0
                     action='append',
                     doc="""limit to the subdatasets containing the
    given path. If a root path of a subdataset is given, the last
    considered dataset will be the subdataset itself.[CMD:  This
    option can be given multiple times CMD][PY:  Can be a list with
    multiple paths PY], in which case datasets that
    contain any of the given paths will be considered.""",
                     constraints=EnsureStr() | EnsureNone())

fulfilled = Parameter(args=("--fulfilled", ),
                      doc="""DEPRECATED: use [CMD: --state CMD][PY: `state` PY]
    instead. If given, must be a boolean flag indicating whether
    to consider either only locally present or absent datasets.
    By default all subdatasets are considered regardless of their
    status.""",
                      constraints=EnsureBool() | EnsureNone())

dataset_state = Parameter(
    args=("--state", ),
    doc="""indicate which (sub)datasets to consider: either only locally present,
    absent, or any of those two kinds.
    """,
    # yoh: intentionally left out the description of default since might be
    # command specific
    constraints=EnsureChoice('present', 'absent', 'any'))

shared_access_opt = Parameter(
    args=('--shared-access', ),
    metavar='MODE',
    doc="""configure shared access to a dataset, see `git init --shared`
    documentation for complete details on the supported scenarios. Possible