Exemplo n.º 1
0
 def get_refds_path(cls, dataset):
     """Return a resolved reference dataset path from a `dataset` argument"""
     # theoretically a dataset could come in as a relative path -> resolve
     refds_path = dataset.path if isinstance(dataset, Dataset) else dataset
     if refds_path:
         refds_path = resolve_path(refds_path)
     return refds_path
Exemplo n.º 2
0
def get_normalized_path_arguments(paths, dataset=None, default=None):
    """Apply standard resolution to path arguments

    This is nothing more than a helper to standardize path argument
    preprocessing.

    Parameter
    ---------
    paths : sequence or single path
      Path(s) to normalize
    dataset : path or Dataset or None
      Optional dataset identifying something against which to resolve input
      path arguments
    default: sequence of paths or single path or None
      If `paths` is empty, use this instead

    Returns
    -------
    tuple(list(paths), path)
      Normalized paths and path to a potential dataset against which paths were
      resolved.
    """
    dataset_path = dataset.path if isinstance(dataset, Dataset) else dataset
    if not paths and default:
        paths = default
    paths = assure_list(paths)
    # resolve path(s):
    resolved_paths = [resolve_path(p, dataset) for p in paths]
    if dataset:
        # guarantee absolute paths
        resolved_paths = [opj(dataset_path, p) for p in resolved_paths]
    lgr.debug('Resolved input path arguments: %s', resolved_paths)
    return resolved_paths, dataset_path
Exemplo n.º 3
0
def _get_container_by_path(ds, name, containers):
    from datalad.distribution.dataset import resolve_path
    # Note: since datalad0.12.0rc6 resolve_path returns a Path object here,
    #       which then fails to equal c['path'] below as this is taken from
    #       config as a string
    container_path = str(resolve_path(name, ds))
    container = [c for c in containers.values() if c['path'] == container_path]
    if len(container) == 1:
        return container[0]
Exemplo n.º 4
0
 def get_refds_path(cls, dataset):
     """Return a resolved reference dataset path from a `dataset` argument"""
     # theoretically a dataset could come in as a relative path -> resolve
     if dataset is None:
         return dataset
     refds_path = dataset.path if isinstance(dataset, Dataset) \
         else Dataset(dataset).path
     if refds_path:
         refds_path = resolve_path(refds_path)
     return refds_path
Exemplo n.º 5
0
    def get_refds_path(cls, dataset):
        """Return a resolved reference dataset path from a `dataset` argument

        .. deprecated:: 0.16
           Use ``require_dataset()`` instead.
        """
        # theoretically a dataset could come in as a relative path -> resolve
        if dataset is None:
            return dataset
        refds_path = dataset.path if isinstance(dataset, Dataset) \
            else Dataset(dataset).path
        if refds_path:
            refds_path = str(resolve_path(refds_path))
        return refds_path
Exemplo n.º 6
0
def rawpath2ap(path, refds_path):
    orig_path_request = path
    # this is raw, resolve
    path = resolve_path(path, refds_path)
    # collect info on this path
    path_props = dict(
        path=path,
        # path was requested as input, and not somehow discovered
        raw_input=True,
        # make a record of what actually came in, sorting into
        # dataset might later need to distinguish between a path
        # that pointed to a dataset as a whole vs. a path that
        # pointed to the dataset's content -- just do not destroy
        # any information on the way down
        orig_request=orig_path_request)
    return path_props
Exemplo n.º 7
0
def rawpath2ap(path, refds_path):
    orig_path_request = path
    # this is raw, resolve
    path = resolve_path(path, refds_path)
    # collect info on this path
    path_props = dict(
        path=path,
        # path was requested as input, and not somehow discovered
        raw_input=True,
        # make a record of what actually came in, sorting into
        # dataset might later need to distinguish between a path
        # that pointed to a dataset as a whole vs. a path that
        # pointed to the dataset's content -- just do not destroy
        # any information on the way down
        orig_request=orig_path_request)
    return path_props
Exemplo n.º 8
0
    def __call__(path=None,
                 dataset=None,
                 to=None,
                 since=None,
                 data='auto-if-wanted',
                 force=None,
                 recursive=False,
                 recursion_limit=None,
                 jobs=None):
        # push uses '^' to annotate the previous pushed committish, and None for default
        # behavior. '' was/is (to be deprecated) used in `publish`. Alert user about the mistake
        if since == '':
            raise ValueError("'since' should point to commitish or use '^'.")
        # we resolve here, because we need to perform inspection on what was given
        # as an input argument further down
        paths = [resolve_path(p, dataset) for p in ensure_list(path)]

        ds = require_dataset(dataset, check_installed=True, purpose='push')
        ds_repo = ds.repo

        res_kwargs = dict(
            action='publish',
            refds=ds.path,
            logger=lgr,
        )

        get_remote_kwargs = {'exclude_special_remotes': False} \
            if isinstance(ds_repo, AnnexRepo) else {}
        if to and to not in ds_repo.get_remotes(**get_remote_kwargs):
            # get again for proper error:
            sr = ds_repo.get_remotes(**get_remote_kwargs)
            # yield an error result instead of raising a ValueError,
            # to enable the use case of pushing to a target that
            # a superdataset doesn't know, but some subdatasets to
            # (in combination with '--on-failure ignore')
            yield dict(res_kwargs,
                       status='error',
                       path=ds.path,
                       message="Unknown push target '{}'. {}".format(
                           to, 'Known targets: {}.'.format(', '.join(
                               repr(s) for s in sr))
                           if sr else 'No targets configured in dataset.'))
            return
        if since == '^':
            # figure out state of remote branch and set `since`
            since = _get_corresponding_remote_state(ds_repo, to)
            if not since:
                lgr.info("No tracked remote for active branch, "
                         "detection of last pushed state not in effect.")
        elif since:
            # will blow with ValueError if unusable
            ds_repo.get_hexsha(since)

        # obtain a generator for information on the datasets to process
        # idea is to turn the `paths` argument into per-dataset
        # content listings that can be acted upon
        ds_spec = _datasets_since_(
            # important to pass unchanged dataset arg
            dataset,
            since,
            paths,
            recursive,
            recursion_limit)

        # instead of a loop, this could all be done in parallel
        matched_anything = False
        for dspath, dsrecords in ds_spec:
            matched_anything = True
            lgr.debug('Attempt push of Dataset at %s', dspath)
            pbars = {}
            yield from _push(dspath,
                             dsrecords,
                             to,
                             data,
                             force,
                             jobs,
                             res_kwargs.copy(),
                             pbars,
                             got_path_arg=True if path else False)
            # take down progress bars for this dataset
            for i, ds in pbars.items():
                log_progress(lgr.info, i, 'Finished push of %s', ds)
        if not matched_anything:
            potential_remote = False
            if not to and len(paths) == 1:
                # if we get a remote name without --to, provide a hint
                sr = ds_repo.get_remotes(**get_remote_kwargs)
                potential_remote = [p for p in ensure_list(path) if p in sr]
            if potential_remote:
                hint = "{} matches a sibling name and not a path. " \
                      "Forgot --to?".format(potential_remote)
                yield dict(
                    res_kwargs,
                    status='notneeded',
                    message=hint,
                    hints=hint,
                    type='dataset',
                    path=ds.path,
                )
                # there's no matching path and we have generated a hint on
                # fixing the call - we can return now
                return
            yield dict(
                res_kwargs,
                status='notneeded',
                message=
                'Given constraints did not match any changes to publish',
                type='dataset',
                path=ds.path,
            )
Exemplo n.º 9
0
    def __call__(path=None,
                 *,
                 dataset=None,
                 state='any',
                 fulfilled=NoneDeprecated,
                 recursive=False,
                 recursion_limit=None,
                 contains=None,
                 bottomup=False,
                 set_property=None,
                 delete_property=None):
        if fulfilled is not NoneDeprecated:
            # the two mirror options do not agree and the deprecated one is
            # not at default value
            warnings.warn(
                "subdatasets's `fulfilled` option is deprecated "
                "and will be removed in a future release, "
                "use the `state` option instead.", DeprecationWarning)
            if state != 'any':
                raise ValueError(
                    "Do not specify both 'fulfilled' and 'state', use 'state'")
            # honor the old option for now
            state = {
                None: 'any',
                True: 'present',
                False: 'absent',
            }[fulfilled]
        # Path of minimal resistance/code-change - internally here we will reuse fulfilled
        fulfilled = {
            'any': None,
            'present': True,
            'absent': False,
        }[state]
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='report on subdataset(s)')

        paths = resolve_path(ensure_list(path), dataset, ds) if path else None

        # no constraints given -> query subdatasets under curdir
        if not paths and dataset is None:
            cwd = Path(getpwd())
            paths = None if cwd == ds.pathobj else [cwd]

        lgr.debug('Query subdatasets of %s', dataset)
        if paths is not None:
            lgr.debug('Query subdatasets underneath paths: %s', paths)
        refds_path = ds.path

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must "
                        "start with a letter)" % k)
        if contains:
            contains = resolve_path(ensure_list(contains), dataset, ds)
            # expand all test cases for the contains test in the loop below
            # leads to ~20% speedup per loop iteration of a non-match
            expanded_contains = [[c] + list(c.parents) for c in contains]
        else:
            expanded_contains = []
        contains_hits = set()
        for r in _get_submodules(ds, paths, fulfilled, recursive,
                                 recursion_limit, expanded_contains, bottomup,
                                 set_property, delete_property, refds_path):
            # a boat-load of ancient code consumes this and is ignorant of
            # Path objects
            r['path'] = str(r['path'])
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            if 'contains' in r:
                contains_hits.update(r['contains'])
                r['contains'] = [str(c) for c in r['contains']]
            yield r
        if contains:
            for c in set(contains).difference(contains_hits):
                yield get_status_dict(
                    'subdataset',
                    path=str(c),
                    status='impossible',
                    message='path not contained in any matching subdataset',
                    # we do not want to log such an event, because it is a
                    # legit query to check for matching subdatasets simply
                    # for the purpose of further decision making
                    # user communication in front-end scenarios will happen
                    # via result rendering
                    #logger=lgr
                )
Exemplo n.º 10
0
    def __call__(path=None,
                 dataset=None,
                 fulfilled=None,
                 recursive=False,
                 recursion_limit=None,
                 contains=None,
                 bottomup=False,
                 set_property=None,
                 delete_property=None):
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='subdataset reporting/modification')

        paths = resolve_path(ensure_list(path), dataset, ds) if path else None

        # no constraints given -> query subdatasets under curdir
        if not paths and dataset is None:
            cwd = Path(getpwd())
            paths = None if cwd == ds.pathobj else [cwd]

        lgr.debug('Query subdatasets of %s', dataset)
        if paths is not None:
            lgr.debug('Query subdatasets underneath paths: %s', paths)
        refds_path = ds.path

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        if set_property:
            for k, v in set_property:
                if valid_key.match(k) is None:
                    raise ValueError(
                        "key '%s' is invalid (alphanumeric plus '-' only, must "
                        "start with a letter)" % k)
        if contains:
            contains = resolve_path(ensure_list(contains), dataset, ds)
            # expand all test cases for the contains test in the loop below
            # leads to ~20% speedup per loop iteration of a non-match
            expanded_contains = [[c] + list(c.parents) for c in contains]
        else:
            expanded_contains = []
        contains_hits = set()
        for r in _get_submodules(ds, paths, fulfilled, recursive,
                                 recursion_limit, expanded_contains, bottomup,
                                 set_property, delete_property, refds_path):
            # a boat-load of ancient code consumes this and is ignorant of
            # Path objects
            r['path'] = str(r['path'])
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            if 'contains' in r:
                contains_hits.update(r['contains'])
                r['contains'] = [str(c) for c in r['contains']]
            yield r
        if contains:
            for c in set(contains).difference(contains_hits):
                yield get_status_dict(
                    'subdataset',
                    path=str(c),
                    status='impossible',
                    message='path not contained in any matching subdataset',
                    # we do not want to log such an event, because it is a
                    # legit query to check for matching subdatasets simply
                    # for the purpose of further decision making
                    # user communication in front-end scenarios will happen
                    # via result rendering
                    #logger=lgr
                )
Exemplo n.º 11
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            action=None,
            unavailable_path_status='',
            unavailable_path_msg=None,
            nondataset_path_status='error',
            force_parentds_discovery=True,
            force_subds_discovery=True,
            force_no_revision_change_discovery=True,
            force_untracked_discovery=True,
            modified=None):
        # upfront check for the fastest possible response
        if not path and dataset is None:
            # nothing given, try "here", but do not use `require_dataset`, as
            # it will determine the root dataset of `curdir` and further down
            # lead to path annotation of upstairs directories
            dataset = curdir

        if force_subds_discovery and not force_parentds_discovery:
            raise ValueError(
                'subdataset discovery requires parent dataset discovery')

        # CONCEPT: yield with no status to indicate further processing

        # everything in one big loop to be able too yield as fast a possible
        # without any precomputing for all paths
        refds_path = Interface.get_refds_path(dataset)
        if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)):
            raise ValueError(
                "modification detection only works with a base dataset (non-given or found)")

        # prep common result props
        res_kwargs = dict(
            action=action if action else 'annotate_path',
            refds=refds_path,
            logger=lgr)

        # handle the case of recursion into a single dataset without any
        # extra fancy processing first -- full recursion can be done
        # faster than manual recursion, hence we gain quite some speed
        # from these few lines of extra code
        if not modified and not path and refds_path:
            if not GitRepo.is_valid_repo(refds_path):
                yield get_status_dict(
                    # doesn't matter if the path is in another dataset
                    # it was given as reference dataset
                    status=nondataset_path_status,
                    message='given reference dataset is not a dataset',
                    path=refds_path,
                    **res_kwargs)
                return

            refds = Dataset(refds_path)
            path = []
            # yield the dataset itself
            r = get_status_dict(ds=refds, status='', **res_kwargs)
            yield r

            if recursive:
                # if we have nothing given, but need recursion, we need to feed
                # the dataset path itself
                for r in yield_recursive(
                        refds,
                        refds_path,
                        action,
                        recursion_limit):
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    yield r
            return

        # goal: structure in a way that makes most information on any path
        # available in a single pass, at the cheapest possible cost
        reported_paths = {}
        requested_paths = assure_list(path)

        if modified is not None:
            # modification detection would silently kill all nondataset paths
            # but we have to complain about them, hence doing it here
            if requested_paths and refds_path:
                for r in requested_paths:
                    p = r['path'] if isinstance(r, dict) else r
                    p = resolve_path(p, ds=refds_path)
                    if _with_sep(p).startswith(_with_sep(refds_path)):
                        # all good
                        continue
                    # not the refds
                    path_props = r if isinstance(r, dict) else {}
                    res = get_status_dict(
                        **dict(res_kwargs, **path_props))
                    res['status'] = nondataset_path_status
                    res['message'] = 'path not associated with reference dataset'
                    reported_paths[r] = res
                    yield res

            # preserve non-existing paths to be silently killed by modification
            # detection and append them to requested_paths again after detection.
            # TODO: This might be melted in with treatment of non dataset paths
            # above. Re-appending those paths seems to be better than yielding
            # directly to avoid code duplication, since both cases later on are
            # dealt with again.
            preserved_paths = []
            if requested_paths:
                [preserved_paths.append(r)
                 for r in requested_paths
                 if not lexists(r['path'] if isinstance(r, dict) else r)]

            # replace the requested paths by those paths that were actually
            # modified underneath or at a requested location
            requested_paths = get_modified_subpaths(
                # either the request, or the base dataset, if there was no request
                requested_paths if requested_paths else [refds_path],
                refds=Dataset(refds_path),
                revision=modified,
                report_no_revision_change=force_no_revision_change_discovery,
                report_untracked='all' if force_untracked_discovery else 'no',
                recursion_limit=recursion_limit)

            from itertools import chain
            # re-append the preserved paths:
            requested_paths = chain(requested_paths, iter(preserved_paths))

        # do not loop over unique(), this could be a list of dicts
        # we avoid duplicates manually below via `reported_paths`
        for path in requested_paths:
            if not isinstance(path, dict):
                path = rawpath2ap(path, refds_path)
            # this is now an annotated path!
            path_props = path
            path = path['path']
            # we need to mark our territory, who knows where this has been
            path_props.update(res_kwargs)

            if path in reported_paths:
                # we already recorded this path in the output
                # this can happen, whenever `path` is a subdataset, that was
                # discovered via recursive processing of another path before
                continue
            # the path exists in some shape or form
            # TODO if we have path_props already we could skip this test
            if isdir(path):
                # keep any existing type info, previously a more expensive run
                # could have discovered an uninstalled 'dataset', and we don't
                # want it to be relabeled to a directory
                path_props['type'] = \
                    path_props.get(
                        'type',
                        'dataset' if GitRepo.is_valid_repo(path) else 'directory')
                # this could contain all types of additional content
                containing_dir = path
            else:
                if lexists(path):
                    path_props['type'] = 'file'
                else:
                    path_props['state'] = 'absent'
                # for everything else we are interested in the container
                containing_dir = dirname(path)
                if not containing_dir:
                    containing_dir = curdir

            dspath = parent = get_dataset_root(containing_dir)
            if dspath:
                if path_props.get('type', None) == 'dataset':
                    # for a dataset the root is not the parent, for anything else
                    # it is
                    parent = path_props.get('parentds', None)
                    oneupdir = normpath(opj(containing_dir, pardir))
                    if parent is None and (force_parentds_discovery or (
                            refds_path and _with_sep(oneupdir).startswith(
                                _with_sep(refds_path)))):
                        # either forced, or only if we have a reference dataset, and
                        # only if we stay within this refds when searching for the
                        # parent
                        parent = get_dataset_root(normpath(opj(containing_dir, pardir)))
                        # NOTE the `and refds_path` is critical, as it will determine
                        # whether a top-level dataset that was discovered gets the
                        # parent property or not, it won't get it without a common
                        # base dataset, and that is how we always rolled
                    if parent and refds_path:
                        path_props['parentds'] = parent
                        # don't check whether this is actually a true subdataset of the
                        # parent, done further down
                else:
                    # set parent, but prefer existing property
                    path_props['parentds'] = path_props.get('parentds', dspath)

            # test for `dspath` not `parent`, we only need to know whether there is
            # ANY dataset, not which one is the true parent, logic below relies on
            # the fact that we end here, if there is no dataset at all
            if not dspath:
                # not in any dataset
                res = get_status_dict(
                    **dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = 'path not associated with any dataset'
                reported_paths[path] = res
                yield res
                continue

            # check that we only got SUBdatasets
            if refds_path and not _with_sep(dspath).startswith(_with_sep(refds_path)):
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = \
                    ('path not part of the reference dataset at %s', refds_path)
                reported_paths[path] = res
                yield res
                continue

            if path_props.get('type', None) == 'file':
                # nothing else we can learn about this
                res = get_status_dict(**dict(res_kwargs, **path_props))
                if 'status' not in res:
                    res['status'] = ''
                reported_paths[path] = res
                yield res
                continue

            containing_ds = None
            path_type = path_props.get('type', None)
            if parent and force_subds_discovery and (
                    (path_type == 'dataset' and 'registered_subds' not in path_props) or
                    path_type == 'directory' or
                    not lexists(path)):
                # if the path doesn't exist, or is labeled a directory, or a dataset even
                # a dataset (without this info) -> record whether this is a known subdataset
                # to its parent
                containing_ds = Dataset(parent)
                subdss = containing_ds.subdatasets(
                    fulfilled=None, recursive=False,
                    result_xfm=None, result_filter=None, return_type='list')
                if path in [s['path'] for s in subdss]:
                    if path_type == 'directory' or not lexists(path):
                        # first record that it isn't here, if just a dir or not here at all
                        path_props['state'] = 'absent'
                    # this must be a directory, and it is not installed
                    path_props['type'] = 'dataset'
                    path_props['registered_subds'] = True

            if not lexists(path) or \
                    (path_props.get('type', None) == 'dataset' and
                     path_props.get('state', None) == 'absent'):
                # not there (yet)
                message = unavailable_path_msg if unavailable_path_msg else None
                if message and '%s' in message:
                    message = (message, path)
                path_props['message'] = message
                res = get_status_dict(**dict(res_kwargs, **path_props))
                # assign given status, but only if the props don't indicate a status
                # already
                res['status'] = path_props.get(
                    'status', unavailable_path_status)
                reported_paths[path] = res
                yield res
                continue

            # we know everything we can, report
            res = get_status_dict(**dict(res_kwargs, **path_props))
            if 'status' not in res:
                res['status'] = ''
            reported_paths[path] = res
            yield res

            rec_paths = []
            if recursive:
                # here we need to consider the special case that `path` is
                # a dataset itself, if a recursion_limit is given (e.g.
                # `remove` will do that by default), we need to recurse
                # from the dataset itself, and not its parent to get things
                # right -- this will also avoid needless discovery of
                # unrelated subdatasets
                if path_props.get('type', None) == 'dataset':
                    containing_ds = Dataset(path)
                else:
                    # regular parent, we might have a dataset already
                    containing_ds = Dataset(parent) if containing_ds is None else containing_ds
                for r in yield_recursive(containing_ds, path, action, recursion_limit):
                    # capture reported paths
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    reported_paths[r['path']] = r
                    if modified is not None:
                        # we cannot yield right away, maybe it wasn't modified
                        rec_paths.append(r)
                    else:
                        yield r
            if modified is not None and rec_paths:
                # replace the recursively discovered paths by those paths that
                # were actually modified underneath or at a requested location
                for r in get_modified_subpaths(
                        rec_paths,
                        refds=Dataset(refds_path),
                        revision=modified,
                        report_no_revision_change=force_no_revision_change_discovery,
                        report_untracked='all' if force_untracked_discovery else 'no',
                        recursion_limit=recursion_limit):
                    res = get_status_dict(**dict(r, **res_kwargs))
                    reported_paths[res['path']] = res
                    yield res
        return
Exemplo n.º 12
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            target_dir=None,
            specs_from=None,
            message=None):
        # Concept
        #
        # Loosely model after the POSIX cp command
        #
        # 1. Determine the target of the copy operation, and its associated
        #    dataset
        #
        # 2. for each source: determine source dataset, query for metadata, put
        #    into target dataset
        #
        # Instead of sifting and sorting through input args, process them one
        # by one sequentially. Utilize lookup caching to make things faster,
        # instead of making the procedure itself more complicated.

        if path and specs_from:
            raise ValueError(
                "Path argument(s) AND a specs-from specified, "
                "this is not supported.")

        ds = None
        if dataset:
            ds = require_dataset(dataset, check_installed=True,
                                 purpose='copy into')

        if target_dir:
            target_dir = resolve_path(target_dir, dataset)

        if path:
            # turn into list of absolute paths
            paths = [resolve_path(p, dataset) for p in ensure_list(path)]

            # we already checked that there are no specs_from
            if not target_dir:
                if len(paths) == 1:
                    if not ds:
                        raise ValueError("No target directory was given.")
                    # we can keep target_dir unset and need not manipulate
                    # paths, this is all done in a generic fashion below
                elif len(paths) == 2:
                    # single source+dest combo
                    if paths[-1].is_dir():
                        # check if we need to set target_dir, in case dest
                        # is a dir
                        target_dir = paths.pop(-1)
                    else:
                        specs_from = [paths]
                else:
                    target_dir = paths.pop(-1)

            if not specs_from:
                # in all other cases we have a plain source list
                specs_from = paths

        if not specs_from:
            raise ValueError("Neither `paths` nor `specs_from` given.")

        if target_dir:
            if ".git" in target_dir.parts:
                raise ValueError(
                    "Target directory should not contain a .git directory: {}"
                    .format(target_dir))
        elif ds:
            # no specific target set, but we have to write into a dataset,
            # and one was given. It seems to make sense to use this dataset
            # as a target. it is already to reference for any path resolution.
            # Any explicitly given destination, will take precedence over
            # a general target_dir setting nevertheless.
            target_dir = ds.pathobj

        res_kwargs = dict(
            action='copy_file',
            logger=lgr,
        )

        # lookup cache for dir to repo mappings, and as a DB for cleaning
        # things up
        repo_cache = {}
        # which paths to pass on to save
        to_save = []
        try:
            for src_path, dest_path in _yield_specs(specs_from):
                src_path = Path(src_path)
                dest_path = None \
                    if dest_path is None \
                    else resolve_path(dest_path, dataset)
                lgr.debug('Processing copy specification: %s -> %s',
                          src_path, dest_path)

                # Some checks, first impossibility "wins"
                msg_impossible = None
                if not recursive and src_path.is_dir():
                    msg_impossible = 'recursion not enabled, omitting directory'
                elif (dest_path and dest_path.name == '.git') \
                        or src_path.name == '.git':
                    msg_impossible = \
                        "refuse to place '.git' into destination dataset"
                elif not (dest_path or target_dir):
                    msg_impossible = 'need destination path or target directory'

                if msg_impossible:
                    yield dict(
                        path=str(src_path),
                        status='impossible',
                        message=msg_impossible,
                        **res_kwargs
                    )
                    continue

                for src_file, dest_file in _yield_src_dest_filepaths(
                        src_path, dest_path, target_dir=target_dir):
                    if ds and ds.pathobj not in dest_file.parents:
                        # take time to compose proper error
                        dpath = str(target_dir if target_dir else dest_path)
                        yield dict(
                            path=dpath,
                            status='error',
                            message=(
                                'reference dataset does not contain '
                                'destination path: %s',
                                dpath),
                            **res_kwargs
                        )
                        # only recursion could yield further results, which would
                        # all have the same issue, so call it over right here
                        break
                    for res in _copy_file(src_file, dest_file, cache=repo_cache):
                        yield dict(
                            res,
                            **res_kwargs
                        )
                        if res.get('status', None) == 'ok':
                            to_save.append(res['destination'])
        finally:
            # cleanup time
            # TODO this could also be the place to stop lingering batch processes
            _cleanup_cache(repo_cache)

        if not (ds and to_save):
            # nothing left to do
            return

        yield from ds.save(
            path=to_save,
            # we provide an explicit file list
            recursive=False,
            message=message,
        )
Exemplo n.º 13
0
    def __call__(
            archive,
            *,
            dataset=None,
            annex=None,
            add_archive_leading_dir=False,
            strip_leading_dirs=False,
            leading_dirs_depth=None,
            leading_dirs_consider=None,
            use_current_dir=False,
            delete=False,
            key=False,
            exclude=None,
            rename=None,
            existing='fail',
            annex_options=None,
            copy=False,
            commit=True,
            allow_dirty=False,
            stats=None,
            drop_after=False,
            delete_after=False):

        if exclude:
            exclude = ensure_tuple_or_list(exclude)
        if rename:
            rename = ensure_tuple_or_list(rename)
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='add-archive-content')

        # set up common params for result records
        res_kwargs = {
            'action': 'add-archive-content',
            'logger': lgr,
        }

        if not isinstance(ds.repo, AnnexRepo):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message="Can't operate in a pure Git repository",
                **res_kwargs
            )
            return
        if annex:
            warnings.warn(
                "datalad add_archive_content's `annex` parameter is "
                "deprecated and will be removed in a future release. "
                "Use the 'dataset' parameter instead.",
                DeprecationWarning)
        annex = ds.repo
        # get the archive path relative from the ds root
        archive_path = resolve_path(archive, ds=dataset)
        # let Status decide whether we can act on the given file
        for s in ds.status(
                path=archive_path,
                on_failure='ignore',
                result_renderer='disabled'):
            if s['status'] == 'error':
                if 'path not underneath the reference dataset %s' in s['message']:
                    yield get_status_dict(
                        ds=ds,
                        status='impossible',
                        message='Can not add archive outside of the dataset',
                        **res_kwargs)
                    return
                # status errored & we haven't anticipated the cause. Bubble up
                yield s
                return
            elif s['state'] == 'untracked':
                # we can't act on an untracked file
                message = (
                    "Can not add an untracked archive. "
                    "Run 'datalad save {}'".format(archive)
                )
                yield get_status_dict(
                           ds=ds,
                           status='impossible',
                           message=message,
                           **res_kwargs)
                return

        if not allow_dirty and annex.dirty:
            # error out here if the dataset contains untracked changes
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'clean dataset required. '
                    'Use `datalad status` to inspect unsaved changes'),
                **res_kwargs
            )
            return

        # ensure the archive exists, status doesn't error on a non-existing file
        if not key and not lexists(archive_path):
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message=(
                    'No such file: {}'.format(archive_path),
                ),
                **res_kwargs
            )
            return

        if not key:
            check_path = archive_path.relative_to(ds.pathobj)
            # TODO: support adding archives content from outside the annex/repo
            origin = 'archive'
            # can become get_file_annexinfo once #6104 is merged
            key = annex.get_file_annexinfo(check_path)['key']
            if not key:
                raise RuntimeError(
                    f"Archive must be an annexed file in {ds}")
            archive_dir = Path(archive_path).parent
        else:
            origin = 'key'
            key = archive
            # We must not have anything to do with the location under .git/annex
            archive_dir = None
            # instead, we will go from the current directory
            use_current_dir = True

        archive_basename = file_basename(archive)

        if not key:
            # if we didn't manage to get a key, the file must be in Git
            raise NotImplementedError(
                "Provided file %s does not seem to be under annex control. "
                "We don't support adding everything straight to Git" % archive
            )

        # figure out our location
        pwd = getpwd()
        # are we in a subdirectory of the repository?
        pwd_in_root = annex.path == archive_dir
        # then we should add content under that subdirectory,
        # get the path relative to the repo top
        if use_current_dir:
            # extract the archive under the current directory, not the directory
            # where the archive is located
            extract_rpath = Path(pwd).relative_to(ds.path) \
                if not pwd_in_root \
                else None
        else:
            extract_rpath = archive_dir.relative_to(ds.path)

        # relpath might return '.' as the relative path to curdir, which then normalize_paths
        # would take as instructions to really go from cwd, so we need to sanitize
        if extract_rpath == curdir:
            extract_rpath = None

        try:
            key_rpath = annex.get_contentlocation(key)
        except:
            # the only probable reason for this to fail is that there is no
            # content present
            raise RuntimeError(
                "Content of %s seems to be N/A.  Fetch it first" % key
            )

        # now we simply need to go through every file in that archive and
        lgr.info(
            "Adding content of the archive %s into annex %s", archive, annex
        )

        from datalad.customremotes.archives import ArchiveAnnexCustomRemote

        # TODO: shouldn't we be able just to pass existing AnnexRepo instance?
        # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive
        # OK, let's ignore that the following class is actually a special
        # remote implementation, and use it only to work with its cache
        annexarchive = ArchiveAnnexCustomRemote(annex=None,
                                                path=annex.path,
                                                persistent_cache=True)
        # We will move extracted content so it must not exist prior running
        annexarchive.cache.allow_existing = True
        earchive = annexarchive.cache[key_rpath]
        # make sure there is an enabled datalad-archives special remote
        ensure_datalad_remote(ds.repo, remote=ARCHIVES_SPECIAL_REMOTE,
                              autoenable=True)

        precommitted = False
        old_always_commit = annex.always_commit
        # batch mode is disabled when faking dates, we want to always commit
        annex.always_commit = annex.fake_dates_enabled
        if annex_options:
            if isinstance(annex_options, str):
                annex_options = split_cmdline(annex_options)
        delete_after_rpath = None

        prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad",
                                               dir=annex.path)) \
            if delete_after \
            else None

        # dedicated stats which would be added to passed in (if any)
        outside_stats = stats
        stats = ActivityStats()

        try:
            # keep track of extracted files for progress bar logging
            file_counter = 0
            # iterative over all files in the archive
            extracted_files = list(earchive.get_extracted_files())
            # start a progress bar for extraction
            pbar_id = f'add-archive-{archive_path}'
            log_progress(
                lgr.info, pbar_id, 'Extracting archive',
                label="Extracting archive",
                unit=' Files',
                total = len(extracted_files),
                noninteractive_level = logging.INFO)
            for extracted_file in extracted_files:
                file_counter += 1
                files_left = len(extracted_files) - file_counter
                log_progress(
                    lgr.info, pbar_id,
                    "Files to extract %i ", files_left,
                    update=1,
                    increment=True,
                    noninteractive_level=logging.DEBUG)
                stats.files += 1
                extracted_path = Path(earchive.path) / Path(extracted_file)

                if extracted_path.is_symlink():
                    link_path = str(extracted_path.resolve())
                    if not exists(link_path):
                        # TODO: config  addarchive.symlink-broken='skip'
                        lgr.warning(
                            "Path %s points to non-existing file %s" %
                            (extracted_path, link_path)
                        )
                        stats.skipped += 1
                        continue
                        # TODO: check if points outside of archive - warn & skip

                url = annexarchive.get_file_url(
                    archive_key=key,
                    file=extracted_file,
                    size=os.stat(extracted_path).st_size)

                # preliminary target name which might get modified by renames
                target_file_orig = target_file = Path(extracted_file)

                # stream archives would not have had the original filename
                # information in them, so would be extracted under a name
                # derived from their annex key.
                # Provide ad-hoc handling for such cases
                if (len(extracted_files) == 1 and
                    Path(archive).suffix in ('.xz', '.gz', '.lzma') and
                        Path(key_rpath).name.startswith(Path(
                            extracted_file).name)):
                    # take archive's name without extension for filename & place
                    # where it was originally extracted
                    target_file = \
                        Path(extracted_file).parent / Path(archive).stem

                if strip_leading_dirs:
                    leading_dir = earchive.get_leading_directory(
                        depth=leading_dirs_depth, exclude=exclude,
                        consider=leading_dirs_consider)
                    leading_dir_len = \
                        len(leading_dir) + len(opsep) if leading_dir else 0
                    target_file = str(target_file)[leading_dir_len:]

                if add_archive_leading_dir:
                    # place extracted content under a directory corresponding to
                    # the archive name with suffix stripped.
                    target_file = Path(archive_basename) / target_file

                if rename:
                    target_file = apply_replacement_rules(rename,
                                                          str(target_file))

                # continue to next iteration if extracted_file in excluded
                if exclude:
                    try:  # since we need to skip outside loop from inside loop
                        for regexp in exclude:
                            if re.search(regexp, extracted_file):
                                lgr.debug(
                                    "Skipping {extracted_file} since contains "
                                    "{regexp} pattern".format(**locals()))
                                stats.skipped += 1
                                raise StopIteration
                    except StopIteration:
                        continue

                if delete_after:
                    # place target file in a temporary directory
                    target_file = Path(prefix_dir) / Path(target_file)
                    # but also allow for it in the orig
                    target_file_orig = Path(prefix_dir) / Path(target_file_orig)

                target_file_path_orig = annex.pathobj / target_file_orig

                # If we were invoked in a subdirectory, patch together the
                # correct path
                target_file_path = extract_rpath / target_file \
                    if extract_rpath else target_file
                target_file_path = annex.pathobj / target_file_path

                # when the file already exists...
                if lexists(target_file_path):
                    handle_existing = True
                    if md5sum(str(target_file_path)) == \
                            md5sum(str(extracted_path)):
                        if not annex.is_under_annex(str(extracted_path)):
                            # if under annex -- must be having the same content,
                            # we should just add possibly a new extra URL
                            # but if under git -- we cannot/should not do
                            # anything about it ATM
                            if existing != 'overwrite':
                                continue
                        else:
                            handle_existing = False
                    if not handle_existing:
                        pass  # nothing... just to avoid additional indentation
                    elif existing == 'fail':
                        message = \
                            "{} exists, but would be overwritten by new file " \
                            "{}. Consider adjusting --existing".format\
                            (target_file_path, extracted_file)
                        yield get_status_dict(
                            ds=ds,
                            status='error',
                            message=message,
                            **res_kwargs)
                        return
                    elif existing == 'overwrite':
                        stats.overwritten += 1
                        # to make sure it doesn't conflict -- might have been a
                        # tree
                        rmtree(target_file_path)
                    else:
                        # an elaborate dance to piece together new archive names
                        target_file_path_orig_ = target_file_path

                        # To keep extension intact -- operate on the base of the
                        # filename
                        p, fn = os.path.split(target_file_path)
                        ends_with_dot = fn.endswith('.')
                        fn_base, fn_ext = file_basename(fn, return_ext=True)

                        if existing == 'archive-suffix':
                            fn_base += '-%s' % archive_basename
                        elif existing == 'numeric-suffix':
                            pass  # archive-suffix will have the same logic
                        else:
                            # we shouldn't get here, argparse should catch a
                            # non-existing value for --existing right away
                            raise ValueError(existing)
                        # keep incrementing index in the suffix until file
                        # doesn't collide
                        suf, i = '', 0
                        while True:
                            connector = \
                                ('.' if (fn_ext or ends_with_dot) else '')
                            file = fn_base + suf + connector + fn_ext
                            target_file_path_new =  \
                                Path(p) / Path(file)
                            if not lexists(target_file_path_new):
                                # we found a file name that is not yet taken
                                break
                            lgr.debug("Iteration %i of file name finding. "
                                      "File %s already exists", i,
                                      target_file_path_new)
                            i += 1
                            suf = '.%d' % i
                        target_file_path = target_file_path_new
                        lgr.debug("Original file %s will be saved into %s"
                                  % (target_file_path_orig_, target_file_path))
                        # TODO: should we reserve smth like
                        # stats.clobbed += 1

                if target_file_path != target_file_path_orig:
                    stats.renamed += 1

                if copy:
                    raise NotImplementedError(
                        "Not yet copying from 'persistent' cache"
                    )

                lgr.debug("Adding %s to annex pointing to %s and with options "
                          "%r", target_file_path, url, annex_options)

                out_json = annex.add_url_to_file(
                    target_file_path,
                    url, options=annex_options,
                    batch=True)

                if 'key' in out_json and out_json['key'] is not None:
                    # annex.is_under_annex(target_file, batch=True):
                    # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated
                    # we need to maintain a list of those to be dropped files
                    if drop_after:
                        # drop extracted files after adding to annex
                        annex.drop_key(out_json['key'], batch=True)
                        stats.dropped += 1
                    stats.add_annex += 1
                else:
                    lgr.debug("File {} was added to git, not adding url".format(
                        target_file_path))
                    stats.add_git += 1

                if delete_after:
                    # we count the removal here, but don't yet perform it
                    # to not interfer with batched processes - any pure Git
                    # action invokes precommit which closes batched processes.
                    stats.removed += 1

                # Done with target_file -- just to have clear end of the loop
                del target_file

            if delete and archive and origin != 'key':
                lgr.debug("Removing the original archive {}".format(archive))
                # force=True since some times might still be staged and fail
                annex.remove(str(archive_path), force=True)

            lgr.info("Finished adding %s: %s", archive, stats.as_str(mode='line'))

            if outside_stats:
                outside_stats += stats
            if delete_after:
                # force since not committed. r=True for -r (passed into git call
                # to recurse)
                delete_after_rpath = opj(extract_rpath, prefix_dir) \
                    if extract_rpath else prefix_dir
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                lgr.debug(
                    "Removing extracted and annexed files under %s",
                    delete_after_rpath
                )
                annex.remove(str(delete_after_rpath), r=True, force=True)
            if commit:
                archive_rpath = archive_path.relative_to(ds.path)
                commit_stats = outside_stats if outside_stats else stats
                # so batched ones close and files become annex symlinks etc
                annex.precommit()
                precommitted = True
                if any(r.get('state', None) != 'clean'
                       for p, r in annex.status(untracked='no').items()):
                    annex.commit(
                        "Added content extracted from %s %s\n\n%s" %
                        (origin, archive_rpath,
                         commit_stats.as_str(mode='full')),
                        _datalad_msg=True
                    )
                    commit_stats.reset()
            else:
                # don't commit upon completion
                pass
        finally:
            # take down the progress bar
            log_progress(
                lgr.info, pbar_id,
                'Finished extraction',
                noninteractive_level=logging.INFO)
            # since we batched addurl, we should close those batched processes
            # if haven't done yet.  explicitly checked to avoid any possible
            # "double-action"
            if not precommitted:
                annex.precommit()

            if delete_after_rpath:
                delete_after_path = opj(annex.path, delete_after_rpath)
                delete_after_rpath = resolve_path(delete_after_rpath,
                                                  ds=dataset)
                if exists(delete_after_path):  # should not be there
                    # but for paranoid yoh
                    lgr.warning(
                        "Removing temporary directory under which extracted "
                        "files were annexed and should have been removed: %s",
                        delete_after_path)
                    rmtree(delete_after_path)

            annex.always_commit = old_always_commit
            # remove what is left and/or everything upon failure
            earchive.clean(force=True)
            # remove tempfile directories (not cleaned up automatically):
            if prefix_dir is not None and lexists(prefix_dir):
                os.rmdir(prefix_dir)
        yield get_status_dict(
            ds=ds,
            status='ok',
            **res_kwargs)
        return annex
Exemplo n.º 14
0
    def __call__(dataset=None, path=None, data_only=True, recursive=False):

        # Note: copy logic from install to resolve dataset and path:
        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        if not path:
            if ds is None:
                # no dataset, no target location, nothing to do
                raise ValueError(
                    "insufficient information for uninstallation (needs at "
                    "least a dataset or a path")
        elif isinstance(path, list):
            # TODO: not sure. might be possible to deal with that list directly
            return [Uninstall.__call__(
                    dataset=ds,
                    path=p,
                    data_only=data_only,
                    recursive=recursive) for p in path]

        # resolve the target location against the provided dataset
        if path is not None:
            path = resolve_path(path, ds)

        lgr.debug("Resolved uninstallation target: {0}".format(path))

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the resolved target location (that is now guaranteed to
        # be specified
        if ds is None:
            # try to find a dataset at or above the installation target
            dspath = GitRepo.get_toppath(abspath(path))
            if dspath is None:
                # no top-level dataset found, use path as such
                dspath = path
            ds = Dataset(dspath)
        assert(ds is not None)

        lgr.debug("Resolved target dataset for uninstallation: {0}".format(ds))

        if not ds.is_installed():
            if not path or path == ds.path:
                # we want to uninstall the dataset itself, which is not
                # installed => nothing to do
                # TODO: consider `data` option! is_installed currently only
                # checks for a repository
                lgr.info("Dataset {0} not installed. Nothing to "
                         "do.".format(ds.path))
                return
            else:
                # we want to uninstall something from a not installed dataset
                # Doesn't make sense, does it? => fail
                raise ValueError("Dataset {0} is not installed.".format(ds.path))

        assert(ds.repo is not None)

        if not path or path == ds.path:
            # uninstall the dataset `ds`
            # TODO: what to consider?
            #   - whether it is a submodule of another dataset
            #   - `data_only` ?
            #   - `recursive`
            #   - what to return in what case (data_only)?
            raise NotImplementedError("TODO: Uninstall dataset %s" % ds.path)

        # needed by the logic below
        assert(isabs(path))

        # express the destination path relative to the root of this dataset
        relativepath = relpath(path, start=ds.path)
        if path.startswith(pardir):
            raise ValueError("uninstallation path outside dataset")

        lgr.debug(
            "Resolved uninstallation target relative to dataset {0}: {1}".format(
                ds, relativepath))

        # figure out, what path actually is pointing to:
        if not exists(path):
            # nothing there, nothing to uninstall
            lgr.info("Nothing found to uninstall at %s" % path)
            return

        if relativepath in ds.get_dataset_handles(recursive=True):
            # it's a submodule
            # --recursive required or implied?
            raise NotImplementedError("TODO: uninstall submodule %s from "
                                      "dataset %s" % (relativepath, ds.path))

        if isdir(path):
            # don't know what to do yet
            # in git vs. untracked?
            # recursive?
            raise NotImplementedError("TODO: uninstall directory %s from "
                                      "dataset %s" % (path, ds.path))

        # we know, it's an existing file
        if isinstance(ds.repo, AnnexRepo):
            try:
                ds.repo.get_file_key(relativepath)
            except FileInGitError:
                # file directly in git
                _file_in_git = True

            except FileNotInAnnexError:
                # either an untracked file in this dataset, or something that
                # also actually exists in the file system but could be part of
                # a subdataset
                _untracked_or_within_submodule = True

            # it's an annexed file
            if data_only:
                ds.repo.annex_drop([path])
                return path
            else:
                raise NotImplementedError("TODO: fully uninstall file %s "
                                          "(annex) from dataset %s" %
                                          (path, ds.path))
        else:
            # plain git repo
            if relativepath in ds.repo.get_indexed_files():
                # file directly in git
                _file_in_git = True
            else:
                # either an untracked file in this dataset, or something that
                # also actually exists in the file system but could be part of
                # a subdataset
                _untracked_or_within_submodule = True


        if _file_in_git:
            if data_only:
                raise ValueError("%s is not a file handle. Removing its "
                                 "data only doesn't make sense." % path)
            else:
                return ds.repo.git_remove([relativepath])

        elif _untracked_or_within_submodule:
            subds = get_containing_subdataset(ds, relativepath)
            if ds.path != subds.path:
                # target path belongs to a subdataset, hand uninstallation
                # over to it
                return subds.uninstall(
                    path=relpath(path, start=subds.path),
                    data_only=data_only,
                    recursive=recursive)

            # this must be an untracked/existing something
            # it wasn't installed, so we cannot uninstall it
            raise ValueError("Cannot uninstall %s" % path)
Exemplo n.º 15
0
    def __call__(target, opts=None, dataset=None):
        # only non-bare repos have hashdirmixed, so require one
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='ORA archive export')
        ds_repo = ds.repo

        # TODO remove once datalad 0.12rc7 or later is released
        if not hasattr(ds_repo, 'dot_git'):
            from datalad.support.gitrepo import GitRepo
            ds_repo.dot_git = ds_repo.pathobj / GitRepo.get_git_dir(ds_repo)

        annex_objs = ds_repo.dot_git / 'annex' / 'objects'

        archive = resolve_path(target, dataset)
        if archive.is_dir():
            archive = archive / 'archive.7z'
        else:
            archive.parent.mkdir(exist_ok=True, parents=True)

        if not opts:
            # uncompressed by default
            opts = ['-mx0']

        res_kwargs = dict(
            action="export-archive-ora",
            logger=lgr,
        )

        if not annex_objs.is_dir():
            yield get_status_dict(
                ds=ds,
                status='notneeded',
                message='no annex keys present',
                **res_kwargs,
            )
            return

        exportdir = ds_repo.dot_git / 'datalad' / 'tmp' / 'ora_archive'
        if exportdir.exists():
            yield get_status_dict(
                ds=ds,
                status='error',
                message=(
                    'export directory already exists, please remove first: %s',
                    str(exportdir)),
                **res_kwargs,
            )
            return

        keypaths = [
            k for k in annex_objs.glob(op.join('**', '*')) if k.is_file()
        ]

        log_progress(
            lgr.info,
            'oraarchiveexport',
            'Start ORA archive export %s',
            ds,
            total=len(keypaths),
            label='ORA archive export',
            unit=' Keys',
        )

        link_fx = os.link
        for keypath in keypaths:
            key = keypath.name
            hashdir = op.join(keypath.parts[-4], keypath.parts[-3])
            log_progress(lgr.info,
                         'oraarchiveexport',
                         'Export key %s to %s',
                         key,
                         hashdir,
                         update=1,
                         increment=True)
            keydir = exportdir / hashdir / key
            keydir.mkdir(parents=True, exist_ok=True)
            try:
                link_fx(str(keypath), str(keydir / key))
            except OSError:
                lgr.warning(
                    'No hard links supported at %s, will copy files instead',
                    str(keydir))
                # no hard links supported
                # switch function after first error
                link_fx = shutil.copyfile
                link_fx(str(keypath), str(keydir / key))

        log_progress(lgr.info, 'oraarchiveexport',
                     'Finished RIA archive export from %s', ds)
        try:
            subprocess.run(
                ['7z', 'u', str(archive), '.'] + opts,
                cwd=str(exportdir),
            )
            yield get_status_dict(path=str(archive),
                                  type='file',
                                  status='ok',
                                  **res_kwargs)
        except Exception as e:
            yield get_status_dict(path=str(archive),
                                  type='file',
                                  status='error',
                                  message=('7z failed: %s', exc_str(e)),
                                  **res_kwargs)
            return
        finally:
            rmtree(str(exportdir))
Exemplo n.º 16
0
    def __call__(path=None,
                 initopts=None,
                 *,
                 force=False,
                 description=None,
                 dataset=None,
                 annex=True,
                 fake_dates=False,
                 cfg_proc=None):
        # we only perform negative tests below
        no_annex = not annex

        if dataset:
            if isinstance(dataset, Dataset):
                ds = dataset
            else:
                ds = Dataset(dataset)
            refds_path = ds.path
        else:
            ds = refds_path = None

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if (isinstance(initopts, (list, tuple))
                and '--bare' in initopts) or (isinstance(initopts, dict)
                                              and 'bare' in initopts):
            raise ValueError(
                "Creation of bare repositories is not supported. Consider "
                "one of the create-sibling commands, or use "
                "Git to init a bare repository and push an existing dataset "
                "into it.")

        if path:
            path = resolve_path(path, dataset)

        path = path if path \
            else getpwd() if ds is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert (path is not None)

        # assure cfg_proc is a list (relevant if used via Python API)
        cfg_proc = ensure_list(cfg_proc)

        # prep for yield
        res = dict(action='create',
                   path=str(path),
                   logger=lgr,
                   type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != str(path):
            refds = require_dataset(refds_path,
                                    check_installed=True,
                                    purpose='create a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", ds, str(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = get_dataset_root(
            op.normpath(op.join(str(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if (not pstatus.get(check_path, {}).get("type") == "dataset"
                    and any(check_path == p or check_path in p.parents
                            for p in pstatus)):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents
                ]
                res.update({
                    'status':
                    'error',
                    'message':
                    ('collision with content in parent dataset at %s: %s',
                     str(parentds_path), [str(c) for c in conflict])
                })
                yield res
                return
            if not force:
                # another set of check to see whether the target path is pointing
                # into a known subdataset that is not around ATM
                subds_status = {
                    parentds_path / k.relative_to(prepo.path)
                    for k, v in pstatus.items()
                    if v.get('type', None) == 'dataset'
                }
                check_paths = [check_path]
                check_paths.extend(check_path.parents)
                if any(p in subds_status for p in check_paths):
                    conflict = [p for p in check_paths if p in subds_status]
                    res.update({
                        'status':
                        'error',
                        'message':
                        ('collision with %s (dataset) in dataset %s',
                         str(conflict[0]), str(parentds_path))
                    })
                    yield res
                    return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = ds if isinstance(ds, Dataset) and \
            ds.path == path else Dataset(str(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status':
                'error',
                'message':
                'will not create a dataset in a non-empty directory, use '
                '`--force` option to ignore'
            })
            yield res
            return

        # Check if specified cfg_proc(s) can be discovered, storing
        # the results so they can be used when the time comes to run
        # the procedure. If a procedure cannot be found, raise an
        # error to prevent creating the dataset.
        cfg_proc_specs = []
        if cfg_proc:
            discovered_procs = tbds.run_procedure(
                discover=True,
                result_renderer='disabled',
                return_type='generator',
            )
            for cfg_proc_ in cfg_proc:
                for discovered_proc in discovered_procs:
                    if discovered_proc['procedure_name'] == 'cfg_' + cfg_proc_:
                        cfg_proc_specs.append(discovered_proc)
                        break
                else:
                    raise ValueError("Cannot find procedure with name "
                                     "'%s'" % cfg_proc_)

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # Note for the code below:
        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Re-use tbrepo instance, do not use tbds.repo

        # create and configure desired repository
        # also provides initial set of content to be tracked with git (not annex)
        if no_annex:
            tbrepo, add_to_git = _setup_git_repo(path, initopts, fake_dates)
        else:
            tbrepo, add_to_git = _setup_annex_repo(path, initopts, fake_dates,
                                                   description)

        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Note, must not happen earlier (before if) since "smart" it would not be
        tbds_config = tbds.config

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds_config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds_config.unset(id_var, scope='branch')

        if _seed is None:
            # just the standard way
            # use a fully random identifier (i.e. UUID version 4)
            uuid_id = str(uuid.uuid4())
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds_config.add(id_var,
                        tbds_id if tbds_id is not None else uuid_id,
                        scope='branch',
                        reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in tbds_config.overrides.items():
            tbds_config.add(k, v, scope='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds_config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbrepo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'
        }

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbrepo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        for cfg_proc_spec in cfg_proc_specs:
            yield from tbds.run_procedure(
                cfg_proc_spec,
                result_renderer='disabled',
                return_type='generator',
            )

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(refds, Dataset) and refds.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            yield from refds.save(
                path=tbds.path,
                return_type='generator',
                result_renderer='disabled',
            )

        res.update({'status': 'ok'})
        yield res
Exemplo n.º 17
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            action=None,
            unavailable_path_status='',
            unavailable_path_msg=None,
            nondataset_path_status='error',
            force_parentds_discovery=True,
            force_subds_discovery=True,
            force_no_revision_change_discovery=True,
            force_untracked_discovery=True,
            modified=None):
        # upfront check for the fastest possible response
        if not path and dataset is None:
            # nothing given, try "here", but do not use `require_dataset`, as
            # it will determine the root dataset of `curdir` and further down
            # lead to path annotation of upstairs directories
            dataset = curdir

        if force_subds_discovery and not force_parentds_discovery:
            raise ValueError(
                'subdataset discovery requires parent dataset discovery')

        # CONCEPT: yield with no status to indicate further processing

        # everything in one big loop to be able too yield as fast a possible
        # without any precomputing for all paths
        refds_path = Interface.get_refds_path(dataset)
        if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)):
            raise ValueError(
                "modification detection only works with a base dataset (non-given or found)")

        # prep common result props
        res_kwargs = dict(
            action=action if action else 'annotate_path',
            refds=refds_path,
            logger=lgr)

        # handle the case of recursion into a single dataset without any
        # extra fancy processing first -- full recursion can be done
        # faster than manual recursion, hence we gain quite some speed
        # from these few lines of extra code
        if not modified and not path and refds_path:
            if not GitRepo.is_valid_repo(refds_path):
                yield get_status_dict(
                    # doesn't matter if the path is in another dataset
                    # it was given as reference dataset
                    status=nondataset_path_status,
                    message='given reference dataset is not a dataset',
                    path=refds_path,
                    **res_kwargs)
                return

            refds = Dataset(refds_path)
            path = []
            # yield the dataset itself
            r = get_status_dict(ds=refds, status='', **res_kwargs)
            yield r

            if recursive:
                # if we have nothing given, but need recursion, we need to feed
                # the dataset path itself
                for r in yield_recursive(
                        refds,
                        refds_path,
                        action,
                        recursion_limit):
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    yield r
            return

        # goal: structure in a way that makes most information on any path
        # available in a single pass, at the cheapest possible cost
        reported_paths = {}
        requested_paths = assure_list(path)

        if modified is not None:
            # modification detection would silently kill all nondataset paths
            # but we have to complain about them, hence doing it here
            if requested_paths and refds_path:
                for r in requested_paths:
                    p = r['path'] if isinstance(r, dict) else r
                    p = resolve_path(p, ds=refds_path)
                    if path_startswith(p, refds_path):
                        # all good
                        continue
                    # not the refds
                    path_props = r if isinstance(r, dict) else {}
                    res = get_status_dict(
                        **dict(res_kwargs, **path_props))
                    res['status'] = nondataset_path_status
                    res['message'] = 'path not associated with reference dataset'
                    reported_paths[r] = res
                    yield res

            # preserve non-existing paths to be silently killed by modification
            # detection and append them to requested_paths again after detection.
            # TODO: This might be melted in with treatment of non dataset paths
            # above. Re-appending those paths seems to be better than yielding
            # directly to avoid code duplication, since both cases later on are
            # dealt with again.
            preserved_paths = []
            if requested_paths:
                [preserved_paths.append(r)
                 for r in requested_paths
                 if not lexists(r['path'] if isinstance(r, dict) else r)]

            # replace the requested paths by those paths that were actually
            # modified underneath or at a requested location
            requested_paths = get_modified_subpaths(
                # either the request, or the base dataset, if there was no request
                requested_paths if requested_paths else [refds_path],
                refds=Dataset(refds_path),
                revision=modified,
                report_no_revision_change=force_no_revision_change_discovery,
                report_untracked='all' if force_untracked_discovery else 'no',
                recursion_limit=recursion_limit)

            from itertools import chain
            # re-append the preserved paths:
            requested_paths = chain(requested_paths, iter(preserved_paths))

        # do not loop over unique(), this could be a list of dicts
        # we avoid duplicates manually below via `reported_paths`
        for path in requested_paths:
            if not isinstance(path, dict):
                path = rawpath2ap(path, refds_path)
            # this is now an annotated path!
            path_props = path
            path = path['path']
            # we need to mark our territory, who knows where this has been
            path_props.update(res_kwargs)

            if path in reported_paths:
                # we already recorded this path in the output
                # this can happen, whenever `path` is a subdataset, that was
                # discovered via recursive processing of another path before
                continue
            # the path exists in some shape or form
            # TODO if we have path_props already we could skip this test
            if isdir(path):
                # keep any existing type info, previously a more expensive run
                # could have discovered an uninstalled 'dataset', and we don't
                # want it to be relabeled to a directory
                path_props['type'] = \
                    path_props.get(
                        'type',
                        'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory')
                # this could contain all types of additional content
                containing_dir = path if not islink(path) else normpath(opj(path, pardir))
            else:
                if lexists(path):
                    path_props['type'] = 'file'
                else:
                    path_props['state'] = 'absent'
                # for everything else we are interested in the container
                containing_dir = dirname(path)
                if not containing_dir:
                    containing_dir = curdir

            dspath = parent = get_dataset_root(containing_dir)
            if dspath:
                if path_props.get('type', None) == 'dataset':
                    # for a dataset the root is not the parent, for anything else
                    # it is
                    parent = path_props.get('parentds', None)
                    oneupdir = normpath(opj(containing_dir, pardir))
                    if parent is None and (force_parentds_discovery or (
                            refds_path and _with_sep(oneupdir).startswith(
                                _with_sep(refds_path)))):
                        # either forced, or only if we have a reference dataset, and
                        # only if we stay within this refds when searching for the
                        # parent
                        parent = get_dataset_root(normpath(opj(containing_dir, pardir)))
                        # NOTE the `and refds_path` is critical, as it will determine
                        # whether a top-level dataset that was discovered gets the
                        # parent property or not, it won't get it without a common
                        # base dataset, and that is how we always rolled
                    if parent and refds_path:
                        path_props['parentds'] = parent
                        # don't check whether this is actually a true subdataset of the
                        # parent, done further down
                else:
                    # set parent, but prefer existing property
                    path_props['parentds'] = path_props.get('parentds', dspath)

            # test for `dspath` not `parent`, we only need to know whether there is
            # ANY dataset, not which one is the true parent, logic below relies on
            # the fact that we end here, if there is no dataset at all
            if not dspath:
                # not in any dataset
                res = get_status_dict(
                    **dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = 'path not associated with any dataset'
                reported_paths[path] = res
                yield res
                continue

            # check that we only got SUBdatasets
            if refds_path and not path_startswith(dspath, refds_path):
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = \
                    ('path not part of the reference dataset at %s', refds_path)
                reported_paths[path] = res
                yield res
                continue

            if path_props.get('type', None) == 'file':
                # nothing else we can learn about this
                res = get_status_dict(**dict(res_kwargs, **path_props))
                if 'status' not in res:
                    res['status'] = ''
                reported_paths[path] = res
                yield res
                continue

            containing_ds = None
            path_type = path_props.get('type', None)
            if parent and force_subds_discovery and (
                    (path_type == 'dataset' and 'registered_subds' not in path_props) or
                    path_type == 'directory' or
                    not lexists(path)):
                # if the path doesn't exist, or is labeled a directory, or a dataset even
                # a dataset (without this info) -> record whether this is a known subdataset
                # to its parent
                containing_ds = Dataset(parent)
                subdss = containing_ds.subdatasets(
                    fulfilled=None, recursive=False,
                    result_xfm=None, result_filter=None, return_type='list')
                if path in [s['path'] for s in subdss]:
                    if path_type == 'directory' or not lexists(path):
                        # first record that it isn't here, if just a dir or not here at all
                        path_props['state'] = 'absent'
                    # this must be a directory, and it is not installed
                    path_props['type'] = 'dataset'
                    path_props['registered_subds'] = True

            if not lexists(path) or \
                    (path_props.get('type', None) == 'dataset' and
                     path_props.get('state', None) == 'absent'):
                # not there (yet)
                message = unavailable_path_msg if unavailable_path_msg else None
                if message and '%s' in message:
                    message = (message, path)
                path_props['message'] = message
                res = get_status_dict(**dict(res_kwargs, **path_props))
                # assign given status, but only if the props don't indicate a status
                # already
                res['status'] = path_props.get(
                    'status', unavailable_path_status)
                reported_paths[path] = res
                yield res
                continue

            # we know everything we can, report
            res = get_status_dict(**dict(res_kwargs, **path_props))
            if 'status' not in res:
                res['status'] = ''
            reported_paths[path] = res
            yield res

            rec_paths = []
            if recursive:
                # here we need to consider the special case that `path` is
                # a dataset itself, if a recursion_limit is given (e.g.
                # `remove` will do that by default), we need to recurse
                # from the dataset itself, and not its parent to get things
                # right -- this will also avoid needless discovery of
                # unrelated subdatasets
                if path_props.get('type', None) == 'dataset':
                    containing_ds = Dataset(path)
                else:
                    # regular parent, we might have a dataset already
                    containing_ds = Dataset(parent) if containing_ds is None else containing_ds
                for r in yield_recursive(containing_ds, path, action, recursion_limit):
                    # capture reported paths
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    reported_paths[r['path']] = r
                    if modified is not None:
                        # we cannot yield right away, maybe it wasn't modified
                        rec_paths.append(r)
                    else:
                        yield r
            if modified is not None and rec_paths:
                # replace the recursively discovered paths by those paths that
                # were actually modified underneath or at a requested location
                for r in get_modified_subpaths(
                        rec_paths,
                        refds=Dataset(refds_path),
                        revision=modified,
                        report_no_revision_change=force_no_revision_change_discovery,
                        report_untracked='all' if force_untracked_discovery else 'no',
                        recursion_limit=recursion_limit):
                    res = get_status_dict(**dict(r, **res_kwargs))
                    reported_paths[res['path']] = res
                    yield res
        return
Exemplo n.º 18
0
    def __call__(dataset,
                 urlfile,
                 urlformat,
                 filenameformat,
                 input_type="ext",
                 exclude_autometa=None,
                 meta=None,
                 message=None,
                 dry_run=False,
                 fast=False,
                 ifexists=None,
                 missing_value=None,
                 save=True,
                 version_urls=False,
                 cfg_proc=None):
        # Temporarily work around gh-2269.
        url_file = urlfile
        url_format, filename_format = urlformat, filenameformat

        from requests.exceptions import RequestException

        from datalad.distribution.dataset import Dataset, require_dataset
        from datalad.interface.results import get_status_dict
        from datalad.support.annexrepo import AnnexRepo

        lgr = logging.getLogger("datalad.plugin.addurls")

        ds = require_dataset(dataset, check_installed=False)
        if ds.repo and not isinstance(ds.repo, AnnexRepo):
            yield get_status_dict(action="addurls",
                                  ds=ds,
                                  status="error",
                                  message="not an annex repo")
            return

        url_file = str(resolve_path(url_file, dataset))

        if input_type == "ext":
            extension = os.path.splitext(url_file)[1]
            input_type = "json" if extension == ".json" else "csv"

        with open(url_file) as fd:
            try:
                rows, subpaths = extract(fd, input_type, url_format,
                                         filename_format, exclude_autometa,
                                         meta, dry_run, missing_value)
            except (ValueError, RequestException) as exc:
                yield get_status_dict(action="addurls",
                                      ds=ds,
                                      status="error",
                                      message=exc_str(exc))
                return

        if not rows:
            yield get_status_dict(action="addurls",
                                  ds=ds,
                                  status="notneeded",
                                  message="No rows to process")
            return

        if len(rows) != len(set(row["filename"] for row in rows)):
            yield get_status_dict(action="addurls",
                                  ds=ds,
                                  status="error",
                                  message=("There are file name collisions; "
                                           "consider using {_repindex}"))
            return

        if dry_run:
            for subpath in subpaths:
                lgr.info("Would create a subdataset at %s", subpath)
            for row in rows:
                lgr.info("Would download %s to %s", row["url"],
                         os.path.join(ds.path, row["filename"]))
                lgr.info(
                    "Metadata: %s",
                    sorted(u"{}={}".format(k, v)
                           for k, v in row["meta_args"].items()))
            yield get_status_dict(action="addurls",
                                  ds=ds,
                                  status="ok",
                                  message="dry-run finished")
            return

        if not ds.repo:
            # Populate a new dataset with the URLs.
            for r in ds.create(result_xfm=None,
                               return_type='generator',
                               cfg_proc=cfg_proc):
                yield r

        annex_options = ["--fast"] if fast else []

        for spath in subpaths:
            if os.path.exists(os.path.join(ds.path, spath)):
                lgr.warning("Not creating subdataset at existing path: %s",
                            spath)
            else:
                for r in ds.create(spath,
                                   result_xfm=None,
                                   cfg_proc=cfg_proc,
                                   return_type='generator'):
                    yield r

        for row in rows:
            # Add additional information that we'll need for various
            # operations.
            filename_abs = os.path.join(ds.path, row["filename"])
            if row["subpath"]:
                ds_current = Dataset(os.path.join(ds.path, row["subpath"]))
                ds_filename = os.path.relpath(filename_abs, ds_current.path)
            else:
                ds_current = ds
                ds_filename = row["filename"]
            row.update({
                "filename_abs": filename_abs,
                "ds": ds_current,
                "ds_filename": ds_filename
            })

        if version_urls:
            num_urls = len(rows)
            log_progress(lgr.info,
                         "addurls_versionurls",
                         "Versioning %d URLs",
                         num_urls,
                         label="Versioning URLs",
                         total=num_urls,
                         unit=" URLs")
            for row in rows:
                url = row["url"]
                try:
                    row["url"] = get_versioned_url(url)
                except (ValueError, NotImplementedError) as exc:
                    # We don't expect this to happen because get_versioned_url
                    # should return the original URL if it isn't an S3 bucket.
                    # It only raises exceptions if it doesn't know how to
                    # handle the scheme for what looks like an S3 bucket.
                    lgr.warning("error getting version of %s: %s", row["url"],
                                exc_str(exc))
                log_progress(lgr.info,
                             "addurls_versionurls",
                             "Versioned result for %s: %s",
                             url,
                             row["url"],
                             update=1,
                             increment=True)
            log_progress(lgr.info, "addurls_versionurls",
                         "Finished versioning URLs")

        files_to_add = set()
        for r in add_urls(rows, ifexists=ifexists, options=annex_options):
            if r["status"] == "ok":
                files_to_add.add(r["path"])
            yield r

            msg = message or """\
[DATALAD] add files from URLs

url_file='{}'
url_format='{}'
filename_format='{}'""".format(url_file, url_format, filename_format)

        if files_to_add:
            meta_rows = [r for r in rows if r["filename_abs"] in files_to_add]
            for r in add_meta(meta_rows):
                yield r

            if save:
                for r in ds.save(path=files_to_add,
                                 message=msg,
                                 recursive=True):
                    yield r
Exemplo n.º 19
0
    def __call__(path=None, dataset=None, reporton='all', recursive=False):
        # prep results
        res_kwargs = dict(action='meta_dump', logger=lgr)
        ds = require_dataset(dataset=dataset,
                             check_installed=True,
                             purpose='aggregate metadata query')
        if dataset:
            res_kwargs['refds'] = ds.path

        agginfos = get_ds_aggregate_db(
            ds.pathobj,
            version=str(aggregate_layout_version),
            # we are handling errors below
            warn_absent=False,
        )
        if not agginfos:
            # if there has ever been an aggregation run, this file would
            # exist, hence there has not been and we need to tell this
            # to people
            yield get_status_dict(
                ds=ds,
                status='impossible',
                message='metadata aggregation has never been performed in '
                'this dataset',
                **res_kwargs)
            return

        if not path:
            # implement https://github.com/datalad/datalad/issues/3282
            path = ds.pathobj if isinstance(dataset, Dataset) else os.getcwd()

        # check for paths that are not underneath this dataset
        resolved_paths = set()
        for p in assure_list(path):
            p = resolve_path(p, dataset)
            if p != ds.pathobj and ds.pathobj not in p.parents:
                raise ValueError(
                    'given path {} is not underneath dataset {}'.format(p, ds))
            resolved_paths.add(p)

        # sort paths into their containing dataset aggregate records
        paths_by_ds = {}
        while resolved_paths:
            resolved_path = resolved_paths.pop()
            # find the first dataset that matches
            for aggdspath in sorted(agginfos, reverse=True):
                if recursive and resolved_path in aggdspath.parents:
                    ps = paths_by_ds.get(aggdspath, set())
                    ps.add(aggdspath)
                    paths_by_ds[aggdspath] = ps
                elif aggdspath == resolved_path \
                        or aggdspath in resolved_path.parents:
                    ps = paths_by_ds.get(aggdspath, set())
                    ps.add(resolved_path)
                    paths_by_ds[aggdspath] = ps
                    # stop when the containing dataset is found
                    break

        # which files do we need to have locally to perform the query
        info_keys = \
            ('dataset_info', 'content_info') \
            if reporton in ('all', 'jsonld') else \
            ('dataset_info',) if reporton == 'datasets' else \
            ('content_info',) if reporton == 'files' else \
            []
        objfiles = [
            text_type(agginfos[d][t]) for d in paths_by_ds for t in info_keys
            if t in agginfos[d]
        ]
        lgr.debug(
            'Verifying/achieving local availability of %i metadata objects',
            len(objfiles))
        if objfiles:
            for r in ds.get(path=objfiles,
                            result_renderer='disabled',
                            return_type='generator'):
                # report only of not a success as this is an internal operation
                # that a user would not (need to) expect
                if success_status_map.get(
                        r['status'], False) != 'success':  # pragma: no cover
                    yield r

        contexts = {}
        nodes_by_context = {}
        parentds = []
        # loop over all records to get complete parentds relationships
        for aggdspath in sorted(agginfos):
            while parentds and parentds[-1] not in aggdspath.parents:
                parentds.pop()
            if aggdspath not in paths_by_ds:
                # nothing to say about this
                parentds.append(aggdspath)
                continue
            agg_record = agginfos[aggdspath]
            if reporton == 'aggregates':
                # we do not need to loop over the actual query paths, as
                # the aggregates of the containing dataset will contain
                # the desired info, if any exists

                # convert pathobj before emitting until we became more clever
                info = {
                    k: text_type(v) if isinstance(v, ut.PurePath) else v
                    for k, v in iteritems(agg_record)
                }
                info.update(
                    path=text_type(aggdspath),
                    type='dataset',
                )
                if aggdspath == ds.pathobj:
                    info['layout_version'] = aggregate_layout_version
                if parentds:
                    info['parentds'] = text_type(parentds[-1])
                yield dict(info, status='ok', **res_kwargs)
                parentds.append(aggdspath)
                continue

            # pull out actual metadata records
            for res in _yield_metadata_records(
                    aggdspath,
                    agg_record,
                    paths_by_ds[aggdspath],
                    reporton,
                    parentds=parentds[-1] if parentds else None):
                if reporton != 'jsonld':
                    yield dict(res, **res_kwargs)
                    continue
                collect_jsonld_metadata(aggdspath, res, nodes_by_context,
                                        contexts)

            parentds.append(aggdspath)
        if reporton == 'jsonld':
            yield dict(status='ok',
                       type='dataset',
                       path=ds.path,
                       metadata=format_jsonld_metadata(nodes_by_context),
                       refcommit=agginfos[ds.pathobj]['refcommit'],
                       **res_kwargs)
Exemplo n.º 20
0
    def __call__(
            target,
            opts=None,
            *,  # opts is positional but optional in CLI
            dataset=None,
            remote=None,
            annex_wanted=None,
            froms=None,
            missing_content='error',):
        # only non-bare repos have hashdirmixed, so require one
        ds = require_dataset(
            dataset, check_installed=True, purpose='export to ORA archive')
        ds_repo = ds.repo

        annex_objs = ds_repo.dot_git / 'annex' / 'objects'

        archive = resolve_path(target, dataset)
        if archive.is_dir():
            archive = archive / 'archive.7z'
        else:
            archive.parent.mkdir(exist_ok=True, parents=True)

        froms = ensure_list(froms)

        if not opts:
            # uncompressed by default
            opts = ['-mx0']

        res_kwargs = dict(
            action="export-archive-ora",
            logger=lgr,
        )

        if not annex_objs.is_dir():
            yield get_status_dict(
                ds=ds,
                status='notneeded',
                message='no annex keys present',
                **res_kwargs,
            )
            return

        exportdir = ds_repo.dot_git / 'datalad' / 'tmp' / 'ora_archive'
        if exportdir.exists():
            yield get_status_dict(
                ds=ds,
                status='error',
                message=(
                    'export directory already exists, please remove first: %s',
                    str(exportdir)),
                **res_kwargs,
            )
            return

        def expr_to_opts(expr):
            opts = []
            expr = expr.replace('(', ' ( ').replace(')', ' ) ')
            for sub_expr in expr.split(' '):
                if len(sub_expr):
                    if sub_expr in '()':
                        opts.append(f"-{sub_expr}")
                    else:
                        opts.append(f"--{sub_expr}")
            return opts

        find_filters = []
        if remote:
            find_filters = ['-('] + expr_to_opts(ds_repo.get_preferred_content('wanted', remote)) + ['-)']
        if annex_wanted:
            find_filters.extend(expr_to_opts(annex_wanted))
        # git-annex find results need to be uniqued with set, as git-annex find
        # will return duplicates if multiple symlinks point to the same key.
        if froms:
            keypaths = set([
                annex_objs.joinpath(k) for treeish in froms for k in ds_repo.call_annex_items_([
                'find', *find_filters, f"--branch={treeish}",
                "--format=${hashdirmixed}${key}/${key}\\n"])
                ])
        else:
            keypaths = set(annex_objs.joinpath(k) for k in ds_repo.call_annex_items_([
                'find', *find_filters,
                "--format=${hashdirmixed}${key}/${key}\\n"
            ]))

        log_progress(
            lgr.info,
            'oraarchiveexport',
            'Start ORA archive export %s', ds,
            total=len(keypaths),
            label='ORA archive export',
            unit=' Keys',
        )

        if missing_content == 'continue':
            missing_file_lgr_func = lgr.warning
        elif missing_content == 'ignore':
            missing_file_lgr_func = lgr.debug

        link_fx = os.link
        for keypath in keypaths:
            key = keypath.name
            hashdir = op.join(keypath.parts[-4], keypath.parts[-3])
            log_progress(
                lgr.info,
                'oraarchiveexport',
                'Export key %s to %s', key, hashdir,
                update=1,
                increment=True)
            keydir = exportdir / hashdir / key
            keydir.mkdir(parents=True, exist_ok=True)
            try:
                link_fx(str(keypath), str(keydir / key))
            except FileNotFoundError as e:
                if missing_content == 'error':
                    raise IOError('Key %s has no content available' % keypath)
                missing_file_lgr_func(
                    'Key %s has no content available',
                    str(keypath))
            except OSError:
                lgr.warning(
                    'No hard links supported at %s, will copy files instead',
                    str(keypath))
                # no hard links supported
                # switch function after first error
                link_fx = shutil.copyfile
                link_fx(str(keypath), str(keydir / key))

        log_progress(
            lgr.info,
            'oraarchiveexport',
            'Finished RIA archive export from %s', ds
        )
        try:
            subprocess.run(
                ['7z', 'u', str(archive), '.'] + opts,
                cwd=str(exportdir),
            )
            yield get_status_dict(
                path=str(archive),
                type='file',
                status='ok',
                **res_kwargs)
        except Exception as e:
            ce = CapturedException(e)
            yield get_status_dict(
                path=str(archive),
                type='file',
                status='error',
                message=('7z failed: %s', ce),
                exception=ce,
                **res_kwargs)
            return
        finally:
            rmtree(str(exportdir))
Exemplo n.º 21
0
    def __call__(urls,
                 *,
                 dataset=None,
                 path=None,
                 overwrite=False,
                 archive=False,
                 save=True,
                 message=None):
        from ..downloaders.http import HTTPDownloader
        from ..downloaders.providers import Providers

        ds = None
        if save or dataset:
            try:
                ds = require_dataset(dataset,
                                     check_installed=True,
                                     purpose='download urls')
            except NoDatasetFound:
                pass

        common_report = {"action": "download_url", "ds": ds}

        got_ds_instance = isinstance(dataset, Dataset)
        dir_is_target = not path or str(path).endswith(op.sep)
        path = str(resolve_path(path or op.curdir, ds=dataset))
        if dir_is_target:
            # resolve_path() doesn't preserve trailing separators. Add one for
            # the download() call.
            path = path + op.sep
        urls = ensure_list_from_str(urls)

        if not dir_is_target:
            if len(urls) > 1:
                yield get_status_dict(
                    status="error",
                    message=
                    ("When specifying multiple urls, --path should point to "
                     "a directory target (with a trailing separator). Got %r",
                     path),
                    type="file",
                    path=path,
                    **common_report)
                return
            if archive:
                # make sure the file suffix indicated by a URL is preserved
                # so that any further archive processing doesn't have to
                # employ mime type inspection in order to determine the archive
                # type
                from datalad.support.network import URL
                suffixes = PurePosixPath(URL(urls[0]).path).suffixes
                if not Path(path).suffixes == suffixes:
                    path += ''.join(suffixes)
            # we know that we have a single URL
            # download() would be fine getting an existing directory and
            # downloading the URL underneath it, but let's enforce a trailing
            # slash here for consistency.
            if op.isdir(path):
                yield get_status_dict(
                    status="error",
                    message=(
                        "Non-directory path given (no trailing separator) "
                        "but a directory with that name (after adding archive "
                        "suffix) exists"),
                    type="file",
                    path=path,
                    **common_report)
                return

        # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress
        # in % of urls which were already downloaded
        providers = Providers.from_config_files()
        downloaded_paths = []
        path_urls = {}
        need_datalad_remote = False
        for url in urls:
            # somewhat "ugly"
            downloader = providers.get_provider(url).get_downloader(url)
            try:
                downloaded_path = downloader.download(url,
                                                      path=path,
                                                      overwrite=overwrite)
            except Exception as e:
                ce = CapturedException(e)
                yield get_status_dict(status="error",
                                      message=str(ce),
                                      type="file",
                                      path=path,
                                      exception=ce,
                                      **common_report)
            else:
                if not need_datalad_remote \
                   and (downloader.authenticator or downloader.credential or
                        type(downloader) != HTTPDownloader):
                    need_datalad_remote = True
                downloaded_paths.append(downloaded_path)
                path_urls[downloaded_path] = url
                yield get_status_dict(status="ok",
                                      type="file",
                                      path=downloaded_path,
                                      **common_report)

        if downloaded_paths and save and ds is not None:
            msg = message or """\
[DATALAD] Download URLs

URLs:
  {}""".format("\n  ".join(urls))

            for r in Save()(
                    downloaded_paths,
                    message=msg,
                    # ATTN: Pass the original dataset argument to
                    # preserve relative path handling semantics.
                    dataset=dataset,
                    return_type="generator",
                    result_renderer='disabled',
                    result_xfm=None,
                    result_filter=None,
                    on_failure="ignore"):
                yield r

            ds_repo = ds.repo
            if isinstance(ds_repo, AnnexRepo):
                if need_datalad_remote:
                    from datalad.customremotes.base import (
                        ensure_datalad_remote, )
                    ensure_datalad_remote(ds_repo,
                                          autoenable=True,
                                          encryption=None)

                if got_ds_instance:
                    # Paths in `downloaded_paths` are already relative to the
                    # dataset.
                    rpaths = dict(zip(downloaded_paths, downloaded_paths))
                else:
                    # Paths in `downloaded_paths` are already relative to the
                    # current working directory. Take these relative to the
                    # dataset for use with the AnnexRepo method calls.
                    rpaths = {}
                    for orig_path, resolved in zip(
                            downloaded_paths,
                            resolve_path(downloaded_paths, ds=dataset)):
                        rpath = path_under_rev_dataset(ds, resolved)
                        if rpath:
                            rpaths[str(rpath)] = orig_path
                        else:
                            lgr.warning("Path %s not under dataset %s",
                                        orig_path, ds)
                annex_paths = [
                    p for p, annexed in zip(
                        rpaths, ds_repo.is_under_annex(list(rpaths.keys())))
                    if annexed
                ]
                if annex_paths:
                    for path in annex_paths:
                        url = path_urls[rpaths[path]]
                        try:
                            # The file is already present. This is just to
                            # register the URL.
                            ds_repo.add_url_to_file(
                                path,
                                url,
                                # avoid batch mode for single files
                                # https://github.com/datalad/datalad/issues/2849
                                batch=len(annex_paths) > 1,
                                # bypass URL size check, we already have the file
                                options=['--relaxed'])
                        except CommandError as exc:
                            lgr.warning("Registering %s with %s failed: %s",
                                        path, url, CapturedException(exc))

                    if archive:
                        for path in annex_paths:
                            yield from ds.add_archive_content(
                                path,
                                delete=True,
                                on_failure='ignore',
                                return_type='generator',
                                result_renderer='disabled')
Exemplo n.º 22
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None):
        refds = require_dataset(dataset, check_installed=True,
                                purpose="unlock")

        # Before passing the results to status()
        #   * record explicitly specified non-directory paths so that we can
        #     decide whether to yield a result for reported paths
        #   * filter out and yield results for paths that don't exist
        res_paths_nondir = set()
        paths_lexist = None
        res_paths = list()
        if path:
            # Note, that we need unresolved versions of the path input to be
            # passed on to status. See gh-5456 for example.
            path = ensure_list(path)
            res_paths = resolve_path(path, ds=dataset)
            paths_lexist = []
            res_paths_lexist = []
            for p, p_r in zip(path, res_paths):
                if p_r.exists() or p_r.is_symlink():
                    paths_lexist.append(p)
                    res_paths_lexist.append(p_r)
                if not p_r.is_dir():
                    res_paths_nondir.add(p_r)

        res_kwargs = dict(action='unlock', logger=lgr, refds=refds.path)
        if res_paths:
            for p in set(res_paths).difference(set(res_paths_lexist)):
                yield get_status_dict(
                    status="impossible",
                    path=str(p),
                    type="file",
                    message="path does not exist",
                    **res_kwargs)
        if not (paths_lexist or paths_lexist is None):
            return

        # Collect information on the paths to unlock.
        to_unlock = defaultdict(list)  # ds => paths (relative to ds)
        for res in Status()(
                # ATTN: it is vital to pass the `dataset` argument as it,
                # and not a dataset instance in order to maintain the path
                # semantics between here and the status() call
                dataset=dataset,
                path=paths_lexist,
                untracked="normal" if res_paths_nondir else "no",
                report_filetype=False,
                annex="availability",
                recursive=recursive,
                recursion_limit=recursion_limit,
                result_renderer='disabled',
                on_failure="ignore"):
            if res["action"] != "status" or res["status"] != "ok":
                yield res
                continue
            has_content = res.get("has_content")
            if has_content:
                parentds = res["parentds"]
                to_unlock[parentds].append(op.relpath(res["path"], parentds))
            elif res_paths_nondir and Path(res["path"]) in res_paths_nondir:
                if has_content is False:
                    msg = "no content present"
                    status = "impossible"
                elif res["state"] == "untracked":
                    msg = "untracked"
                    status = "impossible"
                else:
                    # This is either a regular git file or an unlocked annex
                    # file.
                    msg = "non-annex file"
                    status = "notneeded"
                yield get_status_dict(
                    status=status,
                    path=res["path"],
                    type="file",
                    message="{}; cannot unlock".format(msg),
                    **res_kwargs)

        # Do the actual unlocking.
        for ds_path, files in to_unlock.items():
            ds = Dataset(ds_path)
            for r in ds.repo._call_annex_records(
                    ["unlock"],
                    files=files):
                yield get_status_dict(
                    path=op.join(ds.path, r['file']),
                    status='ok' if r['success'] else 'error',
                    type='file',
                    **res_kwargs)
Exemplo n.º 23
0
    def __call__(dataset=None, path=None, data_only=True, recursive=False):

        # Note: copy logic from install to resolve dataset and path:
        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        if not path:
            if ds is None:
                # no dataset, no target location, nothing to do
                raise ValueError(
                    "insufficient information for uninstallation (needs at "
                    "least a dataset or a path")
        elif isinstance(path, list):
            # TODO: not sure. might be possible to deal with that list directly
            return [
                Uninstall.__call__(dataset=ds,
                                   path=p,
                                   data_only=data_only,
                                   recursive=recursive) for p in path
            ]

        # resolve the target location against the provided dataset
        if path is not None:
            path = resolve_path(path, ds)

        lgr.debug("Resolved uninstallation target: {0}".format(path))

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the resolved target location (that is now guaranteed to
        # be specified
        if ds is None:
            # try to find a dataset at or above the installation target
            dspath = GitRepo.get_toppath(abspath(path))
            if dspath is None:
                # no top-level dataset found, use path as such
                dspath = path
            ds = Dataset(dspath)
        assert (ds is not None)

        lgr.debug("Resolved target dataset for uninstallation: {0}".format(ds))

        if not ds.is_installed():
            if not path or path == ds.path:
                # we want to uninstall the dataset itself, which is not
                # installed => nothing to do
                # TODO: consider `data` option! is_installed currently only
                # checks for a repository
                lgr.info("Dataset {0} not installed. Nothing to "
                         "do.".format(ds.path))
                return
            else:
                # we want to uninstall something from a not installed dataset
                # Doesn't make sense, does it? => fail
                raise ValueError("Dataset {0} is not installed.".format(
                    ds.path))

        assert (ds.repo is not None)

        if not path or path == ds.path:
            # uninstall the dataset `ds`
            # TODO: what to consider?
            #   - whether it is a submodule of another dataset
            #   - `data_only` ?
            #   - `recursive`
            #   - what to return in what case (data_only)?
            raise NotImplementedError("TODO: Uninstall dataset %s" % ds.path)

        # needed by the logic below
        assert (isabs(path))

        # express the destination path relative to the root of this dataset
        relativepath = relpath(path, start=ds.path)
        if path.startswith(pardir):
            raise ValueError("uninstallation path outside dataset")

        lgr.debug(
            "Resolved uninstallation target relative to dataset {0}: {1}".
            format(ds, relativepath))

        # figure out, what path actually is pointing to:
        if not exists(path):
            # nothing there, nothing to uninstall
            lgr.info("Nothing found to uninstall at %s" % path)
            return

        if relativepath in ds.get_dataset_handles(recursive=True):
            # it's a submodule
            # --recursive required or implied?
            raise NotImplementedError("TODO: uninstall submodule %s from "
                                      "dataset %s" % (relativepath, ds.path))

        if isdir(path):
            # don't know what to do yet
            # in git vs. untracked?
            # recursive?
            raise NotImplementedError("TODO: uninstall directory %s from "
                                      "dataset %s" % (path, ds.path))

        # we know, it's an existing file
        if isinstance(ds.repo, AnnexRepo):
            try:
                ds.repo.get_file_key(relativepath)
            except FileInGitError:
                # file directly in git
                _file_in_git = True

            except FileNotInAnnexError:
                # either an untracked file in this dataset, or something that
                # also actually exists in the file system but could be part of
                # a subdataset
                _untracked_or_within_submodule = True

            # it's an annexed file
            if data_only:
                ds.repo.annex_drop([path])
                return path
            else:
                raise NotImplementedError("TODO: fully uninstall file %s "
                                          "(annex) from dataset %s" %
                                          (path, ds.path))
        else:
            # plain git repo
            if relativepath in ds.repo.get_indexed_files():
                # file directly in git
                _file_in_git = True
            else:
                # either an untracked file in this dataset, or something that
                # also actually exists in the file system but could be part of
                # a subdataset
                _untracked_or_within_submodule = True

        if _file_in_git:
            if data_only:
                raise ValueError("%s is not a file handle. Removing its "
                                 "data only doesn't make sense." % path)
            else:
                return ds.repo.git_remove([relativepath])

        elif _untracked_or_within_submodule:
            subds = get_containing_subdataset(ds, relativepath)
            if ds.path != subds.path:
                # target path belongs to a subdataset, hand uninstallation
                # over to it
                return subds.uninstall(path=relpath(path, start=subds.path),
                                       data_only=data_only,
                                       recursive=recursive)

            # this must be an untracked/existing something
            # it wasn't installed, so we cannot uninstall it
            raise ValueError("Cannot uninstall %s" % path)
Exemplo n.º 24
0
    def __call__(dataset=None, path=None, source=None, recursive=False,
                 add_data_to_git=False):
        lgr.debug("Installation attempt started")
        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)

        if isinstance(path, list):
            if not len(path):
                # normalize value to expected state when nothing was provided
                path = None
            elif len(path) == 1:
                # we can simply continue with the function as called with a
                # single argument
                path = path[0]
            else:
                lgr.debug("Installation of multiple targets was requested: {0}".format(path))
                return [Install.__call__(
                        dataset=ds,
                        path=p,
                        source=source,
                        recursive=recursive) for p in path]

        # resolve the target location against the provided dataset
        if path is not None:
            # make sure it is not a URL, `resolve_path` cannot handle that
            if is_url(path):
                try:
                    path = get_local_path_from_url(path)
                    path = resolve_path(path, ds)
                except ValueError:
                    # URL doesn't point to a local something
                    pass
            else:
                path = resolve_path(path, ds)

        # any `path` argument that point to something local now resolved and
        # is no longer a URL

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the resolved target location (that is now guaranteed to
        # be specified, but only if path isn't a URL (anymore) -> special case,
        # handles below
        if ds is None and path is not None and not is_url(path):
            # try to find a dataset at or above the installation target
            dspath = GitRepo.get_toppath(abspath(path))
            if dspath is None:
                # no top-level dataset found, use path as such
                dspath = path
            ds = Dataset(dspath)

        if ds is None and source is None and path is not None:
            # no dataset, no source
            # this could be a shortcut install call, where the first
            # arg identifies the source
            if is_url(path) or os.path.exists(path):
                # we have an actual URL -> this should be the source
                # OR
                # it is not a URL, but it exists locally
                lgr.debug(
                    "Single argument given to install and no dataset found. "
                    "Assuming the argument identifies a source location.")
                source = path
                path = None

        lgr.debug("Resolved installation target: {0}".format(path))

        if ds is None and path is None and source is not None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            lgr.debug(
                "Neither dataset not target installation path provided. "
                "Assuming installation of a remote dataset. "
                "Deriving destination path from given source {0}".format(
                    source))
            ds = Dataset(_installationpath_from_url(source))

        if not path and ds is None:
            # no dataset, no target location, nothing to do
            raise InsufficientArgumentsError(
                "insufficient information for installation (needs at "
                "least a dataset or an installation path")

        assert(ds is not None)

        lgr.debug("Resolved target dataset for installation: {0}".format(ds))

        vcs = ds.repo
        if vcs is None:
            # TODO check that a "ds.path" actually points to a TOPDIR
            # should be the case already, but maybe nevertheless check
            try:
                with swallow_logs():
                    vcs = Install._get_new_vcs(ds, source, vcs)
            except GitCommandError:
                lgr.debug("Cannot retrieve from URL: {0}".format(source))
                # maybe source URL was missing a '/.git'
                if source and not source.rstrip('/').endswith('/.git'):
                    source = '{0}/.git'.format(source.rstrip('/'))
                    lgr.debug("Attempt to retrieve from URL: {0}".format(source))
                    vcs = Install._get_new_vcs(ds, source, vcs)
                else:
                    lgr.debug("Unable to establish repository instance at: {0}".format(ds.path))
                    raise

        assert(ds.repo)  # is automagically re-evaluated in the .repo property

        runner = Runner()

        if path is None or path == ds.path:
            # if the goal was to install this dataset, we are done,
            # except for 'recursive'.

            # TODO: For now 'recursive' means just submodules.
            # See --with-data vs. -- recursive and figure it out
            if recursive:
                for sm in ds.repo.get_submodules():
                    _install_subds_from_flexible_source(
                        ds, sm.path, sm.url, recursive=recursive)
            return ds

        # at this point this dataset is "installed", now we can test whether to
        # install something into the dataset

        # needed by the logic below
        assert(isabs(path))

        # express the destination path relative to the root of this dataset
        relativepath = relpath(path, start=ds.path)
        if path.startswith(pardir):
            raise ValueError("installation path outside dataset")

        lgr.debug(
            "Resolved installation target relative to dataset {0}: {1}".format(
                ds, relativepath))

        # this dataset must already know everything necessary
        ###################################################
        # FLOW GUIDE
        #
        # at this point we know nothing about the
        # installation targether
        ###################################################
        try:
            # it is simplest to let annex tell us what we are dealing with
            lgr.debug("Trying to fetch file %s using annex", relativepath)
            if not isinstance(vcs, AnnexRepo):
                assert(isinstance(vcs, GitRepo))
                # FLOW GUIDE
                # this is not an annex repo, but we raise exceptions
                # to be able to treat them alike in the special case handling
                # below
                if not exists(path):
                    raise IOError("path doesn't exist yet, might need special handling")
                elif relativepath in vcs.get_indexed_files():
                    # relativepath is in git
                    raise FileInGitError("We need to handle it as known to git")
                else:
                    raise FileNotInAnnexError("We don't have yet annex repo here")
            if vcs.get_file_key(relativepath):
                # FLOW GUIDE EXIT POINT
                # this is an annex'ed file -> get it
                # TODO implement `copy --from` using `source`
                # TODO fail if `source` is something strange
                vcs.annex_get(relativepath)
                # return the absolute path to the installed file
                return path

        except FileInGitError:
            ###################################################
            # FLOW GUIDE
            #
            # `path` is either
            # - a  file already checked into Git
            # - known submodule
            ###################################################
            lgr.log(5, "FileInGitError logic")
            if source is not None:
                raise FileInGitError("File %s is already in git. Specifying source (%s) makes no sense"
                                     % (path, source))
            # file is checked into git directly -> nothing to do
            # OR this is a submodule of this dataset
            submodule = [sm for sm in ds.repo.get_submodules()
                         if sm.path == relativepath]
            if not len(submodule):
                # FLOW GUIDE EXIT POINT
                # this is a file in Git and no submodule, just return its path
                lgr.debug("Don't act, data already present in Git")
                return path
            elif len(submodule) > 1:
                raise RuntimeError(
                    "more than one submodule registered at the same path?")
            submodule = submodule[0]

            # FLOW GUIDE EXIT POINT
            # we are dealing with a known submodule (i.e. `source`
            # doesn't matter) -> check it out
            lgr.debug("Install subdataset at: {0}".format(submodule.path))
            subds = _install_subds_from_flexible_source(
                ds, submodule.path, submodule.url, recursive=recursive)
            return subds

        except FileNotInAnnexError:
            ###################################################
            # FLOW GUIDE
            #
            # `path` is either
            # - content of a subdataset
            # - an untracked file in this dataset
            # - an entire untracked/unknown existing subdataset
            ###################################################
            lgr.log(5, "FileNotInAnnexError logic")
            subds = get_containing_subdataset(ds, relativepath)
            if ds.path != subds.path:
                # FLOW GUIDE EXIT POINT
                # target path belongs to a known subdataset, hand
                # installation over to it
                return subds.install(
                    path=relpath(path, start=subds.path),
                    source=source,
                    recursive=recursive,
                    add_data_to_git=add_data_to_git)

            # FLOW GUIDE
            # this must be an untracked/existing something, so either
            # - a file
            # - a directory
            # - an entire repository
            if exists(opj(path, '.git')):
                # FLOW GUIDE EXIT POINT
                # this is an existing repo and must be in-place turned into
                # a submodule of this dataset
                return _install_subds_inplace(
                    ds, path, relativepath, source, runner)

            # FLOW GUIDE EXIT POINT
            # - untracked file or directory in this dataset
            if isdir(path) and not recursive:
                # this is a directory and we want --recursive for it
                raise ValueError(
                    "installation of a directory requires the `recursive` flag")

            # few sanity checks
            if source and abspath(source) != path:
                raise ValueError(
                    "installation target already exists, but `source` points to "
                    "another location (target: '{0}', source: '{0}'".format(
                        source, path))

            if not add_data_to_git and not (isinstance(vcs, AnnexRepo)):
                raise RuntimeError(
                    "Trying to install file(s) into a dataset "
                    "with a plain Git repository. First initialize annex, or "
                    "provide override flag.")

            # switch `add` procedure between Git and Git-annex according to flag
            if add_data_to_git:
                vcs.git_add(relativepath)
                added_files = resolve_path(relativepath, ds)
            else:
                # do a blunt `annex add`
                added_files = vcs.annex_add(relativepath)
                # return just the paths of the installed components
                if isinstance(added_files, list):
                    added_files = [resolve_path(i['file'], ds) for i in added_files]
                else:
                    added_files = resolve_path(added_files['file'], ds)
            if added_files:
                return added_files
            else:
                return None

        except IOError:
            ###################################################
            # FLOW GUIDE
            #
            # more complicated special cases -- `path` is either
            # - a file/subdataset in a not yet initialized but known
            #   submodule
            # - an entire untracked/unknown existing subdataset
            # - non-existing content that should be installed from `source`
            ###################################################
            lgr.log(5, "IOError logic")
            # we can end up here in two cases ATM
            if (exists(path) or islink(path)) or source is None:
                # FLOW GUIDE
                # - target exists but this dataset's VCS rejects it,
                #   so it should be part of a subdataset
                # or
                # - target doesn't exist, but no source is given, so
                #   it could be a handle that is actually contained in
                #   a not yet installed subdataset
                subds = get_containing_subdataset(ds, relativepath)
                if ds.path != subds.path:
                    # FLOW GUIDE
                    # target path belongs to a subdataset, hand installation
                    # over to it
                    if not subds.is_installed():
                        # FLOW GUIDE
                        # we are dealing with a target in a not yet
                        # available but known subdataset -> install it first
                        ds.install(subds.path, recursive=recursive)
                    return subds.install(
                        path=relpath(path, start=subds.path),
                        source=source,
                        recursive=recursive,
                        add_data_to_git=add_data_to_git)

                # FLOW GUIDE EXIT POINT
                raise InsufficientArgumentsError(
                    "insufficient information for installation: the "
                    "installation target {0} doesn't exists, isn't a "
                    "known handle of dataset {1}, and no `source` "
                    "information was provided.".format(path, ds))

            if not source:
                # FLOW GUIDE EXIT POINT
                raise InsufficientArgumentsError(
                    "insufficient information for installation: the "
                    "installation target {0} doesn't exists, isn't a "
                    "known handle of dataset {1}, and no `source` "
                    "information was provided.".format(path, ds))

            source_path = expandpath(source)
            if exists(source_path):
                # FLOW GUIDE EXIT POINT
                # this could be
                # - local file
                # - local directory
                # - repository outside the dataset
                # we only want to support the last case of locally cloning
                # a repo -- fail otherwise
                if exists(opj(source_path, '.git')):
                    return _install_subds_from_flexible_source(
                        ds, relativepath, source_path, recursive)

                raise ValueError(
                    "installing individual local files or directories is not "
                    "supported, copy/move them into the dataset first")

            # FLOW GUIDE
            # `source` is non-local, it could be:
            #   - repository
            #   - file
            # we have no further evidence, hence we need to try
            try:
                # FLOW GUIDE EXIT POINT
                # assume it is a dataset
                return _install_subds_from_flexible_source(
                    ds, relativepath, source, recursive)
            except CommandError:
                # FLOW GUIDE EXIT POINT
                # apaarently not a repo, assume it is a file url
                vcs.annex_addurl_to_file(relativepath, source)
                return path
Exemplo n.º 25
0
    def __call__(
            dataset=None,
            dest=None,
            path=None,
            # Note: add remote currently disabled in publish
            # dest_url=None, dest_pushurl=None,
            with_data=None,
            recursive=False):

        # Note: add remote currently disabled in publish
        # if dest is None and (dest_url is not None
        #                        or dest_pushurl is not None):
        #     raise ValueError("""insufficient information for adding the
        #     destination as a sibling (needs at least a name)""")

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)
        if not path:
            path = curdir

        elif isinstance(path, list):
            return [
                Publish.__call__(
                    dataset=ds,
                    dest=dest,
                    path=p,
                    # Note: add remote currently disabled in publish
                    # dest_url=dest_url,
                    # dest_pushurl=dest_pushurl,
                    with_data=with_data,
                    recursive=recursive) for p in path
            ]

        # resolve the location against the provided dataset
        if path is not None:
            path = resolve_path(path, ds)

        lgr.info("Publishing {0}".format(path))

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the resolved location (that is now guaranteed to
        # be specified
        if ds is None:
            # try to find a dataset at or above the location
            dspath = GitRepo.get_toppath(abspath(path))
            if dspath is None:
                # no top-level dataset found, use path as such
                dspath = path
            ds = Dataset(dspath)
        lgr.debug("Resolved dataset for publication: {0}".format(ds))
        assert (ds is not None)

        # it might still be about a subdataset of ds:
        if path is not None:
            relativepath = relpath(path, start=ds.path)
            subds = get_containing_subdataset(ds, relativepath)
            if subds.path != ds.path:
                # path belongs to a subdataset; hand it over
                lgr.debug("Hand over to submodule %s" % subds.path)
                return subds.publish(
                    dest=dest,
                    path=relpath(path, start=subds.path),
                    # Note: add remote currently disabled in publish
                    # dest_url=dest_url,
                    # dest_pushurl=dest_pushurl,
                    with_data=with_data,
                    recursive=recursive)

        # now, we know, we have to operate on ds. So, ds needs to be installed,
        # since we cannot publish anything from a not installed dataset,
        # can we?
        # (But may be just the existence of ds.repo is important here.)
        if not ds.is_installed():
            raise ValueError("No installed dataset found at "
                             "{0}.".format(ds.path))
        assert (ds.repo is not None)

        # TODO: For now we can deal with a sibling(remote) name given by `dest`
        # only. Figure out, when to allow for passing a local path or URL
        # directly and what to do in that case.

        # Note: we need an upstream remote, if there's none given. We could
        # wait for git push to complain, but we need to explicitly figure it
        # out for pushing annex branch anyway and we might as well fail right
        # here.

        # keep original dest in case it's None for passing to recursive calls:
        dest_resolved = dest
        if dest is None:
            # check for tracking branch's remote:
            try:
                std_out, std_err = \
                    ds.repo._git_custom_command('',
                                                ["git", "config", "--get", "branch.{active_branch}.remote".format(active_branch=ds.repo.git_get_active_branch())],
                                                expect_fail=True)
            except CommandError as e:
                if e.code == 1 and e.stdout == "":
                    std_out = None
                else:
                    raise
            if std_out:
                dest_resolved = std_out.strip()
            else:
                # we have no remote given and no upstream => fail
                raise RuntimeError("No known default target for "
                                   "publication and none given.")

        # upstream branch needed for update (merge) and subsequent push,
        # in case there is no.
        set_upstream = False
        try:
            # Note: tracking branch actually defined bei entry "merge"
            # PLUS entry "remote"
            std_out, std_err = \
                ds.repo._git_custom_command('',
                                            ["git", "config", "--get",
                                             "branch.{active_branch}.merge".format(active_branch=ds.repo.git_get_active_branch())],
                                            expect_fail=True)
        except CommandError as e:
            if e.code == 1 and e.stdout == "":
                # no tracking branch yet:
                set_upstream = True
            else:
                raise

        # is `dest` an already known remote?
        if dest_resolved not in ds.repo.git_get_remotes():
            # unknown remote
            raise ValueError("No sibling '%s' found." % dest_resolved)

            # Note: add remote currently disabled in publish
            # if dest_url is None:
            #     raise ValueError("No sibling '%s' found. Provide `dest-url`"
            #                      " to register it." % dest_resolved)
            # lgr.info("Sibling %s unknown. Registering ...")
            #
            # # Fill in URL-Template:
            # remote_url = dest_url.replace("%NAME", basename(ds.path))
            # # TODO: handle_name.replace("/", "-")) instead of basename()
            # #       - figure it out ;)
            # #       - either a datasets needs to discover superdatasets in
            # #         order to get it's relative path to provide a name
            # #       - or: We need a different approach on the templates
            #
            # # Add the remote
            # ds.repo.git_remote_add(dest_resolved, remote_url)
            # if dest_pushurl:
            #     # Fill in template:
            #     remote_url_push = \
            #         dest_pushurl.replace("%NAME", basename(ds.path))
            #     # TODO: Different way of replacing %NAME; See above
            #
            #     # Modify push url:
            #     ds.repo._git_custom_command('',
            #                                 ["git", "remote",
            #                                  "set-url",
            #                                  "--push", dest_resolved,
            #                                  remote_url_push])
            # lgr.info("Added sibling '%s'." % dest)
            # lgr.debug("Added remote '%s':\n %s (fetch)\n%s (push)." %
            #           (dest_resolved, remote_url,
            #            remote_url_push if dest_pushurl else remote_url))
        # Note: add remote currently disabled in publish
        # else:
        #     # known remote: parameters dest-url-* currently invalid.
        #     # This may change to adapt the existing remote.
        #     if dest_url:
        #         lgr.warning("Sibling '%s' already exists for dataset '%s'. "
        #                     "Ignoring dest-url %s." %
        #                     (dest_resolved, ds.path, dest_url))
        #     if dest_pushurl:
        #         lgr.warning("Sibling '%s' already exists for dataset '%s'. "
        #                     "Ignoring dest-pushurl %s." %
        #                     (dest_resolved, ds.path, dest_pushurl))

        # Figure out, what to publish
        if path is None or path == ds.path:
            # => publish the dataset itself
            # push local state:
            # TODO: Rework git_push in GitRepo
            cmd = ['git', 'push']
            if set_upstream:
                # no upstream branch yet
                cmd.append("--set-upstream")
            cmd += [dest_resolved, ds.repo.git_get_active_branch()]
            ds.repo._git_custom_command('', cmd)
            # push annex branch:
            if isinstance(ds.repo, AnnexRepo):
                ds.repo.git_push("%s +git-annex:git-annex" % dest_resolved)

            # TODO: if with_data is a shell pattern, we get a list, when called
            # from shell, right?
            # => adapt the following and check constraints to allow for that
            if with_data:
                ds.repo._git_custom_command('', ["git", "annex", "copy"] +
                                            with_data +
                                            ["--to", dest_resolved])

            if recursive and ds.get_dataset_handles() != []:
                results = [ds]
                # Note: add remote currently disabled in publish
                # modify URL templates:
                # if dest_url:
                #     dest_url = dest_url.replace('%NAME', basename(ds.path) + '-%NAME')
                # if dest_pushurl:
                #     dest_pushurl = dest_pushurl.replace('%NAME', basename(ds.path) + '-%NAME')
                for subds in ds.get_dataset_handles():
                    results.append(
                        Dataset(opj(ds.path, subds)).publish(
                            dest=dest,
                            # Note: use `dest` instead of `dest_resolved` in case
                            # dest was None, so subdatasets would use their default
                            # as well
                            # Note: add remote currently disabled in publish
                            # dest_url=dest_url,
                            # dest_pushurl=dest_pushurl,
                            with_data=with_data,
                            recursive=recursive))
                return results

            return ds

        elif exists(path):
            # At this point `path` is not referencing a (sub)dataset.
            # An annexed file is the only thing left, that `path` might be
            # validly pointing to. Anything else we can't handle currently.
            if isinstance(ds.repo, AnnexRepo):
                try:
                    if ds.repo.get_file_key(relativepath):
                        # file is in annex, publish it
                        ds.repo._run_annex_command(
                            'copy',
                            annex_options=[path,
                                           '--to=%s' % dest_resolved])
                        return path
                except (FileInGitError, FileNotInAnnexError):
                    pass
            # `path` can't be published
            lgr.warning("Don't know how to publish %s." % path)
            return None

        else:
            # nothing to publish found
            lgr.warning("Nothing to publish found at %s." % path)
            return None
    def __call__(sshurl,
                 name=None,
                 target_dir=None,
                 target_url=None,
                 target_pushurl=None,
                 dataset=None,
                 recursive=False,
                 recursion_limit=None,
                 existing='error',
                 shared=None,
                 group=None,
                 ui=False,
                 as_common_datasrc=None,
                 publish_by_default=None,
                 publish_depends=None,
                 annex_wanted=None,
                 annex_group=None,
                 annex_groupwanted=None,
                 inherit=False,
                 since=None):
        #
        # nothing without a base dataset
        #
        ds = require_dataset(dataset,
                             check_installed=True,
                             purpose='creating a sibling')
        refds_path = ds.path

        #
        # all checks that are possible before we start parsing the dataset
        #

        # possibly use sshurl to get the name in case if not specified
        if not sshurl:
            if not inherit:
                raise InsufficientArgumentsError(
                    "needs at least an SSH URL, if no inherit option")
            if name is None:
                raise ValueError(
                    "Neither SSH URL, nor the name of sibling to inherit from "
                    "was specified")
            # It might well be that we already have this remote setup
            try:
                sshurl = CreateSibling._get_remote_url(ds, name)
            except Exception as exc:
                lgr.debug('%s does not know about url for %s: %s', ds, name,
                          exc_str(exc))
        elif inherit:
            raise ValueError(
                "For now, for clarity not allowing specifying a custom sshurl "
                "while inheriting settings")
            # may be could be safely dropped -- still WiP

        if not sshurl:
            # TODO: may be more back up before _prep?
            super_ds = ds.get_superdataset()
            if not super_ds:
                raise ValueError(
                    "Could not determine super dataset for %s to inherit URL" %
                    ds)
            super_url = CreateSibling._get_remote_url(super_ds, name)
            # for now assuming hierarchical setup
            # (TODO: to be able to destinguish between the two, probably
            # needs storing datalad.*.target_dir to have %RELNAME in there)
            sshurl = slash_join(super_url, relpath(refds_path, super_ds.path))

        # check the login URL
        sibling_ri = RI(sshurl)
        ssh_sibling = is_ssh(sibling_ri)
        if not (ssh_sibling or isinstance(sibling_ri, PathRI)):
            raise ValueError(
                "Unsupported SSH URL or path: '{0}', "
                "use ssh://host/path, host:path or path syntax".format(sshurl))

        if not name:
            name = sibling_ri.hostname if ssh_sibling else "local"
            lgr.debug("No sibling name given. Using %s'%s' as sibling name",
                      "URL hostname " if ssh_sibling else "", name)
        if since == '':
            # consider creating siblings only since the point of
            # the last update
            # XXX here we assume one to one mapping of names from local branches
            # to the remote
            active_branch = ds.repo.get_active_branch()
            since = '%s/%s' % (name, active_branch)

        #
        # parse the base dataset to find all subdatasets that need processing
        #
        to_process = []
        cand_ds = [
            Dataset(r['path']) for r in diff_dataset(
                ds,
                fr=since,
                to=None,
                # make explicit, but doesn't matter, no recursion in diff()
                constant_refs=True,
                # contrain to the paths of all locally existing subdatasets
                path=[
                    sds['path']
                    for sds in ds.subdatasets(recursive=recursive,
                                              recursion_limit=recursion_limit,
                                              fulfilled=True,
                                              result_renderer=None)
                ],
                # save cycles, we are only looking for datasets
                annex=None,
                untracked='no',
                # recursion was done faster by subdatasets()
                recursive=False,
                # save cycles, we are only looking for datasets
                eval_file_type=False,
            ) if r.get('type') == 'dataset' and r.get('state', None) != 'clean'
        ]
        # check remotes setup
        for d in cand_ds if since else ([ds] + cand_ds):
            d_repo = d.repo
            if d_repo is None:
                continue
            checkds_remotes = d.repo.get_remotes()
            res = dict(
                action='create_sibling',
                path=d.path,
                type='dataset',
            )

            if publish_depends:
                # make sure dependencies are valid
                # TODO: inherit -- we might want to automagically create
                # those dependents as well???
                unknown_deps = set(
                    ensure_list(publish_depends)).difference(checkds_remotes)
                if unknown_deps:
                    yield dict(
                        res,
                        status='error',
                        message=('unknown sibling(s) specified as publication '
                                 'dependency: %s', unknown_deps),
                    )
                    continue
            if name in checkds_remotes and existing in ('error', 'skip'):
                yield dict(
                    res,
                    status='error' if existing == 'error' else 'notneeded',
                    message=(
                        "sibling '%s' already configured (specify alternative "
                        "name, or force reconfiguration via --existing", name),
                )
                continue
            to_process.append(res)

        if not to_process:
            # we ruled out all possibilities
            # TODO wait for gh-1218 and make better return values
            lgr.info("No datasets qualify for sibling creation. "
                     "Consider different settings for --existing "
                     "or --since if this is unexpected")
            return

        if ssh_sibling:
            # request ssh connection:
            lgr.info("Connecting ...")
            shell = ssh_manager.get_connection(sshurl)
        else:
            shell = _RunnerAdapter()
            sibling_ri.path = str(resolve_path(sibling_ri.path, dataset))
            if target_dir:
                target_dir = opj(sibling_ri.path, target_dir)

        if target_dir is None:
            if sibling_ri.path:
                target_dir = sibling_ri.path
            else:
                target_dir = '.'

        # TODO: centralize and generalize template symbol handling
        replicate_local_structure = "%RELNAME" not in target_dir

        if not shell.get_annex_version():
            raise MissingExternalDependency(
                'git-annex',
                msg="It's required on the {} machine to create a sibling".
                format('remote' if ssh_sibling else 'local'))

        #
        # all checks done and we have a connection, now do something
        #

        # loop over all datasets, ordered from top to bottom to make test
        # below valid (existing directories would cause the machinery to halt)
        # But we need to run post-update hook in depth-first fashion, so
        # would only collect first and then run (see gh #790)
        yielded = set()
        remote_repos_to_run_hook_for = []
        for currentds_ap in \
                sorted(to_process, key=lambda x: x['path'].count('/')):
            current_ds = Dataset(currentds_ap['path'])

            path = _create_dataset_sibling(
                name, current_ds, refds_path, shell, replicate_local_structure,
                sibling_ri, target_dir, target_url, target_pushurl, existing,
                shared, group, publish_depends, publish_by_default, ui,
                as_common_datasrc, annex_wanted, annex_group,
                annex_groupwanted, inherit)
            if not path:
                # nothing new was created
                # TODO is 'notneeded' appropriate in this case?
                currentds_ap['status'] = 'notneeded'
                # TODO explain status in 'message'
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            remote_repos_to_run_hook_for.append((path, currentds_ap))

            # publish web-interface to root dataset on publication server
            if current_ds.path == refds_path and ui:
                lgr.info("Uploading web interface to %s" % path)
                try:
                    CreateSibling.upload_web_interface(path, shell, shared, ui)
                except CommandError as e:
                    currentds_ap['status'] = 'error'
                    currentds_ap['message'] = (
                        "failed to push web interface to the remote datalad repository (%s)",
                        exc_str(e))
                    yield currentds_ap
                    yielded.add(currentds_ap['path'])
                    continue

        # in reverse order would be depth first
        lgr.info("Running post-update hooks in all created siblings")
        # TODO: add progressbar
        for path, currentds_ap in remote_repos_to_run_hook_for[::-1]:
            # Trigger the hook
            lgr.debug("Running hook for %s (if exists and executable)", path)
            try:
                shell(
                    "cd {} "
                    "&& ( [ -x hooks/post-update ] && hooks/post-update || : )"
                    "".format(sh_quote(_path_(path, ".git"))))
            except CommandError as e:
                currentds_ap['status'] = 'error'
                currentds_ap['message'] = (
                    "failed to run post-update hook under remote path %s (%s)",
                    path, exc_str(e))
                yield currentds_ap
                yielded.add(currentds_ap['path'])
                continue
            if not currentds_ap['path'] in yielded:
                # if we were silent until now everything is just splendid
                currentds_ap['status'] = 'ok'
                yield currentds_ap
Exemplo n.º 27
0
    def __call__(
            path=None,
            initopts=None,
            force=False,
            description=None,
            dataset=None,
            no_annex=_NoAnnexDefault,
            annex=True,
            fake_dates=False,
            cfg_proc=None
    ):
        # TODO: introduced with 0.13, remove with 0.14
        if no_annex is not _NoAnnexDefault:
            # the two mirror options do not agree and the deprecated one is
            # not at default value
            warnings.warn("datalad-create's `no_annex` option is deprecated "
                          "and will be removed in a future release, "
                          "use the reversed-sign `annex` option instead.",
                          DeprecationWarning)
            # honor the old option for now
            annex = not no_annex

        # we only perform negative tests below
        no_annex = not annex

        if dataset:
            if isinstance(dataset, Dataset):
                ds = dataset
            else:
                ds = Dataset(dataset)
            refds_path = ds.path
        else:
            ds = refds_path = None

        # two major cases
        # 1. we got a `dataset` -> we either want to create it (path is None),
        #    or another dataset in it (path is not None)
        # 2. we got no dataset -> we want to create a fresh dataset at the
        #    desired location, either at `path` or PWD

        # sanity check first
        if no_annex:
            if description:
                raise ValueError("Incompatible arguments: cannot specify "
                                 "description for annex repo and declaring "
                                 "no annex repo.")

        if (isinstance(initopts, (list, tuple)) and '--bare' in initopts) or (
                isinstance(initopts, dict) and 'bare' in initopts):
            raise ValueError(
                "Creation of bare repositories is not supported. Consider "
                "one of the create-sibling commands, or use "
                "Git to init a bare repository and push an existing dataset "
                "into it.")

        if path:
            path = resolve_path(path, dataset)

        path = path if path \
            else getpwd() if ds is None \
            else refds_path

        # we know that we need to create a dataset at `path`
        assert(path is not None)

        # assure cfg_proc is a list (relevant if used via Python API)
        cfg_proc = assure_list(cfg_proc)

        # prep for yield
        res = dict(action='create', path=str(path),
                   logger=lgr, type='dataset',
                   refds=refds_path)

        refds = None
        if refds_path and refds_path != str(path):
            refds = require_dataset(
                refds_path, check_installed=True,
                purpose='creating a subdataset')

            path_inrefds = path_under_rev_dataset(refds, path)
            if path_inrefds is None:
                yield dict(
                    res,
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s",
                        ds, str(path)),
                )
                return

        # try to locate an immediate parent dataset
        # we want to know this (irrespective of whether we plan on adding
        # this new dataset to a parent) in order to avoid conflicts with
        # a potentially absent/uninstalled subdataset of the parent
        # in this location
        # it will cost some filesystem traversal though...
        parentds_path = get_dataset_root(
            op.normpath(op.join(str(path), os.pardir)))
        if parentds_path:
            prepo = GitRepo(parentds_path)
            parentds_path = ut.Path(parentds_path)
            # we cannot get away with a simple
            # GitRepo.get_content_info(), as we need to detect
            # uninstalled/added subdatasets too
            check_path = ut.Path(path)
            pstatus = prepo.status(
                untracked='no',
                # limit query to target path for a potentially massive speed-up
                paths=[check_path.relative_to(parentds_path)])
            if (not pstatus.get(check_path, {}).get("type") == "dataset" and
                any(check_path == p or check_path in p.parents
                    for p in pstatus)):
                # redo the check in a slower fashion, it is already broken
                # let's take our time for a proper error message
                conflict = [
                    p for p in pstatus
                    if check_path == p or check_path in p.parents]
                res.update({
                    'status': 'error',
                    'message': (
                        'collision with content in parent dataset at %s: %s',
                        str(parentds_path),
                        [str(c) for c in conflict])})
                yield res
                return
            if not force:
                # another set of check to see whether the target path is pointing
                # into a known subdataset that is not around ATM
                subds_status = {
                    parentds_path / k.relative_to(prepo.path)
                    for k, v in pstatus.items()
                    if v.get('type', None) == 'dataset'}
                check_paths = [check_path]
                check_paths.extend(check_path.parents)
                if any(p in subds_status for p in check_paths):
                    conflict = [p for p in check_paths if p in subds_status]
                    res.update({
                        'status': 'error',
                        'message': (
                            'collision with %s (dataset) in dataset %s',
                            str(conflict[0]),
                            str(parentds_path))})
                    yield res
                    return

        # important to use the given Dataset object to avoid spurious ID
        # changes with not-yet-materialized Datasets
        tbds = ds if isinstance(ds, Dataset) and \
            ds.path == path else Dataset(str(path))

        # don't create in non-empty directory without `force`:
        if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force:
            res.update({
                'status': 'error',
                'message':
                    'will not create a dataset in a non-empty directory, use '
                    '`force` option to ignore'})
            yield res
            return

        # stuff that we create and want to have tracked with git (not annex)
        add_to_git = {}

        if initopts is not None and isinstance(initopts, list):
            initopts = {'_from_cmdline_': initopts}

        # Note for the code below:
        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Re-use tbrepo instance, do not use tbds.repo

        # create and configure desired repository
        if no_annex:
            lgr.info("Creating a new git repo at %s", tbds.path)
            tbrepo = GitRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                git_opts=initopts,
                fake_dates=fake_dates)
            # place a .noannex file to indicate annex to leave this repo alone
            stamp_path = ut.Path(tbrepo.path) / '.noannex'
            stamp_path.touch()
            add_to_git[stamp_path] = {
                'type': 'file',
                'state': 'untracked'}
        else:
            # always come with annex when created from scratch
            lgr.info("Creating a new annex repo at %s", tbds.path)
            tbrepo = AnnexRepo(
                tbds.path,
                url=None,
                create=True,
                create_sanity_checks=False,
                # do not set backend here, to avoid a dedicated commit
                backend=None,
                # None causes version to be taken from config
                version=None,
                description=description,
                git_opts=initopts,
                fake_dates=fake_dates
            )
            # set the annex backend in .gitattributes as a staged change
            tbrepo.set_default_backend(
                cfg.obtain('datalad.repo.backend'),
                persistent=True, commit=False)
            add_to_git[tbrepo.pathobj / '.gitattributes'] = {
                'type': 'file',
                'state': 'added'}
            # make sure that v6 annex repos never commit content under .datalad
            attrs_cfg = (
                ('config', 'annex.largefiles', 'nothing'),
                ('metadata/aggregate*', 'annex.largefiles', 'nothing'),
                ('metadata/objects/**', 'annex.largefiles',
                 '({})'.format(cfg.obtain(
                     'datalad.metadata.create-aggregate-annex-limit'))))
            attrs = tbrepo.get_gitattributes(
                [op.join('.datalad', i[0]) for i in attrs_cfg])
            set_attrs = []
            for p, k, v in attrs_cfg:
                if not attrs.get(
                        op.join('.datalad', p), {}).get(k, None) == v:
                    set_attrs.append((p, {k: v}))
            if set_attrs:
                tbrepo.set_gitattributes(
                    set_attrs,
                    attrfile=op.join('.datalad', '.gitattributes'))

            # prevent git annex from ever annexing .git* stuff (gh-1597)
            attrs = tbrepo.get_gitattributes('.git')
            if not attrs.get('.git', {}).get(
                    'annex.largefiles', None) == 'nothing':
                tbrepo.set_gitattributes([
                    ('**/.git*', {'annex.largefiles': 'nothing'})])
                # must use the repo.pathobj as this will have resolved symlinks
                add_to_git[tbrepo.pathobj / '.gitattributes'] = {
                    'type': 'file',
                    'state': 'untracked'}

        # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad
        # Note, must not happen earlier (before if) since "smart" it would not be
        tbds_config = tbds.config

        # record an ID for this repo for the afterlife
        # to be able to track siblings and children
        id_var = 'datalad.dataset.id'
        # Note, that Dataset property `id` will change when we unset the
        # respective config. Therefore store it before:
        tbds_id = tbds.id
        if id_var in tbds_config:
            # make sure we reset this variable completely, in case of a
            # re-create
            tbds_config.unset(id_var, where='dataset')

        if _seed is None:
            # just the standard way
            uuid_id = uuid.uuid1().urn.split(':')[-1]
        else:
            # Let's generate preseeded ones
            uuid_id = str(uuid.UUID(int=random.getrandbits(128)))
        tbds_config.add(
            id_var,
            tbds_id if tbds_id is not None else uuid_id,
            where='dataset',
            reload=False)

        # make config overrides permanent in the repo config
        # this is similar to what `annex init` does
        # we are only doing this for config overrides and do not expose
        # a dedicated argument, because it is sufficient for the cmdline
        # and unnecessary for the Python API (there could simply be a
        # subsequence ds.config.add() call)
        for k, v in tbds_config.overrides.items():
            tbds_config.add(k, v, where='local', reload=False)

        # all config manipulation is done -> fll reload
        tbds_config.reload()

        # must use the repo.pathobj as this will have resolved symlinks
        add_to_git[tbrepo.pathobj / '.datalad'] = {
            'type': 'directory',
            'state': 'untracked'}

        # save everything, we need to do this now and cannot merge with the
        # call below, because we may need to add this subdataset to a parent
        # but cannot until we have a first commit
        tbrepo.save(
            message='[DATALAD] new dataset',
            git=True,
            # we have to supply our own custom status, as the repo does
            # not have a single commit yet and the is no HEAD reference
            # TODO make `GitRepo.status()` robust to this state.
            _status=add_to_git,
        )

        for cfg_proc_ in cfg_proc:
            for r in tbds.run_procedure('cfg_' + cfg_proc_):
                yield r

        # the next only makes sense if we saved the created dataset,
        # otherwise we have no committed state to be registered
        # in the parent
        if isinstance(refds, Dataset) and refds.path != tbds.path:
            # we created a dataset in another dataset
            # -> make submodule
            for r in refds.save(
                    path=tbds.path,
            ):
                yield r

        res.update({'status': 'ok'})
        yield res
Exemplo n.º 28
0
    def __call__(path=None,
                 source=None,
                 dataset=None,
                 get_data=False,
                 description=None,
                 recursive=False,
                 recursion_limit=None,
                 reckless=None,
                 jobs="auto"):

        # normalize path argument to be equal when called from cmdline and
        # python and nothing was passed into `path`
        path = ensure_list(path)

        if not source and not path:
            raise InsufficientArgumentsError(
                "Please provide at least a source or a path")

        #  Common kwargs to pass to underlying git/install calls.
        #  They might need adjustments (e.g. for recursion_limit, but
        #  otherwise would be applicable throughout
        #
        # There should have been more of common options!
        # since underneath get could do similar installs
        common_kwargs = dict(
            get_data=get_data,
            recursive=recursive,
            recursion_limit=recursion_limit,
            # git_opts=git_opts,
            # annex_opts=annex_opts,
            reckless=reckless,
            jobs=jobs,
        )

        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = None
        if dataset is not None:
            ds = require_dataset(dataset,
                                 check_installed=True,
                                 purpose='installation')
            common_kwargs['dataset'] = dataset
        # pre-compute for results below
        refds_path = Interface.get_refds_path(ds)

        # switch into the two scenarios without --source:
        # 1. list of URLs
        # 2. list of (sub)dataset content
        if source is None:
            # we need to collect URLs and paths
            to_install = []
            to_get = []
            # TODO: this approach is problematic, it disrupts the order of input args.
            # consequently results will be returned in an unexpected order when a
            # mixture of source URL and paths is given. Reordering is only possible when
            # everything in here is fully processed before any results can be yielded.
            # moreover, I think the semantics of the status quo implementation are a
            # bit complicated: in a mixture list a source URL will lead to a new dataset
            # at a generated default location, but a path will lead to a subdataset
            # at that exact location
            for urlpath in path:
                ri = RI(urlpath)
                (to_get
                 if isinstance(ri, PathRI) else to_install).append(urlpath)

            # 1. multiple source URLs
            for s in to_install:
                lgr.debug("Install passes into install source=%s", s)
                for r in Install.__call__(
                        source=s,
                        description=description,
                        # we need to disable error handling in order to have it done at
                        # the very top, otherwise we are not able to order a global
                        # "ignore-and-keep-going"
                        on_failure='ignore',
                        return_type='generator',
                        result_xfm=None,
                        result_filter=None,
                        **common_kwargs):
                    # no post-processing of the installed content on disk
                    # should be necessary here, all done by code further
                    # down that deals with an install from an actuall `source`
                    # any necessary fixes should go there too!
                    r['refds'] = refds_path
                    yield r

            # 2. one or more dataset content paths
            if to_get:
                lgr.debug("Install passes into get %d items", len(to_get))
                # all commented out hint on inability to pass those options
                # into underlying install-related calls.
                # Also need to pass from get:
                #  annex_get_opts

                for r in Get.__call__(
                        to_get,
                        # TODO should pass-through description, not sure why disabled
                        # description=description,
                        # we need to disable error handling in order to have it done at
                        # the very top, otherwise we are not able to order a global
                        # "ignore-and-keep-going"
                        on_failure='ignore',
                        return_type='generator',
                        result_xfm=None,
                        result_filter=None,
                        **common_kwargs):
                    # no post-processing of get'ed content on disk should be
                    # necessary here, this is the responsibility of `get`
                    # (incl. adjusting parent's gitmodules when submodules end
                    # up in an "updated" state (done in get helpers)
                    # any required fixes should go there!
                    r['refds'] = refds_path
                    yield r

            # we are done here
            # the rest is about install from a `source`
            return

        # an actual `source` was given
        if source and path and len(path) > 1:
            # exception is ok here, if this fails it is either direct user error
            # or we f****d up one of our internal calls
            raise ValueError(
                "install needs a single PATH when source is provided.  "
                "Was given mutliple PATHs: %s" % str(path))

        # parameter constraints:
        if not source:
            # exception is ok here, if this fails it is either direct user error
            # or we f****d up one of our internal calls
            raise InsufficientArgumentsError(
                "a `source` is required for installation")

        # code below deals with a single path only
        path = path[0] if path else None

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            yield get_status_dict(
                'install',
                path=path,
                status='impossible',
                logger=lgr,
                source_url=source,
                refds=refds_path,
                message=
                "installation `source` and destination `path` are identical. "
                "If you are trying to add a subdataset simply use the `save` command"
            )
            return

        # resolve the target location (if local) against the provided dataset
        # or CWD:
        if path is not None:
            # MIH everything in here is highly similar to what common
            # interface helpers do (or should/could do), but at the same
            # is very much tailored to just apply to `install` -- I guess
            # it has to stay special

            # Should work out just fine for regular paths, so no additional
            # conditioning is necessary
            try:
                path_ri = RI(path)
            except Exception as e:
                raise ValueError("invalid path argument {}: ({})".format(
                    path, exc_str(e)))
            try:
                # Wouldn't work for SSHRI ATM, see TODO within SSHRI
                # yoh: path should be a local path, and mapping note within
                #      SSHRI about mapping localhost:path to path is kinda
                #      a peculiar use-case IMHO
                # TODO Stringification can be removed once PY35 is no longer
                # supported
                path = str(resolve_path(path_ri.localpath, dataset))
                # any `path` argument that point to something local now
                # resolved and is no longer a URL
            except ValueError:
                # `path` is neither a valid source nor a local path.
                # TODO: The only thing left is a known subdataset with a
                # name, that is not a path; Once we correctly distinguish
                # between path and name of a submodule, we need to consider
                # this.
                # For now: Just raise
                raise ValueError("Invalid path argument {0}".format(path))
        # `path` resolved, if there was any.

        # clone dataset, will also take care of adding to superdataset, if one
        # is given
        res = Clone.__call__(
            source,
            path,
            dataset=ds,
            description=description,
            reckless=reckless,
            # we need to disable error handling in order to have it done at
            # the very top, otherwise we are not able to order a global
            # "ignore-and-keep-going"
            result_xfm=None,
            return_type='generator',
            result_filter=None,
            on_failure='ignore')
        # helper
        as_ds = YieldDatasets()
        destination_dataset = None
        for r in res:
            if r['action'] == 'install' and r['type'] == 'dataset':
                # make sure logic below is valid, only one dataset result is
                # coming back
                assert (destination_dataset is None)
                destination_dataset = as_ds(r)
            r['refds'] = refds_path
            yield r
        assert (destination_dataset)

        # Now, recursive calls:
        if recursive or get_data:
            # dataset argument must not be passed inside since we use bound .get
            # It is ok to do "inplace" as long as we still return right
            # after the loop ends
            common_kwargs.pop('dataset', '')
            for r in destination_dataset.get(
                    curdir,
                    description=description,
                    # we need to disable error handling in order to have it done at
                    # the very top, otherwise we are not able to order a global
                    # "ignore-and-keep-going"
                    on_failure='ignore',
                    return_type='generator',
                    result_xfm=None,
                    **common_kwargs):
                r['refds'] = refds_path
                yield r
        # at this point no futher post-processing should be necessary,
        # `clone` and `get` must have done that (incl. parent handling)
        # if not, bugs should be fixed in those commands
        return
Exemplo n.º 29
0
    def __call__(path,
                 dataset=None,
                 spec_file=None,
                 properties=None,
                 replace=False):
        # TODO: message

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose="hirni spec4anything")
        path = assure_list(path)
        path = [resolve_path(p, dataset) for p in path]

        res_kwargs = dict(action='hirni spec4anything', logger=lgr)
        res_kwargs['refds'] = Interface.get_refds_path(dataset)

        # ### This might become superfluous. See datalad-gh-2653
        ds_path = PathRI(dataset.path)
        # ###

        updated_files = []
        paths = []
        for ap in AnnotatePaths.__call__(
                dataset=dataset,
                path=path,
                action='hirni spec4anything',
                unavailable_path_status='impossible',
                nondataset_path_status='error',
                return_type='generator',
                # TODO: Check this one out:
                on_failure='ignore',
                # Note/TODO: Not sure yet whether and when we need those.
                # Generally we want to be able to create a spec for subdatasets,
                # too:
                # recursive=recursive,
                # recursion_limit=recursion_limit,
                # force_subds_discovery=True,
                # force_parentds_discovery=True,
        ):

            if ap.get('status', None) in ['error', 'impossible']:
                yield ap
                continue

            # ### This might become superfluous. See datalad-gh-2653
            ap_path = PathRI(ap['path'])
            # ###

            # find acquisition and respective specification file:
            rel_path = posixpath.relpath(ap_path.posixpath, ds_path.posixpath)

            path_parts = rel_path.split('/')

            # TODO: Note: Outcommented this warning for now. We used to not have
            # a spec file at the toplevel of the study dataset, but now we do.
            # The logic afterwards works, but should be revisited. At least,
            # `acq` should be called differently now.
            # if len(path_parts) < 2:
            #     lgr.warning("Not within an acquisition")
            acq = path_parts[0]

            # TODO: spec file specifiable or fixed path?
            #       if we want the former, what we actually need is an
            #       association of acquisition and its spec path
            #       => prob. not an option but a config

            spec_path = spec_file if spec_file \
                else posixpath.join(ds_path.posixpath, acq,
                                    dataset.config.get("datalad.hirni.studyspec.filename",
                                                       "studyspec.json"))

            spec = [r for r in json_py.load_stream(spec_path)] \
                if posixpath.exists(spec_path) else list()

            lgr.debug("Add specification snippet for %s", ap['path'])
            # XXX 'add' does not seem to be the thing we want to do
            # rather 'set', so we have to check whether a spec for a location
            # is already known and fail or replace it (maybe with --force)

            # go through all existing specs and extract unique value
            # and also assign them to the new record (subjects, ...), but only
            # editable fields!!
            uniques = dict()
            for s in spec:
                for k in s:
                    if isinstance(s[k], dict) and 'value' in s[k]:
                        if k not in uniques:
                            uniques[k] = set()
                        uniques[k].add(s[k]['value'])
            overrides = dict()
            for k in uniques:
                if len(uniques[k]) == 1:
                    overrides[k] = _get_edit_dict(value=uniques[k].pop(),
                                                  approved=False)

            if properties:

                # TODO: This entire reading of properties needs to be RF'd
                # into proper generalized functions.
                # spec got more complex. update() prob. can't simply override
                # (think: 'procedures' and 'tags' prob. need to be appended
                # instead)

                # load from file or json string
                if isinstance(properties, dict):
                    props = properties
                elif op.exists(properties):
                    props = json_py.load(properties)
                else:
                    props = json_py.loads(properties)
                # turn into editable, pre-approved records
                spec_props = {
                    k: dict(value=v, approved=True)
                    for k, v in props.items()
                    if k not in non_editables + ['tags', 'procedures']
                }
                spec_props.update({
                    k: v
                    for k, v in props.items() if k in non_editables + ['tags']
                })

                # TODO: still wrong. It's a list. Append or override? How to decide?
                spec_props.update({
                    o_k: [{
                        i_k: dict(value=i_v, approved=True)
                        for i_k, i_v in o_v.items()
                    }]
                    for o_k, o_v in props.items() if o_k in ['procedures']
                })

                overrides.update(spec_props)

            # TODO: It's probably wrong to use uniques for overwriting! At least
            # they cannot be used to overwrite values explicitly set in
            # _add_to_spec like "location", "type", etc.
            #
            # But then: This should concern non-editable fields only, right?

            spec = _add_to_spec(spec,
                                posixpath.split(spec_path)[0],
                                ap,
                                dataset,
                                overrides=overrides,
                                replace=replace)

            # Note: Not sure whether we really want one commit per snippet.
            #       If not - consider:
            #       - What if we fail amidst? => Don't write to file yet.
            #       - What about input paths from different acquisitions?
            #         => store specs per acquisition in memory
            # MIH: One commit per line seems silly. why not update all files
            # collect paths of updated files, and give them to a single `add`
            # at the very end?
            # MIH: if we fail, we fail and nothing is committed
            from datalad_hirni.support.spec_helpers import sort_spec
            json_py.dump2stream(sorted(spec, key=lambda x: sort_spec(x)),
                                spec_path)
            updated_files.append(spec_path)

            yield get_status_dict(status='ok',
                                  type=ap['type'],
                                  path=ap['path'],
                                  **res_kwargs)
            paths.append(ap)

        from datalad.dochelpers import single_or_plural
        from os import linesep
        message = "[HIRNI] Add specification {n_snippets} for: {paths}".format(
            n_snippets=single_or_plural("snippet", "snippets", len(paths)),
            paths=linesep.join(" - " + op.relpath(p['path'], dataset.path)
                               for p in paths)
            if len(paths) > 1 else op.relpath(paths[0]['path'], dataset.path))
        for r in dataset.save(updated_files,
                              to_git=True,
                              message=message,
                              return_type='generator',
                              result_renderer='disabled'):
            yield r
Exemplo n.º 30
0
    def __call__(message=None,
                 files=None,
                 dataset=None,
                 auto_add_changes=False,
                 version_tag=None,
                 recursive=False,
                 recursion_limit=None,
                 super_datasets=False):
        # shortcut
        ds = require_dataset(dataset, check_installed=True, purpose='saving')

        if not ds.repo.repo.is_dirty(index=True,
                                     working_tree=True,
                                     untracked_files=True,
                                     submodules=True):
            # if we cannot see anything dirty at all, the only things we could
            # do is tag
            if version_tag:
                ds.repo.tag(version_tag)
            # take the easy one out
            return

        # always yields list; empty if None
        files = assure_list(files)

        # track what to be committed, so it becomes
        # possible to decide when/what to save further down
        # and one level up
        orig_hexsha = ds.repo.get_hexsha()
        to_commit = []

        # before anything, let's deal with missing submodules that may have
        # been rm'ed by the user
        # this will not alter/amend the history of the dataset
        deinit_deleted_submodules(ds)

        # XXX path resolution needs to happen on the input argument, not the
        # resolved dataset!
        # otherwise we will not be able to figure out, whether there was an
        # explicit dataset provided, or just a matching one resolved
        # automatically.
        # if files are provided but no dataset, we interpret them as
        # CWD-related

        if auto_add_changes:
            # use the dataset's base path to indicate that everything
            # should be saved
            if files:
                lgr.warning(
                    "List of paths was provided to save but auto_add_changes "
                    "was specified, so list of paths was ignored")
            files = [ds.path]
        else:
            # make sure we apply the usual path interpretation logic
            files = [resolve_path(p, dataset) for p in files]

        new_submodules = untracked_subdatasets_to_submodules(ds, files)
        if new_submodules:
            # make sure that .gitmodules is added to the list of files
            # to be committed.  Adding to index might not be enough iff
            # custom files was provided
            to_commit.append('.gitmodules')
        to_commit.extend(new_submodules)

        # now we should have a complete list of submodules to potentially
        # recurse into
        if recursive and (recursion_limit is None or recursion_limit > 0):
            # what subdataset to touch?
            subdss = []
            if auto_add_changes:
                # all installed 1st-level ones
                # we only want immediate subdatasets, higher depths will come
                # via recursion
                subdss = [
                    Dataset(opj(ds.path, subds_path))
                    for subds_path in ds.get_subdatasets(recursive=False)
                ]
            elif files is not None:
                # only subdatasets that contain any of the to-be-considered
                # paths
                # TODO:  the same deductions will be redone later again
                #  very inefficient.  Should be just sorted into subds
                #  once!
                subdss = [
                    ds.get_containing_subdataset(p, recursion_limit=1)
                    for p in files
                ]

            # skip anything that isn't installed, or this dataset
            subdss = [d for d in subdss if d.is_installed() and d != ds]

            prop_recursion_limit = \
                None if recursion_limit is None else max(recursion_limit - 1, 0)

            for subds in subdss:
                # TODO: just make use of get._sort_paths_into_datasets
                # currently it is very inefficient since for the same ds
                # it asks about subdatasets for every file!
                subds_files = []  # files belonging to the subds
                todo_files = []  # leftover files
                for f in files:
                    if ds.get_containing_subdataset(
                            f, recursion_limit=1) == subds:
                        subds_files.append(f)
                    else:
                        todo_files.append(f)
                files = todo_files

                subds_modified = Save.__call__(
                    message=message,
                    files=subds_files,
                    dataset=subds,
                    auto_add_changes=auto_add_changes,
                    version_tag=version_tag,
                    recursive=recursive and
                    (prop_recursion_limit is None or prop_recursion_limit > 0),
                    recursion_limit=prop_recursion_limit,
                )
                if subds_modified:
                    # stage changes in this submodule
                    subdspath = relpath(subds.path, ds.path)
                    ds.repo.add(subdspath, git=True)
                    to_commit.append(subdspath)

        if files:  # could still be none without auto add changes
            ds_subdatasets = ds.get_subdatasets(recursive=False)
            subdatasets_paths = {opj(ds.path, f) for f in ds_subdatasets}
            # TODO: also use some centralized sorting into sub-datasets
            # e.g. one used in get
            ds_files = [
                f for f in files if f in subdatasets_paths
                or ds.get_containing_subdataset(f, recursion_limit=1) == ds
            ]
            if len(ds_files):
                # XXX Is there a better way to handle files in mixed repos?
                ds.repo.add(ds_files)
                ds.repo.add(ds_files, git=True)
                to_commit.extend(ds_files)
            # it might be that the file itself is the submodule, so we might
            # need to commit .gitmodules
            for f in files:
                for subds in subdatasets_paths:
                    if subds.rstrip('/') == f.rstrip('/'):
                        to_commit.append('.gitmodules')
                        break

        _datalad_msg = False
        if not message:
            message = 'Recorded existing changes'
            _datalad_msg = True

        # extend with files yet to be committed in this dataset
        to_commit.extend(files)

        # anything should be staged by now
        # however, staged submodule changes are not considered as
        # `index`, hence `submodules` needs to be True too
        # we can have an explicit list of stuff to save or (if no `files`
        # provided) have staged stuff
        if ds.repo.repo.is_dirty(index=True,
                                 working_tree=False,
                                 untracked_files=False,
                                 submodules=True):

            # Analyze list of known to be committed files/submodules,
            # see if nothing points outside, and then convert to relative paths
            to_commit_rel = []
            if to_commit:
                repopath = ds.repo.path
                for f in to_commit:
                    if isabs(f):
                        frel = relpath(f, repopath)
                        if frel.startswith(pardir):
                            # XXX may be just a warning and skip?
                            raise RuntimeError(
                                "Path %s outside of the dataset %s. Can't commit"
                                % (f, ds))
                        f = frel
                    to_commit_rel.append(f)
                to_commit_rel = sorted(set(to_commit_rel))
                if '.' in to_commit_rel:
                    # we need to commit everything
                    to_commit_rel = []

            ds.repo.commit(message,
                           options=to_commit_rel,
                           _datalad_msg=_datalad_msg)
        elif to_commit:
            lgr.warning(
                "Was instructed to commit %s files but repository is not dirty",
                to_commit)
        elif not auto_add_changes:
            lgr.info('Nothing to save, consider auto-detection of changes, '
                     'if this is unexpected.')

        # MIH: let's tag even if there was nothing commit. I'd forget this
        # option too often...
        if version_tag:
            ds.repo.tag(version_tag)

        _was_modified = ds.repo.get_hexsha() != orig_hexsha

        # and now we could consider saving our changes within super-datasets
        # Let's float up until we get to a non-dataset
        if super_datasets:
            if _was_modified:
                if version_tag:
                    lgr.info(
                        "Version tag %s will not be applied to super datasets",
                        version_tag)
                superds = ds
                while True:
                    supersubds = superds
                    superds = superds.get_superdataset(datalad_only=True)
                    if not superds:
                        break
                    Save.__call__(
                        message=message +
                        " [origin: %s]" % relpath(ds.path, superds.path),
                        files=[relpath(supersubds.path, superds.path)],
                        dataset=superds,
                        auto_add_changes=False,
                        version_tag=None,
                        recursive=False,
                    )
            else:
                lgr.info(
                    "Not trying to save super-datasets since no modifications")

        # TODO: figure out what we should return for recursive/super_datasets
        # shouldn't we return all commits???
        return ds.repo.repo.head.commit if _was_modified else None
Exemplo n.º 31
0
    def __call__(path=None,
                 spec=None,
                 dataset=None,
                 subject=None,
                 anon_subject=None,
                 acquisition=None,
                 properties=None):

        # TODO: acquisition can probably be removed (or made an alternative to
        # derive spec and/or dicom location from)

        # Change, so path needs to point directly to dicom ds?
        # Or just use acq and remove path?

        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose="spec from dicoms")

        from datalad.utils import assure_list
        if path is not None:
            path = assure_list(path)
            path = [resolve_path(p, dataset) for p in path]
        else:
            raise InsufficientArgumentsError(
                "insufficient arguments for dicom2spec: a path is required")

        # TODO: We should be able to deal with several paths at once
        #       ATM we aren't (see also commit + message of actual spec)
        assert len(path) == 1

        if not spec:
            raise InsufficientArgumentsError(
                "insufficient arguments for dicom2spec: a spec file is required"
            )

            # TODO: That's prob. wrong. We can derive default spec from acquisition
        else:
            spec = resolve_path(spec, dataset)

        spec_series_list = \
            [r for r in json_py.load_stream(spec)] if op.exists(spec) else list()

        # get dataset level metadata:
        found_some = False
        for meta in dataset.meta_dump(
                path,
                recursive=False,  # always False?
                reporton='datasets',
                return_type='generator',
                result_renderer='disabled'):
            if meta.get('status', None) not in ['ok', 'notneeded']:
                yield meta
                continue

            if 'dicom' not in meta['metadata']:

                # TODO: Really "notneeded" or simply not a result at all?
                yield dict(status='notneeded',
                           message=("found no DICOM metadata for %s",
                                    meta['path']),
                           path=meta['path'],
                           type='dataset',
                           action='dicom2spec',
                           logger=lgr)
                continue

            if 'Series' not in meta['metadata']['dicom'] or \
                    not meta['metadata']['dicom']['Series']:
                yield dict(
                    status='impossible',
                    message=("no image series detected in DICOM metadata of"
                             " %s", meta['path']),
                    path=meta['path'],
                    type='dataset',
                    action='dicom2spec',
                    logger=lgr)
                continue

            found_some = True

            overrides = dict()
            if properties:
                # load from file or json string
                props = json_py.load(properties) \
                        if op.exists(properties) else json_py.loads(properties)
                # turn into editable, pre-approved records
                props = {
                    k: dict(value=v, approved=True)
                    for k, v in props.items()
                }
                overrides.update(props)

            spec_series_list = add_to_spec(
                meta,
                spec_series_list,
                op.dirname(spec),
                subject=subject,
                anon_subject=anon_subject,
                # session=session,
                # TODO: parameter "session" was what
                # we now call acquisition. This is
                # NOT a good default for bids_session!
                # Particularly wrt to anonymization
                overrides=overrides,
                dataset=dataset)

        if not found_some:
            yield dict(
                status='impossible',
                message="found no DICOM metadata",
                path=path,
                type=
                'file',  # TODO: arguable should be 'file' or 'dataset', depending on path
                action='dicom2spec',
                logger=lgr)
            return

        # TODO: RF needed. This rule should go elsewhere:
        # ignore duplicates (prob. reruns of aborted runs)
        # -> convert highest id only
        # Note: This sorting is a q&d hack!
        # TODO: Sorting needs to become more sophisticated + include notion of :all
        spec_series_list = sorted(spec_series_list,
                                  key=lambda x: get_specval(x, 'id')
                                  if 'id' in x.keys() else 0)
        for i in range(len(spec_series_list)):
            # Note: Removed the following line from condition below,
            # since it appears to be pointless. Value for 'converter'
            # used to be 'heudiconv' or 'ignore' for a 'dicomseries', so
            # it's not clear ATM what case this could possibly have catched:
            # heuristic.has_specval(spec_series_list[i], "converter") and \
            if spec_series_list[i]["type"] == "dicomseries" and \
                has_specval(spec_series_list[i], "bids-run") and \
                get_specval(spec_series_list[i], "bids-run") in \
                    [get_specval(s, "bids-run")
                     for s in spec_series_list[i + 1:]
                     if get_specval(
                            s,
                            "description") == get_specval(
                                spec_series_list[i], "description") and \
                     get_specval(s, "id") > get_specval(
                                             spec_series_list[i], "id")
                     ]:
                lgr.debug("Ignore SeriesNumber %s for conversion" % i)
                spec_series_list[i]["tags"].append(
                    'hirni-dicom-converter-ignore')

        lgr.debug("Storing specification (%s)", spec)
        # store as a stream (one record per file) to be able to
        # easily concat files without having to parse them, or
        # process them line by line without having to fully parse them
        from datalad_hirni.support.spec_helpers import sort_spec
        # Note: Sorting paradigm needs to change. See above.
        # spec_series_list = sorted(spec_series_list, key=lambda x: sort_spec(x))
        json_py.dump2stream(spec_series_list, spec)

        # make sure spec is in git:
        dataset.repo.set_gitattributes([(spec, {
            'annex.largefiles': 'nothing'
        })], '.gitattributes')

        for r in Save.__call__(dataset=dataset,
                               path=[spec, '.gitattributes'],
                               to_git=True,
                               message="[HIRNI] Added study specification "
                               "snippet for %s" %
                               op.relpath(path[0], dataset.path),
                               return_type='generator',
                               result_renderer='disabled'):
            if r.get('status', None) not in ['ok', 'notneeded']:
                yield r
            elif r['path'] in [spec, op.join(dataset.path, '.gitattributes')] \
                    and r['type'] == 'file':
                r['action'] = 'dicom2spec'
                r['logger'] = lgr
                yield r
            elif r['type'] == 'dataset':
                # 'ok' or 'notneeded' for a dataset is okay, since we commit
                # the spec. But it's not a result to yield
                continue
            else:
                # anything else shouldn't happen
                yield dict(
                    status='error',
                    message=("unexpected result from save: %s", r),
                    path=
                    spec,  # TODO: This actually isn't clear - get it from `r`
                    type='file',
                    action='dicom2spec',
                    logger=lgr)
Exemplo n.º 32
0
def diff_dataset(dataset,
                 fr,
                 to,
                 constant_refs,
                 path=None,
                 annex=None,
                 untracked='normal',
                 recursive=False,
                 recursion_limit=None,
                 eval_file_type=True,
                 reporting_order='depth-first'):
    """Internal helper to diff a dataset

    Parameters
    ----------
    dataset : Dataset
      Dataset to perform the diff on. `fr` and `to` parameters are interpreted
      in the context of this dataset.
    fr : str
      Commit-ish to compare from.
    to : str
      Commit-ish to compare to.
    constant_refs : bool
      If True, `fr` and `to` will be passed on unmodified to diff operations
      on subdatasets. This can be useful with symbolic references like tags
      to report subdataset changes independent of superdataset changes.
      If False, `fr` and `to` will be translated to the subdataset commit-ish
      that match the given commit-ish in the superdataset.
    path : Path-like, optional
      Paths to constrain the diff to (see main diff() command).
    annex : str, optional
      Reporting mode for annex properties (see main diff() command).
    untracked : str, optional
      Reporting mode for untracked content (see main diff() command).
    recursive : bool, optional
      Flag to enable recursive operation (see main diff() command).
    recursion_limit : int, optional
      Recursion limit (see main diff() command).
    eval_file_type : bool, optional
      Whether to perform file type discrimination between real symlinks
      and symlinks representing annex'ed files. This can be expensive
      in datasets with many files.
    reporting_order : {'depth-first', 'breadth-first'}, optional
      By default, subdataset content records are reported after the record
      on the subdataset's submodule in a superdataset (depth-first).
      Alternatively, report all superdataset records first, before reporting
      any subdataset content records (breadth-first).

    Yields
    ------
    dict
      DataLad result records.
    """
    if reporting_order not in ('depth-first', 'breadth-first'):
        raise ValueError('Unknown reporting order: {}'.format(reporting_order))

    ds = require_dataset(dataset,
                         check_installed=True,
                         purpose='difference reporting')

    # we cannot really perform any sorting of paths into subdatasets
    # or rejecting paths based on the state of the filesystem, as
    # we need to be able to compare with states that are not represented
    # in the worktree (anymore)
    if path:
        ps = []
        # sort any path argument into the respective subdatasets
        for p in sorted(assure_list(path)):
            # it is important to capture the exact form of the
            # given path argument, before any normalization happens
            # distinguish rsync-link syntax to identify
            # a dataset as whole (e.g. 'ds') vs its
            # content (e.g. 'ds/')
            # special case is the root dataset, always report its content
            # changes
            orig_path = str(p)
            resolved_path = resolve_path(p, dataset)
            p = \
                resolved_path, \
                orig_path.endswith(op.sep) or resolved_path == ds.pathobj
            str_path = str(p[0])
            root = get_dataset_root(str_path)
            if root is None:
                # no root, not possibly underneath the refds
                yield dict(action='status',
                           path=str_path,
                           refds=ds.path,
                           status='error',
                           message='path not underneath this dataset',
                           logger=lgr)
                continue
            if path_under_rev_dataset(ds, str_path) is None:
                # nothing we support handling any further
                # there is only a single refds
                yield dict(
                    path=str_path,
                    refds=ds.path,
                    action='diff',
                    status='error',
                    message=(
                        "dataset containing given paths is not underneath "
                        "the reference dataset %s: %s", ds, str_path),
                    logger=lgr,
                )
                continue

            ps.append(p)
        path = ps

    # TODO we might want to move away from the single-pass+immediate-yield
    # paradigm for this command. If we gather all information first, we
    # could do post-processing and detect when a file (same gitsha, or same
    # key) was copied/moved from another dataset. Another command (e.g.
    # save) could act on this information and also move/copy
    # availability information or at least enhance the respective commit
    # message with cross-dataset provenance info

    # cache to help avoid duplicate status queries
    content_info_cache = {}
    for res in _diff_ds(
            ds,
            fr,
            to,
            constant_refs,
            recursion_limit if recursion_limit is not None and recursive else
            -1 if recursive else 0,
            # TODO recode paths to repo path reference
            origpaths=None if not path else OrderedDict(path),
            untracked=untracked,
            annexinfo=annex,
            eval_file_type=eval_file_type,
            cache=content_info_cache,
            order=reporting_order):
        res.update(
            refds=ds.path,
            logger=lgr,
            action='diff',
        )
        yield res
Exemplo n.º 33
0
def sort_paths_by_datasets(refds, orig_dataset_arg, paths):
    """Sort paths into actually present datasets

    Parameters
    ----------
    refds : Dataset or None
      Dataset instance of a reference dataset, if any exists. This is
      not just a `dataset` argument in any form (for path resolution),
      see `orig_dataset_arg` for that, but has to be a Dataset instance
      that serves as the root of all operations.
    orig_dataset_arg : None or str
      The original dataset argument of the calling command. This is
      used to determine the path specification semantics, i.e.
      relative to CWD vs. relative to a given dataset
    paths : list
      Paths as given to the calling command

    Returns
    -------
    OrderedDict, list
      The dictionary contains all to-be-sorted paths as values to
      their respective containing datasets paths (as keys). The second
      list contains status dicts for any errors that may have occurred
      during processing. They can be yielded in the context of
      the calling command.
    """
    errors = []
    paths_by_ds = OrderedDict()
    # sort any path argument into the respective subdatasets
    for p in sorted(paths):
        # it is important to capture the exact form of the
        # given path argument, before any normalization happens
        # for further decision logic below
        orig_path = text_type(p)
        p = resolve_path(p, orig_dataset_arg)
        root = rev_get_dataset_root(text_type(p))
        if root is None:
            # no root, not possibly underneath the refds
            errors.append(
                dict(action='status',
                     path=p,
                     status='error',
                     message='path not underneath this dataset',
                     logger=lgr))
            continue
        else:
            if refds and root == text_type(p) and \
                    not orig_path.endswith(op.sep):
                # the given path is pointing to a dataset
                # distinguish rsync-link syntax to identify
                # the dataset as whole (e.g. 'ds') vs its
                # content (e.g. 'ds/')
                super_root = rev_get_dataset_root(op.dirname(root))
                if super_root:
                    # the dataset identified by the path argument
                    # is contained in a superdataset, and no
                    # trailing path separator was found in the
                    # argument -> user wants to address the dataset
                    # as a whole (in the superdataset)
                    root = super_root

        root = Path(root)
        ps = paths_by_ds.get(root, [])
        ps.append(p)
        paths_by_ds[root] = ps

    return paths_by_ds, errors
Exemplo n.º 34
0
    def __call__(
            source,
            path=None,
            dataset=None,
            description=None,
            reckless=None):
        # did we explicitly get a dataset to install into?
        # if we got a dataset, path will be resolved against it.
        # Otherwise path will be resolved first.
        ds = require_dataset(
            dataset, check_installed=True, purpose='cloning') \
            if dataset is not None else dataset
        refds_path = ds.path if ds else None

        # legacy compatibility
        if reckless is True:
            # so that we can forget about how things used to be
            reckless = 'auto'

        if isinstance(source, Dataset):
            source = source.path

        if source == path:
            # even if they turn out to be identical after resolving symlinks
            # and more sophisticated witchcraft, it would still happily say
            # "it appears to be already installed", so we just catch an
            # obviously pointless input combination
            raise ValueError(
                "clone `source` and destination `path` are identical [{}]. "
                "If you are trying to add a subdataset simply use `save`".format(
                    path))

        if path is not None:
            path = resolve_path(path, dataset)

        # derive target from source:
        if path is None:
            # we got nothing but a source. do something similar to git clone
            # and derive the path from the source and continue
            # since this is a relative `path`, resolve it:
            # we are not going to reuse the decoded URL, as this is done for
            # all source candidates in clone_dataset(), we just use to determine
            # a destination path here in order to perform a bunch of additional
            # checks that shall not pollute the helper function
            source_ = decode_source_spec(
                source, cfg=None if ds is None else ds.config)
            path = resolve_path(source_['default_destpath'], dataset)
            lgr.debug("Determined clone target path from source")
        lgr.debug("Resolved clone target path to: '%s'", path)

        # there is no other way -- my intoxicated brain tells me
        assert(path is not None)

        result_props = dict(
            action='install',
            logger=lgr,
            refds=refds_path,
            source_url=source)

        try:
            # this will implicitly cause pathlib to run a bunch of checks
            # whether the present path makes any sense on the platform
            # we are running on -- we don't care if the path actually
            # exists at this point, but we want to abort early if the path
            # spec is determined to be useless
            path.exists()
        except OSError as e:
            yield get_status_dict(
                status='error',
                path=path,
                message=('cannot handle target path: %s', exc_str(e)),
                **result_props)
            return

        destination_dataset = Dataset(path)
        result_props['ds'] = destination_dataset

        if ds is not None and ds.pathobj not in path.parents:
            yield get_status_dict(
                status='error',
                message=("clone target path '%s' not in specified target dataset '%s'",
                         path, ds),
                **result_props)
            return

        # perform the actual cloning operation
        yield from clone_dataset(
            [source],
            destination_dataset,
            reckless,
            description,
            result_props,
            cfg=None if ds is None else ds.config,
        )

        # TODO handle any 'version' property handling and verification using a dedicated
        # public helper

        if ds is not None:
            # we created a dataset in another dataset
            # -> make submodule
            for r in ds.save(
                    path,
                    return_type='generator',
                    result_filter=None,
                    result_xfm=None,
                    on_failure='ignore'):
                yield r
Exemplo n.º 35
0
    def __call__(dataset=None, dest=None, path=None,
                 # Note: add remote currently disabled in publish
                 # dest_url=None, dest_pushurl=None,
                 with_data=None, recursive=False):

        # Note: add remote currently disabled in publish
        # if dest is None and (dest_url is not None
        #                        or dest_pushurl is not None):
        #     raise ValueError("""insufficient information for adding the
        #     destination as a sibling (needs at least a name)""")

        # shortcut
        ds = dataset

        if ds is not None and not isinstance(ds, Dataset):
            ds = Dataset(ds)
        if not path:
            path = curdir

        elif isinstance(path, list):
            return [Publish.__call__(
                    dataset=ds,
                    dest=dest,
                    path=p,
                    # Note: add remote currently disabled in publish
                    # dest_url=dest_url,
                    # dest_pushurl=dest_pushurl,
                    with_data=with_data,
                    recursive=recursive) for p in path]

        # resolve the location against the provided dataset
        if path is not None:
            path = resolve_path(path, ds)

        lgr.info("Publishing {0}".format(path))

        # if we have no dataset given, figure out which one we need to operate
        # on, based on the resolved location (that is now guaranteed to
        # be specified
        if ds is None:
            # try to find a dataset at or above the location
            dspath = GitRepo.get_toppath(abspath(path))
            if dspath is None:
                # no top-level dataset found, use path as such
                dspath = path
            ds = Dataset(dspath)
        lgr.debug("Resolved dataset for publication: {0}".format(ds))
        assert(ds is not None)

        # it might still be about a subdataset of ds:
        if path is not None:
            relativepath = relpath(path, start=ds.path)
            subds = get_containing_subdataset(ds, relativepath)
            if subds.path != ds.path:
                    # path belongs to a subdataset; hand it over
                    lgr.debug("Hand over to submodule %s" % subds.path)
                    return subds.publish(dest=dest,
                                         path=relpath(path, start=subds.path),
                                         # Note: add remote currently disabled in publish
                                         # dest_url=dest_url,
                                         # dest_pushurl=dest_pushurl,
                                         with_data=with_data,
                                         recursive=recursive)

        # now, we know, we have to operate on ds. So, ds needs to be installed,
        # since we cannot publish anything from a not installed dataset,
        # can we?
        # (But may be just the existence of ds.repo is important here.)
        if not ds.is_installed():
            raise ValueError("No installed dataset found at "
                             "{0}.".format(ds.path))
        assert(ds.repo is not None)

        # TODO: For now we can deal with a sibling(remote) name given by `dest`
        # only. Figure out, when to allow for passing a local path or URL
        # directly and what to do in that case.

        # Note: we need an upstream remote, if there's none given. We could
        # wait for git push to complain, but we need to explicitly figure it
        # out for pushing annex branch anyway and we might as well fail right
        # here.

        # keep original dest in case it's None for passing to recursive calls:
        dest_resolved = dest
        if dest is None:
            # check for tracking branch's remote:
            try:
                std_out, std_err = \
                    ds.repo._git_custom_command('',
                                                ["git", "config", "--get", "branch.{active_branch}.remote".format(active_branch=ds.repo.git_get_active_branch())],
                                                expect_fail=True)
            except CommandError as e:
                if e.code == 1 and e.stdout == "":
                    std_out = None
                else:
                    raise
            if std_out:
                dest_resolved = std_out.strip()
            else:
                # we have no remote given and no upstream => fail
                raise RuntimeError("No known default target for "
                                   "publication and none given.")

        # upstream branch needed for update (merge) and subsequent push,
        # in case there is no.
        set_upstream = False
        try:
            # Note: tracking branch actually defined bei entry "merge"
            # PLUS entry "remote"
            std_out, std_err = \
                ds.repo._git_custom_command('',
                                            ["git", "config", "--get",
                                             "branch.{active_branch}.merge".format(active_branch=ds.repo.git_get_active_branch())],
                                            expect_fail=True)
        except CommandError as e:
            if e.code == 1 and e.stdout == "":
                # no tracking branch yet:
                set_upstream = True
            else:
                raise

        # is `dest` an already known remote?
        if dest_resolved not in ds.repo.git_get_remotes():
            # unknown remote
            raise ValueError("No sibling '%s' found." % dest_resolved)

            # Note: add remote currently disabled in publish
            # if dest_url is None:
            #     raise ValueError("No sibling '%s' found. Provide `dest-url`"
            #                      " to register it." % dest_resolved)
            # lgr.info("Sibling %s unknown. Registering ...")
            #
            # # Fill in URL-Template:
            # remote_url = dest_url.replace("%NAME", basename(ds.path))
            # # TODO: handle_name.replace("/", "-")) instead of basename()
            # #       - figure it out ;)
            # #       - either a datasets needs to discover superdatasets in
            # #         order to get it's relative path to provide a name
            # #       - or: We need a different approach on the templates
            #
            # # Add the remote
            # ds.repo.git_remote_add(dest_resolved, remote_url)
            # if dest_pushurl:
            #     # Fill in template:
            #     remote_url_push = \
            #         dest_pushurl.replace("%NAME", basename(ds.path))
            #     # TODO: Different way of replacing %NAME; See above
            #
            #     # Modify push url:
            #     ds.repo._git_custom_command('',
            #                                 ["git", "remote",
            #                                  "set-url",
            #                                  "--push", dest_resolved,
            #                                  remote_url_push])
            # lgr.info("Added sibling '%s'." % dest)
            # lgr.debug("Added remote '%s':\n %s (fetch)\n%s (push)." %
            #           (dest_resolved, remote_url,
            #            remote_url_push if dest_pushurl else remote_url))
        # Note: add remote currently disabled in publish
        # else:
        #     # known remote: parameters dest-url-* currently invalid.
        #     # This may change to adapt the existing remote.
        #     if dest_url:
        #         lgr.warning("Sibling '%s' already exists for dataset '%s'. "
        #                     "Ignoring dest-url %s." %
        #                     (dest_resolved, ds.path, dest_url))
        #     if dest_pushurl:
        #         lgr.warning("Sibling '%s' already exists for dataset '%s'. "
        #                     "Ignoring dest-pushurl %s." %
        #                     (dest_resolved, ds.path, dest_pushurl))

        # Figure out, what to publish
        if path is None or path == ds.path:
            # => publish the dataset itself
            # push local state:
            # TODO: Rework git_push in GitRepo
            cmd = ['git', 'push']
            if set_upstream:
                # no upstream branch yet
                cmd.append("--set-upstream")
            cmd += [dest_resolved, ds.repo.git_get_active_branch()]
            ds.repo._git_custom_command('', cmd)
            # push annex branch:
            if isinstance(ds.repo, AnnexRepo):
                ds.repo.git_push("%s +git-annex:git-annex" % dest_resolved)

            # TODO: if with_data is a shell pattern, we get a list, when called
            # from shell, right?
            # => adapt the following and check constraints to allow for that
            if with_data:
                ds.repo._git_custom_command('', ["git", "annex", "copy"] +
                                            with_data + ["--to", dest_resolved])

            if recursive and ds.get_dataset_handles() != []:
                results = [ds]
                # Note: add remote currently disabled in publish
                # modify URL templates:
                # if dest_url:
                #     dest_url = dest_url.replace('%NAME', basename(ds.path) + '-%NAME')
                # if dest_pushurl:
                #     dest_pushurl = dest_pushurl.replace('%NAME', basename(ds.path) + '-%NAME')
                for subds in ds.get_dataset_handles():
                    results.append(Dataset(opj(ds.path,
                                              subds)).publish(
                        dest=dest,
                        # Note: use `dest` instead of `dest_resolved` in case
                        # dest was None, so subdatasets would use their default
                        # as well
                        # Note: add remote currently disabled in publish
                        # dest_url=dest_url,
                        # dest_pushurl=dest_pushurl,
                        with_data=with_data,
                        recursive=recursive))
                return results

            return ds

        elif exists(path):
            # At this point `path` is not referencing a (sub)dataset.
            # An annexed file is the only thing left, that `path` might be
            # validly pointing to. Anything else we can't handle currently.
            if isinstance(ds.repo, AnnexRepo):
                try:
                    if ds.repo.get_file_key(relativepath):
                        # file is in annex, publish it
                        ds.repo._run_annex_command('copy',
                                                   annex_options=[path,
                                                                  '--to=%s' % dest_resolved])
                        return path
                except (FileInGitError, FileNotInAnnexError):
                    pass
            # `path` can't be published
            lgr.warning("Don't know how to publish %s." % path)
            return None

        else:
            # nothing to publish found
            lgr.warning("Nothing to publish found at %s." % path)
            return None