예제 #1
0
파일: save.py 프로젝트: silky/datalad
def untracked_subdatasets_to_submodules(ds, consider_paths):
    # treat special case of still untracked subdatasets.
    # those need to become submodules now, as they are otherwise added
    # without an entry in .gitmodules, and subsequently break Git's
    # submodule functionality completely
    new_modules = []
    if not consider_paths:
        # nothing to test
        return new_modules

    for utf in ds.repo.repo.untracked_files:
        utf_abspath = opj(ds.path, utf)
        if not isdir(utf_abspath):
            # this cannot be a repository
            continue

        # test whether the potential submodule is scheduled for saving
        utf_realpath = realpath(utf_abspath)
        if any([
                utf_realpath.startswith(_with_sep(realpath(f)))
                for f in consider_paths
        ]):
            # matches at least one path -> turn into submodule
            _install_subds_inplace(
                ds=ds,
                path=
                utf_abspath,  # can be ignored, we don't need the return value
                relativepath=utf.rstrip(os.sep),
                name=None)
            new_modules.append(utf.rstrip(os.sep))

    return new_modules
예제 #2
0
파일: utils.py 프로젝트: debanjum/datalad
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None):
    content_by_ds = {}
    if isinstance(recursion_limit, int) and recursion_limit <= 0:
        return content_by_ds
    # loop over submodules not subdatasets to get the url right away
    # install using helper that give some flexibility regarding where to
    # get the module from
    for sub in ds.repo.get_submodules():
        subds = Dataset(opj(ds.path, sub.path))
        if start is not None and not subds.path.startswith(_with_sep(start)):
            # this one we can ignore, not underneath the start path
            continue
        if not subds.is_installed():
            try:
                lgr.info("Installing subdataset %s", subds.path)
                subds = _install_subds_from_flexible_source(
                    ds, sub.path, sub.url, reckless)
                # we want the entire thing, but mark this subdataset
                # as automatically installed
                content_by_ds[subds.path] = [curdir]
            except Exception as e:
                # skip, if we didn't manage to install subdataset
                lgr.warning(
                    "Installation of subdatasets %s failed, skipped", subds)
                lgr.debug("Installation attempt failed with exception: %s",
                          exc_str(e))
                continue
            # otherwise recurse
            # we can skip the start expression, we know we are within
            content_by_ds.update(_recursive_install_subds_underneath(
                subds,
                recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit,
                reckless=reckless
            ))
    return content_by_ds
예제 #3
0
def get_tree_roots(paths):
    """Return common root paths for a set of paths

    This function determines the smallest set of common root
    paths and sorts all given paths under the respective
    root.

    Returns
    -------
    dict
      paths by root
    """
    paths_ws = [_with_sep(p) for p in paths]
    # sort all paths under their potential roots
    roots = {}
    # start from the top to get all paths down the line
    # and collate them into as few roots as possible
    for s in sorted(paths_ws):
        if any([s.startswith(r) for r in roots]):
            # this path is already covered by a known root
            continue
        # find all sub paths
        subs = [p for p in paths if p.startswith(s)]
        roots[s.rstrip(sep)] = subs
    return roots
예제 #4
0
파일: utils.py 프로젝트: datalad/datalad
def get_tree_roots(paths):
    """Return common root paths for a set of paths

    This function determines the smallest set of common root
    paths and sorts all given paths under the respective
    root.

    Returns
    -------
    dict
      paths by root
    """
    paths_ws = [_with_sep(p) for p in paths]
    # sort all paths under their potential roots
    roots = {}
    # start from the top to get all paths down the line
    # and collate them into as few roots as possible
    for s in sorted(paths_ws):
        if any([s.startswith(r) for r in roots]):
            # this path is already covered by a known root
            continue
        # find all sub paths
        subs = [p for p in paths if p.startswith(s)]
        roots[s.rstrip(sep)] = subs
    return roots
예제 #5
0
파일: utils.py 프로젝트: adhvaithrp/datalad
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace,
                                      spec):
    """Discover the edges and nodes in a dataset tree to given target paths

    Parameters
    ----------
    basepath : path
      Path to a start or top-level dataset. Really has to be a path to a
      dataset!
    targetpaths : list(path)
      Any non-zero number of path that are termination points for the
      search algorithm. Can be paths to datasets, directories, or files
      (and any combination thereof).
    current_trace : list
      For a top-level call this should probably always be `[]`
    spec : dict
      `content_by_ds`-style dictionary that will receive information about the
      discovered datasets. Specifically, for each discovered dataset there
      will be in item with its path under the key (path) of the respective
      superdataset.

    Returns
    -------
    None
      Function calls itself recursively and populates `spec` in-place.
    """
    # this beast walks the directory tree from a given `basepath` until
    # it discovers any of the given `targetpaths`
    # if it finds one, it commits any accummulated trace of visited
    # datasets on this edge to the spec
    valid_repo = GitRepo.is_valid_repo(basepath)
    if valid_repo:
        # we are passing into a new dataset, extend the dataset trace
        current_trace = current_trace + [basepath]
    if basepath in targetpaths:
        # found a targetpath, commit the trace
        for i, p in enumerate(current_trace[:-1]):
            # TODO RF prepare proper annotated path dicts
            spec[p] = list(set(spec.get(p, []) + [current_trace[i + 1]]))
    if not isdir(basepath):
        # nothing underneath this one -> done
        return
    # this edge is not done, we need to try to reach any downstream
    # dataset
    for p in listdir(basepath):
        if valid_repo and p == '.git':
            # ignore gitdir to speed things up
            continue
        p = opj(basepath, p)
        if all(t != p and not t.startswith(_with_sep(p)) for t in targetpaths):
            # OPT listdir might be large and we could have only few items
            # in `targetpaths` -- so traverse only those in spec which have
            # leading dir basepath
            continue
        # we need to call this even for non-directories, to be able to match
        # file target paths
        discover_dataset_trace_to_targets(p, targetpaths, current_trace, spec)
예제 #6
0
파일: utils.py 프로젝트: debanjum/datalad
def amend_pathspec_with_superdatasets(spec, topmost=True, limit_single=False):
    """Amend a path spec dictionary with entries for superdatasets

    The result will be a superdataset entry (if a superdataset exists)
    for each input dataset. This entry will (at least) contain the path
    to the subdataset.

    Parameters
    ----------
    spec : dict
      Path spec
    topmost : Dataset or bool
      Flag whether to grab the immediate, or the top-most superdataset
      for each entry, alternatively this can be a dataset instance
      that is used as the topmost dataset.
    limit_single : bool
      If a `topmost` dataset is provided, and this flag is True, only
      the given topmost dataset will be considered as superdataset. Any
      datasets in the spec that are not underneath this dataset will
      not have associated superdataset entries added to the spec.

    Returns
    -------
    dict
      Amended path spec dictionary
    """
    superdss = {}
    for dpath in spec.keys():
        superds = None
        if isinstance(topmost, Dataset):
            if limit_single and dpath == topmost.path:
                # this is already the topmost, no further superdataset to
                # consider
                continue
            if dpath.startswith(_with_sep(topmost.path)):
                # the given topmost dataset is "above" the current
                # datasets path
                superds = topmost
            elif limit_single:
                continue
        if not superds:
            # grab the (topmost) superdataset
            superds = Dataset(dpath).get_superdataset(
                datalad_only=True, topmost=topmost)
        if not superds:
            continue
        # register the subdatasets path in the spec of the superds
        spaths = superdss.get(superds.path, [])
        if not spaths:
            spaths = spec.get(superds.path, [])
        spaths.append(dpath)
        superdss[superds.path] = spaths
    spec.update(superdss)
    return spec
예제 #7
0
 def _filterpaths(basepath, paths, exclude):
     final_paths = []
     for rp in [opj(basepath, p) if basepath else p for p in paths]:
         if rp in exclude:
             continue
         elif any(ep.startswith(_with_sep(rp)) for ep in exclude):
             final_paths.extend(
                 _filterpaths(rp, listdir(opj(ds.path, rp)), exclude))
             pass
         else:
             final_paths.append(rp)
     return final_paths
예제 #8
0
파일: diff.py 프로젝트: overlake333/datalad
def _get_untracked_content(dspath, report_untracked, paths=None):
    cmd = [
        'git',
        '--work-tree=.',
        'status',
        '--porcelain',
        # file names NULL terminated
        '-z',
        # we never want to touch submodules, they cannot be untracked
        '--ignore-submodules=all',
        # fully untracked dirs as such, the rest as files
        '--untracked={}'.format(report_untracked)
    ]
    try:
        stdout, stderr = GitRunner(cwd=dspath).run(cmd,
                                                   log_stderr=True,
                                                   log_stdout=True,
                                                   log_online=False,
                                                   expect_stderr=False,
                                                   shell=False,
                                                   expect_fail=True)
    except CommandError as e:
        # TODO should we catch any and handle them in here?
        raise e

    if paths:
        paths = [r['path'] for r in paths]
        if len(paths) == 1 and paths[0] == dspath:
            # nothing to filter
            paths = None

    for line in stdout.split('\0'):
        if not line:
            continue
        if not line.startswith('?? '):
            # nothing untracked, ignore, task of `diff`
            continue
        apath = opj(
            dspath,
            # strip state marker
            line[3:])
        norm_apath = normpath(apath)
        if paths and not any(
            [norm_apath == p or apath.startswith(_with_sep(p))
             for p in paths]):
            # we got a whitelist for paths, don't report any other
            continue
        ap = dict(path=norm_apath,
                  parentds=dspath,
                  state='untracked',
                  type='directory' if isdir(apath) else 'file')
        yield ap
예제 #9
0
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None,
                                        refds_path=None, description=None):
    if isinstance(recursion_limit, int) and recursion_limit <= 0:
        return
    # install using helper that give some flexibility regarding where to
    # get the module from
    for sub in ds.subdatasets(
            return_type='generator', result_renderer='disabled'):
        subds = Dataset(sub['path'])
        if sub.get('gitmodule_datalad-recursiveinstall', '') == 'skip':
            lgr.debug(
                "subdataset %s is configured to be skipped on recursive installation",
                sub['path'])
            continue
        if start is not None and not subds.path.startswith(_with_sep(start)):
            # this one we can ignore, not underneath the start path
            continue
        if sub['state'] != 'absent':
            # dataset was already found to exist
            yield get_status_dict(
                'install', ds=subds, status='notneeded', logger=lgr,
                refds=refds_path)
            # do not continue, even if an intermediate dataset exists it
            # does not imply that everything below it does too
        else:
            # try to get this dataset
            try:
                subds = _install_subds_from_flexible_source(
                    ds,
                    relpath(sub['path'], start=ds.path),
                    sub['gitmodule_url'],
                    reckless,
                    description=description)
                yield get_status_dict(
                    'install', ds=subds, status='ok', logger=lgr, refds=refds_path,
                    message=("Installed subdataset %s", subds), parentds=ds.path)
            except Exception as e:
                # skip all of downstairs, if we didn't manage to install subdataset
                yield get_status_dict(
                    'install', ds=subds, status='error', logger=lgr, refds=refds_path,
                    message=("Installation of subdatasets %s failed with exception: %s",
                             subds, exc_str(e)))
                continue
        # otherwise recurse
        # we can skip the start expression, we know we are within
        for res in _recursive_install_subds_underneath(
                subds,
                recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit,
                reckless=reckless,
                refds_path=refds_path):
            yield res
예제 #10
0
def _dump_submeta(ds, submetas, matchpath, save, modified_ds):
    known_subds = list(submetas.keys())
    for p in known_subds:
        smeta = submetas[p]
        if matchpath and not p.startswith(_with_sep(matchpath)):
            continue
        subds_relpath = relpath(p, matchpath)
        # inject proper inter-dataset relationships
        for m in smeta:
            # skip non-implicit
            if not is_implicit_metadata(m):
                continue
            if 'dcterms:isPartOf' not in m and m.get('type',
                                                     None) == 'Dataset':
                m['dcterms:isPartOf'] = ds.id
        sp = opj(ds.path, metadata_basepath, subds_relpath)
        _store_json(ds, sp, smeta)
        # TODO this is all wrong! It should not talk to repo methods and emulate
        # high-level code, but use the (now) existing high-level commands
        # stage potential changes in the subdataset
        try:
            ds.repo.add(subds_relpath, git=True)
        except CommandError:
            # TODO as a bonus this exception handling is untested! wipe out during
            # upcoming RF
            # it can blow if we skipped a non-dataset submodule
            # in this case we need to find the chain of submodules leading to it and
            # save then bottom-up
            testpath = dirname(subds_relpath)
            while testpath:
                repo = ds.subdatasets(contains=testpath,
                                      result_xfm='datasets',
                                      return_type='item-or-list')
                repo.repo.add(relpath(subds_relpath, testpath), git=True)
                modified_ds = _save_helper(repo, save, modified_ds)
                # see if there is anything left...
                # IMPORTANT to go with relpath to actually get to an empty
                # string eventually
                testpath = dirname(relpath(repo.path, ds.path))

        # removed stored item from lookup
        del submetas[p]
    return modified_ds
예제 #11
0
파일: utils.py 프로젝트: debanjum/datalad
def sort_paths_into_subdatasets(superds_path, target_subs, spec):
    # XXX forge a chain: whenever some path needs to be pushed down
    # put the receiving dataset as a components to process into the
    # respective superdataset -- this will enable further processing
    # of all datasets in a completely independent fashion
    # (except for order of processing)

    # get all existing subdataset as candidate nodes of the graph
    # that needs to be built and checked
    subds_graph = Dataset(superds_path).get_subdatasets(
        absolute=True, recursive=True, edges=True, fulfilled=True)
    if not subds_graph:
        # no subdatasets, nothing to sort
        return
    for t in target_subs:
        trace = get_trace(
            subds_graph,
            superds_path,
            t)
        if not trace:
            # not connected, or identical
            continue
        tosort = [superds_path] + trace + [t]
        # loop over all but the last one, simplifies logic below
        for i, d in enumerate(tosort[:-1]):
            paths = spec.get(d, [])
            keep_paths = []
            next_ds = tosort[i + 1]
            next_dspaths = spec.get(next_ds, [])
            comp = _with_sep(next_ds)
            for p in assure_list(paths):
                if p.startswith(comp):
                    next_dspaths.append(p)
                    # remember that we pushed the path into this dataset
                    keep_paths.append(next_ds)
                else:
                    keep_paths.append(p)
            spec[next_ds] = next_dspaths
            spec[d] = keep_paths
    # tidy up -- deduplicate
    for c in spec:
        spec[c] = list(set(spec[c]))
예제 #12
0
def process_vanished_paths(unavailable_paths, content_by_ds):
    # presently unavailable paths could be, e.g., deleted files, or
    # uninstalled subdatasets, or simply nothing -> figure it out and act
    # accordingly
    dsinfo = {}
    nonexistent_paths = []
    for p in unavailable_paths:
        # we need to check whether any of these correspond
        # to a known subdataset, and add those to the list of
        # things to be removed
        toppath = get_dataset_root(p)
        if not toppath:
            nonexistent_paths.append(p)
            continue
        ds = Dataset(toppath)
        dinfo = dsinfo.get(
            toppath, {
                'deleted': ds.repo.get_deleted_files(),
                'subds': ds.get_subdatasets(recursive=False, absolute=True)
            })
        # cache for a potentially following request
        dsinfo[toppath] = dinfo
        if p in dinfo['subds']:
            # test for subds needs to come first, as it would also show
            # up in "deleted_files"
            # this is a known subdataset that has vanished
            lgr.debug('deinit vanished subdataset {} in {}'.format(p, ds))
            # simply deinit to complete a "forced uninstallation", without
            # an explicit "remove" there is nothing to be save in this
            # case
            ds.repo.deinit_submodule(p[len(_with_sep(ds.path)):])
        elif p in dinfo['deleted']:
            # vanished file -> 'git rm' it to stage the change
            ds.repo.remove(p)
            # record that we are "saving" this path
            dpaths = content_by_ds.get(ds.path, [])
            dpaths.append(p)
            content_by_ds[ds.path] = dpaths
        else:
            # this is nothing we can anyhow handle
            nonexistent_paths.append(p)
    return content_by_ds, nonexistent_paths
예제 #13
0
파일: save.py 프로젝트: debanjum/datalad
def process_vanished_paths(unavailable_paths, content_by_ds):
    # presently unavailable paths could be, e.g., deleted files, or
    # uninstalled subdatasets, or simply nothing -> figure it out and act
    # accordingly
    dsinfo = {}
    nonexistent_paths = []
    for p in unavailable_paths:
        # we need to check whether any of these correspond
        # to a known subdataset, and add those to the list of
        # things to be removed
        toppath = get_dataset_root(p)
        if not toppath:
            nonexistent_paths.append(p)
            continue
        ds = Dataset(toppath)
        dinfo = dsinfo.get(toppath,
                           {'deleted': ds.repo.get_deleted_files(),
                            'subds': ds.get_subdatasets(
                                recursive=False, absolute=True)})
        # cache for a potentially following request
        dsinfo[toppath] = dinfo
        if p in dinfo['subds']:
            # test for subds needs to come first, as it would also show
            # up in "deleted_files"
            # this is a known subdataset that has vanished
            lgr.debug('deinit vanished subdataset {} in {}'.format(p, ds))
            # simply deinit to complete a "forced uninstallation", without
            # an explicit "remove" there is nothing to be save in this
            # case
            ds.repo.deinit_submodule(p[len(_with_sep(ds.path)):])
        elif p in dinfo['deleted']:
            # vanished file -> 'git rm' it to stage the change
            ds.repo.remove(p)
            # record that we are "saving" this path
            dpaths = content_by_ds.get(ds.path, [])
            dpaths.append(p)
            content_by_ds[ds.path] = dpaths
        else:
            # this is nothing we can anyhow handle
            nonexistent_paths.append(p)
    return content_by_ds, nonexistent_paths
예제 #14
0
def yield_recursive(ds, path, action, recursion_limit):
    # make sure we get everything relevant in all _checked out_
    # subdatasets, obtaining of previously unavailable subdataset
    # is elsewhere
    for subd_res in ds.subdatasets(recursive=True,
                                   recursion_limit=recursion_limit,
                                   return_type='generator'):
        # this check is not the same as subdatasets --contains=path
        # because we want all subdataset below a path, not just the
        # containing one
        if subd_res['path'].startswith(_with_sep(path)):
            # this subdatasets is underneath the search path
            # be careful to not overwrite anything, in case
            # this subdataset has been processed before
            subd_res['action'] = action
            # mark as "notprocessed"
            subd_res['status'] = ''
            # we know that this is a known subdataset, that is how
            # we got here, make a record
            subd_res['registered_subds'] = True
            yield subd_res
예제 #15
0
def _recursive_install_subds_underneath(ds,
                                        recursion_limit,
                                        reckless,
                                        start=None):
    content_by_ds = {}
    if isinstance(recursion_limit, int) and recursion_limit <= 0:
        return content_by_ds
    # loop over submodules not subdatasets to get the url right away
    # install using helper that give some flexibility regarding where to
    # get the module from
    for sub in ds.repo.get_submodules():
        subds = Dataset(opj(ds.path, sub.path))
        if start is not None and not subds.path.startswith(_with_sep(start)):
            # this one we can ignore, not underneath the start path
            continue
        if not subds.is_installed():
            try:
                lgr.info("Installing subdataset %s", subds.path)
                subds = _install_subds_from_flexible_source(
                    ds, sub.path, sub.url, reckless)
                # we want the entire thing, but mark this subdataset
                # as automatically installed
                content_by_ds[subds.path] = [curdir]
            except Exception as e:
                # skip, if we didn't manage to install subdataset
                lgr.warning("Installation of subdatasets %s failed, skipped",
                            subds)
                lgr.debug("Installation attempt failed with exception: %s",
                          exc_str(e))
                continue
            # otherwise recurse
            # we can skip the start expression, we know we are within
            content_by_ds.update(
                _recursive_install_subds_underneath(
                    subds,
                    recursion_limit=recursion_limit -
                    1 if isinstance(recursion_limit, int) else recursion_limit,
                    reckless=reckless))
    return content_by_ds
예제 #16
0
def _dump_submeta(ds, submetas, matchpath, save, modified_ds):
    known_subds = list(submetas.keys())
    for p in known_subds:
        smeta = submetas[p]
        if matchpath and not p.startswith(_with_sep(matchpath)):
            continue
        subds_relpath = relpath(p, matchpath)
        # inject proper inter-dataset relationships
        for m in smeta:
            # skip non-implicit
            if not is_implicit_metadata(m):
                continue
            if 'dcterms:isPartOf' not in m and m.get('type', None) == 'Dataset':
                m['dcterms:isPartOf'] = ds.id
        sp = opj(ds.path, metadata_basepath, subds_relpath)
        _store_json(ds, sp, smeta)
        # stage potential changes in the subdataset
        try:
            ds.repo.add(subds_relpath, git=True)
        except CommandError:
            # it can blow if we skipped a non-dataset submodule
            # in this case we need to find the chain of submodules leading to it and
            # save then bottom-up
            testpath = dirname(subds_relpath)
            while testpath:
                # TODO this is a slow call that implies pretty bad repeated traversal
                # of dataset trees -- RF to use `subdatasets --contains`
                repo = ds.get_containing_subdataset(testpath)
                repo.repo.add(relpath(subds_relpath, testpath), git=True)
                modified_ds = _save_helper(repo, save, modified_ds)
                # see if there is anything left...
                # IMPORTANT to go with relpath to actually get to an empty
                # string eventually
                testpath = dirname(relpath(repo.path, ds.path))

        # removed stored item from lookup
        del submetas[p]
    return modified_ds
예제 #17
0
def _dump_submeta(ds, submetas, matchpath, save, modified_ds):
    known_subds = list(submetas.keys())
    for p in known_subds:
        smeta = submetas[p]
        if matchpath and not p.startswith(_with_sep(matchpath)):
            continue
        subds_relpath = relpath(p, matchpath)
        # inject proper inter-dataset relationships
        for m in smeta:
            # skip non-implicit
            if not is_implicit_metadata(m):
                continue
            if 'dcterms:isPartOf' not in m and m.get('type', None) == 'Dataset':
                m['dcterms:isPartOf'] = ds.id
        sp = opj(ds.path, metadata_basepath, subds_relpath)
        _store_json(ds, sp, smeta)
        # stage potential changes in the subdataset
        try:
            ds.repo.add(subds_relpath, git=True)
        except CommandError:
            # it can blow if we skipped a non-dataset submodule
            # in this case we need to find the chain of submodules leading to it and
            # save then bottom-up
            testpath = dirname(subds_relpath)
            while testpath:
                repo = ds.get_containing_subdataset(testpath)
                repo.repo.add(relpath(subds_relpath, testpath), git=True)
                modified_ds = _save_helper(repo, save, modified_ds)
                # see if there is anything left...
                # IMPORTANT to go with relpath to actually get to an empty
                # string eventually
                testpath = dirname(relpath(repo.path, ds.path))

        # removed stored item from lookup
        del submetas[p]
    return modified_ds
예제 #18
0
    def __call__(
            dataset=None,
            fulfilled=None,
            recursive=False,
            recursion_limit=None,
            contains=None,
            bottomup=False,
            set_property=None,
            delete_property=None):
        dataset = require_dataset(
            dataset, check_installed=False, purpose='subdataset reporting/modification')
        refds_path = dataset.path

        # XXX this seems strange, but is tested to be the case -- I'd rather set
        # `check_installed` to true above and fail
        if not GitRepo.is_valid_repo(refds_path):
            return

        # return as quickly as possible
        if isinstance(recursion_limit, int) and (recursion_limit <= 0):
            return

        try:
            if not (bottomup or contains or set_property or delete_property or \
                    (recursive and recursion_limit is not None)):
                # FAST IMPLEMENTATION FOR THE STRAIGHTFORWARD CASE
                # as fast as possible (just a single call to Git)
                # need to track current parent
                stack = [refds_path]
                modinfo_cache = {}
                for sm in _parse_git_submodules(refds_path, recursive=recursive):
                    # unwind the parent stack until we find the right one
                    # this assumes that submodules come sorted
                    while not sm['path'].startswith(_with_sep(stack[-1])):
                        stack.pop()
                    parent = stack[-1]
                    if parent not in modinfo_cache:
                        # read the parent .gitmodules, if not done yet
                        modinfo_cache[parent] = _parse_gitmodules(parent)
                    # get URL info, etc.
                    sm.update(modinfo_cache[parent].get(sm['path'], {}))
                    subdsres = get_status_dict(
                        'subdataset',
                        status='ok',
                        type='dataset',
                        refds=refds_path,
                        logger=lgr)
                    subdsres.update(sm)
                    subdsres['parentds'] = parent
                    if (fulfilled is None or
                            GitRepo.is_valid_repo(sm['path']) == fulfilled):
                        yield subdsres
                    # for the next "parent" commit this subdataset to the stack
                    stack.append(sm['path'])
                # MUST RETURN: the rest of the function is doing another implementation
                return
        except InvalidGitRepositoryError as e:
            lgr.debug("fast subdataset query failed, trying slow robust one (%s)",
                      exc_str(e))

        # MORE ROBUST, FLEXIBLE, BUT SLOWER IMPLEMENTATION
        # slow but flexible (one Git call per dataset), but deals with subdatasets in
        # direct mode
        if contains:
            contains = resolve_path(contains, dataset)
        for r in _get_submodules(
                dataset.path, fulfilled, recursive, recursion_limit,
                contains, bottomup, set_property, delete_property,
                refds_path):
            # without the refds_path cannot be rendered/converted relative
            # in the eval_results decorator
            r['refds'] = refds_path
            yield r
예제 #19
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            action=None,
            unavailable_path_status='',
            unavailable_path_msg=None,
            nondataset_path_status='error',
            force_parentds_discovery=True,
            force_subds_discovery=True,
            force_no_revision_change_discovery=True,
            force_untracked_discovery=True,
            modified=None):
        # upfront check for the fastest possible response
        if not path and dataset is None:
            # nothing given, try "here", but do not use `require_dataset`, as
            # it will determine the root dataset of `curdir` and further down
            # lead to path annotation of upstairs directories
            dataset = curdir

        if force_subds_discovery and not force_parentds_discovery:
            raise ValueError(
                'subdataset discovery requires parent dataset discovery')

        # CONCEPT: yield with no status to indicate further processing

        # everything in one big loop to be able too yield as fast a possible
        # without any precomputing for all paths
        refds_path = Interface.get_refds_path(dataset)
        if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)):
            raise ValueError(
                "modification detection only works with a base dataset (non-given or found)")

        # prep common result props
        res_kwargs = dict(
            action=action if action else 'annotate_path',
            refds=refds_path,
            logger=lgr)

        # handle the case of recursion into a single dataset without any
        # extra fancy processing first -- full recursion can be done
        # faster than manual recursion, hence we gain quite some speed
        # from these few lines of extra code
        if not modified and not path and refds_path:
            if not GitRepo.is_valid_repo(refds_path):
                yield get_status_dict(
                    # doesn't matter if the path is in another dataset
                    # it was given as reference dataset
                    status=nondataset_path_status,
                    message='given reference dataset is not a dataset',
                    path=refds_path,
                    **res_kwargs)
                return

            refds = Dataset(refds_path)
            path = []
            # yield the dataset itself
            r = get_status_dict(ds=refds, status='', **res_kwargs)
            yield r

            if recursive:
                # if we have nothing given, but need recursion, we need to feed
                # the dataset path itself
                for r in yield_recursive(
                        refds,
                        refds_path,
                        action,
                        recursion_limit):
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    yield r
            return

        # goal: structure in a way that makes most information on any path
        # available in a single pass, at the cheapest possible cost
        reported_paths = {}
        requested_paths = assure_list(path)

        if modified is not None:
            # modification detection would silently kill all nondataset paths
            # but we have to complain about them, hence doing it here
            if requested_paths and refds_path:
                for r in requested_paths:
                    p = r['path'] if isinstance(r, dict) else r
                    p = resolve_path(p, ds=refds_path)
                    if path_startswith(p, refds_path):
                        # all good
                        continue
                    # not the refds
                    path_props = r if isinstance(r, dict) else {}
                    res = get_status_dict(
                        **dict(res_kwargs, **path_props))
                    res['status'] = nondataset_path_status
                    res['message'] = 'path not associated with reference dataset'
                    reported_paths[r] = res
                    yield res

            # preserve non-existing paths to be silently killed by modification
            # detection and append them to requested_paths again after detection.
            # TODO: This might be melted in with treatment of non dataset paths
            # above. Re-appending those paths seems to be better than yielding
            # directly to avoid code duplication, since both cases later on are
            # dealt with again.
            preserved_paths = []
            if requested_paths:
                [preserved_paths.append(r)
                 for r in requested_paths
                 if not lexists(r['path'] if isinstance(r, dict) else r)]

            # replace the requested paths by those paths that were actually
            # modified underneath or at a requested location
            requested_paths = get_modified_subpaths(
                # either the request, or the base dataset, if there was no request
                requested_paths if requested_paths else [refds_path],
                refds=Dataset(refds_path),
                revision=modified,
                report_no_revision_change=force_no_revision_change_discovery,
                report_untracked='all' if force_untracked_discovery else 'no',
                recursion_limit=recursion_limit)

            from itertools import chain
            # re-append the preserved paths:
            requested_paths = chain(requested_paths, iter(preserved_paths))

        # do not loop over unique(), this could be a list of dicts
        # we avoid duplicates manually below via `reported_paths`
        for path in requested_paths:
            if not isinstance(path, dict):
                path = rawpath2ap(path, refds_path)
            # this is now an annotated path!
            path_props = path
            path = path['path']
            # we need to mark our territory, who knows where this has been
            path_props.update(res_kwargs)

            if path in reported_paths:
                # we already recorded this path in the output
                # this can happen, whenever `path` is a subdataset, that was
                # discovered via recursive processing of another path before
                continue
            # the path exists in some shape or form
            # TODO if we have path_props already we could skip this test
            if isdir(path):
                # keep any existing type info, previously a more expensive run
                # could have discovered an uninstalled 'dataset', and we don't
                # want it to be relabeled to a directory
                path_props['type'] = \
                    path_props.get(
                        'type',
                        'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory')
                # this could contain all types of additional content
                containing_dir = path if not islink(path) else normpath(opj(path, pardir))
            else:
                if lexists(path):
                    path_props['type'] = 'file'
                else:
                    path_props['state'] = 'absent'
                # for everything else we are interested in the container
                containing_dir = dirname(path)
                if not containing_dir:
                    containing_dir = curdir

            dspath = parent = get_dataset_root(containing_dir)
            if dspath:
                if path_props.get('type', None) == 'dataset':
                    # for a dataset the root is not the parent, for anything else
                    # it is
                    parent = path_props.get('parentds', None)
                    oneupdir = normpath(opj(containing_dir, pardir))
                    if parent is None and (force_parentds_discovery or (
                            refds_path and _with_sep(oneupdir).startswith(
                                _with_sep(refds_path)))):
                        # either forced, or only if we have a reference dataset, and
                        # only if we stay within this refds when searching for the
                        # parent
                        parent = get_dataset_root(normpath(opj(containing_dir, pardir)))
                        # NOTE the `and refds_path` is critical, as it will determine
                        # whether a top-level dataset that was discovered gets the
                        # parent property or not, it won't get it without a common
                        # base dataset, and that is how we always rolled
                    if parent and refds_path:
                        path_props['parentds'] = parent
                        # don't check whether this is actually a true subdataset of the
                        # parent, done further down
                else:
                    # set parent, but prefer existing property
                    path_props['parentds'] = path_props.get('parentds', dspath)

            # test for `dspath` not `parent`, we only need to know whether there is
            # ANY dataset, not which one is the true parent, logic below relies on
            # the fact that we end here, if there is no dataset at all
            if not dspath:
                # not in any dataset
                res = get_status_dict(
                    **dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = 'path not associated with any dataset'
                reported_paths[path] = res
                yield res
                continue

            # check that we only got SUBdatasets
            if refds_path and not path_startswith(dspath, refds_path):
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = \
                    ('path not part of the reference dataset at %s', refds_path)
                reported_paths[path] = res
                yield res
                continue

            if path_props.get('type', None) == 'file':
                # nothing else we can learn about this
                res = get_status_dict(**dict(res_kwargs, **path_props))
                if 'status' not in res:
                    res['status'] = ''
                reported_paths[path] = res
                yield res
                continue

            containing_ds = None
            path_type = path_props.get('type', None)
            if parent and force_subds_discovery and (
                    (path_type == 'dataset' and 'registered_subds' not in path_props) or
                    path_type == 'directory' or
                    not lexists(path)):
                # if the path doesn't exist, or is labeled a directory, or a dataset even
                # a dataset (without this info) -> record whether this is a known subdataset
                # to its parent
                containing_ds = Dataset(parent)
                subdss = containing_ds.subdatasets(
                    fulfilled=None, recursive=False,
                    result_xfm=None, result_filter=None, return_type='list')
                if path in [s['path'] for s in subdss]:
                    if path_type == 'directory' or not lexists(path):
                        # first record that it isn't here, if just a dir or not here at all
                        path_props['state'] = 'absent'
                    # this must be a directory, and it is not installed
                    path_props['type'] = 'dataset'
                    path_props['registered_subds'] = True

            if not lexists(path) or \
                    (path_props.get('type', None) == 'dataset' and
                     path_props.get('state', None) == 'absent'):
                # not there (yet)
                message = unavailable_path_msg if unavailable_path_msg else None
                if message and '%s' in message:
                    message = (message, path)
                path_props['message'] = message
                res = get_status_dict(**dict(res_kwargs, **path_props))
                # assign given status, but only if the props don't indicate a status
                # already
                res['status'] = path_props.get(
                    'status', unavailable_path_status)
                reported_paths[path] = res
                yield res
                continue

            # we know everything we can, report
            res = get_status_dict(**dict(res_kwargs, **path_props))
            if 'status' not in res:
                res['status'] = ''
            reported_paths[path] = res
            yield res

            rec_paths = []
            if recursive:
                # here we need to consider the special case that `path` is
                # a dataset itself, if a recursion_limit is given (e.g.
                # `remove` will do that by default), we need to recurse
                # from the dataset itself, and not its parent to get things
                # right -- this will also avoid needless discovery of
                # unrelated subdatasets
                if path_props.get('type', None) == 'dataset':
                    containing_ds = Dataset(path)
                else:
                    # regular parent, we might have a dataset already
                    containing_ds = Dataset(parent) if containing_ds is None else containing_ds
                for r in yield_recursive(containing_ds, path, action, recursion_limit):
                    # capture reported paths
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    reported_paths[r['path']] = r
                    if modified is not None:
                        # we cannot yield right away, maybe it wasn't modified
                        rec_paths.append(r)
                    else:
                        yield r
            if modified is not None and rec_paths:
                # replace the recursively discovered paths by those paths that
                # were actually modified underneath or at a requested location
                for r in get_modified_subpaths(
                        rec_paths,
                        refds=Dataset(refds_path),
                        revision=modified,
                        report_no_revision_change=force_no_revision_change_discovery,
                        report_untracked='all' if force_untracked_discovery else 'no',
                        recursion_limit=recursion_limit):
                    res = get_status_dict(**dict(r, **res_kwargs))
                    reported_paths[res['path']] = res
                    yield res
        return
예제 #20
0
def get_modified_subpaths(aps, refds, revision, recursion_limit=None,
                          report_no_revision_change=True,
                          report_untracked='all'):
    """
    Parameters
    ----------
    aps : list
    refds : Dataset
    revision : str
      Commit-ish
    """
    # TODO needs recursion limit
    # NOTE this is implemented as a generator despite that fact that we need
    # to sort through _all_ the inputs initially, diff'ing each involved
    # dataset takes time that we can use to already act on intermediate
    # result paths, without having to wait for 100% completion
    if revision is None:
        # we want all, subds not matching the ref are assumed to have been
        # sorted out before (e.g. one level up)
        for r in aps:
            yield r

    # life is simple: we diff the base dataset
    modified = []
    # Diff.__call__ is used to get access to the now obsolete interface.diff
    # that exists merely for annotate_paths. (refds.diff corresponds to
    # core.local.diff.)
    from datalad.interface.diff import Diff
    for r in Diff.__call__(
            dataset=refds,
            # we cannot really limit the diff paths easily because we might get
            # or miss content (e.g. subdatasets) if we don't figure out which
            # ones are known -- and we don't want that
            path=None,
            # `revision` can be anything that Git support for `diff`
            # `True` is code for diff without revision
            revision=revision if revision is not True else None,
            # it is important that staged is False, otherwise we would miss unstaged
            # changes when e.g. diffing against HEAD (save does that)
            staged=False,
            # we might want to consider putting 'untracked' here
            # maybe that is a little faster, not tested yet
            ignore_subdatasets='none',
            # by default, we want to see any individual untracked file, this simplifies further
            # processing dramatically, but may require subsequent filtering
            # in order to avoid flooding user output with useless info
            report_untracked=report_untracked,
            # no recursion, we needs to update `revision` for every subdataset
            # before we can `diff`
            recursive=False,
            return_type='generator',
            result_renderer=None,
            # need to be able to yield the errors
            on_failure='ignore'):
        if r['status'] in ('impossible', 'error'):
            # something unexpected, tell daddy
            yield r
            continue
        # if asked, and no change in revision -- skip
        if not report_no_revision_change \
                and (r.get('revision_src') or r.get('revision')) \
                and (r.get('revision_src') == r.get('revision')):
            continue
        r['status'] = ''
        modified.append(r)

    if not len(modified):
        # nothing modified nothing to report
        return

    # now we can grab the APs that are in this dataset and yield them
    for ap in aps:
        # need to preserve pristine info first
        ap = ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path)
        for m in modified:
            if ap['path'] == m['path']:
                # is directly modified, yield input AP
                # but update with what we learned about the modification
                ap.update(m)
                yield ap
                break
            if path_is_subpath(m['path'], ap['path']):
                # a modified path is underneath this AP
                # yield the modified one instead
                yield m
                continue

    mod_subs = [m for m in modified if m.get('type', None) == 'dataset']
    if not mod_subs or (recursion_limit is not None and recursion_limit < 1):
        return

    aps = [ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path) for ap in aps]
    # now for all submodules that were found modified
    for sub in [m for m in modified if m.get('type', None) == 'dataset']:
        sub_path_ = _with_sep(sub['path'])
        # these AP match something inside this submodule, or the whole submodule
        sub_aps = [ap for ap in aps if _with_sep(ap['path']).startswith(sub_path_)]
        if not sub_aps:
            continue
        # we are interested in the modifications within this subdataset
        # from the state we previously had on record, till the state
        # we have in record now
        diff_range = '{}..{}'.format(
            sub['revision_src'] if sub['revision_src'] else PRE_INIT_COMMIT_SHA,
            sub['revision'] if sub['revision'] else '')
        if sub['revision_src'] and sub['revision_src'] == sub['revision']:
            # this is a special case, where subdataset reported changes without
            # a change in state/commit -- this is code for uncommited changes
            # in the subdataset (including staged ones). In such a case, we
            # must not provide a diff range, but only the source commit we want
            # to diff against
            # XXX if this is changed, likely the same logic in diff needs
            # changing too!
            diff_range = sub['revision_src']

        for r in get_modified_subpaths(
                sub_aps,
                Dataset(sub['path']),
                diff_range,
                recursion_limit=(recursion_limit - 1) if recursion_limit is not None else None ):
            yield r
예제 #21
0
def annotated2content_by_ds(annotated, refds_path, path_only=False):
    """Helper to convert annotated paths into an old-style content_by_ds dict

    Only items with an `status` property value not equal to 'ok', 'notneeded',
    'impossible', or 'error' are sorted. All others are considered as
    already processed and are returned in a separate list.

    Parameters
    ----------
    annotated : list or generator
      Dicts with annotated path information.
    refds_path : str
      Path to the reference dataset the original path annotation was based on.
    path_only: bool
      Whether returned dict values are sequences of just paths for each
      dataset, or whether the full info dicts are reported as items.

    Returns
    -------
    dict, dict, list, list
      Dict keys are dataset paths, values are determined by the `path_only`
      switch. The keys in the second dict are paths to dataset, values are
      dicts with all known properties about those datasets.
      The first list contains all already "processed" results, which
      typically need to be re-yielded. The second list contains items (same
      type as dict values) for all annotated paths that have no associated
      parent dataset (i.e. nondataset paths) -- this list will be empty by
      default, unless `nondataset_path_status` was set to ''."""
    content_by_ds = OrderedDict()
    ds_props = {}
    nondataset_paths = []
    completed = []
    for r in annotated:
        if r.get('type', None) == 'dataset':
            # collect all properties of all known datasets from the annotated
            # paths
            dp = ds_props.get(r['path'], {})
            dp.update(r)
            ds_props[r['path']] = dp
        if r.get('status', None) in ('ok', 'notneeded', 'impossible', 'error'):
            completed.append(r)
            continue
        parentds = r.get('parentds', None)
        if r.get('type', None) == 'dataset':
            # do dataset handling first, it is the more complex beast
            orig_request = r.get('orig_request', None)
            if parentds is None or refds_path is None or \
                    r.get('process_content', False) or (orig_request and (
                    orig_request == curdir or
                    orig_request.endswith(dirsep) or
                    orig_request.endswith('{}{}'.format(dirsep, curdir)))):
                # a dataset that floats by on its own OR
                # behave similar to rsync, a trailing '/' indicates the
                # content rather then the dataset itself
                # in both cases we want to process this part as part
                # of the same dataset, and not any potential parent
                toappendto = content_by_ds.get(r['path'], [])
                toappendto.append(r['path'] if path_only else r)
                content_by_ds[r['path']] = toappendto
            if parentds and refds_path and \
                    _with_sep(parentds).startswith(_with_sep(refds_path)):
                # put also in parentds record if there is any, and the parent
                # is underneath or identical to the reference dataset
                toappendto = content_by_ds.get(parentds, [])
                toappendto.append(r['path'] if path_only else r)
                content_by_ds[parentds] = toappendto
        else:
            # files and dirs
            # common case, something with a parentds
            toappendto = content_by_ds.get(parentds, [])
            toappendto.append(r['path'] if path_only else r)
            content_by_ds[parentds] = toappendto

    return content_by_ds, ds_props, completed, nondataset_paths
예제 #22
0
    def __call__(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            action=None,
            unavailable_path_status='',
            unavailable_path_msg=None,
            nondataset_path_status='error',
            force_parentds_discovery=True,
            force_subds_discovery=True,
            force_no_revision_change_discovery=True,
            force_untracked_discovery=True,
            modified=None):
        # upfront check for the fastest possible response
        if not path and dataset is None:
            # nothing given, try "here", but do not use `require_dataset`, as
            # it will determine the root dataset of `curdir` and further down
            # lead to path annotation of upstairs directories
            dataset = curdir

        if force_subds_discovery and not force_parentds_discovery:
            raise ValueError(
                'subdataset discovery requires parent dataset discovery')

        # CONCEPT: yield with no status to indicate further processing

        # everything in one big loop to be able too yield as fast a possible
        # without any precomputing for all paths
        refds_path = Interface.get_refds_path(dataset)
        if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)):
            raise ValueError(
                "modification detection only works with a base dataset (non-given or found)")

        # prep common result props
        res_kwargs = dict(
            action=action if action else 'annotate_path',
            refds=refds_path,
            logger=lgr)

        # handle the case of recursion into a single dataset without any
        # extra fancy processing first -- full recursion can be done
        # faster than manual recursion, hence we gain quite some speed
        # from these few lines of extra code
        if not modified and not path and refds_path:
            if not GitRepo.is_valid_repo(refds_path):
                yield get_status_dict(
                    # doesn't matter if the path is in another dataset
                    # it was given as reference dataset
                    status=nondataset_path_status,
                    message='given reference dataset is not a dataset',
                    path=refds_path,
                    **res_kwargs)
                return

            refds = Dataset(refds_path)
            path = []
            # yield the dataset itself
            r = get_status_dict(ds=refds, status='', **res_kwargs)
            yield r

            if recursive:
                # if we have nothing given, but need recursion, we need to feed
                # the dataset path itself
                for r in yield_recursive(
                        refds,
                        refds_path,
                        action,
                        recursion_limit):
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    yield r
            return

        # goal: structure in a way that makes most information on any path
        # available in a single pass, at the cheapest possible cost
        reported_paths = {}
        requested_paths = assure_list(path)

        if modified is not None:
            # modification detection would silently kill all nondataset paths
            # but we have to complain about them, hence doing it here
            if requested_paths and refds_path:
                for r in requested_paths:
                    p = r['path'] if isinstance(r, dict) else r
                    p = resolve_path(p, ds=refds_path)
                    if _with_sep(p).startswith(_with_sep(refds_path)):
                        # all good
                        continue
                    # not the refds
                    path_props = r if isinstance(r, dict) else {}
                    res = get_status_dict(
                        **dict(res_kwargs, **path_props))
                    res['status'] = nondataset_path_status
                    res['message'] = 'path not associated with reference dataset'
                    reported_paths[r] = res
                    yield res

            # preserve non-existing paths to be silently killed by modification
            # detection and append them to requested_paths again after detection.
            # TODO: This might be melted in with treatment of non dataset paths
            # above. Re-appending those paths seems to be better than yielding
            # directly to avoid code duplication, since both cases later on are
            # dealt with again.
            preserved_paths = []
            if requested_paths:
                [preserved_paths.append(r)
                 for r in requested_paths
                 if not lexists(r['path'] if isinstance(r, dict) else r)]

            # replace the requested paths by those paths that were actually
            # modified underneath or at a requested location
            requested_paths = get_modified_subpaths(
                # either the request, or the base dataset, if there was no request
                requested_paths if requested_paths else [refds_path],
                refds=Dataset(refds_path),
                revision=modified,
                report_no_revision_change=force_no_revision_change_discovery,
                report_untracked='all' if force_untracked_discovery else 'no',
                recursion_limit=recursion_limit)

            from itertools import chain
            # re-append the preserved paths:
            requested_paths = chain(requested_paths, iter(preserved_paths))

        # do not loop over unique(), this could be a list of dicts
        # we avoid duplicates manually below via `reported_paths`
        for path in requested_paths:
            if not isinstance(path, dict):
                path = rawpath2ap(path, refds_path)
            # this is now an annotated path!
            path_props = path
            path = path['path']
            # we need to mark our territory, who knows where this has been
            path_props.update(res_kwargs)

            if path in reported_paths:
                # we already recorded this path in the output
                # this can happen, whenever `path` is a subdataset, that was
                # discovered via recursive processing of another path before
                continue
            # the path exists in some shape or form
            # TODO if we have path_props already we could skip this test
            if isdir(path):
                # keep any existing type info, previously a more expensive run
                # could have discovered an uninstalled 'dataset', and we don't
                # want it to be relabeled to a directory
                path_props['type'] = \
                    path_props.get(
                        'type',
                        'dataset' if GitRepo.is_valid_repo(path) else 'directory')
                # this could contain all types of additional content
                containing_dir = path
            else:
                if lexists(path):
                    path_props['type'] = 'file'
                else:
                    path_props['state'] = 'absent'
                # for everything else we are interested in the container
                containing_dir = dirname(path)
                if not containing_dir:
                    containing_dir = curdir

            dspath = parent = get_dataset_root(containing_dir)
            if dspath:
                if path_props.get('type', None) == 'dataset':
                    # for a dataset the root is not the parent, for anything else
                    # it is
                    parent = path_props.get('parentds', None)
                    oneupdir = normpath(opj(containing_dir, pardir))
                    if parent is None and (force_parentds_discovery or (
                            refds_path and _with_sep(oneupdir).startswith(
                                _with_sep(refds_path)))):
                        # either forced, or only if we have a reference dataset, and
                        # only if we stay within this refds when searching for the
                        # parent
                        parent = get_dataset_root(normpath(opj(containing_dir, pardir)))
                        # NOTE the `and refds_path` is critical, as it will determine
                        # whether a top-level dataset that was discovered gets the
                        # parent property or not, it won't get it without a common
                        # base dataset, and that is how we always rolled
                    if parent and refds_path:
                        path_props['parentds'] = parent
                        # don't check whether this is actually a true subdataset of the
                        # parent, done further down
                else:
                    # set parent, but prefer existing property
                    path_props['parentds'] = path_props.get('parentds', dspath)

            # test for `dspath` not `parent`, we only need to know whether there is
            # ANY dataset, not which one is the true parent, logic below relies on
            # the fact that we end here, if there is no dataset at all
            if not dspath:
                # not in any dataset
                res = get_status_dict(
                    **dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = 'path not associated with any dataset'
                reported_paths[path] = res
                yield res
                continue

            # check that we only got SUBdatasets
            if refds_path and not _with_sep(dspath).startswith(_with_sep(refds_path)):
                res = get_status_dict(**dict(res_kwargs, **path_props))
                res['status'] = nondataset_path_status
                res['message'] = \
                    ('path not part of the reference dataset at %s', refds_path)
                reported_paths[path] = res
                yield res
                continue

            if path_props.get('type', None) == 'file':
                # nothing else we can learn about this
                res = get_status_dict(**dict(res_kwargs, **path_props))
                if 'status' not in res:
                    res['status'] = ''
                reported_paths[path] = res
                yield res
                continue

            containing_ds = None
            path_type = path_props.get('type', None)
            if parent and force_subds_discovery and (
                    (path_type == 'dataset' and 'registered_subds' not in path_props) or
                    path_type == 'directory' or
                    not lexists(path)):
                # if the path doesn't exist, or is labeled a directory, or a dataset even
                # a dataset (without this info) -> record whether this is a known subdataset
                # to its parent
                containing_ds = Dataset(parent)
                subdss = containing_ds.subdatasets(
                    fulfilled=None, recursive=False,
                    result_xfm=None, result_filter=None, return_type='list')
                if path in [s['path'] for s in subdss]:
                    if path_type == 'directory' or not lexists(path):
                        # first record that it isn't here, if just a dir or not here at all
                        path_props['state'] = 'absent'
                    # this must be a directory, and it is not installed
                    path_props['type'] = 'dataset'
                    path_props['registered_subds'] = True

            if not lexists(path) or \
                    (path_props.get('type', None) == 'dataset' and
                     path_props.get('state', None) == 'absent'):
                # not there (yet)
                message = unavailable_path_msg if unavailable_path_msg else None
                if message and '%s' in message:
                    message = (message, path)
                path_props['message'] = message
                res = get_status_dict(**dict(res_kwargs, **path_props))
                # assign given status, but only if the props don't indicate a status
                # already
                res['status'] = path_props.get(
                    'status', unavailable_path_status)
                reported_paths[path] = res
                yield res
                continue

            # we know everything we can, report
            res = get_status_dict(**dict(res_kwargs, **path_props))
            if 'status' not in res:
                res['status'] = ''
            reported_paths[path] = res
            yield res

            rec_paths = []
            if recursive:
                # here we need to consider the special case that `path` is
                # a dataset itself, if a recursion_limit is given (e.g.
                # `remove` will do that by default), we need to recurse
                # from the dataset itself, and not its parent to get things
                # right -- this will also avoid needless discovery of
                # unrelated subdatasets
                if path_props.get('type', None) == 'dataset':
                    containing_ds = Dataset(path)
                else:
                    # regular parent, we might have a dataset already
                    containing_ds = Dataset(parent) if containing_ds is None else containing_ds
                for r in yield_recursive(containing_ds, path, action, recursion_limit):
                    # capture reported paths
                    r.update(res_kwargs)
                    if 'refds' in r and not r['refds']:
                        # avoid cruft
                        del r['refds']
                    reported_paths[r['path']] = r
                    if modified is not None:
                        # we cannot yield right away, maybe it wasn't modified
                        rec_paths.append(r)
                    else:
                        yield r
            if modified is not None and rec_paths:
                # replace the recursively discovered paths by those paths that
                # were actually modified underneath or at a requested location
                for r in get_modified_subpaths(
                        rec_paths,
                        refds=Dataset(refds_path),
                        revision=modified,
                        report_no_revision_change=force_no_revision_change_discovery,
                        report_untracked='all' if force_untracked_discovery else 'no',
                        recursion_limit=recursion_limit):
                    res = get_status_dict(**dict(r, **res_kwargs))
                    reported_paths[res['path']] = res
                    yield res
        return
예제 #23
0
def get_modified_subpaths(aps, refds, revision, recursion_limit=None,
                          report_no_revision_change=True,
                          report_untracked='all'):
    """
    Parameters
    ----------
    aps : list
    refds : Dataset
    revision : str
      Commit-ish
    """
    # TODO needs recursion limit
    # NOTE this is implemented as a generator despite that fact that we need
    # to sort through _all_ the inputs initially, diff'ing each involved
    # dataset takes time that we can use to already act on intermediate
    # result paths, without having to wait for 100% completion
    if revision is None:
        # we want all, subds not matching the ref are assumed to have been
        # sorted out before (e.g. one level up)
        for r in aps:
            yield r

    # life is simple: we diff the base dataset
    modified = []
    for r in refds.diff(
            # we cannot really limit the diff paths easily because we might get
            # or miss content (e.g. subdatasets) if we don't figure out which
            # ones are known -- and we don't want that
            path=None,
            # `revision` can be anything that Git support for `diff`
            # `True` is code for diff without revision
            revision=revision if revision is not True else None,
            # it is important that staged is False, otherwise we would miss unstaged
            # changes when e.g. diffing against HEAD (save does that)
            staged=False,
            # we might want to consider putting 'untracked' here
            # maybe that is a little faster, not tested yet
            ignore_subdatasets='none',
            # by default, we want to see any individual untracked file, this simplifies further
            # processing dramatically, but may require subsequent filtering
            # in order to avoid flooding user output with useless info
            report_untracked=report_untracked,
            # no recursion, we needs to update `revision` for every subdataset
            # before we can `diff`
            recursive=False,
            return_type='generator',
            result_renderer=None,
            # need to be able to yield the errors
            on_failure='ignore'):
        if r['status'] in ('impossible', 'error'):
            # something unexpected, tell daddy
            yield r
            continue
        # if asked, and no change in revision -- skip
        if not report_no_revision_change \
                and (r.get('revision_src') or r.get('revision')) \
                and (r.get('revision_src') == r.get('revision')):
            continue
        r['status'] = ''
        modified.append(r)

    if not len(modified):
        # nothing modified nothing to report
        return

    # now we can grab the APs that are in this dataset and yield them
    for ap in aps:
        # need to preserve pristine info first
        ap = ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path)
        for m in modified:
            if ap['path'] == m['path']:
                # is directly modified, yield input AP
                # but update with what we learned about the modification
                ap.update(m)
                yield ap
                break
            if m['path'].startswith(_with_sep(ap['path'])):
                # a modified path is underneath this AP
                # yield the modified one instead
                yield m
                continue

    mod_subs = [m for m in modified if m.get('type', None) == 'dataset']
    if not mod_subs or (recursion_limit is not None and recursion_limit < 1):
        return

    aps = [ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path) for ap in aps]
    # now for all submodules that were found modified
    for sub in [m for m in modified if m.get('type', None) == 'dataset']:
        sub_path_ = _with_sep(sub['path'])
        # these AP match something inside this submodule, or the whole submodule
        sub_aps = [ap for ap in aps if _with_sep(ap['path']).startswith(sub_path_)]
        if not sub_aps:
            continue
        # we are interested in the modifications within this subdataset
        # from the state we previously had on record, till the state
        # we have in record now
        diff_range = '{}..{}'.format(
            sub['revision_src'] if sub['revision_src'] else PRE_INIT_COMMIT_SHA,
            sub['revision'] if sub['revision'] else '')
        if sub['revision_src'] and sub['revision_src'] == sub['revision']:
            # this is a special case, where subdataset reported changes without
            # a change in state/commit -- this is code for uncommited changes
            # in the subdataset (including staged ones). In such a case, we
            # must not provide a diff range, but only the source commit we want
            # to diff against
            # XXX if this is changed, likely the same logic in diff needs
            # changing too!
            diff_range = sub['revision_src']

        for r in get_modified_subpaths(
                sub_aps,
                Dataset(sub['path']),
                diff_range,
                recursion_limit=(recursion_limit - 1) if recursion_limit is not None else None ):
            yield r
예제 #24
0
파일: base.py 프로젝트: yarikoptic/datalad
    def _prep(
            path=None,
            dataset=None,
            recursive=False,
            recursion_limit=None,
            dir_lookup=None,
            sub_paths=True):
        """Common input argument validation and pre-processing

        This method pre-processes the two most common input argument types:
        a base dataset, and one or more given paths. One or the other needs
        to be different from `None` or an `InsufficientArgumentsError` will
        be raised.

        Paths are normalized based on current practice (if relative, they
        are interpreted relative to a base dataset, if one is provided, or
        relative to the current working directory if not).

        Paths are then sorted by the datasets that contain them. If paths are
        detected that are not associated with any dataset `ValueError` is
        raised. If a `dataset` is given, any paths associated with a dataset
        that is not this dataset or a subdataset of it will also trigger a
        `ValueError`.

        Parameters
        ----------
        path : path or list(path) or None
          Path input argument
        dataset : path or Dataset or None
          Dataset input argument. If given, the output dict is guaranteed
          to carry a key for this dataset, but not necessarily any paths
          as values.
        recursive : bool
          Whether to discover subdatasets under any of the given paths
          recursively
        recursion_limit : None or int
          Optional recursion limit specification (max levels of recursion)
        dir_lookup : dict, optional
          Passed to `get_paths_by_dataset`
        sub_paths : bool, optional
          Passed to `get_paths_by_dataset`  :-P

        Returns
        -------
        (dict, list)
          The dictionary contains keys of absolute dataset paths and lists with
          the normalized (generally absolute) paths of presently existing
          locations associated with the respective dataset as values. The list
          return in addition contains all paths that are part of a dataset, but
          presently do not exist on the filesystem.
        """
        from .utils import get_normalized_path_arguments
        from .utils import get_paths_by_dataset
        # upfront check prior any resolution attempt to avoid disaster
        if path is None and dataset is None:
            raise InsufficientArgumentsError(
                "at least a dataset or a path must be given")

        path, dataset_path = get_normalized_path_arguments(
            path, dataset)
        if not path and dataset_path and recursive:
            # if we have nothing given, but need recursion, we need to feed
            # the dataset path to the sorting to make it work
            # but we also need to fish it out again afterwards
            tosort = [dataset_path]
            fishout_dataset_path = True
        else:
            tosort = path
            fishout_dataset_path = False
        content_by_ds, unavailable_paths, nondataset_paths = \
            get_paths_by_dataset(tosort,
                                 recursive=recursive,
                                 recursion_limit=recursion_limit,
                                 dir_lookup=dir_lookup,
                                 sub_paths=sub_paths)
        if fishout_dataset_path:  # explicit better than implicit, duplication is evil
            # fish out the dataset path that we inserted above
            content_by_ds[dataset_path] = [p for p in content_by_ds[dataset_path]
                                           if p != dataset_path]
        if not path and dataset_path:
            # no files given, but a dataset -> operate on whole dataset
            # but do not specify any paths to process -- needs to be tailored
            # by caller
            content_by_ds[dataset_path] = content_by_ds.get(dataset_path, [])
        if dataset_path and not content_by_ds and not unavailable_paths:
            # we got a dataset, but there is nothing actually installed
            nondataset_paths.append(dataset_path)
        if dataset_path:
            # check that we only got SUBdatasets
            dataset_path = _with_sep(dataset_path)
            for ds in content_by_ds:
                if not _with_sep(ds).startswith(dataset_path):
                    nondataset_paths.extend(content_by_ds[ds])
        # complain about nondataset and non-existing paths
        if nondataset_paths:
            if dataset_path:
                raise ValueError(
                    "will not touch paths outside of base datasets(%s): %s"
                    % (dataset_path, nondataset_paths))
            else:
                raise ValueError(
                    "will not touch paths outside of installed datasets: %s"
                    % nondataset_paths)
        if unavailable_paths:
            lgr.debug('Encountered unavaliable paths: %s', unavailable_paths)
        return content_by_ds, unavailable_paths
예제 #25
0
파일: utils.py 프로젝트: yarikoptic/datalad
def get_paths_by_dataset(paths,
                         recursive=False,
                         recursion_limit=None,
                         out=None,
                         dir_lookup=None,
                         sub_paths=True):
    """Sort a list of paths per dataset they are contained in.

    Any paths that are not part of a dataset, or presently unavailable are
    reported.

    Parameter
    ---------
    paths : sequence
      A sequence of path specifications to sort.
    recursive : bool
      Flag whether to report subdatasets under any of the given paths
    recursion_limit :
      Depth constraint for recursion. See `subdatasets()` for more
      information.
    out : dict or None
      By default a new output dictionary is created, however an existing one
      can be provided via this argument to enable incremental processing.
    dir_lookup : dict or None, optional
      Optional lookup cache that maps paths to previously determined datasets.
      This can speed up repeated processing.
    sub_paths : bool, optional
      Provide a list containing the sub-dataset path, as the entry for that
      sub-dataset.  If False, empty list is assigned

    Returns
    -------
    Tuple(dict, list, list)
      Dict of `existing dataset path`: `path` mappings, the list of currently
      non-existing paths (possibly matching currently uninstalled datasets),
      and any paths that are not part of any dataset.
    """
    # sort paths into the respective datasets
    if dir_lookup is None:
        dir_lookup = {}
    if out is None:
        out = {}
    # paths that don't exist (yet)
    unavailable_paths = []
    nondataset_paths = []
    for path in unique(paths):
        if not lexists(path):
            # not there yet, impossible to say which ds it will actually
            # be in, if any
            unavailable_paths.append(path)
            continue
        # the path exists in some shape or form
        if isdir(path):
            # this could contain all types of additional content
            d = path
        else:
            # for everything else we are interested in the container
            d = dirname(path)
            if not d:
                d = curdir

        dspath = dir_lookup.get(d, None)
        if dspath:
            _ds_looked_up = True
        else:
            _ds_looked_up = False
            # this could be `None` if there is no git repo
            dspath = get_dataset_root(d)
            dir_lookup[d] = dspath

        if not dspath:
            nondataset_paths.append(path)
            continue

        if path in out.get(dspath, []):
            # we already recorded this path in the output
            # this can happen, whenever `path` is a subdataset, that was
            # discovered via recursive processing of another path before
            continue

        if isdir(path):
            ds = Dataset(dspath)
            # we need to doublecheck that this is not a subdataset mount
            # point, in which case get_dataset_root() would point to the parent.

            if not _ds_looked_up:
                # we didn't deal with it before

                # TODO this is a slow call, no need for dedicated RF, will vanish
                # together with the entire function
                smpath = ds.get_containing_subdataset(path,
                                                      recursion_limit=1).path
                if smpath != dspath:
                    # fix entry
                    dir_lookup[d] = smpath
                    # submodule still needs to be obtained
                    unavailable_paths.append(path)
                    continue
            else:
                # we figured out the dataset previously, so we can spare some
                # effort by not calling ds.subdatasets or
                # ds.get_containing_subdataset. Instead we just need
                # get_dataset_root, which is cheaper
                if dspath != get_dataset_root(dspath):
                    # if the looked up path isn't the default value,
                    # it's a 'fixed' entry for an unavailable dataset (see above)
                    unavailable_paths.append(path)
                    continue

            if recursive:
                # make sure we get everything relevant in all _checked out_
                # subdatasets, obtaining of previously unavailable subdataset
                # else done elsewhere
                for subdspath in ds.subdatasets(
                        fulfilled=True,
                        recursive=recursive,
                        recursion_limit=recursion_limit,
                        result_xfm='paths'):
                    if subdspath.startswith(_with_sep(path)):
                        # this subdatasets is underneath the search path
                        # be careful to not overwrite anything, in case
                        # this subdataset has been processed before
                        out[subdspath] = out.get(
                            subdspath, [subdspath] if sub_paths else [])

        out[dspath] = out.get(dspath, []) + [path]
    return out, unavailable_paths, nondataset_paths
예제 #26
0
파일: add.py 프로젝트: silky/datalad
    def __call__(path=None,
                 source=None,
                 dataset=None,
                 to_git=False,
                 save=True,
                 recursive=False,
                 recursion_limit=None,
                 if_dirty='ignore',
                 git_opts=None,
                 annex_opts=None,
                 annex_add_opts=None,
                 jobs=None):

        # parameter constraints:
        if not path and not source:
            raise InsufficientArgumentsError(
                "insufficient information for "
                "adding: requires at least a path "
                "or a source.")

        # When called from cmdline `path` and `source` will be a list even if
        # there is only one item.
        # Make sure we deal with the same when called via python API:
        # always yields list; empty if None
        path = assure_list(path)
        source = assure_list(source)

        # TODO: Q: are the list operations in the following 3 blocks (resolving
        #          paths, sources and datasets) guaranteed to be stable
        #          regarding order?

        # resolve path(s):
        # TODO: RF: resolve_path => datalad.utils => more general (repos => normalize paths)
        resolved_paths = [resolve_path(p, dataset) for p in path]

        # must come after resolve_path()!!
        # resolve dataset:
        dataset = require_dataset(dataset,
                                  check_installed=True,
                                  purpose='adding')
        handle_dirty_dataset(dataset, if_dirty)

        # resolve source(s):
        resolved_sources = []
        for s in source:
            if not is_datalad_compat_ri(s):
                raise ValueError("invalid source parameter: %s" % s)
            resolved_sources.append(_get_git_url_from_source(s))

        # find (sub-)datasets to add things to (and fail on invalid paths):
        if recursive:

            # 1. Find the (sub-)datasets containing the given path(s):
            # Note, that `get_containing_subdataset` raises if `p` is
            # outside `dataset`, but it returns `dataset`, if `p` is inside
            # a subdataset not included by `recursion_limit`. In the latter
            # case, the git calls will fail instead.
            # We could check for this right here and fail early, but this
            # would lead to the need to discover the entire hierarchy no
            # matter if actually required.
            resolved_datasets = [
                dataset.get_containing_subdataset(
                    p, recursion_limit=recursion_limit) for p in resolved_paths
            ]

            # 2. Find implicit subdatasets to call add on:
            # If there are directories in resolved_paths (Note,
            # that this includes '.' and '..'), check for subdatasets
            # beneath them. These should be called recursively with '.'.
            # Therefore add the subdatasets to resolved_datasets and
            # corresponding '.' to resolved_paths, in order to generate the
            # correct call.
            for p in resolved_paths:
                if isdir(p):
                    for subds_path in \
                        dataset.get_subdatasets(absolute=True, recursive=True,
                                                recursion_limit=recursion_limit):
                        if subds_path.startswith(_with_sep(p)):
                            resolved_datasets.append(Dataset(subds_path))
                            resolved_paths.append(curdir)

        else:
            # if not recursive, try to add everything to dataset itself:
            resolved_datasets = [dataset for i in range(len(resolved_paths))]

        # we need a resolved dataset per path:
        assert len(resolved_paths) == len(resolved_datasets)

        # sort parameters for actual git/git-annex calls:
        # (dataset, path, source)
        from six.moves import zip_longest

        param_tuples = list(
            zip_longest(resolved_datasets, resolved_paths, resolved_sources))
        # possible None-datasets in `param_tuples` were filled in by zip_longest
        # and need to be replaced by `dataset`:
        param_tuples = [(d if d is not None else dataset, p, s)
                        for d, p, s in param_tuples]

        calls = {
            d.path: {  # list of paths to 'git-add':
                'g_add': [],
                # list of paths to 'git-annex-add':
                'a_add': [],
                # list of sources to 'git-annex-addurl':
                'addurl_s': [],
                # list of (path, source) to
                # 'git-annex-addurl --file':
                'addurl_f': []
            }
            for d in [i for i, p, s in param_tuples]
        }

        for ds, p, s in param_tuples:
            # it should not happen, that `path` as well as `source` are None:
            assert p or s
            if not s:
                # we have a path only
                # Do not try to add to annex whenever there is no annex
                if to_git or not isinstance(ds.repo, AnnexRepo):
                    calls[ds.path]['g_add'].append(p)
                else:
                    calls[ds.path]['a_add'].append(p)
            elif not p:
                # we have a source only
                if to_git:
                    raise NotImplementedError("Can't add a remote source "
                                              "directly to git.")
                calls[ds.path]['addurl_s'].append(s)
            else:
                # we have a path and a source
                if to_git:
                    raise NotImplementedError("Can't add a remote source "
                                              "directly to git.")
                calls[ds.path]['addurl_f'].append((p, s))

        # now do the actual add operations:
        # TODO: implement git/git-annex/git-annex-add options

        datasets_return_values = defaultdict(list)
        for dspath in calls:
            ds = Dataset(dspath)
            return_values = datasets_return_values[dspath]
            lgr.info("Processing dataset %s ..." % ds)

            # check every (sub-)dataset for annex once, since we can't add or
            # addurl anything, if there is no annex:
            # TODO: Q: Alternatively, just call git-annex-init if there's no
            # annex yet and we have an annex-add/annex-addurl request?
            _is_annex = isinstance(ds.repo, AnnexRepo)

            if calls[ds.path]['g_add']:
                lgr.debug("Adding %s to git", calls[dspath]['g_add'])
                added = ds.repo.add(calls[dspath]['g_add'],
                                    git=True,
                                    git_options=git_opts)
                return_values.extend(added)
            if calls[ds.path]['a_add']:
                if _is_annex:
                    lgr.debug("Adding %s to annex", calls[dspath]['a_add'])
                    return_values.extend(
                        ds.repo.add(calls[dspath]['a_add'],
                                    git=False,
                                    jobs=jobs,
                                    git_options=git_opts,
                                    annex_options=annex_opts,
                                    options=annex_add_opts))
                else:
                    lgr.debug("{0} is no annex. Skip 'annex-add' for "
                              "files {1}".format(ds, calls[dspath]['a_add']))
                    return_values.extend([{
                        'file': f,
                        'success': False,
                        'note': "no annex at %s" % ds.path
                    } for f in calls[dspath]['a_add']])

            # TODO: AnnexRepo.add_urls' return value doesn't contain the created
            #       file name but the url
            if calls[ds.path]['addurl_s']:
                if _is_annex:
                    lgr.debug("Adding urls %s to annex",
                              calls[dspath]['addurl_s'])
                    return_values.extend(
                        ds.repo.add_urls(
                            calls[ds.path]['addurl_s'],
                            options=annex_add_opts,
                            # TODO: extra parameter for addurl?
                            git_options=git_opts,
                            annex_options=annex_opts,
                            jobs=jobs,
                        ))
                else:
                    lgr.debug("{0} is no annex. Skip 'annex-addurl' for "
                              "files {1}".format(ds,
                                                 calls[dspath]['addurl_s']))
                    return_values.extend([{
                        'file': f,
                        'success': False,
                        'note': "no annex at %s" % ds.path
                    } for f in calls[dspath]['addurl_s']])

            if calls[ds.path]['addurl_f']:
                if _is_annex:
                    for f, u in calls[ds.path]['addurl_f']:
                        lgr.debug("Adding urls %s to files in annex",
                                  calls[dspath]['addurl_f'])
                        return_values.append(
                            ds.repo.add_url_to_file(
                                f,
                                u,
                                options=annex_add_opts,  # TODO: see above
                                git_options=git_opts,
                                annex_options=annex_opts,
                                batch=True))
                else:
                    lgr.debug("{0} is no annex. Skip 'annex-addurl' for "
                              "files {1}".format(ds,
                                                 calls[dspath]['addurl_f']))
                    return_values.extend([{
                        'file': f,
                        'success': False,
                        'note': "no annex at %s" % ds.path
                    } for f in calls[dspath]['addurl_f']])
            return_values = None  # to avoid mis-use

        # XXX or we could return entire datasets_return_values, could be useful
        # that way.  But then should be unified with the rest of commands, e.g.
        # get etc
        return_values_flat = []
        for dspath, return_values in datasets_return_values.items():
            if save and len(return_values):
                # we got something added -> save
                # everything we care about at this point should be staged already
                Save.__call__(message='[DATALAD] added content',
                              dataset=ds,
                              auto_add_changes=False,
                              recursive=False)
            # TODO: you feels that this is some common logic we already have somewhere
            dsrelpath = relpath(dspath, dataset.path)
            if dsrelpath != curdir:
                # we need ot adjust 'file' entry in each record
                for return_value in return_values:
                    if 'file' in return_value:
                        return_value['file'] = opj(dsrelpath,
                                                   return_value['file'])
                    return_values_flat.append(return_value)
            else:
                return_values_flat.extend(return_values)

        return return_values_flat
예제 #27
0
파일: utils.py 프로젝트: yarikoptic/datalad
def filter_unmodified(content_by_ds, refds, since):
    """Filter per-dataset path specifications based on modification history.

    This function takes a path specification dictionary, as produced by
    `Interface._prep()` and filters it such that only that subset of paths
    remains in the dictionary that corresponding to the set of changes in
    the given reference dataset since a given state.

    The change set is traced across all related subdatasets, i.e. if a submodule
    in the reference dataset is reported as modified then all paths for any given
    subdataset in the modified one are tested for changes too (based on the
    state at which the parent dataset reports a change in the subdataset), and so
    on.

    In doing so, not only unmodified given paths are removed, but also modified
    given paths are replaced by the set of actually modified paths within them.

    Only committed changes are considered!

    Parameters
    ----------
    content_by_ds : dict
      Per-dataset path specifications, as produced ,for example, by
      `Interface._prep()`
    refds : Dataset or *Repo or path
      Reference dataset for which to determine the initial change set
    since : state
      Any commit-ish/tree-ish supported by Git (tag, commit, branch, ...).
      Changes between this given state and the most recent commit are
      evaluated.

    Returns
    -------
    dict
      Filtered path spec dictionary. If `since` is not None, the output is
      guaranteed to only contain paths to modified, and presently existing
      components of subdatasets of the given reference dataset (and itself).
    """
    if since is None:
        # we want all, subds not matching the ref are assumed to have been
        # sorted out before (e.g. one level up)
        return content_by_ds
    # turn refds argument into a usable repo instance
    if not hasattr(refds, 'path'):
        # not a Repo or Dataset
        refds_path = refds
        refds = GitRepo(refds, create=False)
    else:
        refds_path = refds.path
    repo = refds.repo
    if hasattr(repo, 'repo'):
        # TODO use GitRepo.diff() when available (gh-1217)
        repo = repo.repo

    dict_class = content_by_ds.__class__  # could be ordered dict

    # life is simple: we diff the base dataset, and kill anything that
    # does not start with something that is in the diff
    # we cannot really limit the diff paths easily because we might get
    # or miss content (e.g. subdatasets) if we don't figure out which ones
    # are known -- and we don't want that
    try:
        diff = repo.commit().diff(since)
    except GitCommandError as exc:
        # could fail because `since` points to non existing location.
        # Unfortunately there might be no meaningful message
        # e.g. "fatal: ambiguous argument 'HEAD^': unknown revision or path not in the working tree"
        # logged within this GitCommandError for some reason! So let's check
        # that value of since post-error for being correct:
        try:
            refds.repo._git_custom_command(
                [], ['git', 'show', '--stat', since, '--'],
                expect_stderr=True,
                expect_fail=True)
            raise  # re-raise since our idea was incorrect
        except CommandError as ce_exc:
            if ce_exc.stderr.startswith('fatal: bad revision'):
                raise ValueError(
                    "Value since=%r is not valid. Git reports: %s" %
                    (since, exc_str(ce_exc)))
            else:
                raise  # re-raise

    # get all modified paths (with original? commit) that are still
    # present
    modified = dict(
        (opj(refds_path, d.b_path), d.b_blob.hexsha if d.b_blob else None)
        for d in diff)
    if not modified:
        # nothing modified nothing to report
        return dict_class()
    # determine the subset that is a directory and hence is relevant for possible
    # subdatasets
    modified_dirs = {_with_sep(d) for d in modified if isdir(d)}
    # find the subdatasets matching modified paths, this will also kick out
    # any paths that are not in the dataset sub-hierarchy
    mod_subs = dict_class(
        (candds, paths) for candds, paths in content_by_ds.items()
        if candds != refds_path and any(
            _with_sep(candds).startswith(md) for md in modified_dirs))
    # now query the next level down
    keep_subs = \
        [filter_unmodified(mod_subs, subds_path, modified[subds_path])
         for subds_path in mod_subs
         if subds_path in modified]
    # merge result list into a single dict
    keep = dict_class((k, v) for d in keep_subs for k, v in d.items())

    paths_refds = content_by_ds[refds_path]
    keep[refds_path] = [
        m for m in modified if lexists(m)  # still around
        and (m in paths_refds  # listed file, or subds
             # or a modified path under a given directory
             or any(m.startswith(_with_sep(p)) for p in paths_refds))
    ]
    return keep
예제 #28
0
def _get_submodules(dspath, fulfilled, recursive, recursion_limit,
                    contains, bottomup, set_property, delete_property,
                    refds_path):
    if not GitRepo.is_valid_repo(dspath):
        return
    modinfo = _parse_gitmodules(dspath)
    # write access parser
    parser = None
    if set_property or delete_property:
        parser = _get_gitmodule_parser(dspath)
    # put in giant for-loop to be able to yield results before completion
    for sm in _parse_git_submodules(dspath, recursive=False):
        if contains and \
                not (contains == sm['path'] or
                     contains.startswith(_with_sep(sm['path']))):
            # we are not looking for this subds, because it doesn't
            # match the target path
            continue
        sm.update(modinfo.get(sm['path'], {}))
        if set_property or delete_property:
            # do modifications now before we read the info out for reporting
            # use 'submodule "NAME"' section ID style as this seems to be the default
            submodule_section = 'submodule "{}"'.format(sm['gitmodule_name'])
            # first deletions
            for dprop in assure_list(delete_property):
                parser.remove_option(submodule_section, dprop)
                # also kick from the info we just read above
                sm.pop('gitmodule_{}'.format(dprop), None)
            # and now setting values
            for sprop in assure_list(set_property):
                prop, val = sprop
                if val.startswith('<') and val.endswith('>') and '{' in val:
                    # expand template string
                    val = val[1:-1].format(
                        **dict(
                            sm,
                            refds_relpath=relpath(sm['path'], refds_path),
                            refds_relname=relpath(sm['path'], refds_path).replace(os.sep, '-')))
                parser.set_value(
                    submodule_section,
                    prop,
                    val)
                # also add to the info we just read above
                sm['gitmodule_{}'.format(prop)] = val

        #common = commonprefix((with_pathsep(subds), with_pathsep(path)))
        #if common.endswith(sep) and common == with_pathsep(subds):
        #    candidates.append(common)
        subdsres = get_status_dict(
            'subdataset',
            status='ok',
            type='dataset',
            logger=lgr)
        subdsres.update(sm)
        subdsres['parentds'] = dspath
        if not bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled):
            yield subdsres

        # expand list with child submodules. keep all paths relative to parent
        # and convert jointly at the end
        if recursive and \
                (recursion_limit in (None, 'existing') or
                 (isinstance(recursion_limit, int) and
                  recursion_limit > 1)):
            for r in _get_submodules(
                    sm['path'],
                    fulfilled, recursive,
                    (recursion_limit - 1)
                    if isinstance(recursion_limit, int)
                    else recursion_limit,
                    contains,
                    bottomup,
                    set_property,
                    delete_property,
                    refds_path):
                yield r
        if bottomup and \
                (fulfilled is None or
                 GitRepo.is_valid_repo(sm['path']) == fulfilled):
            yield subdsres
    if parser is not None:
        # release parser lock manually, auto-cleanup is not reliable in PY3
        parser.release()
예제 #29
0
def results_from_annex_noinfo(ds,
                              requested_paths,
                              respath_by_status,
                              dir_fail_msg,
                              noinfo_dir_msg,
                              noinfo_file_msg,
                              noinfo_status='notneeded',
                              **kwargs):
    """Helper to yield results based on what information git annex did no give us.

    The helper assumes that the annex command returned without an error code,
    and interprets which of the requested paths we have heard nothing about,
    and assumes that git annex was happy with their current state.

    Parameters
    ==========
    ds : Dataset
      All results have to be concerning this single dataset (used to resolve
      relpaths).
    requested_paths : list
      List of path arguments sent to `git annex`
    respath_by_status : dict
      Mapping of 'success' or 'failure' labels to lists of result paths
      reported by `git annex`. Everything that is not in here, we assume
      that `git annex` was happy about.
    dir_fail_msg : str
      Message template to inject into the result for a requested directory where
      a failure was reported for some of its content. The template contains two
      string placeholders that will be expanded with 1) the path of the
      directory, and 2) the content failure paths for that directory
    noinfo_dir_msg : str
      Message template to inject into the result for a requested directory that
      `git annex` was silent about (incl. any content). There must be one string
      placeholder that is expanded with the path of that directory.
    noinfo_file_msg : str
      Message to inject into the result for a requested file that `git
      annex` was silent about.
    noinfo_status : str
      Status to report when annex provides no information
    **kwargs
      Any further kwargs are included in the yielded result dictionary.
    """
    for p in requested_paths:
        # any relpath is relative to the currently processed dataset
        # not the global reference dataset
        p = p if isabs(p) else normpath(opj(ds.path, p))
        if any(p in ps for ps in respath_by_status.values()):
            # we have a report for this path already
            continue
        common_report = dict(path=p, **kwargs)
        if isdir(p):
            # `annex` itself will not report on directories, but if a
            # directory was requested, we want to say something about
            # it in the results.  we are inside a single, existing
            # repo, hence all directories are already present, if not
            # we had an error
            # do we have any failures in a subdir of the requested dir?
            failure_results = [
                fp for fp in respath_by_status.get('failure', [])
                if fp.startswith(_with_sep(p))
            ]
            if failure_results:
                # we were not able to process all requested_paths, let's label
                # this 'impossible' to get a warning-type report
                # after all we have the directory itself, but not
                # (some) of its requested_paths
                yield get_status_dict(status='impossible',
                                      type='directory',
                                      message=(dir_fail_msg, p,
                                               failure_results),
                                      **common_report)
            else:
                # otherwise cool, but how cool?
                success_results = [
                    fp for fp in respath_by_status.get('success', [])
                    if fp.startswith(_with_sep(p))
                ]
                yield get_status_dict(
                    status='ok' if success_results else noinfo_status,
                    message=None if success_results else (noinfo_dir_msg, p),
                    type='directory',
                    **common_report)
            continue
        else:
            # not a directory, and we have had no word from `git annex`,
            # yet no exception, hence the file was most probably
            # already in the desired state
            yield get_status_dict(status=noinfo_status,
                                  type='file',
                                  message=noinfo_file_msg,
                                  **common_report)
예제 #30
0
def _get_dsinfo_from_aggmetadata(ds_path, path, recursive, db):
    """Grab info on aggregated metadata for a path from a given dataset.

    The actual info is stored in a `db` dict under the absolute path
    of the dataset that contains the query path, plus any subdataset
    in case of recursion (with their own DB entries).

    Parameters
    ----------
    ds : Dataset
      source dataset
    path : str
      absolute path for which to obtain metadata
    recursive : bool

    Returns
    -------
    str or list
      A string is an error message, a list contains all absolute paths for
      all datasets on which info was put into the DB.
    """
    info_fpath = opj(ds_path, agginfo_relpath)
    info_basepath = dirname(info_fpath)
    # TODO cache these
    agginfos = _load_json_object(info_fpath)

    def _ensure_abs_obj_location(rec):
        # object location in the DB must be absolute so we can copy easily
        # to all relevant datasets
        for key in location_keys:
            if key in rec and not isabs(rec[key]):
                rec[key] = opj(info_basepath, rec[key])
        return rec

    rpath = relpath(path, start=ds_path)
    seed_ds = _get_containingds_from_agginfo(agginfos, rpath)
    if seed_ds is None:
        # nothing found
        # this will be the message in the result for the query path
        # and could be a tuple
        return ("No matching aggregated metadata in Dataset at %s", ds_path)

    # easy peasy
    seed_abs = opj(ds_path, seed_ds)
    db[seed_abs] = _ensure_abs_obj_location(agginfos[seed_ds])
    hits = [seed_abs]

    if not recursive:
        return hits

    # a little more complicated: we need to loop over all subdataset
    # records an pick the ones that are underneath the seed
    for agginfo_path in agginfos:
        if agginfo_path.startswith(_with_sep(seed_ds)):
            absp = opj(ds_path, agginfo_path)
            db[absp] = _ensure_abs_obj_location(agginfos[agginfo_path])
            hits.append(absp)
    # TODO we must keep the info on these recursively discovered datasets
    # somewhere, because we cannot rediscover them on the filesystem
    # when updating the datasets later on
    return hits
예제 #31
0
def get_paths_by_dataset(paths,
                         recursive=False,
                         recursion_limit=None,
                         out=None,
                         dir_lookup=None):
    """Sort a list of paths per dataset they are contained in.

    Any paths that are not part of a dataset, or presently unavailable are
    reported.

    Parameter
    ---------
    paths : sequence
      A sequence of path specifications to sort.
    recursive : bool
      Flag whether to report subdatasets under any of the given paths
    recursion_limit :
      Depth constraint for recursion. See `Dataset.get_subdatasets()` for more
      information.
    out : dict or None
      By default a new output dictionary is created, howeverm and existing one
      can be provided via this argument to enable incremental processing.
    dir_lookup : dict or None
      Optional lookup cache that maps paths to previously determined datasets.
      This can speed up repeated processing.

    Returns
    -------
    Tuple(dict, list, list)
      Dict of `existing dataset path`: `path` mappings, the list of currently
      non-existing paths (possibly matching currently uninstalled datasets),
      and any paths that are not part of any dataset

    """
    # sort paths into the respective datasets
    if dir_lookup is None:
        dir_lookup = {}
    if out is None:
        out = {}
    # paths that don't exist (yet)
    unavailable_paths = []
    nondataset_paths = []
    for path in paths:
        if not lexists(path):
            # not there yet, impossible to say which ds it will actually
            # be in, if any
            unavailable_paths.append(path)
            continue
        # the path exists in some shape or form
        if isdir(path):
            # this could contain all types of additional content
            d = path
        else:
            # for everything else we are interested in the container
            d = dirname(path)
            if not d:
                d = curdir
        # this could be `None` if there is no git repo
        dspath = dir_lookup.get(d, GitRepo.get_toppath(d))
        dir_lookup[d] = dspath
        if not dspath:
            nondataset_paths.append(path)
            continue
        if isdir(path):
            ds = Dataset(dspath)
            # we need to doublecheck that this is not a subdataset mount
            # point, in which case get_toppath() would point to the parent
            smpath = ds.get_containing_subdataset(path, recursion_limit=1).path
            if smpath != dspath:
                # fix entry
                dir_lookup[d] = smpath
                # submodule still needs to be obtained
                unavailable_paths.append(path)
                continue
            if recursive:
                # make sure we get everything relevant in all _checked out_
                # subdatasets, obtaining of previously unavailable subdataset
                # else done elsewhere
                subs = ds.get_subdatasets(fulfilled=True,
                                          recursive=recursive,
                                          recursion_limit=recursion_limit)
                for sub in subs:
                    subdspath = opj(dspath, sub)
                    if subdspath.startswith(_with_sep(path)):
                        # this subdatasets is underneath the search path
                        # we want it all
                        # be careful to not overwrite anything, in case
                        # this subdataset has been processed before
                        out[subdspath] = out.get(subdspath, [subdspath])
        out[dspath] = out.get(dspath, []) + [path]
    return out, unavailable_paths, nondataset_paths