def untracked_subdatasets_to_submodules(ds, consider_paths): # treat special case of still untracked subdatasets. # those need to become submodules now, as they are otherwise added # without an entry in .gitmodules, and subsequently break Git's # submodule functionality completely new_modules = [] if not consider_paths: # nothing to test return new_modules for utf in ds.repo.repo.untracked_files: utf_abspath = opj(ds.path, utf) if not isdir(utf_abspath): # this cannot be a repository continue # test whether the potential submodule is scheduled for saving utf_realpath = realpath(utf_abspath) if any([ utf_realpath.startswith(_with_sep(realpath(f))) for f in consider_paths ]): # matches at least one path -> turn into submodule _install_subds_inplace( ds=ds, path= utf_abspath, # can be ignored, we don't need the return value relativepath=utf.rstrip(os.sep), name=None) new_modules.append(utf.rstrip(os.sep)) return new_modules
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None): content_by_ds = {} if isinstance(recursion_limit, int) and recursion_limit <= 0: return content_by_ds # loop over submodules not subdatasets to get the url right away # install using helper that give some flexibility regarding where to # get the module from for sub in ds.repo.get_submodules(): subds = Dataset(opj(ds.path, sub.path)) if start is not None and not subds.path.startswith(_with_sep(start)): # this one we can ignore, not underneath the start path continue if not subds.is_installed(): try: lgr.info("Installing subdataset %s", subds.path) subds = _install_subds_from_flexible_source( ds, sub.path, sub.url, reckless) # we want the entire thing, but mark this subdataset # as automatically installed content_by_ds[subds.path] = [curdir] except Exception as e: # skip, if we didn't manage to install subdataset lgr.warning( "Installation of subdatasets %s failed, skipped", subds) lgr.debug("Installation attempt failed with exception: %s", exc_str(e)) continue # otherwise recurse # we can skip the start expression, we know we are within content_by_ds.update(_recursive_install_subds_underneath( subds, recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit, reckless=reckless )) return content_by_ds
def get_tree_roots(paths): """Return common root paths for a set of paths This function determines the smallest set of common root paths and sorts all given paths under the respective root. Returns ------- dict paths by root """ paths_ws = [_with_sep(p) for p in paths] # sort all paths under their potential roots roots = {} # start from the top to get all paths down the line # and collate them into as few roots as possible for s in sorted(paths_ws): if any([s.startswith(r) for r in roots]): # this path is already covered by a known root continue # find all sub paths subs = [p for p in paths if p.startswith(s)] roots[s.rstrip(sep)] = subs return roots
def get_tree_roots(paths): """Return common root paths for a set of paths This function determines the smallest set of common root paths and sorts all given paths under the respective root. Returns ------- dict paths by root """ paths_ws = [_with_sep(p) for p in paths] # sort all paths under their potential roots roots = {} # start from the top to get all paths down the line # and collate them into as few roots as possible for s in sorted(paths_ws): if any([s.startswith(r) for r in roots]): # this path is already covered by a known root continue # find all sub paths subs = [p for p in paths if p.startswith(s)] roots[s.rstrip(sep)] = subs return roots
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace, spec): """Discover the edges and nodes in a dataset tree to given target paths Parameters ---------- basepath : path Path to a start or top-level dataset. Really has to be a path to a dataset! targetpaths : list(path) Any non-zero number of path that are termination points for the search algorithm. Can be paths to datasets, directories, or files (and any combination thereof). current_trace : list For a top-level call this should probably always be `[]` spec : dict `content_by_ds`-style dictionary that will receive information about the discovered datasets. Specifically, for each discovered dataset there will be in item with its path under the key (path) of the respective superdataset. Returns ------- None Function calls itself recursively and populates `spec` in-place. """ # this beast walks the directory tree from a given `basepath` until # it discovers any of the given `targetpaths` # if it finds one, it commits any accummulated trace of visited # datasets on this edge to the spec valid_repo = GitRepo.is_valid_repo(basepath) if valid_repo: # we are passing into a new dataset, extend the dataset trace current_trace = current_trace + [basepath] if basepath in targetpaths: # found a targetpath, commit the trace for i, p in enumerate(current_trace[:-1]): # TODO RF prepare proper annotated path dicts spec[p] = list(set(spec.get(p, []) + [current_trace[i + 1]])) if not isdir(basepath): # nothing underneath this one -> done return # this edge is not done, we need to try to reach any downstream # dataset for p in listdir(basepath): if valid_repo and p == '.git': # ignore gitdir to speed things up continue p = opj(basepath, p) if all(t != p and not t.startswith(_with_sep(p)) for t in targetpaths): # OPT listdir might be large and we could have only few items # in `targetpaths` -- so traverse only those in spec which have # leading dir basepath continue # we need to call this even for non-directories, to be able to match # file target paths discover_dataset_trace_to_targets(p, targetpaths, current_trace, spec)
def amend_pathspec_with_superdatasets(spec, topmost=True, limit_single=False): """Amend a path spec dictionary with entries for superdatasets The result will be a superdataset entry (if a superdataset exists) for each input dataset. This entry will (at least) contain the path to the subdataset. Parameters ---------- spec : dict Path spec topmost : Dataset or bool Flag whether to grab the immediate, or the top-most superdataset for each entry, alternatively this can be a dataset instance that is used as the topmost dataset. limit_single : bool If a `topmost` dataset is provided, and this flag is True, only the given topmost dataset will be considered as superdataset. Any datasets in the spec that are not underneath this dataset will not have associated superdataset entries added to the spec. Returns ------- dict Amended path spec dictionary """ superdss = {} for dpath in spec.keys(): superds = None if isinstance(topmost, Dataset): if limit_single and dpath == topmost.path: # this is already the topmost, no further superdataset to # consider continue if dpath.startswith(_with_sep(topmost.path)): # the given topmost dataset is "above" the current # datasets path superds = topmost elif limit_single: continue if not superds: # grab the (topmost) superdataset superds = Dataset(dpath).get_superdataset( datalad_only=True, topmost=topmost) if not superds: continue # register the subdatasets path in the spec of the superds spaths = superdss.get(superds.path, []) if not spaths: spaths = spec.get(superds.path, []) spaths.append(dpath) superdss[superds.path] = spaths spec.update(superdss) return spec
def _filterpaths(basepath, paths, exclude): final_paths = [] for rp in [opj(basepath, p) if basepath else p for p in paths]: if rp in exclude: continue elif any(ep.startswith(_with_sep(rp)) for ep in exclude): final_paths.extend( _filterpaths(rp, listdir(opj(ds.path, rp)), exclude)) pass else: final_paths.append(rp) return final_paths
def _get_untracked_content(dspath, report_untracked, paths=None): cmd = [ 'git', '--work-tree=.', 'status', '--porcelain', # file names NULL terminated '-z', # we never want to touch submodules, they cannot be untracked '--ignore-submodules=all', # fully untracked dirs as such, the rest as files '--untracked={}'.format(report_untracked) ] try: stdout, stderr = GitRunner(cwd=dspath).run(cmd, log_stderr=True, log_stdout=True, log_online=False, expect_stderr=False, shell=False, expect_fail=True) except CommandError as e: # TODO should we catch any and handle them in here? raise e if paths: paths = [r['path'] for r in paths] if len(paths) == 1 and paths[0] == dspath: # nothing to filter paths = None for line in stdout.split('\0'): if not line: continue if not line.startswith('?? '): # nothing untracked, ignore, task of `diff` continue apath = opj( dspath, # strip state marker line[3:]) norm_apath = normpath(apath) if paths and not any( [norm_apath == p or apath.startswith(_with_sep(p)) for p in paths]): # we got a whitelist for paths, don't report any other continue ap = dict(path=norm_apath, parentds=dspath, state='untracked', type='directory' if isdir(apath) else 'file') yield ap
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None, refds_path=None, description=None): if isinstance(recursion_limit, int) and recursion_limit <= 0: return # install using helper that give some flexibility regarding where to # get the module from for sub in ds.subdatasets( return_type='generator', result_renderer='disabled'): subds = Dataset(sub['path']) if sub.get('gitmodule_datalad-recursiveinstall', '') == 'skip': lgr.debug( "subdataset %s is configured to be skipped on recursive installation", sub['path']) continue if start is not None and not subds.path.startswith(_with_sep(start)): # this one we can ignore, not underneath the start path continue if sub['state'] != 'absent': # dataset was already found to exist yield get_status_dict( 'install', ds=subds, status='notneeded', logger=lgr, refds=refds_path) # do not continue, even if an intermediate dataset exists it # does not imply that everything below it does too else: # try to get this dataset try: subds = _install_subds_from_flexible_source( ds, relpath(sub['path'], start=ds.path), sub['gitmodule_url'], reckless, description=description) yield get_status_dict( 'install', ds=subds, status='ok', logger=lgr, refds=refds_path, message=("Installed subdataset %s", subds), parentds=ds.path) except Exception as e: # skip all of downstairs, if we didn't manage to install subdataset yield get_status_dict( 'install', ds=subds, status='error', logger=lgr, refds=refds_path, message=("Installation of subdatasets %s failed with exception: %s", subds, exc_str(e))) continue # otherwise recurse # we can skip the start expression, we know we are within for res in _recursive_install_subds_underneath( subds, recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit, reckless=reckless, refds_path=refds_path): yield res
def _dump_submeta(ds, submetas, matchpath, save, modified_ds): known_subds = list(submetas.keys()) for p in known_subds: smeta = submetas[p] if matchpath and not p.startswith(_with_sep(matchpath)): continue subds_relpath = relpath(p, matchpath) # inject proper inter-dataset relationships for m in smeta: # skip non-implicit if not is_implicit_metadata(m): continue if 'dcterms:isPartOf' not in m and m.get('type', None) == 'Dataset': m['dcterms:isPartOf'] = ds.id sp = opj(ds.path, metadata_basepath, subds_relpath) _store_json(ds, sp, smeta) # TODO this is all wrong! It should not talk to repo methods and emulate # high-level code, but use the (now) existing high-level commands # stage potential changes in the subdataset try: ds.repo.add(subds_relpath, git=True) except CommandError: # TODO as a bonus this exception handling is untested! wipe out during # upcoming RF # it can blow if we skipped a non-dataset submodule # in this case we need to find the chain of submodules leading to it and # save then bottom-up testpath = dirname(subds_relpath) while testpath: repo = ds.subdatasets(contains=testpath, result_xfm='datasets', return_type='item-or-list') repo.repo.add(relpath(subds_relpath, testpath), git=True) modified_ds = _save_helper(repo, save, modified_ds) # see if there is anything left... # IMPORTANT to go with relpath to actually get to an empty # string eventually testpath = dirname(relpath(repo.path, ds.path)) # removed stored item from lookup del submetas[p] return modified_ds
def sort_paths_into_subdatasets(superds_path, target_subs, spec): # XXX forge a chain: whenever some path needs to be pushed down # put the receiving dataset as a components to process into the # respective superdataset -- this will enable further processing # of all datasets in a completely independent fashion # (except for order of processing) # get all existing subdataset as candidate nodes of the graph # that needs to be built and checked subds_graph = Dataset(superds_path).get_subdatasets( absolute=True, recursive=True, edges=True, fulfilled=True) if not subds_graph: # no subdatasets, nothing to sort return for t in target_subs: trace = get_trace( subds_graph, superds_path, t) if not trace: # not connected, or identical continue tosort = [superds_path] + trace + [t] # loop over all but the last one, simplifies logic below for i, d in enumerate(tosort[:-1]): paths = spec.get(d, []) keep_paths = [] next_ds = tosort[i + 1] next_dspaths = spec.get(next_ds, []) comp = _with_sep(next_ds) for p in assure_list(paths): if p.startswith(comp): next_dspaths.append(p) # remember that we pushed the path into this dataset keep_paths.append(next_ds) else: keep_paths.append(p) spec[next_ds] = next_dspaths spec[d] = keep_paths # tidy up -- deduplicate for c in spec: spec[c] = list(set(spec[c]))
def process_vanished_paths(unavailable_paths, content_by_ds): # presently unavailable paths could be, e.g., deleted files, or # uninstalled subdatasets, or simply nothing -> figure it out and act # accordingly dsinfo = {} nonexistent_paths = [] for p in unavailable_paths: # we need to check whether any of these correspond # to a known subdataset, and add those to the list of # things to be removed toppath = get_dataset_root(p) if not toppath: nonexistent_paths.append(p) continue ds = Dataset(toppath) dinfo = dsinfo.get( toppath, { 'deleted': ds.repo.get_deleted_files(), 'subds': ds.get_subdatasets(recursive=False, absolute=True) }) # cache for a potentially following request dsinfo[toppath] = dinfo if p in dinfo['subds']: # test for subds needs to come first, as it would also show # up in "deleted_files" # this is a known subdataset that has vanished lgr.debug('deinit vanished subdataset {} in {}'.format(p, ds)) # simply deinit to complete a "forced uninstallation", without # an explicit "remove" there is nothing to be save in this # case ds.repo.deinit_submodule(p[len(_with_sep(ds.path)):]) elif p in dinfo['deleted']: # vanished file -> 'git rm' it to stage the change ds.repo.remove(p) # record that we are "saving" this path dpaths = content_by_ds.get(ds.path, []) dpaths.append(p) content_by_ds[ds.path] = dpaths else: # this is nothing we can anyhow handle nonexistent_paths.append(p) return content_by_ds, nonexistent_paths
def process_vanished_paths(unavailable_paths, content_by_ds): # presently unavailable paths could be, e.g., deleted files, or # uninstalled subdatasets, or simply nothing -> figure it out and act # accordingly dsinfo = {} nonexistent_paths = [] for p in unavailable_paths: # we need to check whether any of these correspond # to a known subdataset, and add those to the list of # things to be removed toppath = get_dataset_root(p) if not toppath: nonexistent_paths.append(p) continue ds = Dataset(toppath) dinfo = dsinfo.get(toppath, {'deleted': ds.repo.get_deleted_files(), 'subds': ds.get_subdatasets( recursive=False, absolute=True)}) # cache for a potentially following request dsinfo[toppath] = dinfo if p in dinfo['subds']: # test for subds needs to come first, as it would also show # up in "deleted_files" # this is a known subdataset that has vanished lgr.debug('deinit vanished subdataset {} in {}'.format(p, ds)) # simply deinit to complete a "forced uninstallation", without # an explicit "remove" there is nothing to be save in this # case ds.repo.deinit_submodule(p[len(_with_sep(ds.path)):]) elif p in dinfo['deleted']: # vanished file -> 'git rm' it to stage the change ds.repo.remove(p) # record that we are "saving" this path dpaths = content_by_ds.get(ds.path, []) dpaths.append(p) content_by_ds[ds.path] = dpaths else: # this is nothing we can anyhow handle nonexistent_paths.append(p) return content_by_ds, nonexistent_paths
def yield_recursive(ds, path, action, recursion_limit): # make sure we get everything relevant in all _checked out_ # subdatasets, obtaining of previously unavailable subdataset # is elsewhere for subd_res in ds.subdatasets(recursive=True, recursion_limit=recursion_limit, return_type='generator'): # this check is not the same as subdatasets --contains=path # because we want all subdataset below a path, not just the # containing one if subd_res['path'].startswith(_with_sep(path)): # this subdatasets is underneath the search path # be careful to not overwrite anything, in case # this subdataset has been processed before subd_res['action'] = action # mark as "notprocessed" subd_res['status'] = '' # we know that this is a known subdataset, that is how # we got here, make a record subd_res['registered_subds'] = True yield subd_res
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None): content_by_ds = {} if isinstance(recursion_limit, int) and recursion_limit <= 0: return content_by_ds # loop over submodules not subdatasets to get the url right away # install using helper that give some flexibility regarding where to # get the module from for sub in ds.repo.get_submodules(): subds = Dataset(opj(ds.path, sub.path)) if start is not None and not subds.path.startswith(_with_sep(start)): # this one we can ignore, not underneath the start path continue if not subds.is_installed(): try: lgr.info("Installing subdataset %s", subds.path) subds = _install_subds_from_flexible_source( ds, sub.path, sub.url, reckless) # we want the entire thing, but mark this subdataset # as automatically installed content_by_ds[subds.path] = [curdir] except Exception as e: # skip, if we didn't manage to install subdataset lgr.warning("Installation of subdatasets %s failed, skipped", subds) lgr.debug("Installation attempt failed with exception: %s", exc_str(e)) continue # otherwise recurse # we can skip the start expression, we know we are within content_by_ds.update( _recursive_install_subds_underneath( subds, recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit, reckless=reckless)) return content_by_ds
def _dump_submeta(ds, submetas, matchpath, save, modified_ds): known_subds = list(submetas.keys()) for p in known_subds: smeta = submetas[p] if matchpath and not p.startswith(_with_sep(matchpath)): continue subds_relpath = relpath(p, matchpath) # inject proper inter-dataset relationships for m in smeta: # skip non-implicit if not is_implicit_metadata(m): continue if 'dcterms:isPartOf' not in m and m.get('type', None) == 'Dataset': m['dcterms:isPartOf'] = ds.id sp = opj(ds.path, metadata_basepath, subds_relpath) _store_json(ds, sp, smeta) # stage potential changes in the subdataset try: ds.repo.add(subds_relpath, git=True) except CommandError: # it can blow if we skipped a non-dataset submodule # in this case we need to find the chain of submodules leading to it and # save then bottom-up testpath = dirname(subds_relpath) while testpath: # TODO this is a slow call that implies pretty bad repeated traversal # of dataset trees -- RF to use `subdatasets --contains` repo = ds.get_containing_subdataset(testpath) repo.repo.add(relpath(subds_relpath, testpath), git=True) modified_ds = _save_helper(repo, save, modified_ds) # see if there is anything left... # IMPORTANT to go with relpath to actually get to an empty # string eventually testpath = dirname(relpath(repo.path, ds.path)) # removed stored item from lookup del submetas[p] return modified_ds
def _dump_submeta(ds, submetas, matchpath, save, modified_ds): known_subds = list(submetas.keys()) for p in known_subds: smeta = submetas[p] if matchpath and not p.startswith(_with_sep(matchpath)): continue subds_relpath = relpath(p, matchpath) # inject proper inter-dataset relationships for m in smeta: # skip non-implicit if not is_implicit_metadata(m): continue if 'dcterms:isPartOf' not in m and m.get('type', None) == 'Dataset': m['dcterms:isPartOf'] = ds.id sp = opj(ds.path, metadata_basepath, subds_relpath) _store_json(ds, sp, smeta) # stage potential changes in the subdataset try: ds.repo.add(subds_relpath, git=True) except CommandError: # it can blow if we skipped a non-dataset submodule # in this case we need to find the chain of submodules leading to it and # save then bottom-up testpath = dirname(subds_relpath) while testpath: repo = ds.get_containing_subdataset(testpath) repo.repo.add(relpath(subds_relpath, testpath), git=True) modified_ds = _save_helper(repo, save, modified_ds) # see if there is anything left... # IMPORTANT to go with relpath to actually get to an empty # string eventually testpath = dirname(relpath(repo.path, ds.path)) # removed stored item from lookup del submetas[p] return modified_ds
def __call__( dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): dataset = require_dataset( dataset, check_installed=False, purpose='subdataset reporting/modification') refds_path = dataset.path # XXX this seems strange, but is tested to be the case -- I'd rather set # `check_installed` to true above and fail if not GitRepo.is_valid_repo(refds_path): return # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return try: if not (bottomup or contains or set_property or delete_property or \ (recursive and recursion_limit is not None)): # FAST IMPLEMENTATION FOR THE STRAIGHTFORWARD CASE # as fast as possible (just a single call to Git) # need to track current parent stack = [refds_path] modinfo_cache = {} for sm in _parse_git_submodules(refds_path, recursive=recursive): # unwind the parent stack until we find the right one # this assumes that submodules come sorted while not sm['path'].startswith(_with_sep(stack[-1])): stack.pop() parent = stack[-1] if parent not in modinfo_cache: # read the parent .gitmodules, if not done yet modinfo_cache[parent] = _parse_gitmodules(parent) # get URL info, etc. sm.update(modinfo_cache[parent].get(sm['path'], {})) subdsres = get_status_dict( 'subdataset', status='ok', type='dataset', refds=refds_path, logger=lgr) subdsres.update(sm) subdsres['parentds'] = parent if (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres # for the next "parent" commit this subdataset to the stack stack.append(sm['path']) # MUST RETURN: the rest of the function is doing another implementation return except InvalidGitRepositoryError as e: lgr.debug("fast subdataset query failed, trying slow robust one (%s)", exc_str(e)) # MORE ROBUST, FLEXIBLE, BUT SLOWER IMPLEMENTATION # slow but flexible (one Git call per dataset), but deals with subdatasets in # direct mode if contains: contains = resolve_path(contains, dataset) for r in _get_submodules( dataset.path, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path yield r
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path if not islink(path) else normpath(opj(path, pardir)) else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def get_modified_subpaths(aps, refds, revision, recursion_limit=None, report_no_revision_change=True, report_untracked='all'): """ Parameters ---------- aps : list refds : Dataset revision : str Commit-ish """ # TODO needs recursion limit # NOTE this is implemented as a generator despite that fact that we need # to sort through _all_ the inputs initially, diff'ing each involved # dataset takes time that we can use to already act on intermediate # result paths, without having to wait for 100% completion if revision is None: # we want all, subds not matching the ref are assumed to have been # sorted out before (e.g. one level up) for r in aps: yield r # life is simple: we diff the base dataset modified = [] # Diff.__call__ is used to get access to the now obsolete interface.diff # that exists merely for annotate_paths. (refds.diff corresponds to # core.local.diff.) from datalad.interface.diff import Diff for r in Diff.__call__( dataset=refds, # we cannot really limit the diff paths easily because we might get # or miss content (e.g. subdatasets) if we don't figure out which # ones are known -- and we don't want that path=None, # `revision` can be anything that Git support for `diff` # `True` is code for diff without revision revision=revision if revision is not True else None, # it is important that staged is False, otherwise we would miss unstaged # changes when e.g. diffing against HEAD (save does that) staged=False, # we might want to consider putting 'untracked' here # maybe that is a little faster, not tested yet ignore_subdatasets='none', # by default, we want to see any individual untracked file, this simplifies further # processing dramatically, but may require subsequent filtering # in order to avoid flooding user output with useless info report_untracked=report_untracked, # no recursion, we needs to update `revision` for every subdataset # before we can `diff` recursive=False, return_type='generator', result_renderer=None, # need to be able to yield the errors on_failure='ignore'): if r['status'] in ('impossible', 'error'): # something unexpected, tell daddy yield r continue # if asked, and no change in revision -- skip if not report_no_revision_change \ and (r.get('revision_src') or r.get('revision')) \ and (r.get('revision_src') == r.get('revision')): continue r['status'] = '' modified.append(r) if not len(modified): # nothing modified nothing to report return # now we can grab the APs that are in this dataset and yield them for ap in aps: # need to preserve pristine info first ap = ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path) for m in modified: if ap['path'] == m['path']: # is directly modified, yield input AP # but update with what we learned about the modification ap.update(m) yield ap break if path_is_subpath(m['path'], ap['path']): # a modified path is underneath this AP # yield the modified one instead yield m continue mod_subs = [m for m in modified if m.get('type', None) == 'dataset'] if not mod_subs or (recursion_limit is not None and recursion_limit < 1): return aps = [ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path) for ap in aps] # now for all submodules that were found modified for sub in [m for m in modified if m.get('type', None) == 'dataset']: sub_path_ = _with_sep(sub['path']) # these AP match something inside this submodule, or the whole submodule sub_aps = [ap for ap in aps if _with_sep(ap['path']).startswith(sub_path_)] if not sub_aps: continue # we are interested in the modifications within this subdataset # from the state we previously had on record, till the state # we have in record now diff_range = '{}..{}'.format( sub['revision_src'] if sub['revision_src'] else PRE_INIT_COMMIT_SHA, sub['revision'] if sub['revision'] else '') if sub['revision_src'] and sub['revision_src'] == sub['revision']: # this is a special case, where subdataset reported changes without # a change in state/commit -- this is code for uncommited changes # in the subdataset (including staged ones). In such a case, we # must not provide a diff range, but only the source commit we want # to diff against # XXX if this is changed, likely the same logic in diff needs # changing too! diff_range = sub['revision_src'] for r in get_modified_subpaths( sub_aps, Dataset(sub['path']), diff_range, recursion_limit=(recursion_limit - 1) if recursion_limit is not None else None ): yield r
def annotated2content_by_ds(annotated, refds_path, path_only=False): """Helper to convert annotated paths into an old-style content_by_ds dict Only items with an `status` property value not equal to 'ok', 'notneeded', 'impossible', or 'error' are sorted. All others are considered as already processed and are returned in a separate list. Parameters ---------- annotated : list or generator Dicts with annotated path information. refds_path : str Path to the reference dataset the original path annotation was based on. path_only: bool Whether returned dict values are sequences of just paths for each dataset, or whether the full info dicts are reported as items. Returns ------- dict, dict, list, list Dict keys are dataset paths, values are determined by the `path_only` switch. The keys in the second dict are paths to dataset, values are dicts with all known properties about those datasets. The first list contains all already "processed" results, which typically need to be re-yielded. The second list contains items (same type as dict values) for all annotated paths that have no associated parent dataset (i.e. nondataset paths) -- this list will be empty by default, unless `nondataset_path_status` was set to ''.""" content_by_ds = OrderedDict() ds_props = {} nondataset_paths = [] completed = [] for r in annotated: if r.get('type', None) == 'dataset': # collect all properties of all known datasets from the annotated # paths dp = ds_props.get(r['path'], {}) dp.update(r) ds_props[r['path']] = dp if r.get('status', None) in ('ok', 'notneeded', 'impossible', 'error'): completed.append(r) continue parentds = r.get('parentds', None) if r.get('type', None) == 'dataset': # do dataset handling first, it is the more complex beast orig_request = r.get('orig_request', None) if parentds is None or refds_path is None or \ r.get('process_content', False) or (orig_request and ( orig_request == curdir or orig_request.endswith(dirsep) or orig_request.endswith('{}{}'.format(dirsep, curdir)))): # a dataset that floats by on its own OR # behave similar to rsync, a trailing '/' indicates the # content rather then the dataset itself # in both cases we want to process this part as part # of the same dataset, and not any potential parent toappendto = content_by_ds.get(r['path'], []) toappendto.append(r['path'] if path_only else r) content_by_ds[r['path']] = toappendto if parentds and refds_path and \ _with_sep(parentds).startswith(_with_sep(refds_path)): # put also in parentds record if there is any, and the parent # is underneath or identical to the reference dataset toappendto = content_by_ds.get(parentds, []) toappendto.append(r['path'] if path_only else r) content_by_ds[parentds] = toappendto else: # files and dirs # common case, something with a parentds toappendto = content_by_ds.get(parentds, []) toappendto.append(r['path'] if path_only else r) content_by_ds[parentds] = toappendto return content_by_ds, ds_props, completed, nondataset_paths
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if _with_sep(p).startswith(_with_sep(refds_path)): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not _with_sep(dspath).startswith(_with_sep(refds_path)): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def get_modified_subpaths(aps, refds, revision, recursion_limit=None, report_no_revision_change=True, report_untracked='all'): """ Parameters ---------- aps : list refds : Dataset revision : str Commit-ish """ # TODO needs recursion limit # NOTE this is implemented as a generator despite that fact that we need # to sort through _all_ the inputs initially, diff'ing each involved # dataset takes time that we can use to already act on intermediate # result paths, without having to wait for 100% completion if revision is None: # we want all, subds not matching the ref are assumed to have been # sorted out before (e.g. one level up) for r in aps: yield r # life is simple: we diff the base dataset modified = [] for r in refds.diff( # we cannot really limit the diff paths easily because we might get # or miss content (e.g. subdatasets) if we don't figure out which # ones are known -- and we don't want that path=None, # `revision` can be anything that Git support for `diff` # `True` is code for diff without revision revision=revision if revision is not True else None, # it is important that staged is False, otherwise we would miss unstaged # changes when e.g. diffing against HEAD (save does that) staged=False, # we might want to consider putting 'untracked' here # maybe that is a little faster, not tested yet ignore_subdatasets='none', # by default, we want to see any individual untracked file, this simplifies further # processing dramatically, but may require subsequent filtering # in order to avoid flooding user output with useless info report_untracked=report_untracked, # no recursion, we needs to update `revision` for every subdataset # before we can `diff` recursive=False, return_type='generator', result_renderer=None, # need to be able to yield the errors on_failure='ignore'): if r['status'] in ('impossible', 'error'): # something unexpected, tell daddy yield r continue # if asked, and no change in revision -- skip if not report_no_revision_change \ and (r.get('revision_src') or r.get('revision')) \ and (r.get('revision_src') == r.get('revision')): continue r['status'] = '' modified.append(r) if not len(modified): # nothing modified nothing to report return # now we can grab the APs that are in this dataset and yield them for ap in aps: # need to preserve pristine info first ap = ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path) for m in modified: if ap['path'] == m['path']: # is directly modified, yield input AP # but update with what we learned about the modification ap.update(m) yield ap break if m['path'].startswith(_with_sep(ap['path'])): # a modified path is underneath this AP # yield the modified one instead yield m continue mod_subs = [m for m in modified if m.get('type', None) == 'dataset'] if not mod_subs or (recursion_limit is not None and recursion_limit < 1): return aps = [ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path) for ap in aps] # now for all submodules that were found modified for sub in [m for m in modified if m.get('type', None) == 'dataset']: sub_path_ = _with_sep(sub['path']) # these AP match something inside this submodule, or the whole submodule sub_aps = [ap for ap in aps if _with_sep(ap['path']).startswith(sub_path_)] if not sub_aps: continue # we are interested in the modifications within this subdataset # from the state we previously had on record, till the state # we have in record now diff_range = '{}..{}'.format( sub['revision_src'] if sub['revision_src'] else PRE_INIT_COMMIT_SHA, sub['revision'] if sub['revision'] else '') if sub['revision_src'] and sub['revision_src'] == sub['revision']: # this is a special case, where subdataset reported changes without # a change in state/commit -- this is code for uncommited changes # in the subdataset (including staged ones). In such a case, we # must not provide a diff range, but only the source commit we want # to diff against # XXX if this is changed, likely the same logic in diff needs # changing too! diff_range = sub['revision_src'] for r in get_modified_subpaths( sub_aps, Dataset(sub['path']), diff_range, recursion_limit=(recursion_limit - 1) if recursion_limit is not None else None ): yield r
def _prep( path=None, dataset=None, recursive=False, recursion_limit=None, dir_lookup=None, sub_paths=True): """Common input argument validation and pre-processing This method pre-processes the two most common input argument types: a base dataset, and one or more given paths. One or the other needs to be different from `None` or an `InsufficientArgumentsError` will be raised. Paths are normalized based on current practice (if relative, they are interpreted relative to a base dataset, if one is provided, or relative to the current working directory if not). Paths are then sorted by the datasets that contain them. If paths are detected that are not associated with any dataset `ValueError` is raised. If a `dataset` is given, any paths associated with a dataset that is not this dataset or a subdataset of it will also trigger a `ValueError`. Parameters ---------- path : path or list(path) or None Path input argument dataset : path or Dataset or None Dataset input argument. If given, the output dict is guaranteed to carry a key for this dataset, but not necessarily any paths as values. recursive : bool Whether to discover subdatasets under any of the given paths recursively recursion_limit : None or int Optional recursion limit specification (max levels of recursion) dir_lookup : dict, optional Passed to `get_paths_by_dataset` sub_paths : bool, optional Passed to `get_paths_by_dataset` :-P Returns ------- (dict, list) The dictionary contains keys of absolute dataset paths and lists with the normalized (generally absolute) paths of presently existing locations associated with the respective dataset as values. The list return in addition contains all paths that are part of a dataset, but presently do not exist on the filesystem. """ from .utils import get_normalized_path_arguments from .utils import get_paths_by_dataset # upfront check prior any resolution attempt to avoid disaster if path is None and dataset is None: raise InsufficientArgumentsError( "at least a dataset or a path must be given") path, dataset_path = get_normalized_path_arguments( path, dataset) if not path and dataset_path and recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path to the sorting to make it work # but we also need to fish it out again afterwards tosort = [dataset_path] fishout_dataset_path = True else: tosort = path fishout_dataset_path = False content_by_ds, unavailable_paths, nondataset_paths = \ get_paths_by_dataset(tosort, recursive=recursive, recursion_limit=recursion_limit, dir_lookup=dir_lookup, sub_paths=sub_paths) if fishout_dataset_path: # explicit better than implicit, duplication is evil # fish out the dataset path that we inserted above content_by_ds[dataset_path] = [p for p in content_by_ds[dataset_path] if p != dataset_path] if not path and dataset_path: # no files given, but a dataset -> operate on whole dataset # but do not specify any paths to process -- needs to be tailored # by caller content_by_ds[dataset_path] = content_by_ds.get(dataset_path, []) if dataset_path and not content_by_ds and not unavailable_paths: # we got a dataset, but there is nothing actually installed nondataset_paths.append(dataset_path) if dataset_path: # check that we only got SUBdatasets dataset_path = _with_sep(dataset_path) for ds in content_by_ds: if not _with_sep(ds).startswith(dataset_path): nondataset_paths.extend(content_by_ds[ds]) # complain about nondataset and non-existing paths if nondataset_paths: if dataset_path: raise ValueError( "will not touch paths outside of base datasets(%s): %s" % (dataset_path, nondataset_paths)) else: raise ValueError( "will not touch paths outside of installed datasets: %s" % nondataset_paths) if unavailable_paths: lgr.debug('Encountered unavaliable paths: %s', unavailable_paths) return content_by_ds, unavailable_paths
def get_paths_by_dataset(paths, recursive=False, recursion_limit=None, out=None, dir_lookup=None, sub_paths=True): """Sort a list of paths per dataset they are contained in. Any paths that are not part of a dataset, or presently unavailable are reported. Parameter --------- paths : sequence A sequence of path specifications to sort. recursive : bool Flag whether to report subdatasets under any of the given paths recursion_limit : Depth constraint for recursion. See `subdatasets()` for more information. out : dict or None By default a new output dictionary is created, however an existing one can be provided via this argument to enable incremental processing. dir_lookup : dict or None, optional Optional lookup cache that maps paths to previously determined datasets. This can speed up repeated processing. sub_paths : bool, optional Provide a list containing the sub-dataset path, as the entry for that sub-dataset. If False, empty list is assigned Returns ------- Tuple(dict, list, list) Dict of `existing dataset path`: `path` mappings, the list of currently non-existing paths (possibly matching currently uninstalled datasets), and any paths that are not part of any dataset. """ # sort paths into the respective datasets if dir_lookup is None: dir_lookup = {} if out is None: out = {} # paths that don't exist (yet) unavailable_paths = [] nondataset_paths = [] for path in unique(paths): if not lexists(path): # not there yet, impossible to say which ds it will actually # be in, if any unavailable_paths.append(path) continue # the path exists in some shape or form if isdir(path): # this could contain all types of additional content d = path else: # for everything else we are interested in the container d = dirname(path) if not d: d = curdir dspath = dir_lookup.get(d, None) if dspath: _ds_looked_up = True else: _ds_looked_up = False # this could be `None` if there is no git repo dspath = get_dataset_root(d) dir_lookup[d] = dspath if not dspath: nondataset_paths.append(path) continue if path in out.get(dspath, []): # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue if isdir(path): ds = Dataset(dspath) # we need to doublecheck that this is not a subdataset mount # point, in which case get_dataset_root() would point to the parent. if not _ds_looked_up: # we didn't deal with it before # TODO this is a slow call, no need for dedicated RF, will vanish # together with the entire function smpath = ds.get_containing_subdataset(path, recursion_limit=1).path if smpath != dspath: # fix entry dir_lookup[d] = smpath # submodule still needs to be obtained unavailable_paths.append(path) continue else: # we figured out the dataset previously, so we can spare some # effort by not calling ds.subdatasets or # ds.get_containing_subdataset. Instead we just need # get_dataset_root, which is cheaper if dspath != get_dataset_root(dspath): # if the looked up path isn't the default value, # it's a 'fixed' entry for an unavailable dataset (see above) unavailable_paths.append(path) continue if recursive: # make sure we get everything relevant in all _checked out_ # subdatasets, obtaining of previously unavailable subdataset # else done elsewhere for subdspath in ds.subdatasets( fulfilled=True, recursive=recursive, recursion_limit=recursion_limit, result_xfm='paths'): if subdspath.startswith(_with_sep(path)): # this subdatasets is underneath the search path # be careful to not overwrite anything, in case # this subdataset has been processed before out[subdspath] = out.get( subdspath, [subdspath] if sub_paths else []) out[dspath] = out.get(dspath, []) + [path] return out, unavailable_paths, nondataset_paths
def __call__(path=None, source=None, dataset=None, to_git=False, save=True, recursive=False, recursion_limit=None, if_dirty='ignore', git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path and not source: raise InsufficientArgumentsError( "insufficient information for " "adding: requires at least a path " "or a source.") # When called from cmdline `path` and `source` will be a list even if # there is only one item. # Make sure we deal with the same when called via python API: # always yields list; empty if None path = assure_list(path) source = assure_list(source) # TODO: Q: are the list operations in the following 3 blocks (resolving # paths, sources and datasets) guaranteed to be stable # regarding order? # resolve path(s): # TODO: RF: resolve_path => datalad.utils => more general (repos => normalize paths) resolved_paths = [resolve_path(p, dataset) for p in path] # must come after resolve_path()!! # resolve dataset: dataset = require_dataset(dataset, check_installed=True, purpose='adding') handle_dirty_dataset(dataset, if_dirty) # resolve source(s): resolved_sources = [] for s in source: if not is_datalad_compat_ri(s): raise ValueError("invalid source parameter: %s" % s) resolved_sources.append(_get_git_url_from_source(s)) # find (sub-)datasets to add things to (and fail on invalid paths): if recursive: # 1. Find the (sub-)datasets containing the given path(s): # Note, that `get_containing_subdataset` raises if `p` is # outside `dataset`, but it returns `dataset`, if `p` is inside # a subdataset not included by `recursion_limit`. In the latter # case, the git calls will fail instead. # We could check for this right here and fail early, but this # would lead to the need to discover the entire hierarchy no # matter if actually required. resolved_datasets = [ dataset.get_containing_subdataset( p, recursion_limit=recursion_limit) for p in resolved_paths ] # 2. Find implicit subdatasets to call add on: # If there are directories in resolved_paths (Note, # that this includes '.' and '..'), check for subdatasets # beneath them. These should be called recursively with '.'. # Therefore add the subdatasets to resolved_datasets and # corresponding '.' to resolved_paths, in order to generate the # correct call. for p in resolved_paths: if isdir(p): for subds_path in \ dataset.get_subdatasets(absolute=True, recursive=True, recursion_limit=recursion_limit): if subds_path.startswith(_with_sep(p)): resolved_datasets.append(Dataset(subds_path)) resolved_paths.append(curdir) else: # if not recursive, try to add everything to dataset itself: resolved_datasets = [dataset for i in range(len(resolved_paths))] # we need a resolved dataset per path: assert len(resolved_paths) == len(resolved_datasets) # sort parameters for actual git/git-annex calls: # (dataset, path, source) from six.moves import zip_longest param_tuples = list( zip_longest(resolved_datasets, resolved_paths, resolved_sources)) # possible None-datasets in `param_tuples` were filled in by zip_longest # and need to be replaced by `dataset`: param_tuples = [(d if d is not None else dataset, p, s) for d, p, s in param_tuples] calls = { d.path: { # list of paths to 'git-add': 'g_add': [], # list of paths to 'git-annex-add': 'a_add': [], # list of sources to 'git-annex-addurl': 'addurl_s': [], # list of (path, source) to # 'git-annex-addurl --file': 'addurl_f': [] } for d in [i for i, p, s in param_tuples] } for ds, p, s in param_tuples: # it should not happen, that `path` as well as `source` are None: assert p or s if not s: # we have a path only # Do not try to add to annex whenever there is no annex if to_git or not isinstance(ds.repo, AnnexRepo): calls[ds.path]['g_add'].append(p) else: calls[ds.path]['a_add'].append(p) elif not p: # we have a source only if to_git: raise NotImplementedError("Can't add a remote source " "directly to git.") calls[ds.path]['addurl_s'].append(s) else: # we have a path and a source if to_git: raise NotImplementedError("Can't add a remote source " "directly to git.") calls[ds.path]['addurl_f'].append((p, s)) # now do the actual add operations: # TODO: implement git/git-annex/git-annex-add options datasets_return_values = defaultdict(list) for dspath in calls: ds = Dataset(dspath) return_values = datasets_return_values[dspath] lgr.info("Processing dataset %s ..." % ds) # check every (sub-)dataset for annex once, since we can't add or # addurl anything, if there is no annex: # TODO: Q: Alternatively, just call git-annex-init if there's no # annex yet and we have an annex-add/annex-addurl request? _is_annex = isinstance(ds.repo, AnnexRepo) if calls[ds.path]['g_add']: lgr.debug("Adding %s to git", calls[dspath]['g_add']) added = ds.repo.add(calls[dspath]['g_add'], git=True, git_options=git_opts) return_values.extend(added) if calls[ds.path]['a_add']: if _is_annex: lgr.debug("Adding %s to annex", calls[dspath]['a_add']) return_values.extend( ds.repo.add(calls[dspath]['a_add'], git=False, jobs=jobs, git_options=git_opts, annex_options=annex_opts, options=annex_add_opts)) else: lgr.debug("{0} is no annex. Skip 'annex-add' for " "files {1}".format(ds, calls[dspath]['a_add'])) return_values.extend([{ 'file': f, 'success': False, 'note': "no annex at %s" % ds.path } for f in calls[dspath]['a_add']]) # TODO: AnnexRepo.add_urls' return value doesn't contain the created # file name but the url if calls[ds.path]['addurl_s']: if _is_annex: lgr.debug("Adding urls %s to annex", calls[dspath]['addurl_s']) return_values.extend( ds.repo.add_urls( calls[ds.path]['addurl_s'], options=annex_add_opts, # TODO: extra parameter for addurl? git_options=git_opts, annex_options=annex_opts, jobs=jobs, )) else: lgr.debug("{0} is no annex. Skip 'annex-addurl' for " "files {1}".format(ds, calls[dspath]['addurl_s'])) return_values.extend([{ 'file': f, 'success': False, 'note': "no annex at %s" % ds.path } for f in calls[dspath]['addurl_s']]) if calls[ds.path]['addurl_f']: if _is_annex: for f, u in calls[ds.path]['addurl_f']: lgr.debug("Adding urls %s to files in annex", calls[dspath]['addurl_f']) return_values.append( ds.repo.add_url_to_file( f, u, options=annex_add_opts, # TODO: see above git_options=git_opts, annex_options=annex_opts, batch=True)) else: lgr.debug("{0} is no annex. Skip 'annex-addurl' for " "files {1}".format(ds, calls[dspath]['addurl_f'])) return_values.extend([{ 'file': f, 'success': False, 'note': "no annex at %s" % ds.path } for f in calls[dspath]['addurl_f']]) return_values = None # to avoid mis-use # XXX or we could return entire datasets_return_values, could be useful # that way. But then should be unified with the rest of commands, e.g. # get etc return_values_flat = [] for dspath, return_values in datasets_return_values.items(): if save and len(return_values): # we got something added -> save # everything we care about at this point should be staged already Save.__call__(message='[DATALAD] added content', dataset=ds, auto_add_changes=False, recursive=False) # TODO: you feels that this is some common logic we already have somewhere dsrelpath = relpath(dspath, dataset.path) if dsrelpath != curdir: # we need ot adjust 'file' entry in each record for return_value in return_values: if 'file' in return_value: return_value['file'] = opj(dsrelpath, return_value['file']) return_values_flat.append(return_value) else: return_values_flat.extend(return_values) return return_values_flat
def filter_unmodified(content_by_ds, refds, since): """Filter per-dataset path specifications based on modification history. This function takes a path specification dictionary, as produced by `Interface._prep()` and filters it such that only that subset of paths remains in the dictionary that corresponding to the set of changes in the given reference dataset since a given state. The change set is traced across all related subdatasets, i.e. if a submodule in the reference dataset is reported as modified then all paths for any given subdataset in the modified one are tested for changes too (based on the state at which the parent dataset reports a change in the subdataset), and so on. In doing so, not only unmodified given paths are removed, but also modified given paths are replaced by the set of actually modified paths within them. Only committed changes are considered! Parameters ---------- content_by_ds : dict Per-dataset path specifications, as produced ,for example, by `Interface._prep()` refds : Dataset or *Repo or path Reference dataset for which to determine the initial change set since : state Any commit-ish/tree-ish supported by Git (tag, commit, branch, ...). Changes between this given state and the most recent commit are evaluated. Returns ------- dict Filtered path spec dictionary. If `since` is not None, the output is guaranteed to only contain paths to modified, and presently existing components of subdatasets of the given reference dataset (and itself). """ if since is None: # we want all, subds not matching the ref are assumed to have been # sorted out before (e.g. one level up) return content_by_ds # turn refds argument into a usable repo instance if not hasattr(refds, 'path'): # not a Repo or Dataset refds_path = refds refds = GitRepo(refds, create=False) else: refds_path = refds.path repo = refds.repo if hasattr(repo, 'repo'): # TODO use GitRepo.diff() when available (gh-1217) repo = repo.repo dict_class = content_by_ds.__class__ # could be ordered dict # life is simple: we diff the base dataset, and kill anything that # does not start with something that is in the diff # we cannot really limit the diff paths easily because we might get # or miss content (e.g. subdatasets) if we don't figure out which ones # are known -- and we don't want that try: diff = repo.commit().diff(since) except GitCommandError as exc: # could fail because `since` points to non existing location. # Unfortunately there might be no meaningful message # e.g. "fatal: ambiguous argument 'HEAD^': unknown revision or path not in the working tree" # logged within this GitCommandError for some reason! So let's check # that value of since post-error for being correct: try: refds.repo._git_custom_command( [], ['git', 'show', '--stat', since, '--'], expect_stderr=True, expect_fail=True) raise # re-raise since our idea was incorrect except CommandError as ce_exc: if ce_exc.stderr.startswith('fatal: bad revision'): raise ValueError( "Value since=%r is not valid. Git reports: %s" % (since, exc_str(ce_exc))) else: raise # re-raise # get all modified paths (with original? commit) that are still # present modified = dict( (opj(refds_path, d.b_path), d.b_blob.hexsha if d.b_blob else None) for d in diff) if not modified: # nothing modified nothing to report return dict_class() # determine the subset that is a directory and hence is relevant for possible # subdatasets modified_dirs = {_with_sep(d) for d in modified if isdir(d)} # find the subdatasets matching modified paths, this will also kick out # any paths that are not in the dataset sub-hierarchy mod_subs = dict_class( (candds, paths) for candds, paths in content_by_ds.items() if candds != refds_path and any( _with_sep(candds).startswith(md) for md in modified_dirs)) # now query the next level down keep_subs = \ [filter_unmodified(mod_subs, subds_path, modified[subds_path]) for subds_path in mod_subs if subds_path in modified] # merge result list into a single dict keep = dict_class((k, v) for d in keep_subs for k, v in d.items()) paths_refds = content_by_ds[refds_path] keep[refds_path] = [ m for m in modified if lexists(m) # still around and (m in paths_refds # listed file, or subds # or a modified path under a given directory or any(m.startswith(_with_sep(p)) for p in paths_refds)) ] return keep
def _get_submodules(dspath, fulfilled, recursive, recursion_limit, contains, bottomup, set_property, delete_property, refds_path): if not GitRepo.is_valid_repo(dspath): return modinfo = _parse_gitmodules(dspath) # write access parser parser = None if set_property or delete_property: parser = _get_gitmodule_parser(dspath) # put in giant for-loop to be able to yield results before completion for sm in _parse_git_submodules(dspath, recursive=False): if contains and \ not (contains == sm['path'] or contains.startswith(_with_sep(sm['path']))): # we are not looking for this subds, because it doesn't # match the target path continue sm.update(modinfo.get(sm['path'], {})) if set_property or delete_property: # do modifications now before we read the info out for reporting # use 'submodule "NAME"' section ID style as this seems to be the default submodule_section = 'submodule "{}"'.format(sm['gitmodule_name']) # first deletions for dprop in assure_list(delete_property): parser.remove_option(submodule_section, dprop) # also kick from the info we just read above sm.pop('gitmodule_{}'.format(dprop), None) # and now setting values for sprop in assure_list(set_property): prop, val = sprop if val.startswith('<') and val.endswith('>') and '{' in val: # expand template string val = val[1:-1].format( **dict( sm, refds_relpath=relpath(sm['path'], refds_path), refds_relname=relpath(sm['path'], refds_path).replace(os.sep, '-'))) parser.set_value( submodule_section, prop, val) # also add to the info we just read above sm['gitmodule_{}'.format(prop)] = val #common = commonprefix((with_pathsep(subds), with_pathsep(path))) #if common.endswith(sep) and common == with_pathsep(subds): # candidates.append(common) subdsres = get_status_dict( 'subdataset', status='ok', type='dataset', logger=lgr) subdsres.update(sm) subdsres['parentds'] = dspath if not bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres # expand list with child submodules. keep all paths relative to parent # and convert jointly at the end if recursive and \ (recursion_limit in (None, 'existing') or (isinstance(recursion_limit, int) and recursion_limit > 1)): for r in _get_submodules( sm['path'], fulfilled, recursive, (recursion_limit - 1) if isinstance(recursion_limit, int) else recursion_limit, contains, bottomup, set_property, delete_property, refds_path): yield r if bottomup and \ (fulfilled is None or GitRepo.is_valid_repo(sm['path']) == fulfilled): yield subdsres if parser is not None: # release parser lock manually, auto-cleanup is not reliable in PY3 parser.release()
def results_from_annex_noinfo(ds, requested_paths, respath_by_status, dir_fail_msg, noinfo_dir_msg, noinfo_file_msg, noinfo_status='notneeded', **kwargs): """Helper to yield results based on what information git annex did no give us. The helper assumes that the annex command returned without an error code, and interprets which of the requested paths we have heard nothing about, and assumes that git annex was happy with their current state. Parameters ========== ds : Dataset All results have to be concerning this single dataset (used to resolve relpaths). requested_paths : list List of path arguments sent to `git annex` respath_by_status : dict Mapping of 'success' or 'failure' labels to lists of result paths reported by `git annex`. Everything that is not in here, we assume that `git annex` was happy about. dir_fail_msg : str Message template to inject into the result for a requested directory where a failure was reported for some of its content. The template contains two string placeholders that will be expanded with 1) the path of the directory, and 2) the content failure paths for that directory noinfo_dir_msg : str Message template to inject into the result for a requested directory that `git annex` was silent about (incl. any content). There must be one string placeholder that is expanded with the path of that directory. noinfo_file_msg : str Message to inject into the result for a requested file that `git annex` was silent about. noinfo_status : str Status to report when annex provides no information **kwargs Any further kwargs are included in the yielded result dictionary. """ for p in requested_paths: # any relpath is relative to the currently processed dataset # not the global reference dataset p = p if isabs(p) else normpath(opj(ds.path, p)) if any(p in ps for ps in respath_by_status.values()): # we have a report for this path already continue common_report = dict(path=p, **kwargs) if isdir(p): # `annex` itself will not report on directories, but if a # directory was requested, we want to say something about # it in the results. we are inside a single, existing # repo, hence all directories are already present, if not # we had an error # do we have any failures in a subdir of the requested dir? failure_results = [ fp for fp in respath_by_status.get('failure', []) if fp.startswith(_with_sep(p)) ] if failure_results: # we were not able to process all requested_paths, let's label # this 'impossible' to get a warning-type report # after all we have the directory itself, but not # (some) of its requested_paths yield get_status_dict(status='impossible', type='directory', message=(dir_fail_msg, p, failure_results), **common_report) else: # otherwise cool, but how cool? success_results = [ fp for fp in respath_by_status.get('success', []) if fp.startswith(_with_sep(p)) ] yield get_status_dict( status='ok' if success_results else noinfo_status, message=None if success_results else (noinfo_dir_msg, p), type='directory', **common_report) continue else: # not a directory, and we have had no word from `git annex`, # yet no exception, hence the file was most probably # already in the desired state yield get_status_dict(status=noinfo_status, type='file', message=noinfo_file_msg, **common_report)
def _get_dsinfo_from_aggmetadata(ds_path, path, recursive, db): """Grab info on aggregated metadata for a path from a given dataset. The actual info is stored in a `db` dict under the absolute path of the dataset that contains the query path, plus any subdataset in case of recursion (with their own DB entries). Parameters ---------- ds : Dataset source dataset path : str absolute path for which to obtain metadata recursive : bool Returns ------- str or list A string is an error message, a list contains all absolute paths for all datasets on which info was put into the DB. """ info_fpath = opj(ds_path, agginfo_relpath) info_basepath = dirname(info_fpath) # TODO cache these agginfos = _load_json_object(info_fpath) def _ensure_abs_obj_location(rec): # object location in the DB must be absolute so we can copy easily # to all relevant datasets for key in location_keys: if key in rec and not isabs(rec[key]): rec[key] = opj(info_basepath, rec[key]) return rec rpath = relpath(path, start=ds_path) seed_ds = _get_containingds_from_agginfo(agginfos, rpath) if seed_ds is None: # nothing found # this will be the message in the result for the query path # and could be a tuple return ("No matching aggregated metadata in Dataset at %s", ds_path) # easy peasy seed_abs = opj(ds_path, seed_ds) db[seed_abs] = _ensure_abs_obj_location(agginfos[seed_ds]) hits = [seed_abs] if not recursive: return hits # a little more complicated: we need to loop over all subdataset # records an pick the ones that are underneath the seed for agginfo_path in agginfos: if agginfo_path.startswith(_with_sep(seed_ds)): absp = opj(ds_path, agginfo_path) db[absp] = _ensure_abs_obj_location(agginfos[agginfo_path]) hits.append(absp) # TODO we must keep the info on these recursively discovered datasets # somewhere, because we cannot rediscover them on the filesystem # when updating the datasets later on return hits
def get_paths_by_dataset(paths, recursive=False, recursion_limit=None, out=None, dir_lookup=None): """Sort a list of paths per dataset they are contained in. Any paths that are not part of a dataset, or presently unavailable are reported. Parameter --------- paths : sequence A sequence of path specifications to sort. recursive : bool Flag whether to report subdatasets under any of the given paths recursion_limit : Depth constraint for recursion. See `Dataset.get_subdatasets()` for more information. out : dict or None By default a new output dictionary is created, howeverm and existing one can be provided via this argument to enable incremental processing. dir_lookup : dict or None Optional lookup cache that maps paths to previously determined datasets. This can speed up repeated processing. Returns ------- Tuple(dict, list, list) Dict of `existing dataset path`: `path` mappings, the list of currently non-existing paths (possibly matching currently uninstalled datasets), and any paths that are not part of any dataset """ # sort paths into the respective datasets if dir_lookup is None: dir_lookup = {} if out is None: out = {} # paths that don't exist (yet) unavailable_paths = [] nondataset_paths = [] for path in paths: if not lexists(path): # not there yet, impossible to say which ds it will actually # be in, if any unavailable_paths.append(path) continue # the path exists in some shape or form if isdir(path): # this could contain all types of additional content d = path else: # for everything else we are interested in the container d = dirname(path) if not d: d = curdir # this could be `None` if there is no git repo dspath = dir_lookup.get(d, GitRepo.get_toppath(d)) dir_lookup[d] = dspath if not dspath: nondataset_paths.append(path) continue if isdir(path): ds = Dataset(dspath) # we need to doublecheck that this is not a subdataset mount # point, in which case get_toppath() would point to the parent smpath = ds.get_containing_subdataset(path, recursion_limit=1).path if smpath != dspath: # fix entry dir_lookup[d] = smpath # submodule still needs to be obtained unavailable_paths.append(path) continue if recursive: # make sure we get everything relevant in all _checked out_ # subdatasets, obtaining of previously unavailable subdataset # else done elsewhere subs = ds.get_subdatasets(fulfilled=True, recursive=recursive, recursion_limit=recursion_limit) for sub in subs: subdspath = opj(dspath, sub) if subdspath.startswith(_with_sep(path)): # this subdatasets is underneath the search path # we want it all # be careful to not overwrite anything, in case # this subdataset has been processed before out[subdspath] = out.get(subdspath, [subdspath]) out[dspath] = out.get(dspath, []) + [path] return out, unavailable_paths, nondataset_paths