def get_paths_by_dataset(paths, recursive=False, recursion_limit=None, out=None, dir_lookup=None): """Sort a list of paths per dataset they are contained in. Any paths that are not part of a dataset, or presently unavailable are reported. Parameter --------- paths : sequence A sequence of path specifications to sort. recursive : bool Flag whether to report subdatasets under any of the given paths recursion_limit : Depth constraint for recursion. See `Dataset.get_subdatasets()` for more information. out : dict or None By default a new output dictionary is created, howeverm and existing one can be provided via this argument to enable incremental processing. dir_lookup : dict or None Optional lookup cache that maps paths to previously determined datasets. This can speed up repeated processing. Returns ------- Tuple(dict, list, list) Dict of `existing dataset path`: `path` mappings, the list of currently non-existing paths (possibly matching currently uninstalled datasets), and any paths that are not part of any dataset """ # sort paths into the respective datasets if dir_lookup is None: dir_lookup = {} if out is None: out = {} # paths that don't exist (yet) unavailable_paths = [] nondataset_paths = [] for path in paths: if not lexists(path): # not there yet, impossible to say which ds it will actually # be in, if any unavailable_paths.append(path) continue # the path exists in some shape or form if isdir(path): # this could contain all types of additional content d = path else: # for everything else we are interested in the container d = dirname(path) if not d: d = curdir # this could be `None` if there is no git repo dspath = dir_lookup.get(d, GitRepo.get_toppath(d)) dir_lookup[d] = dspath if not dspath: nondataset_paths.append(path) continue if isdir(path): ds = Dataset(dspath) # we need to doublecheck that this is not a subdataset mount # point, in which case get_toppath() would point to the parent smpath = ds.get_containing_subdataset(path, recursion_limit=1).path if smpath != dspath: # fix entry dir_lookup[d] = smpath # submodule still needs to be obtained unavailable_paths.append(path) continue if recursive: # make sure we get everything relevant in all _checked out_ # subdatasets, obtaining of previously unavailable subdataset # else done elsewhere subs = ds.get_subdatasets(fulfilled=True, recursive=recursive, recursion_limit=recursion_limit) for sub in subs: subdspath = opj(dspath, sub) if subdspath.startswith(_with_sep(path)): # this subdatasets is underneath the search path # we want it all # be careful to not overwrite anything, in case # this subdataset has been processed before out[subdspath] = out.get(subdspath, [subdspath]) out[dspath] = out.get(dspath, []) + [path] return out, unavailable_paths, nondataset_paths
def get_paths_by_dataset(paths, recursive=False, recursion_limit=None, out=None, dir_lookup=None, sub_paths=True): """Sort a list of paths per dataset they are contained in. Any paths that are not part of a dataset, or presently unavailable are reported. Parameter --------- paths : sequence A sequence of path specifications to sort. recursive : bool Flag whether to report subdatasets under any of the given paths recursion_limit : Depth constraint for recursion. See `subdatasets()` for more information. out : dict or None By default a new output dictionary is created, however an existing one can be provided via this argument to enable incremental processing. dir_lookup : dict or None, optional Optional lookup cache that maps paths to previously determined datasets. This can speed up repeated processing. sub_paths : bool, optional Provide a list containing the sub-dataset path, as the entry for that sub-dataset. If False, empty list is assigned Returns ------- Tuple(dict, list, list) Dict of `existing dataset path`: `path` mappings, the list of currently non-existing paths (possibly matching currently uninstalled datasets), and any paths that are not part of any dataset. """ # sort paths into the respective datasets if dir_lookup is None: dir_lookup = {} if out is None: out = {} # paths that don't exist (yet) unavailable_paths = [] nondataset_paths = [] for path in unique(paths): if not lexists(path): # not there yet, impossible to say which ds it will actually # be in, if any unavailable_paths.append(path) continue # the path exists in some shape or form if isdir(path): # this could contain all types of additional content d = path else: # for everything else we are interested in the container d = dirname(path) if not d: d = curdir dspath = dir_lookup.get(d, None) if dspath: _ds_looked_up = True else: _ds_looked_up = False # this could be `None` if there is no git repo dspath = get_dataset_root(d) dir_lookup[d] = dspath if not dspath: nondataset_paths.append(path) continue if path in out.get(dspath, []): # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue if isdir(path): ds = Dataset(dspath) # we need to doublecheck that this is not a subdataset mount # point, in which case get_dataset_root() would point to the parent. if not _ds_looked_up: # we didn't deal with it before # TODO this is a slow call, no need for dedicated RF, will vanish # together with the entire function smpath = ds.get_containing_subdataset(path, recursion_limit=1).path if smpath != dspath: # fix entry dir_lookup[d] = smpath # submodule still needs to be obtained unavailable_paths.append(path) continue else: # we figured out the dataset previously, so we can spare some # effort by not calling ds.subdatasets or # ds.get_containing_subdataset. Instead we just need # get_dataset_root, which is cheaper if dspath != get_dataset_root(dspath): # if the looked up path isn't the default value, # it's a 'fixed' entry for an unavailable dataset (see above) unavailable_paths.append(path) continue if recursive: # make sure we get everything relevant in all _checked out_ # subdatasets, obtaining of previously unavailable subdataset # else done elsewhere for subdspath in ds.subdatasets( fulfilled=True, recursive=recursive, recursion_limit=recursion_limit, result_xfm='paths'): if subdspath.startswith(_with_sep(path)): # this subdatasets is underneath the search path # be careful to not overwrite anything, in case # this subdataset has been processed before out[subdspath] = out.get( subdspath, [subdspath] if sub_paths else []) out[dspath] = out.get(dspath, []) + [path] return out, unavailable_paths, nondataset_paths