def test_path_is_subpath(): ok_(path_is_subpath('/a/b', '/a')) ok_(path_is_subpath('/a/b/c', '/a')) nok_(path_is_subpath('/a/b', '/a/b')) nok_(path_is_subpath('/a/b', '/a/b/')) nok_(path_is_subpath('/a/b/', '/a/b')) ok_(path_is_subpath('/a/b', '/')) ok_(path_is_subpath('/aaa/b/c', '/aaa')) nok_(path_is_subpath('/aaa/b/c', '/aa')) nok_(path_is_subpath('/a/b', '/a/c')) nok_(path_is_subpath('/a/b/c', '/a/c')) # must not mix relative and abs assert_raises(ValueError, path_is_subpath, 'a/b', '/a') assert_raises(ValueError, path_is_subpath, '/a/b', 'a')
def _get_containingds_from_agginfo(info, rpath): """Return the relative path of a dataset that contains a relative query path Parameters ---------- info : dict Content of aggregate.json (dict with relative subdataset paths as keys) rpath : str Relative query path Returns ------- str or None None is returned if the is no match, the relative path of the closest containing subdataset otherwise. """ if rpath in info: dspath = rpath else: # not a direct hit, hence we find the closest # containing subdataset (if there is any) containing_ds = sorted( [subds for subds in sorted(info) if path_is_subpath(rpath, subds)], # TODO os.sep might not be OK on windows, # depending on where it was aggregated, ensure uniform UNIX # storage key=lambda x: x.count(os.sep), reverse=True) dspath = containing_ds[0] if len(containing_ds) else None return dspath
def _get_containingds_from_agginfo(info, rpath): """Return the path of a dataset that contains a query path If a query path matches a dataset path directly, the matching dataset path is return -- not the parent dataset! Parameters ---------- info : dict Content of aggregate.json (dict with (relative) subdataset paths as keys) rpath : str Query path can be absolute or relative, but must match the convention used in the info dict. Returns ------- str or None None is returned if there is no match, the path of the closest containing subdataset otherwise (in the convention used in the info dict). """ if rpath in info: dspath = rpath else: # not a direct hit, hence we find the closest # containing subdataset (if there is any) containing_ds = sorted( [subds for subds in sorted(info) if path_is_subpath(rpath, subds)], # TODO os.sep might not be OK on windows, # depending on where it was aggregated, ensure uniform UNIX # storage key=lambda x: x.count(os.sep), reverse=True) dspath = containing_ds[0] if len(containing_ds) else None return dspath
def _filterpaths(basepath, paths, exclude): final_paths = [] for rp in [op.join(basepath, p) if basepath else p for p in paths]: if rp in exclude: continue elif any(path_is_subpath(ep, rp) for ep in exclude): final_paths.extend( _filterpaths(rp, listdir(op.join(ds.path, rp)), exclude)) pass else: final_paths.append(rp) return final_paths
def _recursive_install_subds_underneath(ds, recursion_limit, reckless, start=None, refds_path=None, description=None): if isinstance(recursion_limit, int) and recursion_limit <= 0: return # install using helper that give some flexibility regarding where to # get the module from for sub in ds.subdatasets( return_type='generator', result_renderer='disabled'): subds = Dataset(sub['path']) if sub.get('gitmodule_datalad-recursiveinstall', '') == 'skip': lgr.debug( "subdataset %s is configured to be skipped on recursive installation", sub['path']) continue if start is not None and not path_is_subpath(subds.path, start): # this one we can ignore, not underneath the start path continue if sub.get('state', None) != 'absent': # dataset was already found to exist yield get_status_dict( 'install', ds=subds, status='notneeded', logger=lgr, refds=refds_path) # do not continue, even if an intermediate dataset exists it # does not imply that everything below it does too else: # try to get this dataset try: subds = _install_subds_from_flexible_source( ds, relpath(sub['path'], start=ds.path), sub['gitmodule_url'], reckless, description=description) yield get_status_dict( 'install', ds=subds, status='ok', logger=lgr, refds=refds_path, message=("Installed subdataset %s", subds), parentds=ds.path) except Exception as e: # skip all of downstairs, if we didn't manage to install subdataset yield get_status_dict( 'install', ds=subds, status='error', logger=lgr, refds=refds_path, message=("Installation of subdatasets %s failed with exception: %s", subds, exc_str(e))) continue # otherwise recurse # we can skip the start expression, we know we are within for res in _recursive_install_subds_underneath( subds, recursion_limit=recursion_limit - 1 if isinstance(recursion_limit, int) else recursion_limit, reckless=reckless, refds_path=refds_path): yield res
def _get_dsinfo_from_aggmetadata(ds_path, path, recursive, db): """Grab info on aggregated metadata for a path from a given dataset. The actual info is stored in a `db` dict under the absolute path of the dataset that contains the query path, plus any subdataset in case of recursion (with their own DB entries). Parameters ---------- ds : Dataset source dataset path : str absolute path for which to obtain metadata recursive : bool Returns ------- str or list A string/tuple is an error message, a list contains all absolute paths for all datasets on which info was put into the DB. """ # TODO cache these agginfos = load_ds_aggregate_db(Dataset(ds_path), abspath=True) seed_ds = _get_containingds_from_agginfo(agginfos, path) if seed_ds is None: # nothing found # this will be the message in the result for the query path # and could be a tuple return ( "No matching aggregated metadata for path '%s' in Dataset at %s", op.relpath(path, start=ds_path), ds_path) # easy peasy db[seed_ds] = agginfos[seed_ds] hits = [seed_ds] if not recursive: return hits # a little more complicated: we need to loop over all subdataset # records and pick the ones that are underneath the seed for agginfo_path in agginfos: if path_is_subpath(agginfo_path, seed_ds): db[agginfo_path] = agginfos[agginfo_path] hits.append(agginfo_path) # TODO we must keep the info on these recursively discovered datasets # somewhere, because we cannot rediscover them on the filesystem # when updating the datasets later on return hits
def _get_dsinfo_from_aggmetadata(ds_path, path, recursive, db): """Grab info on aggregated metadata for a path from a given dataset. The actual info is stored in a `db` dict under the absolute path of the dataset that contains the query path, plus any subdataset in case of recursion (with their own DB entries). Parameters ---------- ds : Dataset source dataset path : str absolute path for which to obtain metadata recursive : bool Returns ------- str or list A string/tuple is an error message, a list contains all absolute paths for all datasets on which info was put into the DB. """ # TODO cache these agginfos = load_ds_aggregate_db(Dataset(ds_path), abspath=True) seed_ds = _get_containingds_from_agginfo(agginfos, path) if seed_ds is None: # nothing found # this will be the message in the result for the query path # and could be a tuple return ("No matching aggregated metadata for path '%s' in Dataset at %s", op.relpath(path, start=ds_path), ds_path) # easy peasy db[seed_ds] = agginfos[seed_ds] hits = [seed_ds] if not recursive: return hits # a little more complicated: we need to loop over all subdataset # records and pick the ones that are underneath the seed for agginfo_path in agginfos: if path_is_subpath(agginfo_path, seed_ds): db[agginfo_path] = agginfos[agginfo_path] hits.append(agginfo_path) # TODO we must keep the info on these recursively discovered datasets # somewhere, because we cannot rediscover them on the filesystem # when updating the datasets later on return hits
def yield_recursive(ds, path, action, recursion_limit): # make sure we get everything relevant in all _checked out_ # subdatasets, obtaining of previously unavailable subdataset # is elsewhere for subd_res in ds.subdatasets(recursive=True, recursion_limit=recursion_limit, return_type='generator'): # this check is not the same as subdatasets --contains=path # because we want all subdataset below a path, not just the # containing one if path_is_subpath(subd_res['path'], path): # this subdatasets is underneath the search path # be careful to not overwrite anything, in case # this subdataset has been processed before subd_res['action'] = action # mark as "notprocessed" subd_res['status'] = '' # we know that this is a known subdataset, that is how # we got here, make a record subd_res['registered_subds'] = True yield subd_res
def yield_recursive(ds, path, action, recursion_limit): # make sure we get everything relevant in all _checked out_ # subdatasets, obtaining of previously unavailable subdataset # is elsewhere for subd_res in ds.subdatasets( recursive=True, recursion_limit=recursion_limit, return_type='generator'): # this check is not the same as subdatasets --contains=path # because we want all subdataset below a path, not just the # containing one if path_is_subpath(subd_res['path'], path): # this subdatasets is underneath the search path # be careful to not overwrite anything, in case # this subdataset has been processed before subd_res['action'] = action # mark as "notprocessed" subd_res['status'] = '' # we know that this is a known subdataset, that is how # we got here, make a record subd_res['registered_subds'] = True yield subd_res
def query_aggregated_metadata(reporton, ds, aps, recursive=False, **kwargs): """Query the aggregated metadata in a dataset Query paths (`aps`) have to be composed in an intelligent fashion by the caller of this function, i.e. it should have been decided outside which dataset to query for any given path. Also this function doesn't cache anything, hence the caller must make sure to only call this once per dataset to avoid waste. Parameters ---------- reporton : {None, 'none', 'dataset', 'files', 'all'} If `None`, reporting will be based on the `type` property of the incoming annotated paths. ds : Dataset Dataset to query aps : list Sequence of annotated paths to query metadata for. recursive : bool Whether or not to report metadata underneath all query paths recursively. **kwargs Any other argument will be passed on to the query result dictionary. Returns ------- generator Of result dictionaries. """ from datalad.coreapi import get # look for and load the aggregation info for the base dataset info_fpath = opj(ds.path, agginfo_relpath) agg_base_path = dirname(info_fpath) agginfos = _load_json_object(info_fpath) if not agginfos and not exists(info_fpath): # This dataset does not have aggregated metadata. Does it have any # other version? info_glob = agginfo_relpath_template.format('*') info_files = glob.glob(info_glob) msg = "Found no aggregated metadata info file %s." \ % info_fpath old_metadata_file = opj(ds.path, METADATA_DIR, METADATA_FILENAME) if exists(old_metadata_file): msg += " Found metadata generated with pre-0.10 version of " \ "DataLad, but it will not be used." upgrade_msg = "" if info_files: msg += " Found following info files, which might have been " \ "generated with newer version(s) of datalad: %s." \ % (', '.join(info_files)) upgrade_msg = ", upgrade datalad" msg += " You will likely need to either update the dataset from its " \ "original location,%s or reaggregate metadata locally." \ % upgrade_msg lgr.warning(msg) # cache once loaded metadata objects for additional lookups # TODO possibly supply this cache from outside, if objects could # be needed again -- their filename does not change in a superdataset # if done, cache under relpath, not abspath key cache = { 'objcache': {}, 'subds_relpaths': None, } reported = set() # for all query paths for ap in aps: # all metadata is registered via its relative path to the # dataset that is being queried rpath = relpath(ap['path'], start=ds.path) if rpath in reported: # we already had this, probably via recursion of some kind continue rap = dict(ap, rpath=rpath, type=ap.get('type', None)) # we really have to look this up from the aggregated metadata # and cannot use any 'parentds' property in the incoming annotated # path. the latter will reflect the situation on disk, we need # the record of the containing subdataset in the aggregated metadata # instead containing_ds = _get_containingds_from_agginfo(agginfos, rpath) if containing_ds is None: # could happen if there was no aggregated metadata at all # or the path is in this dataset, but luckily the queried dataset # is known to be present containing_ds = curdir rap['metaprovider'] = containing_ds # build list of datasets and paths to be queried for this annotated path # in the simple case this is just the containing dataset and the actual # query path to_query = [rap] if recursive: # in case of recursion this is also anything in any dataset underneath # the query path matching_subds = [ { 'metaprovider': sub, 'rpath': sub, 'type': 'dataset' } for sub in sorted(agginfos) # we already have the base dataset if (rpath == curdir and sub != curdir) or path_is_subpath(sub, rpath) ] to_query.extend(matching_subds) to_query_available = [] for qap in to_query: if qap['metaprovider'] not in agginfos: res = get_status_dict( status='impossible', path=qap['path'], message= ('Dataset at %s contains no aggregated metadata on this path', qap['metaprovider']), ) res.update(res, **kwargs) if 'type' in qap: res['type'] = qap['type'] yield res else: to_query_available.append(qap) # one heck of a beast to get the set of filenames for all metadata objects that are # required to be present to fulfill this query objfiles = set( agginfos.get(qap['metaprovider'], {}).get(t, None) for qap in to_query_available for t in ('dataset_info',) + \ (('content_info',) if ((reporton is None and qap.get('type', None) == 'file') or reporton in ('files', 'all')) else tuple()) ) # in case there was no metadata provider, we do not want to start # downloading everything: see https://github.com/datalad/datalad/issues/2458 objfiles.difference_update([None]) lgr.debug( 'Verifying/achieving local availability of %i metadata objects', len(objfiles)) if objfiles: get(path=[ dict(path=opj(agg_base_path, of), parentds=ds.path, type='file') for of in objfiles if of ], dataset=ds, result_renderer='disabled') for qap in to_query_available: # info about the dataset that contains the query path dsinfo = agginfos.get(qap['metaprovider'], dict(id=ds.id)) res_tmpl = get_status_dict() for s, d in (('id', 'dsid'), ('refcommit', 'refcommit')): if s in dsinfo: res_tmpl[d] = dsinfo[s] # pull up dataset metadata, always needed if only for the context dsmeta = {} dsobjloc = dsinfo.get('dataset_info', None) if dsobjloc is not None: dsmeta = _load_json_object(opj(agg_base_path, dsobjloc), cache=cache['objcache']) for r in _query_aggregated_metadata_singlepath( ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta, dsinfo.get('content_info', None)): r.update(res_tmpl, **kwargs) # if we are coming from `search` we want to record why this is being # reported if 'query_matched' in ap: r['query_matched'] = ap['query_matched'] if r.get('type', None) == 'file': r['parentds'] = normpath(opj(ds.path, qap['metaprovider'])) yield r reported.add(qap['rpath'])
def query_aggregated_metadata(reporton, ds, aps, recursive=False, **kwargs): """Query the aggregated metadata in a dataset Query paths (`aps`) have to be composed in an intelligent fashion by the caller of this function, i.e. it should have been decided outside which dataset to query for any given path. Also this function doesn't cache anything, hence the caller must make sure to only call this once per dataset to avoid waste. Parameters ---------- reporton : {None, 'none', 'dataset', 'files', 'all'} If `None`, reporting will be based on the `type` property of the incoming annotated paths. ds : Dataset Dataset to query aps : list Sequence of annotated paths to query metadata for. recursive : bool Whether or not to report metadata underneath all query paths recursively. **kwargs Any other argument will be passed on to the query result dictionary. Returns ------- generator Of result dictionaries. """ from datalad.coreapi import get # look for and load the aggregation info for the base dataset agginfos, agg_base_path = load_ds_aggregate_db(ds) # cache once loaded metadata objects for additional lookups # TODO possibly supply this cache from outside, if objects could # be needed again -- their filename does not change in a superdataset # if done, cache under relpath, not abspath key cache = { 'objcache': {}, 'subds_relpaths': None, } reported = set() # for all query paths for ap in aps: # all metadata is registered via its relative path to the # dataset that is being queried rpath = op.relpath(ap['path'], start=ds.path) if rpath in reported: # we already had this, probably via recursion of some kind continue rap = dict(ap, rpath=rpath, type=ap.get('type', None)) # we really have to look this up from the aggregated metadata # and cannot use any 'parentds' property in the incoming annotated # path. the latter will reflect the situation on disk, we need # the record of the containing subdataset in the aggregated metadata # instead containing_ds = _get_containingds_from_agginfo(agginfos, rpath) if containing_ds is None: # could happen if there was no aggregated metadata at all # or the path is in this dataset, but luckily the queried dataset # is known to be present containing_ds = op.curdir rap['metaprovider'] = containing_ds # build list of datasets and paths to be queried for this annotated path # in the simple case this is just the containing dataset and the actual # query path to_query = [rap] if recursive: # in case of recursion this is also anything in any dataset underneath # the query path matching_subds = [{'metaprovider': sub, 'rpath': sub, 'type': 'dataset'} for sub in sorted(agginfos) # we already have the base dataset if (rpath == op.curdir and sub != op.curdir) or path_is_subpath(sub, rpath)] to_query.extend(matching_subds) to_query_available = [] for qap in to_query: if qap['metaprovider'] not in agginfos: res = get_status_dict( status='impossible', path=qap['path'], message=( 'Dataset at %s contains no aggregated metadata on this path', qap['metaprovider']), ) res.update(res, **kwargs) if 'type' in qap: res['type'] = qap['type'] yield res else: to_query_available.append(qap) # one heck of a beast to get the set of filenames for all metadata objects that are # required to be present to fulfill this query objfiles = set( agginfos.get(qap['metaprovider'], {}).get(t, None) for qap in to_query_available for t in ('dataset_info',) + \ (('content_info',) if ((reporton is None and qap.get('type', None) == 'file') or reporton in ('files', 'all')) else tuple()) ) # in case there was no metadata provider, we do not want to start # downloading everything: see https://github.com/datalad/datalad/issues/2458 objfiles.difference_update([None]) lgr.debug( 'Verifying/achieving local availability of %i metadata objects', len(objfiles)) if objfiles: get(path=[dict(path=op.join(agg_base_path, of), parentds=ds.path, type='file') for of in objfiles if of], dataset=ds, result_renderer='disabled') for qap in to_query_available: # info about the dataset that contains the query path dsinfo = agginfos.get(qap['metaprovider'], dict(id=ds.id)) res_tmpl = get_status_dict() for s, d in (('id', 'dsid'), ('refcommit', 'refcommit')): if s in dsinfo: res_tmpl[d] = dsinfo[s] # pull up dataset metadata, always needed if only for the context dsmeta = {} dsobjloc = dsinfo.get('dataset_info', None) if dsobjloc is not None: dsmeta = _load_json_object( op.join(agg_base_path, dsobjloc), cache=cache['objcache']) for r in _query_aggregated_metadata_singlepath( ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta, dsinfo.get('content_info', None)): r.update(res_tmpl, **kwargs) # if we are coming from `search` we want to record why this is being # reported if 'query_matched' in ap: r['query_matched'] = ap['query_matched'] if r.get('type', None) == 'file': r['parentds'] = op.normpath(op.join(ds.path, qap['metaprovider'])) yield r reported.add(qap['rpath'])
def results_from_annex_noinfo(ds, requested_paths, respath_by_status, dir_fail_msg, noinfo_dir_msg, noinfo_file_msg, noinfo_status='notneeded', **kwargs): """Helper to yield results based on what information git annex did no give us. The helper assumes that the annex command returned without an error code, and interprets which of the requested paths we have heard nothing about, and assumes that git annex was happy with their current state. Parameters ========== ds : Dataset All results have to be concerning this single dataset (used to resolve relpaths). requested_paths : list List of path arguments sent to `git annex` respath_by_status : dict Mapping of 'success' or 'failure' labels to lists of result paths reported by `git annex`. Everything that is not in here, we assume that `git annex` was happy about. dir_fail_msg : str Message template to inject into the result for a requested directory where a failure was reported for some of its content. The template contains two string placeholders that will be expanded with 1) the path of the directory, and 2) the content failure paths for that directory noinfo_dir_msg : str Message template to inject into the result for a requested directory that `git annex` was silent about (incl. any content). There must be one string placeholder that is expanded with the path of that directory. noinfo_file_msg : str Message to inject into the result for a requested file that `git annex` was silent about. noinfo_status : str Status to report when annex provides no information **kwargs Any further kwargs are included in the yielded result dictionary. """ for p in requested_paths: # any relpath is relative to the currently processed dataset # not the global reference dataset p = p if isabs(p) else normpath(opj(ds.path, p)) if any(p in ps for ps in respath_by_status.values()): # we have a report for this path already continue common_report = dict(path=p, **kwargs) if isdir(p): # `annex` itself will not report on directories, but if a # directory was requested, we want to say something about # it in the results. we are inside a single, existing # repo, hence all directories are already present, if not # we had an error # do we have any failures in a subdir of the requested dir? failure_results = [ fp for fp in respath_by_status.get('failure', []) if path_is_subpath(fp, p) ] if failure_results: # we were not able to process all requested_paths, let's label # this 'impossible' to get a warning-type report # after all we have the directory itself, but not # (some) of its requested_paths yield get_status_dict(status='impossible', type='directory', message=(dir_fail_msg, p, failure_results), **common_report) else: # otherwise cool, but how cool? success_results = [ fp for fp in respath_by_status.get('success', []) if path_is_subpath(fp, p) ] yield get_status_dict( status='ok' if success_results else noinfo_status, message=None if success_results else (noinfo_dir_msg, p), type='directory', **common_report) continue else: # not a directory, and we have had no word from `git annex`, # yet no exception, hence the file was most probably # already in the desired state yield get_status_dict(status=noinfo_status, type='file', message=noinfo_file_msg, **common_report)
def __call__( path=None, dataset=None, get_aggregates=False, reporton='all', recursive=False): # prep results refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr) if refds_path: res_kwargs['refds'] = refds_path if get_aggregates: # yield all datasets for which we have aggregated metadata as results # the get actual dataset results, so we can turn them into dataset # instances using generic top-level code if desired ds = require_dataset( refds_path, check_installed=True, purpose='aggregate metadata query') agginfos = load_ds_aggregate_db( ds, version=str(aggregate_layout_version), abspath=True ) if not agginfos: # if there has ever been an aggregation run, this file would # exist, hence there has not been and we need to tell this # to people yield get_status_dict( ds=ds, status='impossible', action='metadata', logger=lgr, message='metadata aggregation has never been performed in this dataset') return parentds = [] for dspath in sorted(agginfos): info = agginfos[dspath] if parentds and not path_is_subpath(dspath, parentds[-1]): parentds.pop() info.update( path=dspath, type='dataset', status='ok', ) if dspath == ds.path: info['layout_version'] = aggregate_layout_version if parentds: info['parentds'] = parentds[-1] yield dict( info, **res_kwargs ) parentds.append(dspath) return if not dataset and not path: # makes no sense to have no dataset, go with "here" # error generation happens during annotation path = op.curdir content_by_ds = OrderedDict() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, # MIH: we are querying the aggregated metadata anyways, and that # mechanism has its own, faster way to go down the hierarchy #recursive=recursive, #recursion_limit=recursion_limit, action='metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', # we need to know when to look into aggregated data force_subds_discovery=True, force_parentds_discovery=True, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo(ap['path']): ap['process_content'] = True to_query = None if ap.get('state', None) == 'absent' or \ ap.get('type', 'dataset') != 'dataset': # this is a lonely absent dataset/file or content in a present dataset # -> query through parent # there must be a parent, otherwise this would be a non-dataset path # and would have errored during annotation to_query = ap['parentds'] else: to_query = ap['path'] if to_query: pcontent = content_by_ds.get(to_query, []) pcontent.append(ap) content_by_ds[to_query] = pcontent for ds_path in content_by_ds: ds = Dataset(ds_path) query_agg = [ap for ap in content_by_ds[ds_path] # this is an available subdataset, will be processed in another # iteration if ap.get('state', None) == 'absent' or not(ap.get('type', None) == 'dataset' and ap['path'] != ds_path)] if not query_agg: continue # report from aggregated metadata for r in query_aggregated_metadata( reporton, # by default query the reference dataset, only if there is none # try our luck in the dataset that contains the queried path # this is consistent with e.g. `get_aggregates` reporting the # situation in the reference dataset only Dataset(refds_path) if refds_path else ds, query_agg, # recursion above could only recurse into datasets # on the filesystem, but there might be any number of # uninstalled datasets underneath the last installed one # for which we might have metadata recursive=recursive, **res_kwargs): yield r return
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace, spec, includeds=None): """Discover the edges and nodes in a dataset tree to given target paths Parameters ---------- basepath : path Path to a start or top-level dataset. Really has to be a path to a dataset! targetpaths : list(path) Any non-zero number of paths that are termination points for the search algorithm. Can be paths to datasets, directories, or files (and any combination thereof). current_trace : list For a top-level call this should probably always be `[]` spec : dict `content_by_ds`-style dictionary that will receive information about the discovered datasets. Specifically, for each discovered dataset there will be an item with its path under the key (path) of the respective superdataset. includeds : sequence, optional Any paths given are treated as existing subdatasets, regardless of whether they can be found in the filesystem. Such subdatasets will appear under the key of the closest existing dataset in the `spec`. Returns ------- None Function calls itself recursively and populates `spec` dict in-place. Keys are dataset paths, values are sets of subdataset paths """ # convert to set for faster lookup includeds = includeds if isinstance(includeds, set) else \ set() if includeds is None else set(includeds) # this beast walks the directory tree from a given `basepath` until # it discovers any of the given `targetpaths` # if it finds one, it commits any accummulated trace of visited # datasets on this edge to the spec valid_repo = GitRepo.is_valid_repo(basepath) if valid_repo: # we are passing into a new dataset, extend the dataset trace current_trace = current_trace + [basepath] # this edge is not done, we need to try to reach any downstream # dataset undiscovered_ds = set(t for t in targetpaths) # if t != basepath) # whether anything in this directory matched a targetpath filematch = False if isdir(basepath): for p in listdir(basepath): p = ensure_unicode(opj(basepath, p)) if not isdir(p): if p in targetpaths: filematch = True # we cannot have anything below this one continue # OPT listdir might be large and we could have only few items # in `targetpaths` -- so traverse only those in spec which have # leading dir basepath # filter targets matching this downward path downward_targets = set(t for t in targetpaths if path_startswith(t, p)) if not downward_targets: continue # remove the matching ones from the "todo" list undiscovered_ds.difference_update(downward_targets) # go one deeper discover_dataset_trace_to_targets( p, downward_targets, current_trace, spec, includeds=includeds if not includeds else includeds.intersection(downward_targets)) undiscovered_ds = [ t for t in undiscovered_ds if includeds and path_is_subpath(t, current_trace[-1]) and t in includeds ] if filematch or basepath in targetpaths or undiscovered_ds: for i, p in enumerate(current_trace[:-1]): # TODO RF prepare proper annotated path dicts subds = spec.get(p, set()) subds.add(current_trace[i + 1]) spec[p] = subds if undiscovered_ds: spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union(undiscovered_ds)
def query_aggregated_metadata(reporton, ds, aps, recursive=False, **kwargs): """Query the aggregated metadata in a dataset Query paths (`aps`) have to be composed in an intelligent fashion by the caller of this function, i.e. it should have been decided outside which dataset to query for any given path. Also this function doesn't cache anything, hence the caller must make sure to only call this once per dataset to avoid waste. Parameters ---------- reporton : {None, 'none', 'dataset', 'files', 'all'} If `None`, reporting will be based on the `type` property of the incoming annotated paths. ds : Dataset Dataset to query aps : list Sequence of annotated paths to query metadata for. recursive : bool Whether or not to report metadata underneath all query paths recursively. **kwargs Any other argument will be passed on to the query result dictionary. Returns ------- generator Of result dictionaries. """ from datalad.coreapi import get # look for and load the aggregation info for the base dataset info_fpath = opj(ds.path, agginfo_relpath) agg_base_path = dirname(info_fpath) agginfos = _load_json_object(info_fpath) # cache once loaded metadata objects for additional lookups # TODO possibly supply this cache from outside, if objects could # be needed again -- their filename does not change in a superdataset # if done, cache under relpath, not abspath key cache = { 'objcache': {}, 'subds_relpaths': None, } reported = set() # for all query paths for ap in aps: # all metadata is registered via its relative path to the # dataset that is being queried rpath = relpath(ap['path'], start=ds.path) if rpath in reported: # we already had this, probably via recursion of some kind continue rap = dict(ap, rpath=rpath, type=ap.get('type', None)) # we really have to look this up from the aggregated metadata # and cannot use any 'parentds' property in the incoming annotated # path. the latter will reflect the situation on disk, we need # the record of the containing subdataset in the aggregated metadata # instead containing_ds = _get_containingds_from_agginfo(agginfos, rpath) if containing_ds is None: # could happen if there was no aggregated metadata at all # or the path is in this dataset, but luckily the queried dataset # is known to be present containing_ds = curdir rap['metaprovider'] = containing_ds # build list of datasets and paths to be queried for this annotated path # in the simple case this is just the containing dataset and the actual # query path to_query = [rap] if recursive: # in case of recursion this is also anything in any dataset underneath # the query path matching_subds = [ { 'metaprovider': sub, 'rpath': sub, 'type': 'dataset' } for sub in sorted(agginfos) # we already have the base dataset if (rpath == curdir and sub != curdir) or path_is_subpath(sub, rpath) ] to_query.extend(matching_subds) # one heck of a beast to get the set of filenames for all metadata objects that are # required to be present to fulfill this query objfiles = set( agginfos.get(qap['metaprovider'], {}).get(t, None) for qap in to_query for t in ('dataset_info',) + \ (('content_info',) if ((reporton is None and qap.get('type', None) == 'file') or reporton in ('files', 'all')) else tuple()) ) lgr.debug( 'Verifying/achieving local availability of %i metadata objects', len(objfiles)) get(path=[ dict(path=opj(agg_base_path, of), parentds=ds.path, type='file') for of in objfiles if of ], dataset=ds, result_renderer='disabled') for qap in to_query: # info about the dataset that contains the query path dsinfo = agginfos.get(qap['metaprovider'], dict(id=ds.id)) res_tmpl = get_status_dict() for s, d in (('id', 'dsid'), ('refcommit', 'refcommit')): if s in dsinfo: res_tmpl[d] = dsinfo[s] # pull up dataset metadata, always needed if only for the context dsmeta = {} dsobjloc = dsinfo.get('dataset_info', None) if dsobjloc is not None: dsmeta = _load_json_object(opj(agg_base_path, dsobjloc), cache=cache['objcache']) for r in _query_aggregated_metadata_singlepath( ds, agginfos, agg_base_path, qap, reporton, cache, dsmeta, dsinfo.get('content_info', None)): r.update(res_tmpl, **kwargs) # if we are coming from `search` we want to record why this is being # reported if 'query_matched' in ap: r['query_matched'] = ap['query_matched'] if r.get('type', None) == 'file': r['parentds'] = normpath(opj(ds.path, qap['metaprovider'])) yield r reported.add(qap['rpath'])
def _get_dsinfo_from_aggmetadata(ds_path, path, recursive, db): """Grab info on aggregated metadata for a path from a given dataset. The actual info is stored in a `db` dict under the absolute path of the dataset that contains the query path, plus any subdataset in case of recursion (with their own DB entries). Parameters ---------- ds : Dataset source dataset path : str absolute path for which to obtain metadata recursive : bool Returns ------- str or list A string is an error message, a list contains all absolute paths for all datasets on which info was put into the DB. """ info_fpath = opj(ds_path, agginfo_relpath) info_basepath = dirname(info_fpath) # TODO cache these agginfos = _load_json_object(info_fpath) def _ensure_abs_obj_location(rec): # object location in the DB must be absolute so we can copy easily # to all relevant datasets for key in location_keys: if key in rec and not isabs(rec[key]): rec[key] = opj(info_basepath, rec[key]) return rec rpath = relpath(path, start=ds_path) seed_ds = _get_containingds_from_agginfo(agginfos, rpath) if seed_ds is None: # nothing found # this will be the message in the result for the query path # and could be a tuple return ("No matching aggregated metadata for path '%s' in Dataset at %s", rpath, ds_path) # easy peasy seed_abs = opj(ds_path, seed_ds) db[seed_abs] = _ensure_abs_obj_location(agginfos[seed_ds]) hits = [seed_abs] if not recursive: return hits # a little more complicated: we need to loop over all subdataset # records and pick the ones that are underneath the seed for agginfo_path in agginfos: if path_is_subpath(agginfo_path, seed_ds): absp = opj(ds_path, agginfo_path) db[absp] = _ensure_abs_obj_location(agginfos[agginfo_path]) hits.append(absp) # TODO we must keep the info on these recursively discovered datasets # somewhere, because we cannot rediscover them on the filesystem # when updating the datasets later on return hits
def discover_dataset_trace_to_targets(basepath, targetpaths, current_trace, spec, includeds=None): """Discover the edges and nodes in a dataset tree to given target paths Parameters ---------- basepath : path Path to a start or top-level dataset. Really has to be a path to a dataset! targetpaths : list(path) Any non-zero number of paths that are termination points for the search algorithm. Can be paths to datasets, directories, or files (and any combination thereof). current_trace : list For a top-level call this should probably always be `[]` spec : dict `content_by_ds`-style dictionary that will receive information about the discovered datasets. Specifically, for each discovered dataset there will be in item with its path under the key (path) of the respective superdataset. includeds : sequence, optional Any paths given are treated as existing subdatasets, regardless of whether they can be found in the filesystem. Such subdatasets will appear under the key of the closest existing dataset in the `spec`. Returns ------- None Function calls itself recursively and populates `spec` dict in-place. Keys are dataset paths, values are sets of subdataset paths """ # convert to set for faster lookup includeds = includeds if isinstance(includeds, set) else \ set() if includeds is None else set(includeds) # this beast walks the directory tree from a given `basepath` until # it discovers any of the given `targetpaths` # if it finds one, it commits any accummulated trace of visited # datasets on this edge to the spec valid_repo = GitRepo.is_valid_repo(basepath) if valid_repo: # we are passing into a new dataset, extend the dataset trace current_trace = current_trace + [basepath] # this edge is not done, we need to try to reach any downstream # dataset undiscovered_ds = set(t for t in targetpaths) # if t != basepath) # whether anything in this directory matched a targetpath filematch = False if isdir(basepath): for p in listdir(basepath): p = assure_unicode(opj(basepath, p)) if not isdir(p): if p in targetpaths: filematch = True # we cannot have anything below this one continue # OPT listdir might be large and we could have only few items # in `targetpaths` -- so traverse only those in spec which have # leading dir basepath # filter targets matching this downward path downward_targets = set( t for t in targetpaths if path_startswith(t, p)) if not downward_targets: continue # remove the matching ones from the "todo" list undiscovered_ds.difference_update(downward_targets) # go one deeper discover_dataset_trace_to_targets( p, downward_targets, current_trace, spec, includeds=includeds if not includeds else includeds.intersection( downward_targets)) undiscovered_ds = [t for t in undiscovered_ds if includeds and path_is_subpath(t, current_trace[-1]) and t in includeds] if filematch or basepath in targetpaths or undiscovered_ds: for i, p in enumerate(current_trace[:-1]): # TODO RF prepare proper annotated path dicts subds = spec.get(p, set()) subds.add(current_trace[i + 1]) spec[p] = subds if undiscovered_ds: spec[current_trace[-1]] = spec.get(current_trace[-1], set()).union( undiscovered_ds)
def get_modified_subpaths(aps, refds, revision, recursion_limit=None, report_no_revision_change=True, report_untracked='all'): """ Parameters ---------- aps : list refds : Dataset revision : str Commit-ish """ # TODO needs recursion limit # NOTE this is implemented as a generator despite that fact that we need # to sort through _all_ the inputs initially, diff'ing each involved # dataset takes time that we can use to already act on intermediate # result paths, without having to wait for 100% completion if revision is None: # we want all, subds not matching the ref are assumed to have been # sorted out before (e.g. one level up) for r in aps: yield r # life is simple: we diff the base dataset modified = [] # Diff.__call__ is used to get access to the now obsolete interface.diff # that exists merely for annotate_paths. (refds.diff corresponds to # core.local.diff.) from datalad.interface.diff import Diff for r in Diff.__call__( dataset=refds, # we cannot really limit the diff paths easily because we might get # or miss content (e.g. subdatasets) if we don't figure out which # ones are known -- and we don't want that path=None, # `revision` can be anything that Git support for `diff` # `True` is code for diff without revision revision=revision if revision is not True else None, # it is important that staged is False, otherwise we would miss unstaged # changes when e.g. diffing against HEAD (save does that) staged=False, # we might want to consider putting 'untracked' here # maybe that is a little faster, not tested yet ignore_subdatasets='none', # by default, we want to see any individual untracked file, this simplifies further # processing dramatically, but may require subsequent filtering # in order to avoid flooding user output with useless info report_untracked=report_untracked, # no recursion, we needs to update `revision` for every subdataset # before we can `diff` recursive=False, return_type='generator', result_renderer=None, # need to be able to yield the errors on_failure='ignore'): if r['status'] in ('impossible', 'error'): # something unexpected, tell daddy yield r continue # if asked, and no change in revision -- skip if not report_no_revision_change \ and (r.get('revision_src') or r.get('revision')) \ and (r.get('revision_src') == r.get('revision')): continue r['status'] = '' modified.append(r) if not len(modified): # nothing modified nothing to report return # now we can grab the APs that are in this dataset and yield them for ap in aps: # need to preserve pristine info first ap = ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path) for m in modified: if ap['path'] == m['path']: # is directly modified, yield input AP # but update with what we learned about the modification ap.update(m) yield ap break if path_is_subpath(m['path'], ap['path']): # a modified path is underneath this AP # yield the modified one instead yield m continue mod_subs = [m for m in modified if m.get('type', None) == 'dataset'] if not mod_subs or (recursion_limit is not None and recursion_limit < 1): return aps = [ ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path) for ap in aps ] # now for all submodules that were found modified for sub in [m for m in modified if m.get('type', None) == 'dataset']: sub_path_ = _with_sep(sub['path']) # these AP match something inside this submodule, or the whole submodule sub_aps = [ ap for ap in aps if _with_sep(ap['path']).startswith(sub_path_) ] if not sub_aps: continue # we are interested in the modifications within this subdataset # from the state we previously had on record, till the state # we have in record now diff_range = '{}..{}'.format( sub['revision_src'] if sub['revision_src'] else PRE_INIT_COMMIT_SHA, sub['revision'] if sub['revision'] else '') if sub['revision_src'] and sub['revision_src'] == sub['revision']: # this is a special case, where subdataset reported changes without # a change in state/commit -- this is code for uncommitted changes # in the subdataset (including staged ones). In such a case, we # must not provide a diff range, but only the source commit we want # to diff against # XXX if this is changed, likely the same logic in diff needs # changing too! diff_range = sub['revision_src'] for r in get_modified_subpaths( sub_aps, Dataset(sub['path']), diff_range, recursion_limit=(recursion_limit - 1) if recursion_limit is not None else None): yield r
def __call__(path=None, dataset=None, get_aggregates=False, reporton='all', recursive=False): # prep results refds_path = Interface.get_refds_path(dataset) res_kwargs = dict(action='metadata', logger=lgr) if refds_path: res_kwargs['refds'] = refds_path if get_aggregates: # yield all datasets for which we have aggregated metadata as results # the get actual dataset results, so we can turn them into dataset # instances using generic top-level code if desired ds = require_dataset(refds_path, check_installed=True, purpose='aggregate metadata query') info_fpath = opj(ds.path, agginfo_relpath) if not exists(info_fpath): # if there has ever been an aggregation run, this file would # exist, hence there has not been and we need to tell this # to people yield get_status_dict( ds=ds, status='impossible', action='metadata', logger=lgr, message= 'metadata aggregation has never been performed in this dataset' ) return agginfos = _load_json_object(info_fpath) parentds = [] for sd in sorted(agginfos): info = agginfos[sd] dspath = normpath(opj(ds.path, sd)) if parentds and not path_is_subpath(dspath, parentds[-1]): parentds.pop() info.update( path=dspath, type='dataset', status='ok', ) if sd == curdir: info['layout_version'] = aggregate_layout_version if parentds: info['parentds'] = parentds[-1] yield dict(info, **res_kwargs) parentds.append(dspath) return if not dataset and not path: # makes no sense to have no dataset, go with "here" # error generation happens during annotation path = curdir content_by_ds = OrderedDict() for ap in AnnotatePaths.__call__( dataset=refds_path, path=path, # MIH: we are querying the aggregated metadata anyways, and that # mechanism has its own, faster way to go down the hierarchy #recursive=recursive, #recursion_limit=recursion_limit, action='metadata', # uninstalled subdatasets could be queried via aggregated metadata # -> no 'error' unavailable_path_status='', nondataset_path_status='error', # we need to know when to look into aggregated data force_subds_discovery=True, force_parentds_discovery=True, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo( ap['path']): ap['process_content'] = True to_query = None if ap.get('state', None) == 'absent' or \ ap.get('type', 'dataset') != 'dataset': # this is a lonely absent dataset/file or content in a present dataset # -> query through parent # there must be a parent, otherwise this would be a non-dataset path # and would have errored during annotation to_query = ap['parentds'] else: to_query = ap['path'] if to_query: pcontent = content_by_ds.get(to_query, []) pcontent.append(ap) content_by_ds[to_query] = pcontent for ds_path in content_by_ds: ds = Dataset(ds_path) query_agg = [ ap for ap in content_by_ds[ds_path] # this is an available subdataset, will be processed in another # iteration if ap.get('state', None) == 'absent' or not (ap.get( 'type', None) == 'dataset' and ap['path'] != ds_path) ] if not query_agg: continue # report from aggregated metadata for r in query_aggregated_metadata( reporton, # by default query the reference dataset, only if there is none # try our luck in the dataset that contains the queried path # this is consistent with e.g. `get_aggregates` reporting the # situation in the reference dataset only Dataset(refds_path) if refds_path else ds, query_agg, # recursion above could only recurse into datasets # on the filesystem, but there might be any number of # uninstalled datasets underneath the last installed one # for which we might have metadata recursive=recursive, **res_kwargs): yield r return
def results_from_annex_noinfo(ds, requested_paths, respath_by_status, dir_fail_msg, noinfo_dir_msg, noinfo_file_msg, noinfo_status='notneeded', **kwargs): """Helper to yield results based on what information git annex did no give us. The helper assumes that the annex command returned without an error code, and interprets which of the requested paths we have heard nothing about, and assumes that git annex was happy with their current state. Parameters ========== ds : Dataset All results have to be concerning this single dataset (used to resolve relpaths). requested_paths : list List of path arguments sent to `git annex` respath_by_status : dict Mapping of 'success' or 'failure' labels to lists of result paths reported by `git annex`. Everything that is not in here, we assume that `git annex` was happy about. dir_fail_msg : str Message template to inject into the result for a requested directory where a failure was reported for some of its content. The template contains two string placeholders that will be expanded with 1) the path of the directory, and 2) the content failure paths for that directory noinfo_dir_msg : str Message template to inject into the result for a requested directory that `git annex` was silent about (incl. any content). There must be one string placeholder that is expanded with the path of that directory. noinfo_file_msg : str Message to inject into the result for a requested file that `git annex` was silent about. noinfo_status : str Status to report when annex provides no information **kwargs Any further kwargs are included in the yielded result dictionary. """ for p in requested_paths: # any relpath is relative to the currently processed dataset # not the global reference dataset p = p if isabs(p) else normpath(opj(ds.path, p)) if any(p in ps for ps in respath_by_status.values()): # we have a report for this path already continue common_report = dict(path=p, **kwargs) if isdir(p): # `annex` itself will not report on directories, but if a # directory was requested, we want to say something about # it in the results. we are inside a single, existing # repo, hence all directories are already present, if not # we had an error # do we have any failures in a subdir of the requested dir? failure_results = [ fp for fp in respath_by_status.get('failure', []) if path_is_subpath(fp, p)] if failure_results: # we were not able to process all requested_paths, let's label # this 'impossible' to get a warning-type report # after all we have the directory itself, but not # (some) of its requested_paths yield get_status_dict( status='impossible', type='directory', message=(dir_fail_msg, p, failure_results), **common_report) else: # otherwise cool, but how cool? success_results = [ fp for fp in respath_by_status.get('success', []) if path_is_subpath(fp, p)] yield get_status_dict( status='ok' if success_results else noinfo_status, message=None if success_results else (noinfo_dir_msg, p), type='directory', **common_report) continue else: # not a directory, and we have had no word from `git annex`, # yet no exception, hence the file was most probably # already in the desired state yield get_status_dict( status=noinfo_status, type='file', message=noinfo_file_msg, **common_report)
def get_modified_subpaths(aps, refds, revision, recursion_limit=None, report_no_revision_change=True, report_untracked='all'): """ Parameters ---------- aps : list refds : Dataset revision : str Commit-ish """ # TODO needs recursion limit # NOTE this is implemented as a generator despite that fact that we need # to sort through _all_ the inputs initially, diff'ing each involved # dataset takes time that we can use to already act on intermediate # result paths, without having to wait for 100% completion if revision is None: # we want all, subds not matching the ref are assumed to have been # sorted out before (e.g. one level up) for r in aps: yield r # life is simple: we diff the base dataset modified = [] # Diff.__call__ is used to get access to the now obsolete interface.diff # that exists merely for annotate_paths. (refds.diff corresponds to # core.local.diff.) from datalad.interface.diff import Diff for r in Diff.__call__( dataset=refds, # we cannot really limit the diff paths easily because we might get # or miss content (e.g. subdatasets) if we don't figure out which # ones are known -- and we don't want that path=None, # `revision` can be anything that Git support for `diff` # `True` is code for diff without revision revision=revision if revision is not True else None, # it is important that staged is False, otherwise we would miss unstaged # changes when e.g. diffing against HEAD (save does that) staged=False, # we might want to consider putting 'untracked' here # maybe that is a little faster, not tested yet ignore_subdatasets='none', # by default, we want to see any individual untracked file, this simplifies further # processing dramatically, but may require subsequent filtering # in order to avoid flooding user output with useless info report_untracked=report_untracked, # no recursion, we needs to update `revision` for every subdataset # before we can `diff` recursive=False, return_type='generator', result_renderer=None, # need to be able to yield the errors on_failure='ignore'): if r['status'] in ('impossible', 'error'): # something unexpected, tell daddy yield r continue # if asked, and no change in revision -- skip if not report_no_revision_change \ and (r.get('revision_src') or r.get('revision')) \ and (r.get('revision_src') == r.get('revision')): continue r['status'] = '' modified.append(r) if not len(modified): # nothing modified nothing to report return # now we can grab the APs that are in this dataset and yield them for ap in aps: # need to preserve pristine info first ap = ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path) for m in modified: if ap['path'] == m['path']: # is directly modified, yield input AP # but update with what we learned about the modification ap.update(m) yield ap break if path_is_subpath(m['path'], ap['path']): # a modified path is underneath this AP # yield the modified one instead yield m continue mod_subs = [m for m in modified if m.get('type', None) == 'dataset'] if not mod_subs or (recursion_limit is not None and recursion_limit < 1): return aps = [ap if isinstance(ap, dict) else rawpath2ap(ap, refds.path) for ap in aps] # now for all submodules that were found modified for sub in [m for m in modified if m.get('type', None) == 'dataset']: sub_path_ = _with_sep(sub['path']) # these AP match something inside this submodule, or the whole submodule sub_aps = [ap for ap in aps if _with_sep(ap['path']).startswith(sub_path_)] if not sub_aps: continue # we are interested in the modifications within this subdataset # from the state we previously had on record, till the state # we have in record now diff_range = '{}..{}'.format( sub['revision_src'] if sub['revision_src'] else PRE_INIT_COMMIT_SHA, sub['revision'] if sub['revision'] else '') if sub['revision_src'] and sub['revision_src'] == sub['revision']: # this is a special case, where subdataset reported changes without # a change in state/commit -- this is code for uncommited changes # in the subdataset (including staged ones). In such a case, we # must not provide a diff range, but only the source commit we want # to diff against # XXX if this is changed, likely the same logic in diff needs # changing too! diff_range = sub['revision_src'] for r in get_modified_subpaths( sub_aps, Dataset(sub['path']), diff_range, recursion_limit=(recursion_limit - 1) if recursion_limit is not None else None ): yield r
def __call__(path=None, *, dataset=None, get_aggregates=False, reporton='all', recursive=False): # prep results refds_path = dataset if dataset is None \ else require_dataset(dataset).path res_kwargs = dict(action='metadata', logger=lgr) if refds_path: res_kwargs['refds'] = refds_path if get_aggregates: # yield all datasets for which we have aggregated metadata as results # the get actual dataset results, so we can turn them into dataset # instances using generic top-level code if desired ds = require_dataset(refds_path, check_installed=True, purpose='aggregate metadata query') agginfos = load_ds_aggregate_db( ds, version=str(aggregate_layout_version), abspath=True) if not agginfos: # if there has ever been an aggregation run, this file would # exist, hence there has not been and we need to tell this # to people yield get_status_dict( ds=ds, status='impossible', action='metadata', logger=lgr, message= 'metadata aggregation has never been performed in this dataset' ) return parentds = [] for dspath in sorted(agginfos): info = agginfos[dspath] if parentds and not path_is_subpath(dspath, parentds[-1]): parentds.pop() info.update( path=dspath, type='dataset', status='ok', ) if dspath == ds.path: info['layout_version'] = aggregate_layout_version if parentds: info['parentds'] = parentds[-1] yield dict(info, **res_kwargs) parentds.append(dspath) return if not dataset and not path: # makes no sense to have no dataset, go with "here" # error generation happens during annotation path = op.curdir paths_by_ds, errors = get_paths_by_ds(require_dataset(dataset), dataset, paths=ensure_list(path), subdsroot_mode='super') content_by_ds = OrderedDict() for ap in _minimal_annotate_paths(paths_by_ds, errors, action='metadata', refds=refds_path): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) == 'dataset' and GitRepo.is_valid_repo( ap['path']): ap['process_content'] = True to_query = None if ap.get('state', None) == 'absent' or \ ap.get('type', 'dataset') != 'dataset': # this is a lonely absent dataset/file or content in a present dataset # -> query through parent # there must be a parent, otherwise this would be a non-dataset path # and would have errored during annotation to_query = ap['parentds'] else: to_query = ap['path'] if to_query: pcontent = content_by_ds.get(to_query, []) pcontent.append(ap) content_by_ds[to_query] = pcontent for ds_path in content_by_ds: ds = Dataset(ds_path) query_agg = [ ap for ap in content_by_ds[ds_path] # this is an available subdataset, will be processed in another # iteration if ap.get('state', None) == 'absent' or not (ap.get( 'type', None) == 'dataset' and ap['path'] != ds_path) ] if not query_agg: continue # report from aggregated metadata for r in query_aggregated_metadata( reporton, # by default query the reference dataset, only if there is none # try our luck in the dataset that contains the queried path # this is consistent with e.g. `get_aggregates` reporting the # situation in the reference dataset only Dataset(refds_path) if refds_path else ds, query_agg, # recursion above could only recurse into datasets # on the filesystem, but there might be any number of # uninstalled datasets underneath the last installed one # for which we might have metadata recursive=recursive, **res_kwargs): yield r return