def _load_agginfo_db(ds_path): return { # paths in DB on disk are always relative # make absolute to ease processing during aggregation op.normpath(op.join(ds_path, p)): { k: op.normpath(op.join(ds_path, op.dirname(agginfo_relpath), v)) if k in location_keys else v for k, v in props.items() } for p, props in _load_json_object(opj(ds_path, agginfo_relpath)).items() }
def _get_dsinfo_from_aggmetadata(ds_path, path, recursive, db): """Grab info on aggregated metadata for a path from a given dataset. The actual info is stored in a `db` dict under the absolute path of the dataset that contains the query path, plus any subdataset in case of recursion (with their own DB entries). Parameters ---------- ds : Dataset source dataset path : str absolute path for which to obtain metadata recursive : bool Returns ------- str or list A string is an error message, a list contains all absolute paths for all datasets on which info was put into the DB. """ info_fpath = opj(ds_path, agginfo_relpath) info_basepath = dirname(info_fpath) # TODO cache these agginfos = _load_json_object(info_fpath) def _ensure_abs_obj_location(rec): # object location in the DB must be absolute so we can copy easily # to all relevant datasets for key in location_keys: if key in rec and not isabs(rec[key]): rec[key] = opj(info_basepath, rec[key]) return rec rpath = relpath(path, start=ds_path) seed_ds = _get_containingds_from_agginfo(agginfos, rpath) if seed_ds is None: # nothing found # this will be the message in the result for the query path # and could be a tuple return ("No matching aggregated metadata in Dataset at %s", ds_path) # easy peasy seed_abs = opj(ds_path, seed_ds) db[seed_abs] = _ensure_abs_obj_location(agginfos[seed_ds]) hits = [seed_abs] if not recursive: return hits # a little more complicated: we need to loop over all subdataset # records an pick the ones that are underneath the seed for agginfo_path in agginfos: if agginfo_path.startswith(_with_sep(seed_ds)): absp = opj(ds_path, agginfo_path) db[absp] = _ensure_abs_obj_location(agginfos[agginfo_path]) hits.append(absp) # TODO we must keep the info on these recursively discovered datasets # somewhere, because we cannot rediscover them on the filesystem # when updating the datasets later on return hits
def _update_ds_agginfo(refds_path, ds_path, subds_paths, agginfo_db, to_save): """Perform metadata aggregation for ds and a given list of subdataset paths Parameters ---------- refds_path : str Absolute path to the reference dataset that aggregate_metadata() was called on. ds_path : str Absolute path to the dataset to have its aggregate info updates subds_paths : list(str) Sequence of absolute paths of subdatasets of the to-be-updated dataset, whose agginfo shall be updated within the to-be-updated dataset agginfo_db : dict Dictionary with all information on aggregate metadata on all datasets. Keys are absolute paths of datasets. to_save : list List of paths to save eventually. This function will add new paths as necessary. """ ds = Dataset(ds_path) # location info of aggregate metadata # aggregate.json agginfo_fpath = opj(ds.path, agginfo_relpath) # base path in which aggregate.json and objects is located agg_base_path = dirname(agginfo_fpath) # load existing aggregate info dict # TODO take from cache, once used in _get_dsinfo_from_aggmetadata() ds_agginfos = _load_json_object(agginfo_fpath) # object locations referenced initially objlocs_was = set(ai[k] for ai in ds_agginfos.values() for k in location_keys if k in ai) # TODO look for datasets that are no longer registered and remove all # info about them # track which objects need to be copied objs2copy = [] # for each subdataset (any depth level) for dpath in [ds_path] + subds_paths: # relative path of the currect dataset within the dataset we are updating drelpath = relpath(dpath, start=ds.path) # TODO figure out why `None` could be a value in the DB ## build aggregate info for the current subdataset #ds_dbinfo = agginfo_db.get(dpath, {}) #if not ds_dbinfo: # # we got nothing new, keep what we had # continue #ds_dbinfo = ds_dbinfo.copy() ds_dbinfo = agginfo_db.get(dpath, {}).copy() for loclabel in location_keys: if loclabel == 'filepath_info' and drelpath == curdir: # do not write a file list into the dataset it is from if 'filepath_info' in ds_dbinfo: del ds_dbinfo['filepath_info'] continue # abspath to object objloc = ds_dbinfo.get(loclabel, None) if objloc is None: continue # XXX needs to change when layout of object store is changed # current is ./datalad/metadata/objects/{hash}/{hash} target_objrelpath = opj(*objloc.split(os.sep)[-3:]) # make sure we copy the file from its current location to where it is # needed in this dataset target_objpath = opj(agg_base_path, target_objrelpath) objs2copy.append((objloc, target_objpath)) # now build needed local relpath ds_dbinfo[loclabel] = target_objrelpath # (re)assign in case record is new ds_agginfos[drelpath] = ds_dbinfo # set of metadata objects now referenced objlocs_is = set(ai[k] for ai in ds_agginfos.values() for k in location_keys if k in ai) objs2remove = objlocs_was.difference(objlocs_is) # TODO do we need to (double?) check if all object files exist? #objs2add = [o for o in objlocs_is if exists(opj(ds_path, o))] objs2add = objlocs_is # secretly remove obsolete object files, not really a result from a # user's perspective if objs2remove: ds.remove( [opj(agg_base_path, p) for p in objs2remove], # Don't use the misleading default commit message of `remove`: message='[DATALAD] Remove obsolete metadata object files', # we do not want to drop these files by default, because we would # loose them for other branches, and earlier tags # TODO evaluate whether this should be exposed as a switch # to run an explicit force-drop prior to calling remove() check=False, result_renderer=None, return_type=list) if not objs2add and not refds_path == ds_path: # this is not the base dataset, make sure to save removal in the # parentds -- not needed when objects get added, as removal itself # is already committed to_save(dict(path=ds_path, type='dataset', staged=True)) # must copy object files to local target destination for copy_from, copy_to in objs2copy: if copy_to == copy_from: continue target_dir = dirname(copy_to) if not exists(target_dir): makedirs(target_dir) # TODO we could be more clever (later) and maybe `addurl` (or similar) # the file from another dataset if exists(copy_to): # no need to unlock, just wipe out and replace os.remove(copy_to) shutil.copy(copy_from, copy_to) # TODO is there any chance that this file could be gone at the end? #if exists(agginfo_fpath): to_save.append(dict(path=agginfo_fpath, type='file', staged=True)) if objs2add: # they are added standard way, depending on the repo type ds.add([opj(agg_base_path, p) for p in objs2add], save=False, result_renderer=None, return_type=list) # queue for save, and mark as staged to_save.extend([ dict(path=opj(agg_base_path, p), type='file', staged=True) for p in objs2add ]) # write aggregate info file if not ds_agginfos: return json_py.dump(ds_agginfos, agginfo_fpath) ds.add(agginfo_fpath, save=False, to_git=True, result_renderer=None, return_type=list) # queue for save, and mark as staged to_save.append(dict(path=agginfo_fpath, type='file', staged=True))
def _update_ds_agginfo(refds_path, ds_path, subds_paths, agginfo_db, to_save): """Perform metadata aggregation for ds and a given list of subdataset paths Parameters ---------- refds_path : str Absolute path to the reference dataset that aggregate_metadata() was called on. ds_path : str Absolute path to the dataset to have its aggregate info updates subds_paths : list(str) Sequence of absolute paths of subdatasets of the to-be-updated dataset, whose agginfo shall be updated within the to-be-updated dataset. Any subdataset that is not listed here is assumed to be gone (i.e. no longer a subdataset at all, not just not locally installed) agginfo_db : dict Dictionary with all information on aggregate metadata on all datasets. Keys are absolute paths of datasets. to_save : list List of paths to save eventually. This function will add new paths as necessary. """ ds = Dataset(ds_path) # location info of aggregate metadata # aggregate.json agginfo_fpath = opj(ds.path, agginfo_relpath) # base path in which aggregate.json and objects is located agg_base_path = dirname(agginfo_fpath) # load existing aggregate info dict # TODO take from cache, once used in _get_dsinfo_from_aggmetadata() ds_agginfos = _load_json_object(agginfo_fpath) # object locations referenced initially objlocs_was = set(ai[k] for ai in ds_agginfos.values() for k in location_keys if k in ai) # track which objects need to be copied (each item is a from/to tuple objs2copy = [] # for each subdataset (any depth level) procds_paths = [ds_path] + subds_paths for dpath in procds_paths: ds_dbinfo = agginfo_db.get(dpath, {}).copy() # relative path of the currect dataset within the dataset we are updating drelpath = relpath(dpath, start=ds.path) for loclabel in location_keys: # TODO filepath_info is obsolete if loclabel == 'filepath_info' and drelpath == curdir: # do not write a file list into the dataset it is from if 'filepath_info' in ds_dbinfo: del ds_dbinfo['filepath_info'] continue # abspath to object objloc = ds_dbinfo.get(loclabel, None) if objloc is None: continue # XXX needs to change when layout of object store is changed # current is ./datalad/metadata/objects/{hash}/{hash} target_objrelpath = opj(*objloc.split(os.sep)[-3:]) # make sure we copy the file from its current location to where it is # needed in this dataset target_objpath = opj(agg_base_path, target_objrelpath) objs2copy.append((objloc, target_objpath)) # now build needed local relpath ds_dbinfo[loclabel] = target_objrelpath # (re)assign in case record is new ds_agginfos[drelpath] = ds_dbinfo # remove all entries for which we did not (no longer) have a corresponding # subdataset to take care of ds_agginfos = {k: v for k, v in ds_agginfos.items() if normpath(opj(ds_path, k)) in procds_paths} # set of metadata objects now referenced objlocs_is = set( ai[k] for sdsrpath, ai in ds_agginfos.items() for k in location_keys if k in ai) objs2add = objlocs_is # yoh: we appanretly do need to filter the ones to remove - I did # "git reset --hard HEAD^" and # aggregate-metadata failed upon next run trying to remove # an unknown to git file. I am yet to figure out why that # mattered (hopefully not that reflog is used somehow) objs2remove = [] for obj in objlocs_was.difference(objlocs_is): obj_path = opj(agg_base_path, obj) if lexists(obj_path): objs2remove.append(obj_path) else: # not really a warning, we don't need it anymore, it is already gone lgr.debug( "To-be-deleted metadata object not found, skip deletion (%s)", obj_path ) # secretly remove obsolete object files, not really a result from a # user's perspective if objs2remove: ds.remove( objs2remove, # Don't use the misleading default commit message of `remove`: message='[DATALAD] Remove obsolete metadata object files', # we do not want to drop these files by default, because we would # loose them for other branches, and earlier tags # TODO evaluate whether this should be exposed as a switch # to run an explicit force-drop prior to calling remove() check=False, result_renderer=None, return_type=list) if not objs2add and not refds_path == ds_path: # this is not the base dataset, make sure to save removal in the # parentds -- not needed when objects get added, as removal itself # is already committed to_save.append(dict(path=ds_path, type='dataset', staged=True)) # must copy object files to local target destination # make sure those objects are present ds.get([f for f, t in objs2copy], result_renderer='disabled') for copy_from, copy_to in objs2copy: if copy_to == copy_from: continue target_dir = dirname(copy_to) if not exists(target_dir): makedirs(target_dir) # TODO we could be more clever (later) and maybe `addurl` (or similar) # the file from another dataset if lexists(copy_to): # no need to unlock, just wipe out and replace os.remove(copy_to) shutil.copy(copy_from, copy_to) to_save.append( dict(path=agginfo_fpath, type='file', staged=True)) if objs2add: # they are added standard way, depending on the repo type ds.add( [opj(agg_base_path, p) for p in objs2add], save=False, result_renderer=None, return_type=list) # queue for save, and mark as staged to_save.extend( [dict(path=opj(agg_base_path, p), type='file', staged=True) for p in objs2add]) # write aggregate info file if not ds_agginfos: return json_py.dump(ds_agginfos, agginfo_fpath) ds.add(agginfo_fpath, save=False, to_git=True, result_renderer=None, return_type=list) # queue for save, and mark as staged to_save.append( dict(path=agginfo_fpath, type='file', staged=True))