def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs): """Helper to drop content in datasets. Parameters ---------- ds : Dataset paths : path or list(path) which content to drop check : bool whether to instruct annex to perform minimum copy availability checks noannex_iserror : bool whether calling this function on a pure Git repo results in an 'impossible' or 'notneeded' result. **kwargs additional payload for the result dicts """ # expensive, access only once ds_repo = ds.repo if 'action' not in kwargs: kwargs['action'] = 'drop' # always need to make sure that we pass a list # `normalize_paths` decorator will otherwise screw all logic below paths = ensure_list(paths) if not hasattr(ds_repo, 'drop'): for p in paths: r = get_status_dict( status='impossible' if noannex_iserror else 'notneeded', path=p if isabs(p) else normpath(opj(ds.path, p)), message="no annex'ed content", **kwargs) r['action'] = 'drop' yield r return cmd = ['drop'] if not check: cmd.append('--force') respath_by_status = {} try: yield from (_postproc_result(res, respath_by_status, ds) for res in ds_repo._call_annex_records(cmd, files=paths)) except CommandError as e: # pick up the results captured so far and yield them # the error will be amongst them yield from (_postproc_result(res, respath_by_status, ds) for res in e.kwargs.get('stdout_json', [])) # report on things requested that annex was silent about for r in results_from_annex_noinfo( ds, paths, respath_by_status, dir_fail_msg='could not drop some content in %s %s', noinfo_dir_msg='nothing to drop from %s', noinfo_file_msg="no annex'ed content", **kwargs): r['action'] = 'drop' yield r
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs): """Helper to drop content in datasets. Parameters ---------- ds : Dataset paths : path or list(path) which content to drop check : bool whether to instruct annex to perform minimum copy availability checks noannex_iserror : bool whether calling this function on a pure Git repo results in an 'impossible' or 'notneeded' result. **kwargs additional payload for the result dicts """ if 'action' not in kwargs: kwargs['action'] = 'drop' # always need to make sure that we pass a list # `normalize_paths` decorator will otherwise screw all logic below paths = assure_list(paths) if not hasattr(ds.repo, 'drop'): for p in paths: r = get_status_dict( status='impossible' if noannex_iserror else 'notneeded', path=p if isabs(p) else normpath(opj(ds.path, p)), message="no annex'ed content", **kwargs) r['action'] = 'drop' yield r return opts = ['--force'] if not check else [] respath_by_status = {} for res in ds.repo.drop(paths, options=opts): res = annexjson2result( # annex reports are always about files res, ds, type='file', **kwargs) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res # report on things requested that annex was silent about for r in results_from_annex_noinfo( ds, paths, respath_by_status, dir_fail_msg='could not drop some content in %s %s', noinfo_dir_msg='nothing to drop from %s', noinfo_file_msg="no annex'ed content", **kwargs): r['action'] = 'drop' yield r
def _get_targetpaths(ds, content, refds_path, source, jobs): # not ready for Path instances... content = [str(c) for c in content] # hand over to git-annex, get files content, # report files in git as 'notneeded' to get ds_repo = ds.repo # needs to be an annex to get content if not isinstance(ds_repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', type='file', logger=lgr, refds=refds_path): yield r return respath_by_status = {} try: results = ds_repo.get(content, options=['--from=%s' % source] if source else [], jobs=jobs) except CommandError as exc: results = exc.kwargs.get("stdout_json") if not results: raise for res in results: res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r
def _drop_files(ds, paths, check, noannex_iserror=False, **kwargs): """Helper to drop content in datasets. Parameters ---------- ds : Dataset paths : path or list(path) which content to drop check : bool whether to instruct annex to perform minimum copy availability checks noannex_iserror : bool whether calling this function on a pure Git repo results in an 'impossible' or 'notneeded' result. **kwargs additional payload for the result dicts """ if 'action' not in kwargs: kwargs['action'] = 'drop' # always need to make sure that we pass a list # `normalize_paths` decorator will otherwise screw all logic below paths = assure_list(paths) if not hasattr(ds.repo, 'drop'): for p in paths: r = get_status_dict( status='impossible' if noannex_iserror else 'notneeded', path=p if isabs(p) else normpath(opj(ds.path, p)), message="no annex'ed content", **kwargs) r['action'] = 'drop' yield r return opts = ['--force'] if not check else [] respath_by_status = {} for res in ds.repo.drop(paths, options=opts): res = annexjson2result( # annex reports are always about files res, ds, type='file', **kwargs) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res # report on things requested that annex was silent about for r in results_from_annex_noinfo( ds, paths, respath_by_status, dir_fail_msg='could not drop some content in %s %s', noinfo_dir_msg='nothing to drop from %s', noinfo_file_msg="no annex'ed content", **kwargs): r['action'] = 'drop' yield r
def _drop_files(ds, repo, paths, force=False, jobs=None): """Helper to drop content in datasets. Parameters ---------- repo : AnnexRepo paths : list for which files to drop content check : bool whether to instruct annex to perform minimum copy availability checks Yields ------ dict """ cmd = ['drop'] if force: cmd.append('--force') if jobs: cmd.extend(['--jobs', str(jobs)]) respath_by_status = {} try: yield from (_postproc_annexdrop_result( res, respath_by_status, ds) for res in repo._call_annex_records_items_(cmd, files=paths)) except CommandError as e: # pick up the results captured so far and yield them # the error will be amongst them yield from (_postproc_annexdrop_result(res, respath_by_status, ds) for res in e.kwargs.get('stdout_json', [])) # report on things requested that annex was silent about for r in results_from_annex_noinfo( ds, paths, respath_by_status, dir_fail_msg='could not drop some content in %s %s', noinfo_dir_msg='nothing to drop from %s', noinfo_file_msg="no annex'ed content"): r['action'] = 'drop' yield r
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, message_file=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path") refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) if message and message_file: raise ValueError("Both a message and message file were specified") if message_file: with open(message_file, "rb") as mfh: message = assure_unicode(mfh.read()) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict( 'add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert(not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict( status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) subds_relpath = relpath(ap['path'], ds_path) # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=None, name=None) except (CommandError, InvalidGitRepositoryError) as e: yield get_status_dict( ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) # report added subdatasets -- `annex add` below won't do it yield get_status_dict( ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append(dict( path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add_( list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw ) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({k: v for k, v in res.items() if k not in ('status', 'state')}) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({k: v for k, v in r.items() if k not in ('status', 'state')}) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append({k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len(respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__( path=None, dataset=None, # support passing this through in a path by path basis to_git=None, save=True, message=None, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path" ) refds_path = Interface.get_refds_path(dataset) common_report = dict(action='add', logger=lgr, refds=refds_path) to_add = [] subds_to_add = {} ds_to_annotate_from_recursion = {} got_nothing = True for ap in AnnotatePaths.__call__( path=path, dataset=dataset, # never recursion, need to handle manually below to be able to # discover untracked content recursive=False, action='add', # speed things up by using Git's modification detection, if there # is a repo with at least one commit modified='HEAD' \ if dataset and \ GitRepo.is_valid_repo(refds_path) and \ GitRepo(refds_path).get_hexsha() \ else None, unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): got_nothing = False if ap.get('status', None): # this is done yield ap continue if ap.get('parentds', None) is None and ap.get('type', None) != 'dataset': yield get_status_dict( status='impossible', message='"there is no dataset to add this path to', **dict(common_report, **ap)) continue if ap.get('type', None) == 'directory' and \ ap.get('state', None) == 'untracked' and \ GitRepo.is_valid_repo(ap['path']): # this is an untracked wannabe subdataset in disguise ap['type'] = 'dataset' if recursive and \ (ap.get('raw_input', False) or ap.get('state', None) in ('added', 'modified', 'untracked')) and \ (ap.get('parentds', None) or ap.get('type', None) == 'dataset'): # this was an actually requested input path, or a path that was found # modified by path annotation, based on an input argument # we need to recurse into all subdirs to find potentially # unregistered subdatasets # but only if this path has a parent, or is itself a dataset # otherwise there is nothing to add to _discover_subdatasets_recursively( ds_to_annotate_from_recursion, ap['path'], [ap['parentds'] if 'parentds' in ap else ap['path']], recursion_limit) # get the file content of the root dataset of this search added too # but be careful with extreme recursion_limit settings if recursion_limit is None or recursion_limit > 0: ap['process_content'] = True # record for further processing if not ap['path'] in ds_to_annotate_from_recursion: # if it was somehow already discovered to_add.append(ap) # TODO check if next isn't covered by discover_dataset_trace_to_targets already?? if dataset and ap.get('type', None) == 'dataset': # duplicates not possible, annotated_paths returns unique paths subds_to_add[ap['path']] = ap if got_nothing: # path annotation yielded nothing, most likely cause is that nothing # was found modified, we need to say something about the reference # dataset yield get_status_dict('add', status='notneeded', path=refds_path, type='dataset', logger=lgr) return for subds in ds_to_annotate_from_recursion: if subds not in subds_to_add: # always prefer the already annotated path subds_to_add[subds] = ds_to_annotate_from_recursion[subds] if dataset: # we have a base dataset, discover any intermediate datasets between # the base and any already discovered dataset discovered = {} discover_dataset_trace_to_targets( # from here dataset.path, # to any dataset we are aware of subds_to_add.keys(), [], discovered) for parentds in discovered: for subds in discovered[parentds]: subds_to_add[subds] = subds_to_add.get( subds, dict(path=subds, parentds=parentds, type='dataset')) # merge custom paths and discovered dataset records, paths needs to go first, # because we know most about then, and subsequent annotation call we skip the # later duplicate ones to_add.extend(subds_to_add.values()) # and compact, this should be OK as all the info is in each ap dict to_add = unique(to_add, lambda x: x['path']) if not to_add: # nothing left to do, potentially all errored before return # now re-annotate all paths, this will be fast for already annotated ones # and will amend the annotation for others, it will also deduplicate annotated_paths = AnnotatePaths.__call__( path=to_add, dataset=dataset, # never recursion, done already recursive=False, action='add', unavailable_path_status='impossible', unavailable_path_msg="path does not exist: %s", nondataset_path_status='impossible', return_type='generator', # if there is an error now, we made this mistake in here on_failure='stop') content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( annotated_paths, refds_path=refds_path) assert (not completed) if not content_by_ds: # we should have complained about any inappropriate path argument # above, so if nothing is left, we can simply exit return # simple loop over datasets -- save happens later # start deep down to_save = [] for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) torepoadd = {} respath_by_status = {} for ap in content_by_ds[ds_path]: # we have a new story ap.pop('status', None) torepoadd[ap['path']] = ap # skip anything that doesn't look like a wannabe subdataset if not ap.get('type', None) == 'dataset' or \ ap['path'] == ds_path: continue if ap.get('registered_subds', False): # subdataset that might be in this list because of the # need to save all the way up to a super dataset respath_by_status['success'] = \ respath_by_status.get('success', []) + [ap['path']] yield get_status_dict(status='notneeded', message="already known subdataset", **dict(common_report, **ap)) continue subds = Dataset(ap['path']) # check that the subds has a commit, and refuse # to operate on it otherwise, or we would get a bastard # submodule that cripples git operations if not subds.repo.get_hexsha(): yield get_status_dict( ds=subds, status='impossible', message='cannot add subdataset with no commits', **dict(common_report, **ap)) continue subds_relpath = relpath(ap['path'], ds_path) # make an attempt to configure a submodule source URL based on the # discovered remote configuration remote, branch = subds.repo.get_tracking_branch() subds_url = subds.repo.get_remote_url( remote) if remote else None # Register the repository in the repo tree as a submodule try: ds.repo.add_submodule(subds_relpath, url=subds_url, name=None) except CommandError as e: yield get_status_dict(ds=subds, status='error', message=e.stderr, **dict(common_report, **ap)) continue # queue for saving using the updated annotated path ap['registered_subds'] = True # I hope this is true in direct mode too # TODO this is disabled, because in some circumstances # staging just doesn't happen, and it is unclear when # exactly -- the case that prompted disabling was a submodule # that had no content except for other submodules was not staged, # whereas another submodule on the same level in the same # superdataset which also has one file in it was staged # disable to work correctly, while paying a little bit of # slow down #ap['staged'] = True to_save.append(ap) _fixup_submodule_dotgit_setup(ds, subds_relpath) # report added subdatasets -- `annex add` below won't do it yield get_status_dict(ds=subds, status='ok', message='added new subdataset', **dict(common_report, **ap)) # make sure that .gitmodules is added to the list of files gitmodules_path = opj(ds.path, '.gitmodules') # for git torepoadd[gitmodules_path] = dict(path=gitmodules_path) # and for save to_save.append( dict(path=gitmodules_path, parentds=ds_path, type='file')) # make sure any last minute additions make it to the saving stage # XXX? should content_by_ds become OrderedDict so that possible # super here gets processed last? lgr.debug('Adding content to repo %s: %s', ds.repo, torepoadd) is_annex = isinstance(ds.repo, AnnexRepo) add_kw = {'jobs': jobs} if is_annex and jobs else {} added = ds.repo.add(list(torepoadd.keys()), git=to_git if is_annex else True, **add_kw) for a in added: res = annexjson2result(a, ds, type='file', **common_report) success = success_status_map[res['status']] respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] # produce best possible path/result annotation if res['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report res = dict(torepoadd[res['path']], **res) # override this in all cases to be safe res['parentds'] = ds.path if success: # this was successfully added, queue for saving this very path # in the dataset ap = {k: v for k, v in res.items() if k != 'status'} ap['staged'] = True # strip any status and state info (e.g. save will refuse to save # stuff that is marked state='untracked' to_save.append({ k: v for k, v in res.items() if k not in ('status', 'state') }) if a['file'] == '.gitmodules': # filter out .gitmodules, because this is only included for # technical reasons and has nothing to do with the actual content continue if GitRepo.is_valid_repo(res['path']): # more accurate report in case of an added submodule # mountpoint. # XXX Actually not sure if this can really happen # (depends on what our low-level code would do) # but worst case is that we loose a little bit of # coverage... res['type'] = 'dataset' res['message'] = 'added new state as submodule' yield res for r in results_from_annex_noinfo( ds, torepoadd, respath_by_status, dir_fail_msg='could not add some content in %s %s', noinfo_dir_msg='nothing to add from %s', noinfo_file_msg='already included in the dataset', action='add', logger=lgr, refds=refds_path): if r['path'] in torepoadd: # pull out correct ap for any path that comes out here # (that we know things about), and use the original annotation # instead of just the annex report r = dict(r, **torepoadd[r['path']]) if r['status'] == 'notneeded': # this could be a file that was staged already, it doesn't need # to be added, but it should be saved/commited if so desired to_save.append({ k: v for k, v in r.items() if k not in ('status', 'state') }) # XXX something is fishy with the next one, rethink when sober.... if r['path'] == ds_path and r['status'] == 'ok': # this is for the entire dataset itself which was explicitly requested # make sure to save all r['type'] = 'dataset' r['process_content'] = True to_save.append( {k: v for k, v in r.items() if k != 'status'}) yield r if refds_path and ds_path != refds_path and len( respath_by_status.get('success', [])): # TODO XXX we have an issue here when with `add('.')` and annex ignores any # dotfiles. In this case we end up not saving a dataset completely, because # we rely on accurate reporting. there is an issue about this already # TODO look up the issue ID # if there is a base dataset, but we are below it, and we have anything done to this # dataset -> queue dataset itself for saving its state in the parent ds_ap = dict( path=ds.path, # we have to look for the parent here, as we must save the # subdataset in the parent and not the whole subdataset itself type='dataset') parentds = get_dataset_root(normpath(opj(ds.path, pardir))) if parentds: ds_ap['parentds'] = parentds if dataset: ds_ap['refds'] = refds_path to_save.append(ds_ap) if not save: lgr.debug('Not calling `save` as instructed') return # TODO tell save what was staged already! Set 'staged=True' for # respective annotated paths that are fed into `save` # do not reuse any of the sorting done in here for saving, but instead # pass on all the annotated paths to have `save` figure out what to do with # them -- this is costs something, but should be safer, and frankly is # more comprehensible for res in Save.__call__( # hand-selected annotated paths path=to_save, dataset=refds_path, message=message if message else '[DATALAD] added content', return_type='generator', result_xfm=None, result_filter=None, on_failure='ignore'): yield res
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=False, #git_opts=None, #annex_opts=None, #annex_get_opts=None, jobs='auto', verbose=False, ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # remember which results we already reported, to avoid duplicates yielded_ds = [] to_get = [] unavailable_paths = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='get', # NOTE: Do not act upon unavailable paths yet! Done below after # testing which ones could be obtained unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('state', None) == 'absent' and ap.get( 'raw_input', False): # if this wasn't found, but directly requested, queue for further # exploration unavailable_paths.append(ap) continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: # do not report what hasn't arived yet # also do not report the base dataset that is already # present -- no surprise yield dict(ap, status='notneeded', logger=lgr, message='already installed') yielded_ds.append(ap['path']) ap['process_content'] = get_data to_get.append(ap) # explore the unknown for ap in sorted(unavailable_paths, key=lambda x: x['path']): lgr.debug("Investigate yet unavailable path %s", ap) # how close can we get? dspath = ap.get('parentds', get_dataset_root(ap['path'])) if dspath is None: # nothing we can do for this path continue lgr.debug("Found containing dataset %s for path %s", dspath, ap['path']) ds = Dataset(dspath) # now actually obtain whatever is necessary to get to this path containing_ds = [dspath] for res in _install_necessary_subdatasets(ds, ap['path'], reckless, refds_path, description=description): # yield immediately so errors could be acted upon outside, before # we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record, recursive below might now want to report # a 'notneeded' yielded_ds.append(res['path']) yield res # update to the current innermost dataset containing_ds.append(res['path']) if len(containing_ds) < 2: # no subdataset was installed, hence if the path was unavailable # before it still is, no need to bother git annex ap.update(status='impossible', message='path does not exist') yield ap continue # important to only do the next for the innermost subdataset # as the `recursive` logic below relies on that! # set the correct parent, for a dataset this would be the second-last # reported subdataset ap.update(parentds=containing_ds[-1]) if containing_ds[-1] == ap['path']: # the path actually refers to the last installed dataset ap.update(parentds=containing_ds[-2], process_content=get_data, type='dataset') to_get.append(ap) # results of recursive installation of yet undiscovered datasets rec_get = [] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for ap in sorted(to_get, key=lambda x: x['path']): if ap['type'] not in ('dataset', 'directory') or not ap.get( 'raw_input', False): # a non-directory cannot have content underneath # also we do NOT want to recurse into anything that was specifically # requested, to avoid duplication continue subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds']) lgr.info("Installing %s%s recursively", subds, (" underneath %s" % ap['path'] if subds.path != ap['path'] else "")) for res in _recursive_install_subds_underneath( subds, # `ap['path']` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, start=ap['path'], refds_path=refds_path, description=description): # yield immediately so errors could be acted upon # outside, before we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record yielded_ds.append(res['path']) yield res if not (res['status'] == 'ok' and res['type'] == 'dataset'): # not a dataset that was just installed, we just reported it # upstairs, and can ignore it from now on continue # paranoia, so popular these days... assert GitRepo.is_valid_repo(res['path']) # keep a copy of the install record for `get` later on get_ap = { k: v for k, v in res.items() if not k == 'status' } get_ap['process_content'] = get_data rec_get.append(get_ap) if not get_data: # done already return # merge the two AP lists to_get.extend(rec_get) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_get, refds_path=refds_path) assert (not completed) # hand over to git-annex, get files content, # report files in git as 'notneeded' to get for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) # grab content, ignore subdataset entries content = [ ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path ] if not content: # cut this short should there be nothing continue # needs to be an annex to get content if not isinstance(ds.repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', logger=lgr, refds=refds_path): yield r continue respath_by_status = {} for res in ds.repo.get(content, options=['--from=%s' % source] if source else [], jobs=jobs): res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, description=None, reckless=False, #git_opts=None, #annex_opts=None, #annex_get_opts=None, jobs='auto', verbose=False, ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end refds_path = Interface.get_refds_path(dataset) if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = refds_path # remember which results we already reported, to avoid duplicates yielded_ds = [] to_get = [] unavailable_paths = [] for ap in AnnotatePaths.__call__( path=path, dataset=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='get', # NOTE: Do not act upon unavailable paths yet! Done below after # testing which ones could be obtained unavailable_path_status='', nondataset_path_status='impossible', return_type='generator', on_failure='ignore'): if ap.get('status', None): # we know what to report already yield ap continue if ap.get('state', None) == 'absent' and ap.get('raw_input', False): # if this wasn't found, but directly requested, queue for further # exploration unavailable_paths.append(ap) continue if ap.get('type', None) == 'dataset' and \ GitRepo.is_valid_repo(ap['path']) and \ not ap['path'] == refds_path: # do not report what hasn't arived yet # also do not report the base dataset that is already # present -- no surprise yield dict(ap, status='notneeded', logger=lgr, message='already installed') yielded_ds.append(ap['path']) ap['process_content'] = get_data to_get.append(ap) # explore the unknown for ap in sorted(unavailable_paths, key=lambda x: x['path']): lgr.debug("Investigate yet unavailable path %s", ap) # how close can we get? dspath = ap.get('parentds', get_dataset_root(ap['path'])) if dspath is None: # nothing we can do for this path continue lgr.debug("Found containing dataset %s for path %s", dspath, ap['path']) ds = Dataset(dspath) # now actually obtain whatever is necessary to get to this path containing_ds = [dspath] for res in _install_necessary_subdatasets( ds, ap['path'], reckless, refds_path, description=description): # yield immediately so errors could be acted upon outside, before # we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record, recursive below might now want to report # a 'notneeded' yielded_ds.append(res['path']) yield res # update to the current innermost dataset containing_ds.append(res['path']) if len(containing_ds) < 2: # no subdataset was installed, hence if the path was unavailable # before it still is, no need to bother git annex ap.update(status='impossible', message='path does not exist') yield ap continue # important to only do the next for the innermost subdataset # as the `recursive` logic below relies on that! # set the correct parent, for a dataset this would be the second-last # reported subdataset ap.update(parentds=containing_ds[-1]) if containing_ds[-1] == ap['path']: # the path actually refers to the last installed dataset ap.update(parentds=containing_ds[-2], process_content=get_data, type='dataset') to_get.append(ap) # results of recursive installation of yet undiscovered datasets rec_get = [] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for ap in sorted(to_get, key=lambda x: x['path']): if ap['type'] not in ('dataset', 'directory') or not ap.get('raw_input', False): # a non-directory cannot have content underneath # also we do NOT want to recurse into anything that was specifically # requested, to avoid duplication continue subds = Dataset(ap['path'] if ap['type'] == 'dataset' else ap['parentds']) lgr.info( "Installing %s%s recursively", subds, (" underneath %s" % ap['path'] if subds.path != ap['path'] else "")) for res in _recursive_install_subds_underneath( subds, # `ap['path']` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, start=ap['path'], refds_path=refds_path, description=description): # yield immediately so errors could be acted upon # outside, before we continue if not (res['type'] == 'dataset' and res['path'] in yielded_ds): # unless we reported on this dataset before if res['type'] == 'dataset': # make a record yielded_ds.append(res['path']) yield res if not (res['status'] == 'ok' and res['type'] == 'dataset'): # not a dataset that was just installed, we just reported it # upstairs, and can ignore it from now on continue # paranoia, so popular these days... assert GitRepo.is_valid_repo(res['path']) # keep a copy of the install record for `get` later on get_ap = {k: v for k, v in res.items() if not k == 'status'} get_ap['process_content'] = get_data rec_get.append(get_ap) if not get_data: # done already return # merge the two AP lists to_get.extend(rec_get) # sort into datasets content_by_ds, ds_props, completed, nondataset_paths = \ annotated2content_by_ds( to_get, refds_path=refds_path) assert(not completed) # hand over to git-annex, get files content, # report files in git as 'notneeded' to get for ds_path in sorted(content_by_ds.keys()): ds = Dataset(ds_path) # grab content, ignore subdataset entries content = [ap['path'] for ap in content_by_ds[ds_path] if ap.get('type', None) != 'dataset' or ap['path'] == ds.path] if not content: # cut this short should there be nothing continue # needs to be an annex to get content if not isinstance(ds.repo, AnnexRepo): for r in results_from_paths( content, status='notneeded', message="no dataset annex, content already present", action='get', logger=lgr, refds=refds_path): yield r continue respath_by_status = {} for res in ds.repo.get( content, options=['--from=%s' % source] if source else [], jobs=jobs): res = annexjson2result(res, ds, type='file', logger=lgr, refds=refds_path) success = success_status_map[res['status']] # TODO: in case of some failed commands (e.g. get) there might # be no path in the record. yoh has only vague idea of logic # here so just checks for having 'path', but according to # results_from_annex_noinfo, then it would be assumed that # `content` was acquired successfully, which is not the case if 'path' in res: respath_by_status[success] = \ respath_by_status.get(success, []) + [res['path']] yield res for r in results_from_annex_noinfo( ds, content, respath_by_status, dir_fail_msg='could not get some content in %s %s', noinfo_dir_msg='nothing to get from %s', noinfo_file_msg='already present', action='get', logger=lgr, refds=refds_path): yield r