def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, check=True, if_dirty='save-before'): if dataset and not path: # act on the whole dataset if nothing else was specified path = dataset.path if isinstance(dataset, Dataset) else dataset content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit) handle_dirty_datasets( content_by_ds.keys(), mode=if_dirty, base=dataset) results = [] # iterate over all datasets, order doesn't matter for ds_path in content_by_ds: ds = Dataset(ds_path) paths = content_by_ds[ds_path] res = _drop_files(ds, paths, check=check) results.extend(res) # there is nothing to save at the end return results
def __call__( path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): if dataset and not path: # act on the whole dataset if nothing else was specified path = dataset.path if isinstance(dataset, Dataset) else dataset content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive) if unavailable_paths: lgr.warning('ignored non-installed paths: %s', unavailable_paths) # upfront sanity and compliance checks if path_is_under(content_by_ds.keys()): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") # check that we have no top-level datasets and not files to process args_ok = True for ds_path in content_by_ds: ds = Dataset(ds_path) paths = content_by_ds[ds_path] if ds_path not in paths: lgr.error( "will not act on files at %s (consider the `drop` command)", paths) args_ok = False if not ds.get_superdataset( datalad_only=False, topmost=False): lgr.error( "will not uninstall top-level dataset at %s (consider the `remove` command)", ds.path) args_ok = False if not args_ok: raise ValueError( 'inappropriate arguments, see previous error message(s)') handle_dirty_datasets( content_by_ds, mode=if_dirty, base=dataset) results = [] # iterate over all datasets, starting at the bottom # to deinit contained submodules first for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] results.extend( # we confirmed the super dataset presence above _uninstall_dataset(ds, check=check, has_super=True)) # there is nothing to save at the end return results
def __call__( path=None, name=None, merge=False, dataset=None, recursive=False, recursion_limit=None, fetch_all=False, reobtain_data=False): """ """ if reobtain_data: # TODO: properly define, what to do raise NotImplementedError("TODO: Option '--reobtain-data' not " "implemented yet.") if dataset and not path: # act on the whole dataset if nothing else was specified path = dataset.path if isinstance(dataset, Dataset) else dataset content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit) # TODO: check parsed inputs if any paths within a dataset were given # and issue a message that we will update the associate dataset as a whole # or fail -- see #1185 for a potential discussion results = [] for ds_path in content_by_ds: ds = Dataset(ds_path) repo = ds.repo # get all remotes which have references (would exclude # special remotes) remotes = repo.get_remotes(with_refs_only=True) if not remotes: lgr.debug("No siblings known to dataset at %s\nSkipping", repo.path) continue if name and name not in remotes: lgr.warning("'%s' not known to dataset %s\nSkipping", name, repo.path) continue # Currently '--merge' works for single remote only: # TODO: - condition still incomplete # - We can merge if a remote was given or there is a # tracking branch # - we also can fetch all remotes independently on whether or # not we merge a certain remote if not name and len(remotes) > 1 and merge: lgr.debug("Found multiple remotes:\n%s" % remotes) raise NotImplementedError("No merge strategy for multiple " "remotes implemented yet.") lgr.info("Updating dataset '%s' ..." % repo.path) _update_repo(repo, name, merge, fetch_all)
def __call__(path=None, name=None, merge=False, dataset=None, recursive=False, recursion_limit=None, fetch_all=False, reobtain_data=False): """ """ if reobtain_data: # TODO: properly define, what to do raise NotImplementedError("TODO: Option '--reobtain-data' not " "implemented yet.") if dataset and not path: # act on the whole dataset if nothing else was specified path = dataset.path if isinstance(dataset, Dataset) else dataset content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit) # TODO: check parsed inputs if any paths within a dataset were given # and issue a message that we will update the associate dataset as a whole # or fail -- see #1185 for a potential discussion results = [] for ds_path in content_by_ds: ds = Dataset(ds_path) repo = ds.repo # get all remotes which have references (would exclude # special remotes) remotes = repo.get_remotes(with_refs_only=True) if not remotes: lgr.debug("No siblings known to dataset at %s\nSkipping", repo.path) continue if name and name not in remotes: lgr.warning("'%s' not known to dataset %s\nSkipping", name, repo.path) continue # Currently '--merge' works for single remote only: # TODO: - condition still incomplete # - We can merge if a remote was given or there is a # tracking branch # - we also can fetch all remotes independently on whether or # not we merge a certain remote if not name and len(remotes) > 1 and merge: lgr.debug("Found multiple remotes:\n%s" % remotes) raise NotImplementedError("No merge strategy for multiple " "remotes implemented yet.") lgr.info("Updating dataset '%s' ..." % repo.path) _update_repo(repo, name, merge, fetch_all)
def __call__(path=None, dataset=None, to=None, since=None, missing='fail', force=False, recursive=False, recursion_limit=None, git_opts=None, annex_opts=None, annex_copy_opts=None, jobs=None): # if ever we get a mode, for "with-data" we would need this #if dataset and not path: # # act on the whole dataset if nothing else was specified # path = dataset.path if isinstance(dataset, Dataset) else dataset if not dataset and not path: # try to find a dataset in PWD dataset = require_dataset(None, check_installed=True, purpose='publishing') if since and not dataset: raise InsufficientArgumentsError( 'Modification detection (--since) without a base dataset ' 'is not supported') content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit, # we do not want for this command state that we want to publish # content by default by assigning paths for each sub-dataset # automagically. But if paths were provided -- sorting would # happen to point only to the submodules under those paths, and # then to stay consistent we want to copy those paths data sub_paths=bool(path)) if unavailable_paths: raise ValueError( 'cannot publish content that is not available locally: %s' % ', '.join(unavailable_paths)) # here is the plan # 1. figure out remote to publish to # 2. figure out which content needs to be published to this remote # 3. look for any pre-publication dependencies of that remote # (i.e. remotes that need to be published to before) # 4. publish the content needed to go to the primary remote to # the dependencies first, and to the primary afterwards ds_remote_info = {} lgr.debug("Evaluating %i dataset publication candidate(s)", len(content_by_ds)) # TODO: fancier sorting, so we still follow somewhat the hierarchy # in sorted order, e.g. # d1/sub1/sub1 # d1/sub1 # d1 # d2/sub1 # d2 content_by_ds = OrderedDict( (d, content_by_ds[d]) for d in sorted(content_by_ds, reverse=True)) for ds_path in content_by_ds: ds = Dataset(ds_path) if to is None: # we need an upstream remote, if there's none given. We could # wait for git push to complain, but we need to explicitly # figure it out for pushing annex branch anyway and we might as # well fail right here. track_remote, track_refspec = ds.repo.get_tracking_branch() if not track_remote: # no tracking remote configured, but let try one more # if we only have one remote, and it has a push target # configured that is "good enough" for us cand_remotes = [ r for r in ds.repo.get_remotes() if 'remote.{}.push'.format(r) in ds.config ] if len(cand_remotes) > 1: lgr.warning( 'Target sibling ambiguous, please specific via --to' ) elif len(cand_remotes) == 1: track_remote = cand_remotes[0] else: lgr.warning( 'No target sibling configured for default publication, ' 'please specific via --to') if track_remote: ds_remote_info[ds_path] = dict( zip(('remote', 'refspec'), (track_remote, track_refspec))) elif missing == 'skip': lgr.warning('Cannot determine target sibling, skipping %s', ds) ds_remote_info[ds_path] = None else: # we have no remote given and no upstream => fail raise InsufficientArgumentsError( 'Cannot determine target sibling for %s' % (ds, )) elif to not in ds.repo.get_remotes(): # unknown given remote if missing == 'skip': lgr.warning("Unknown target sibling '%s', skipping %s", to, ds) ds_remote_info[ds_path] = None elif missing == 'inherit': superds = ds.get_superdataset() if not superds: raise RuntimeError( "%s has no super-dataset to inherit settings for the remote %s" % (ds, to)) # XXX due to difference between create-sibling and create-sibling-github # would not be as transparent to inherit for -github lgr.info( "Will try to create a sibling inheriting settings from %s", superds) # XXX explicit None as sshurl for now ds.create_sibling(None, name=to, inherit=True) ds_remote_info[ds_path] = {'remote': to} else: raise ValueError("Unknown target sibling '%s' for %s" % (to, ds)) else: # all good: remote given and is known ds_remote_info[ds_path] = {'remote': to} if dataset and since: # remove all unmodified components from the spec lgr.debug("Testing %i dataset(s) for modifications since '%s'", len(content_by_ds), since) content_by_ds = filter_unmodified(content_by_ds, dataset, since) lgr.debug("Attempt to publish %i datasets", len(content_by_ds)) published, skipped = [], [] for ds_path in content_by_ds: remote_info = ds_remote_info[ds_path] if not remote_info: # in case we are skipping lgr.debug("Skipping dataset at '%s'", ds_path) continue # and publish ds = Dataset(ds_path) pblsh, skp = _publish_dataset(ds, remote=remote_info['remote'], refspec=remote_info.get( 'refspec', None), paths=content_by_ds[ds_path], annex_copy_options=annex_copy_opts, force=force, jobs=jobs) published.extend(pblsh) skipped.extend(skp) return published, skipped
def __call__(path=None, dataset=None, to_git=False, save=True, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path" ) # never recursion, need to handle manually below to be able to # discover untracked content content_by_ds, unavailable_paths = Interface._prep(path=path, dataset=dataset, recursive=False) if unavailable_paths: lgr.warning("ignoring non-existent path(s): %s", unavailable_paths) if recursive: # with --recursive for each input path traverse the directory # tree, when we find a dataset, add it to the spec, AND add it as # a path to the spec of the parent # MIH: wrap in list() to avoid exception, because dict size might # change, but we want to loop over all that are in at the start # only for d in list(content_by_ds.keys()): for p in content_by_ds[d]: _discover_subdatasets_recursively(p, [d], content_by_ds, recursion_limit) if not content_by_ds: raise InsufficientArgumentsError( "no existing content given to add") if dataset: # remeber the datasets associated with actual inputs input_ds = list(content_by_ds.keys()) # forge chain from base dataset to any leaf dataset _discover_trace_to_known(dataset.path, [], content_by_ds) if ds2super: # now check all dataset entries corresponding to the original # input to see if they contain their own paths and remove them for inpds in input_ds: content_by_ds[inpds] = [ p for p in content_by_ds[inpds] if not p == inpds ] # and lastly remove all entries that contain no path to avoid # saving any staged content in the final step content_by_ds = {d: v for d, v in content_by_ds.items() if v} results = [] # simple loop over datasets -- save happens later # start deep down for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) toadd = list(set(content_by_ds[ds_path])) # handle anything that looks like a wannabe subdataset for subds_path in [ d for d in toadd if GitRepo.is_valid_repo(d) and d != ds_path and d not in ds.get_subdatasets( recursive=False, absolute=True, fulfilled=True) ]: # TODO add check that the subds has a commit, and refuse # to operate on it otherwise, or we would get a bastard # submodule that cripples git operations _install_subds_inplace(ds=ds, path=subds_path, relativepath=relpath( subds_path, ds_path)) # make sure that .gitmodules is added to the list of files toadd.append(opj(ds.path, '.gitmodules')) # report added subdatasets -- add below won't do it results.append({'success': True, 'file': Dataset(subds_path)}) # make sure any last minute additions make it to the saving stage content_by_ds[ds_path] = toadd added = ds.repo.add( toadd, git=to_git if isinstance(ds.repo, AnnexRepo) else True, commit=False) for a in added: a['file'] = opj(ds_path, a['file']) results.extend(added) if results and save: save_dataset_hierarchy(content_by_ds, base=dataset.path if dataset and dataset.is_installed() else None, message='[DATALAD] added content') return results
def __call__( path=None, dataset=None, recursive=False, check=True, if_dirty='save-before'): if dataset: dataset = require_dataset( dataset, check_installed=False, purpose='removal') if not dataset.is_installed() and not path: # all done already return [] if not path: # act on the whole dataset if nothing else was specified path = dataset.path if isinstance(dataset, Dataset) else dataset content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive) nonexistent_paths = [] for p in unavailable_paths: # we need to check whether any of these correspond # to a known subdataset, and add those to the list of # things to be removed toppath = get_dataset_root(p) if not toppath: nonexistent_paths.append(p) continue if p in Dataset(toppath).get_subdatasets( recursive=False, absolute=True): # this is a known subdataset that needs to be removed pl = content_by_ds.get(p, []) pl.append(p) content_by_ds[p] = pl if nonexistent_paths: lgr.warning("ignoring non-existent path(s): %s", nonexistent_paths) if path_is_under(content_by_ds): # behave like `rm` and refuse to remove where we are raise ValueError( "refusing to uninstall current or parent directory") handle_dirty_datasets( content_by_ds, mode=if_dirty, base=dataset) ds2save = set() results = [] # iterate over all datasets, starting at the bottom # to make the removal of dataset content known upstairs for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) paths = content_by_ds[ds_path] if ds_path in paths: # entire dataset needs to go superds = ds.get_superdataset( datalad_only=False, topmost=False) res = _uninstall_dataset(ds, check=check, has_super=False) results.extend(res) if ds.path in ds2save: # we just uninstalled it, no need to save anything ds2save.discard(ds.path) if not superds: continue subds_relpath = relpath(ds_path, start=superds.path) # remove submodule reference submodule = [sm for sm in superds.repo.repo.submodules if sm.path == subds_relpath] # there can only be one! assert(len(submodule) == 1) submodule = submodule[0] submodule.remove() if exists(ds_path): # could be an empty dir in case an already uninstalled subdataset # got removed os.rmdir(ds_path) # need to save changes to .gitmodules later content_by_ds[superds.path] = \ content_by_ds.get(superds.path, []) \ + [opj(superds.path, '.gitmodules'), ds_path] ds2save.add(superds.path) else: if check and hasattr(ds.repo, 'drop'): _drop_files(ds, paths, check=True) results.extend(ds.repo.remove(paths, r=True)) ds2save.add(ds.path) if dataset and dataset.is_installed(): # forge chain from base dataset to any leaf dataset # in order to save state changes all the way up _discover_trace_to_known(dataset.path, [], content_by_ds) save_dataset_hierarchy( content_by_ds, base=dataset.path if dataset and dataset.is_installed() else None, message='[DATALAD] removed content') return results
def __call__( path=None, dataset=None, to_git=False, save=True, recursive=False, recursion_limit=None, ds2super=False, git_opts=None, annex_opts=None, annex_add_opts=None, jobs=None): # parameter constraints: if not path: raise InsufficientArgumentsError( "insufficient information for adding: requires at least a path") # never recursion, need to handle manually below to be able to # discover untracked content content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=False) if unavailable_paths: lgr.warning("ignoring non-existent path(s): %s", unavailable_paths) if recursive: # with --recursive for each input path traverse the directory # tree, when we find a dataset, add it to the spec, AND add it as # a path to the spec of the parent # MIH: wrap in list() to avoid exception, because dict size might # change, but we want to loop over all that are in at the start # only for d in list(content_by_ds.keys()): for p in content_by_ds[d]: _discover_subdatasets_recursively( p, [d], content_by_ds, recursion_limit) if not content_by_ds: raise InsufficientArgumentsError( "no existing content given to add") if dataset: # remeber the datasets associated with actual inputs input_ds = list(content_by_ds.keys()) # forge chain from base dataset to any leaf dataset _discover_trace_to_known(dataset.path, [], content_by_ds) if ds2super: # now check all dataset entries corresponding to the original # input to see if they contain their own paths and remove them for inpds in input_ds: content_by_ds[inpds] = [p for p in content_by_ds[inpds] if not p == inpds] # and lastly remove all entries that contain no path to avoid # saving any staged content in the final step content_by_ds = {d: v for d, v in content_by_ds.items() if v} results = [] # simple loop over datasets -- save happens later # start deep down for ds_path in sorted(content_by_ds, reverse=True): ds = Dataset(ds_path) toadd = list(set(content_by_ds[ds_path])) # handle anything that looks like a wannabe subdataset for subds_path in [d for d in toadd if GitRepo.is_valid_repo(d) and d != ds_path and d not in ds.get_subdatasets( recursive=False, absolute=True, fulfilled=True)]: # TODO add check that the subds has a commit, and refuse # to operate on it otherwise, or we would get a bastard # submodule that cripples git operations _install_subds_inplace( ds=ds, path=subds_path, relativepath=relpath(subds_path, ds_path)) # make sure that .gitmodules is added to the list of files toadd.append(opj(ds.path, '.gitmodules')) # report added subdatasets -- add below won't do it results.append({ 'success': True, 'file': Dataset(subds_path)}) # make sure any last minute additions make it to the saving stage content_by_ds[ds_path] = toadd added = ds.repo.add( toadd, git=to_git if isinstance(ds.repo, AnnexRepo) else True, commit=False) for a in added: a['file'] = opj(ds_path, a['file']) results.extend(added) if results and save: save_dataset_hierarchy( content_by_ds, base=dataset.path if dataset and dataset.is_installed() else None, message='[DATALAD] added content') return results
def __call__( path=None, source=None, dataset=None, recursive=False, recursion_limit=None, get_data=True, reckless=False, git_opts=None, annex_opts=None, annex_get_opts=None, jobs=None, verbose=False, # internal -- instead of returning 'get'ed items, return final # content_by_ds, unavailable_paths. To be used by the call from # Install.__call__ and done so to avoid creating another reusable # function which would need to duplicate all this heavy list of # kwargs _return_datasets=False ): # IMPLEMENTATION CONCEPT: # # 1. Sort the world into existing handles and the rest # 2. Try locate missing handles (obtain subdatasets along the way) # 3. Expand into subdatasets with recursion enables (potentially # obtain even more subdatasets # 4. Shoot info of which handles to get in each subdataset to, # git-annex, once at the very end dataset_path = dataset.path if isinstance(dataset, Dataset) else dataset if not (dataset or path): raise InsufficientArgumentsError( "Neither dataset nor target path(s) provided") if dataset and not path: # act on the whole dataset if nothing else was specified path = dataset_path # use lookup cache -- we need that info further down dir_lookup = {} content_by_ds, unavailable_paths = Interface._prep( path=path, dataset=dataset, recursive=recursive, recursion_limit=recursion_limit, dir_lookup=dir_lookup) # explore the unknown for path in sorted(unavailable_paths): # how close can we get? dspath = get_dataset_root(path) if dspath is None: # nothing we can do for this path continue ds = Dataset(dspath) # must always yield a dataset -- we sorted out the ones outside # any dataset at the very top assert ds.is_installed() # now actually obtain whatever is necessary to get to this path containing_ds = install_necessary_subdatasets(ds, path, reckless) if containing_ds.path != ds.path: lgr.debug("Installed %s to fulfill request for content for " "path %s", containing_ds, path) # mark resulting dataset as auto-installed if containing_ds.path == path: # we had to get the entire dataset, not something within # mark that it just appeared content_by_ds[path] = [curdir] else: # we need to get content within content_by_ds[path] = [path] if recursive and not recursion_limit == 'existing': # obtain any subdatasets underneath the paths given inside the # subdatasets that we know already exist # unless we do not want recursion into not-yet-installed datasets for subdspath in sorted(content_by_ds.keys()): for content_path in content_by_ds[subdspath]: if not isdir(content_path): # a non-directory cannot have content underneath continue subds = Dataset(subdspath) lgr.info( "Obtaining %s %s recursively", subds, ("underneath %s" % content_path if subds.path != content_path else "")) cbysubds = _recursive_install_subds_underneath( subds, # `content_path` was explicitly given as input # we count recursions from the input, hence we # can start with the full number recursion_limit, reckless, # protect against magic marker misinterpretation # only relevant for _get, hence replace here start=content_path if content_path != curdir else None) # gets file content for all freshly installed subdatasets content_by_ds.update(cbysubds) ## we have now done everything we could to obtain whatever subdataset ## to get something on the file system for previously unavailable paths ## check and sort one last content_by_ds, unavailable_paths, nondataset_paths = \ get_paths_by_dataset( unavailable_paths, recursive=recursive, recursion_limit=recursion_limit, out=content_by_ds, dir_lookup=dir_lookup) if nondataset_paths: # XXX likely can never get here lgr.warning( "ignored paths that do not belong to any dataset: %s", nondataset_paths) if unavailable_paths: lgr.warning('ignored non-existing paths: %s', unavailable_paths) # hand over to git-annex results = list(chain.from_iterable( _get(content_by_ds, refpath=dataset_path, source=source, jobs=jobs, get_data=get_data))) # ??? should we in _return_datasets case just return both content_by_ds # and unavailable_paths may be so we provide consistent across runs output # and then issue outside similar IncompleteResultsError? if unavailable_paths: # and likely other error flags if _return_datasets: results = sorted(set(content_by_ds).difference(unavailable_paths)) raise IncompleteResultsError(results, failed=unavailable_paths) else: return sorted(content_by_ds) if _return_datasets else results