def get_refds_path(cls, dataset): """Return a resolved reference dataset path from a `dataset` argument""" # theoretically a dataset could come in as a relative path -> resolve refds_path = dataset.path if isinstance(dataset, Dataset) else dataset if refds_path: refds_path = resolve_path(refds_path) return refds_path
def get_normalized_path_arguments(paths, dataset=None, default=None): """Apply standard resolution to path arguments This is nothing more than a helper to standardize path argument preprocessing. Parameter --------- paths : sequence or single path Path(s) to normalize dataset : path or Dataset or None Optional dataset identifying something against which to resolve input path arguments default: sequence of paths or single path or None If `paths` is empty, use this instead Returns ------- tuple(list(paths), path) Normalized paths and path to a potential dataset against which paths were resolved. """ dataset_path = dataset.path if isinstance(dataset, Dataset) else dataset if not paths and default: paths = default paths = assure_list(paths) # resolve path(s): resolved_paths = [resolve_path(p, dataset) for p in paths] if dataset: # guarantee absolute paths resolved_paths = [opj(dataset_path, p) for p in resolved_paths] lgr.debug('Resolved input path arguments: %s', resolved_paths) return resolved_paths, dataset_path
def _get_container_by_path(ds, name, containers): from datalad.distribution.dataset import resolve_path # Note: since datalad0.12.0rc6 resolve_path returns a Path object here, # which then fails to equal c['path'] below as this is taken from # config as a string container_path = str(resolve_path(name, ds)) container = [c for c in containers.values() if c['path'] == container_path] if len(container) == 1: return container[0]
def get_refds_path(cls, dataset): """Return a resolved reference dataset path from a `dataset` argument""" # theoretically a dataset could come in as a relative path -> resolve if dataset is None: return dataset refds_path = dataset.path if isinstance(dataset, Dataset) \ else Dataset(dataset).path if refds_path: refds_path = resolve_path(refds_path) return refds_path
def get_refds_path(cls, dataset): """Return a resolved reference dataset path from a `dataset` argument .. deprecated:: 0.16 Use ``require_dataset()`` instead. """ # theoretically a dataset could come in as a relative path -> resolve if dataset is None: return dataset refds_path = dataset.path if isinstance(dataset, Dataset) \ else Dataset(dataset).path if refds_path: refds_path = str(resolve_path(refds_path)) return refds_path
def rawpath2ap(path, refds_path): orig_path_request = path # this is raw, resolve path = resolve_path(path, refds_path) # collect info on this path path_props = dict( path=path, # path was requested as input, and not somehow discovered raw_input=True, # make a record of what actually came in, sorting into # dataset might later need to distinguish between a path # that pointed to a dataset as a whole vs. a path that # pointed to the dataset's content -- just do not destroy # any information on the way down orig_request=orig_path_request) return path_props
def __call__(path=None, dataset=None, to=None, since=None, data='auto-if-wanted', force=None, recursive=False, recursion_limit=None, jobs=None): # push uses '^' to annotate the previous pushed committish, and None for default # behavior. '' was/is (to be deprecated) used in `publish`. Alert user about the mistake if since == '': raise ValueError("'since' should point to commitish or use '^'.") # we resolve here, because we need to perform inspection on what was given # as an input argument further down paths = [resolve_path(p, dataset) for p in ensure_list(path)] ds = require_dataset(dataset, check_installed=True, purpose='push') ds_repo = ds.repo res_kwargs = dict( action='publish', refds=ds.path, logger=lgr, ) get_remote_kwargs = {'exclude_special_remotes': False} \ if isinstance(ds_repo, AnnexRepo) else {} if to and to not in ds_repo.get_remotes(**get_remote_kwargs): # get again for proper error: sr = ds_repo.get_remotes(**get_remote_kwargs) # yield an error result instead of raising a ValueError, # to enable the use case of pushing to a target that # a superdataset doesn't know, but some subdatasets to # (in combination with '--on-failure ignore') yield dict(res_kwargs, status='error', path=ds.path, message="Unknown push target '{}'. {}".format( to, 'Known targets: {}.'.format(', '.join( repr(s) for s in sr)) if sr else 'No targets configured in dataset.')) return if since == '^': # figure out state of remote branch and set `since` since = _get_corresponding_remote_state(ds_repo, to) if not since: lgr.info("No tracked remote for active branch, " "detection of last pushed state not in effect.") elif since: # will blow with ValueError if unusable ds_repo.get_hexsha(since) # obtain a generator for information on the datasets to process # idea is to turn the `paths` argument into per-dataset # content listings that can be acted upon ds_spec = _datasets_since_( # important to pass unchanged dataset arg dataset, since, paths, recursive, recursion_limit) # instead of a loop, this could all be done in parallel matched_anything = False for dspath, dsrecords in ds_spec: matched_anything = True lgr.debug('Attempt push of Dataset at %s', dspath) pbars = {} yield from _push(dspath, dsrecords, to, data, force, jobs, res_kwargs.copy(), pbars, got_path_arg=True if path else False) # take down progress bars for this dataset for i, ds in pbars.items(): log_progress(lgr.info, i, 'Finished push of %s', ds) if not matched_anything: potential_remote = False if not to and len(paths) == 1: # if we get a remote name without --to, provide a hint sr = ds_repo.get_remotes(**get_remote_kwargs) potential_remote = [p for p in ensure_list(path) if p in sr] if potential_remote: hint = "{} matches a sibling name and not a path. " \ "Forgot --to?".format(potential_remote) yield dict( res_kwargs, status='notneeded', message=hint, hints=hint, type='dataset', path=ds.path, ) # there's no matching path and we have generated a hint on # fixing the call - we can return now return yield dict( res_kwargs, status='notneeded', message= 'Given constraints did not match any changes to publish', type='dataset', path=ds.path, )
def __call__(path=None, *, dataset=None, state='any', fulfilled=NoneDeprecated, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): if fulfilled is not NoneDeprecated: # the two mirror options do not agree and the deprecated one is # not at default value warnings.warn( "subdatasets's `fulfilled` option is deprecated " "and will be removed in a future release, " "use the `state` option instead.", DeprecationWarning) if state != 'any': raise ValueError( "Do not specify both 'fulfilled' and 'state', use 'state'") # honor the old option for now state = { None: 'any', True: 'present', False: 'absent', }[fulfilled] # Path of minimal resistance/code-change - internally here we will reuse fulfilled fulfilled = { 'any': None, 'present': True, 'absent': False, }[state] ds = require_dataset(dataset, check_installed=True, purpose='report on subdataset(s)') paths = resolve_path(ensure_list(path), dataset, ds) if path else None # no constraints given -> query subdatasets under curdir if not paths and dataset is None: cwd = Path(getpwd()) paths = None if cwd == ds.pathobj else [cwd] lgr.debug('Query subdatasets of %s', dataset) if paths is not None: lgr.debug('Query subdatasets underneath paths: %s', paths) refds_path = ds.path # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must " "start with a letter)" % k) if contains: contains = resolve_path(ensure_list(contains), dataset, ds) # expand all test cases for the contains test in the loop below # leads to ~20% speedup per loop iteration of a non-match expanded_contains = [[c] + list(c.parents) for c in contains] else: expanded_contains = [] contains_hits = set() for r in _get_submodules(ds, paths, fulfilled, recursive, recursion_limit, expanded_contains, bottomup, set_property, delete_property, refds_path): # a boat-load of ancient code consumes this and is ignorant of # Path objects r['path'] = str(r['path']) # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path if 'contains' in r: contains_hits.update(r['contains']) r['contains'] = [str(c) for c in r['contains']] yield r if contains: for c in set(contains).difference(contains_hits): yield get_status_dict( 'subdataset', path=str(c), status='impossible', message='path not contained in any matching subdataset', # we do not want to log such an event, because it is a # legit query to check for matching subdatasets simply # for the purpose of further decision making # user communication in front-end scenarios will happen # via result rendering #logger=lgr )
def __call__(path=None, dataset=None, fulfilled=None, recursive=False, recursion_limit=None, contains=None, bottomup=False, set_property=None, delete_property=None): ds = require_dataset(dataset, check_installed=True, purpose='subdataset reporting/modification') paths = resolve_path(ensure_list(path), dataset, ds) if path else None # no constraints given -> query subdatasets under curdir if not paths and dataset is None: cwd = Path(getpwd()) paths = None if cwd == ds.pathobj else [cwd] lgr.debug('Query subdatasets of %s', dataset) if paths is not None: lgr.debug('Query subdatasets underneath paths: %s', paths) refds_path = ds.path # return as quickly as possible if isinstance(recursion_limit, int) and (recursion_limit <= 0): return if set_property: for k, v in set_property: if valid_key.match(k) is None: raise ValueError( "key '%s' is invalid (alphanumeric plus '-' only, must " "start with a letter)" % k) if contains: contains = resolve_path(ensure_list(contains), dataset, ds) # expand all test cases for the contains test in the loop below # leads to ~20% speedup per loop iteration of a non-match expanded_contains = [[c] + list(c.parents) for c in contains] else: expanded_contains = [] contains_hits = set() for r in _get_submodules(ds, paths, fulfilled, recursive, recursion_limit, expanded_contains, bottomup, set_property, delete_property, refds_path): # a boat-load of ancient code consumes this and is ignorant of # Path objects r['path'] = str(r['path']) # without the refds_path cannot be rendered/converted relative # in the eval_results decorator r['refds'] = refds_path if 'contains' in r: contains_hits.update(r['contains']) r['contains'] = [str(c) for c in r['contains']] yield r if contains: for c in set(contains).difference(contains_hits): yield get_status_dict( 'subdataset', path=str(c), status='impossible', message='path not contained in any matching subdataset', # we do not want to log such an event, because it is a # legit query to check for matching subdatasets simply # for the purpose of further decision making # user communication in front-end scenarios will happen # via result rendering #logger=lgr )
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if _with_sep(p).startswith(_with_sep(refds_path)): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not _with_sep(dspath).startswith(_with_sep(refds_path)): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def __call__( path=None, dataset=None, recursive=False, target_dir=None, specs_from=None, message=None): # Concept # # Loosely model after the POSIX cp command # # 1. Determine the target of the copy operation, and its associated # dataset # # 2. for each source: determine source dataset, query for metadata, put # into target dataset # # Instead of sifting and sorting through input args, process them one # by one sequentially. Utilize lookup caching to make things faster, # instead of making the procedure itself more complicated. if path and specs_from: raise ValueError( "Path argument(s) AND a specs-from specified, " "this is not supported.") ds = None if dataset: ds = require_dataset(dataset, check_installed=True, purpose='copy into') if target_dir: target_dir = resolve_path(target_dir, dataset) if path: # turn into list of absolute paths paths = [resolve_path(p, dataset) for p in ensure_list(path)] # we already checked that there are no specs_from if not target_dir: if len(paths) == 1: if not ds: raise ValueError("No target directory was given.") # we can keep target_dir unset and need not manipulate # paths, this is all done in a generic fashion below elif len(paths) == 2: # single source+dest combo if paths[-1].is_dir(): # check if we need to set target_dir, in case dest # is a dir target_dir = paths.pop(-1) else: specs_from = [paths] else: target_dir = paths.pop(-1) if not specs_from: # in all other cases we have a plain source list specs_from = paths if not specs_from: raise ValueError("Neither `paths` nor `specs_from` given.") if target_dir: if ".git" in target_dir.parts: raise ValueError( "Target directory should not contain a .git directory: {}" .format(target_dir)) elif ds: # no specific target set, but we have to write into a dataset, # and one was given. It seems to make sense to use this dataset # as a target. it is already to reference for any path resolution. # Any explicitly given destination, will take precedence over # a general target_dir setting nevertheless. target_dir = ds.pathobj res_kwargs = dict( action='copy_file', logger=lgr, ) # lookup cache for dir to repo mappings, and as a DB for cleaning # things up repo_cache = {} # which paths to pass on to save to_save = [] try: for src_path, dest_path in _yield_specs(specs_from): src_path = Path(src_path) dest_path = None \ if dest_path is None \ else resolve_path(dest_path, dataset) lgr.debug('Processing copy specification: %s -> %s', src_path, dest_path) # Some checks, first impossibility "wins" msg_impossible = None if not recursive and src_path.is_dir(): msg_impossible = 'recursion not enabled, omitting directory' elif (dest_path and dest_path.name == '.git') \ or src_path.name == '.git': msg_impossible = \ "refuse to place '.git' into destination dataset" elif not (dest_path or target_dir): msg_impossible = 'need destination path or target directory' if msg_impossible: yield dict( path=str(src_path), status='impossible', message=msg_impossible, **res_kwargs ) continue for src_file, dest_file in _yield_src_dest_filepaths( src_path, dest_path, target_dir=target_dir): if ds and ds.pathobj not in dest_file.parents: # take time to compose proper error dpath = str(target_dir if target_dir else dest_path) yield dict( path=dpath, status='error', message=( 'reference dataset does not contain ' 'destination path: %s', dpath), **res_kwargs ) # only recursion could yield further results, which would # all have the same issue, so call it over right here break for res in _copy_file(src_file, dest_file, cache=repo_cache): yield dict( res, **res_kwargs ) if res.get('status', None) == 'ok': to_save.append(res['destination']) finally: # cleanup time # TODO this could also be the place to stop lingering batch processes _cleanup_cache(repo_cache) if not (ds and to_save): # nothing left to do return yield from ds.save( path=to_save, # we provide an explicit file list recursive=False, message=message, )
def __call__( archive, *, dataset=None, annex=None, add_archive_leading_dir=False, strip_leading_dirs=False, leading_dirs_depth=None, leading_dirs_consider=None, use_current_dir=False, delete=False, key=False, exclude=None, rename=None, existing='fail', annex_options=None, copy=False, commit=True, allow_dirty=False, stats=None, drop_after=False, delete_after=False): if exclude: exclude = ensure_tuple_or_list(exclude) if rename: rename = ensure_tuple_or_list(rename) ds = require_dataset(dataset, check_installed=True, purpose='add-archive-content') # set up common params for result records res_kwargs = { 'action': 'add-archive-content', 'logger': lgr, } if not isinstance(ds.repo, AnnexRepo): yield get_status_dict( ds=ds, status='impossible', message="Can't operate in a pure Git repository", **res_kwargs ) return if annex: warnings.warn( "datalad add_archive_content's `annex` parameter is " "deprecated and will be removed in a future release. " "Use the 'dataset' parameter instead.", DeprecationWarning) annex = ds.repo # get the archive path relative from the ds root archive_path = resolve_path(archive, ds=dataset) # let Status decide whether we can act on the given file for s in ds.status( path=archive_path, on_failure='ignore', result_renderer='disabled'): if s['status'] == 'error': if 'path not underneath the reference dataset %s' in s['message']: yield get_status_dict( ds=ds, status='impossible', message='Can not add archive outside of the dataset', **res_kwargs) return # status errored & we haven't anticipated the cause. Bubble up yield s return elif s['state'] == 'untracked': # we can't act on an untracked file message = ( "Can not add an untracked archive. " "Run 'datalad save {}'".format(archive) ) yield get_status_dict( ds=ds, status='impossible', message=message, **res_kwargs) return if not allow_dirty and annex.dirty: # error out here if the dataset contains untracked changes yield get_status_dict( ds=ds, status='impossible', message=( 'clean dataset required. ' 'Use `datalad status` to inspect unsaved changes'), **res_kwargs ) return # ensure the archive exists, status doesn't error on a non-existing file if not key and not lexists(archive_path): yield get_status_dict( ds=ds, status='impossible', message=( 'No such file: {}'.format(archive_path), ), **res_kwargs ) return if not key: check_path = archive_path.relative_to(ds.pathobj) # TODO: support adding archives content from outside the annex/repo origin = 'archive' # can become get_file_annexinfo once #6104 is merged key = annex.get_file_annexinfo(check_path)['key'] if not key: raise RuntimeError( f"Archive must be an annexed file in {ds}") archive_dir = Path(archive_path).parent else: origin = 'key' key = archive # We must not have anything to do with the location under .git/annex archive_dir = None # instead, we will go from the current directory use_current_dir = True archive_basename = file_basename(archive) if not key: # if we didn't manage to get a key, the file must be in Git raise NotImplementedError( "Provided file %s does not seem to be under annex control. " "We don't support adding everything straight to Git" % archive ) # figure out our location pwd = getpwd() # are we in a subdirectory of the repository? pwd_in_root = annex.path == archive_dir # then we should add content under that subdirectory, # get the path relative to the repo top if use_current_dir: # extract the archive under the current directory, not the directory # where the archive is located extract_rpath = Path(pwd).relative_to(ds.path) \ if not pwd_in_root \ else None else: extract_rpath = archive_dir.relative_to(ds.path) # relpath might return '.' as the relative path to curdir, which then normalize_paths # would take as instructions to really go from cwd, so we need to sanitize if extract_rpath == curdir: extract_rpath = None try: key_rpath = annex.get_contentlocation(key) except: # the only probable reason for this to fail is that there is no # content present raise RuntimeError( "Content of %s seems to be N/A. Fetch it first" % key ) # now we simply need to go through every file in that archive and lgr.info( "Adding content of the archive %s into annex %s", archive, annex ) from datalad.customremotes.archives import ArchiveAnnexCustomRemote # TODO: shouldn't we be able just to pass existing AnnexRepo instance? # TODO: we will use persistent cache so we could just (ab)use possibly extracted archive # OK, let's ignore that the following class is actually a special # remote implementation, and use it only to work with its cache annexarchive = ArchiveAnnexCustomRemote(annex=None, path=annex.path, persistent_cache=True) # We will move extracted content so it must not exist prior running annexarchive.cache.allow_existing = True earchive = annexarchive.cache[key_rpath] # make sure there is an enabled datalad-archives special remote ensure_datalad_remote(ds.repo, remote=ARCHIVES_SPECIAL_REMOTE, autoenable=True) precommitted = False old_always_commit = annex.always_commit # batch mode is disabled when faking dates, we want to always commit annex.always_commit = annex.fake_dates_enabled if annex_options: if isinstance(annex_options, str): annex_options = split_cmdline(annex_options) delete_after_rpath = None prefix_dir = basename(tempfile.mkdtemp(prefix=".datalad", dir=annex.path)) \ if delete_after \ else None # dedicated stats which would be added to passed in (if any) outside_stats = stats stats = ActivityStats() try: # keep track of extracted files for progress bar logging file_counter = 0 # iterative over all files in the archive extracted_files = list(earchive.get_extracted_files()) # start a progress bar for extraction pbar_id = f'add-archive-{archive_path}' log_progress( lgr.info, pbar_id, 'Extracting archive', label="Extracting archive", unit=' Files', total = len(extracted_files), noninteractive_level = logging.INFO) for extracted_file in extracted_files: file_counter += 1 files_left = len(extracted_files) - file_counter log_progress( lgr.info, pbar_id, "Files to extract %i ", files_left, update=1, increment=True, noninteractive_level=logging.DEBUG) stats.files += 1 extracted_path = Path(earchive.path) / Path(extracted_file) if extracted_path.is_symlink(): link_path = str(extracted_path.resolve()) if not exists(link_path): # TODO: config addarchive.symlink-broken='skip' lgr.warning( "Path %s points to non-existing file %s" % (extracted_path, link_path) ) stats.skipped += 1 continue # TODO: check if points outside of archive - warn & skip url = annexarchive.get_file_url( archive_key=key, file=extracted_file, size=os.stat(extracted_path).st_size) # preliminary target name which might get modified by renames target_file_orig = target_file = Path(extracted_file) # stream archives would not have had the original filename # information in them, so would be extracted under a name # derived from their annex key. # Provide ad-hoc handling for such cases if (len(extracted_files) == 1 and Path(archive).suffix in ('.xz', '.gz', '.lzma') and Path(key_rpath).name.startswith(Path( extracted_file).name)): # take archive's name without extension for filename & place # where it was originally extracted target_file = \ Path(extracted_file).parent / Path(archive).stem if strip_leading_dirs: leading_dir = earchive.get_leading_directory( depth=leading_dirs_depth, exclude=exclude, consider=leading_dirs_consider) leading_dir_len = \ len(leading_dir) + len(opsep) if leading_dir else 0 target_file = str(target_file)[leading_dir_len:] if add_archive_leading_dir: # place extracted content under a directory corresponding to # the archive name with suffix stripped. target_file = Path(archive_basename) / target_file if rename: target_file = apply_replacement_rules(rename, str(target_file)) # continue to next iteration if extracted_file in excluded if exclude: try: # since we need to skip outside loop from inside loop for regexp in exclude: if re.search(regexp, extracted_file): lgr.debug( "Skipping {extracted_file} since contains " "{regexp} pattern".format(**locals())) stats.skipped += 1 raise StopIteration except StopIteration: continue if delete_after: # place target file in a temporary directory target_file = Path(prefix_dir) / Path(target_file) # but also allow for it in the orig target_file_orig = Path(prefix_dir) / Path(target_file_orig) target_file_path_orig = annex.pathobj / target_file_orig # If we were invoked in a subdirectory, patch together the # correct path target_file_path = extract_rpath / target_file \ if extract_rpath else target_file target_file_path = annex.pathobj / target_file_path # when the file already exists... if lexists(target_file_path): handle_existing = True if md5sum(str(target_file_path)) == \ md5sum(str(extracted_path)): if not annex.is_under_annex(str(extracted_path)): # if under annex -- must be having the same content, # we should just add possibly a new extra URL # but if under git -- we cannot/should not do # anything about it ATM if existing != 'overwrite': continue else: handle_existing = False if not handle_existing: pass # nothing... just to avoid additional indentation elif existing == 'fail': message = \ "{} exists, but would be overwritten by new file " \ "{}. Consider adjusting --existing".format\ (target_file_path, extracted_file) yield get_status_dict( ds=ds, status='error', message=message, **res_kwargs) return elif existing == 'overwrite': stats.overwritten += 1 # to make sure it doesn't conflict -- might have been a # tree rmtree(target_file_path) else: # an elaborate dance to piece together new archive names target_file_path_orig_ = target_file_path # To keep extension intact -- operate on the base of the # filename p, fn = os.path.split(target_file_path) ends_with_dot = fn.endswith('.') fn_base, fn_ext = file_basename(fn, return_ext=True) if existing == 'archive-suffix': fn_base += '-%s' % archive_basename elif existing == 'numeric-suffix': pass # archive-suffix will have the same logic else: # we shouldn't get here, argparse should catch a # non-existing value for --existing right away raise ValueError(existing) # keep incrementing index in the suffix until file # doesn't collide suf, i = '', 0 while True: connector = \ ('.' if (fn_ext or ends_with_dot) else '') file = fn_base + suf + connector + fn_ext target_file_path_new = \ Path(p) / Path(file) if not lexists(target_file_path_new): # we found a file name that is not yet taken break lgr.debug("Iteration %i of file name finding. " "File %s already exists", i, target_file_path_new) i += 1 suf = '.%d' % i target_file_path = target_file_path_new lgr.debug("Original file %s will be saved into %s" % (target_file_path_orig_, target_file_path)) # TODO: should we reserve smth like # stats.clobbed += 1 if target_file_path != target_file_path_orig: stats.renamed += 1 if copy: raise NotImplementedError( "Not yet copying from 'persistent' cache" ) lgr.debug("Adding %s to annex pointing to %s and with options " "%r", target_file_path, url, annex_options) out_json = annex.add_url_to_file( target_file_path, url, options=annex_options, batch=True) if 'key' in out_json and out_json['key'] is not None: # annex.is_under_annex(target_file, batch=True): # due to http://git-annex.branchable.com/bugs/annex_drop_is_not___34__in_effect__34___for_load_which_was___34__addurl_--batch__34__ed_but_not_yet_committed/?updated # we need to maintain a list of those to be dropped files if drop_after: # drop extracted files after adding to annex annex.drop_key(out_json['key'], batch=True) stats.dropped += 1 stats.add_annex += 1 else: lgr.debug("File {} was added to git, not adding url".format( target_file_path)) stats.add_git += 1 if delete_after: # we count the removal here, but don't yet perform it # to not interfer with batched processes - any pure Git # action invokes precommit which closes batched processes. stats.removed += 1 # Done with target_file -- just to have clear end of the loop del target_file if delete and archive and origin != 'key': lgr.debug("Removing the original archive {}".format(archive)) # force=True since some times might still be staged and fail annex.remove(str(archive_path), force=True) lgr.info("Finished adding %s: %s", archive, stats.as_str(mode='line')) if outside_stats: outside_stats += stats if delete_after: # force since not committed. r=True for -r (passed into git call # to recurse) delete_after_rpath = opj(extract_rpath, prefix_dir) \ if extract_rpath else prefix_dir delete_after_rpath = resolve_path(delete_after_rpath, ds=dataset) lgr.debug( "Removing extracted and annexed files under %s", delete_after_rpath ) annex.remove(str(delete_after_rpath), r=True, force=True) if commit: archive_rpath = archive_path.relative_to(ds.path) commit_stats = outside_stats if outside_stats else stats # so batched ones close and files become annex symlinks etc annex.precommit() precommitted = True if any(r.get('state', None) != 'clean' for p, r in annex.status(untracked='no').items()): annex.commit( "Added content extracted from %s %s\n\n%s" % (origin, archive_rpath, commit_stats.as_str(mode='full')), _datalad_msg=True ) commit_stats.reset() else: # don't commit upon completion pass finally: # take down the progress bar log_progress( lgr.info, pbar_id, 'Finished extraction', noninteractive_level=logging.INFO) # since we batched addurl, we should close those batched processes # if haven't done yet. explicitly checked to avoid any possible # "double-action" if not precommitted: annex.precommit() if delete_after_rpath: delete_after_path = opj(annex.path, delete_after_rpath) delete_after_rpath = resolve_path(delete_after_rpath, ds=dataset) if exists(delete_after_path): # should not be there # but for paranoid yoh lgr.warning( "Removing temporary directory under which extracted " "files were annexed and should have been removed: %s", delete_after_path) rmtree(delete_after_path) annex.always_commit = old_always_commit # remove what is left and/or everything upon failure earchive.clean(force=True) # remove tempfile directories (not cleaned up automatically): if prefix_dir is not None and lexists(prefix_dir): os.rmdir(prefix_dir) yield get_status_dict( ds=ds, status='ok', **res_kwargs) return annex
def __call__(dataset=None, path=None, data_only=True, recursive=False): # Note: copy logic from install to resolve dataset and path: # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if not path: if ds is None: # no dataset, no target location, nothing to do raise ValueError( "insufficient information for uninstallation (needs at " "least a dataset or a path") elif isinstance(path, list): # TODO: not sure. might be possible to deal with that list directly return [Uninstall.__call__( dataset=ds, path=p, data_only=data_only, recursive=recursive) for p in path] # resolve the target location against the provided dataset if path is not None: path = resolve_path(path, ds) lgr.debug("Resolved uninstallation target: {0}".format(path)) # if we have no dataset given, figure out which one we need to operate # on, based on the resolved target location (that is now guaranteed to # be specified if ds is None: # try to find a dataset at or above the installation target dspath = GitRepo.get_toppath(abspath(path)) if dspath is None: # no top-level dataset found, use path as such dspath = path ds = Dataset(dspath) assert(ds is not None) lgr.debug("Resolved target dataset for uninstallation: {0}".format(ds)) if not ds.is_installed(): if not path or path == ds.path: # we want to uninstall the dataset itself, which is not # installed => nothing to do # TODO: consider `data` option! is_installed currently only # checks for a repository lgr.info("Dataset {0} not installed. Nothing to " "do.".format(ds.path)) return else: # we want to uninstall something from a not installed dataset # Doesn't make sense, does it? => fail raise ValueError("Dataset {0} is not installed.".format(ds.path)) assert(ds.repo is not None) if not path or path == ds.path: # uninstall the dataset `ds` # TODO: what to consider? # - whether it is a submodule of another dataset # - `data_only` ? # - `recursive` # - what to return in what case (data_only)? raise NotImplementedError("TODO: Uninstall dataset %s" % ds.path) # needed by the logic below assert(isabs(path)) # express the destination path relative to the root of this dataset relativepath = relpath(path, start=ds.path) if path.startswith(pardir): raise ValueError("uninstallation path outside dataset") lgr.debug( "Resolved uninstallation target relative to dataset {0}: {1}".format( ds, relativepath)) # figure out, what path actually is pointing to: if not exists(path): # nothing there, nothing to uninstall lgr.info("Nothing found to uninstall at %s" % path) return if relativepath in ds.get_dataset_handles(recursive=True): # it's a submodule # --recursive required or implied? raise NotImplementedError("TODO: uninstall submodule %s from " "dataset %s" % (relativepath, ds.path)) if isdir(path): # don't know what to do yet # in git vs. untracked? # recursive? raise NotImplementedError("TODO: uninstall directory %s from " "dataset %s" % (path, ds.path)) # we know, it's an existing file if isinstance(ds.repo, AnnexRepo): try: ds.repo.get_file_key(relativepath) except FileInGitError: # file directly in git _file_in_git = True except FileNotInAnnexError: # either an untracked file in this dataset, or something that # also actually exists in the file system but could be part of # a subdataset _untracked_or_within_submodule = True # it's an annexed file if data_only: ds.repo.annex_drop([path]) return path else: raise NotImplementedError("TODO: fully uninstall file %s " "(annex) from dataset %s" % (path, ds.path)) else: # plain git repo if relativepath in ds.repo.get_indexed_files(): # file directly in git _file_in_git = True else: # either an untracked file in this dataset, or something that # also actually exists in the file system but could be part of # a subdataset _untracked_or_within_submodule = True if _file_in_git: if data_only: raise ValueError("%s is not a file handle. Removing its " "data only doesn't make sense." % path) else: return ds.repo.git_remove([relativepath]) elif _untracked_or_within_submodule: subds = get_containing_subdataset(ds, relativepath) if ds.path != subds.path: # target path belongs to a subdataset, hand uninstallation # over to it return subds.uninstall( path=relpath(path, start=subds.path), data_only=data_only, recursive=recursive) # this must be an untracked/existing something # it wasn't installed, so we cannot uninstall it raise ValueError("Cannot uninstall %s" % path)
def __call__(target, opts=None, dataset=None): # only non-bare repos have hashdirmixed, so require one ds = require_dataset(dataset, check_installed=True, purpose='ORA archive export') ds_repo = ds.repo # TODO remove once datalad 0.12rc7 or later is released if not hasattr(ds_repo, 'dot_git'): from datalad.support.gitrepo import GitRepo ds_repo.dot_git = ds_repo.pathobj / GitRepo.get_git_dir(ds_repo) annex_objs = ds_repo.dot_git / 'annex' / 'objects' archive = resolve_path(target, dataset) if archive.is_dir(): archive = archive / 'archive.7z' else: archive.parent.mkdir(exist_ok=True, parents=True) if not opts: # uncompressed by default opts = ['-mx0'] res_kwargs = dict( action="export-archive-ora", logger=lgr, ) if not annex_objs.is_dir(): yield get_status_dict( ds=ds, status='notneeded', message='no annex keys present', **res_kwargs, ) return exportdir = ds_repo.dot_git / 'datalad' / 'tmp' / 'ora_archive' if exportdir.exists(): yield get_status_dict( ds=ds, status='error', message=( 'export directory already exists, please remove first: %s', str(exportdir)), **res_kwargs, ) return keypaths = [ k for k in annex_objs.glob(op.join('**', '*')) if k.is_file() ] log_progress( lgr.info, 'oraarchiveexport', 'Start ORA archive export %s', ds, total=len(keypaths), label='ORA archive export', unit=' Keys', ) link_fx = os.link for keypath in keypaths: key = keypath.name hashdir = op.join(keypath.parts[-4], keypath.parts[-3]) log_progress(lgr.info, 'oraarchiveexport', 'Export key %s to %s', key, hashdir, update=1, increment=True) keydir = exportdir / hashdir / key keydir.mkdir(parents=True, exist_ok=True) try: link_fx(str(keypath), str(keydir / key)) except OSError: lgr.warning( 'No hard links supported at %s, will copy files instead', str(keydir)) # no hard links supported # switch function after first error link_fx = shutil.copyfile link_fx(str(keypath), str(keydir / key)) log_progress(lgr.info, 'oraarchiveexport', 'Finished RIA archive export from %s', ds) try: subprocess.run( ['7z', 'u', str(archive), '.'] + opts, cwd=str(exportdir), ) yield get_status_dict(path=str(archive), type='file', status='ok', **res_kwargs) except Exception as e: yield get_status_dict(path=str(archive), type='file', status='error', message=('7z failed: %s', exc_str(e)), **res_kwargs) return finally: rmtree(str(exportdir))
def __call__(path=None, initopts=None, *, force=False, description=None, dataset=None, annex=True, fake_dates=False, cfg_proc=None): # we only perform negative tests below no_annex = not annex if dataset: if isinstance(dataset, Dataset): ds = dataset else: ds = Dataset(dataset) refds_path = ds.path else: ds = refds_path = None # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if (isinstance(initopts, (list, tuple)) and '--bare' in initopts) or (isinstance(initopts, dict) and 'bare' in initopts): raise ValueError( "Creation of bare repositories is not supported. Consider " "one of the create-sibling commands, or use " "Git to init a bare repository and push an existing dataset " "into it.") if path: path = resolve_path(path, dataset) path = path if path \ else getpwd() if ds is None \ else refds_path # we know that we need to create a dataset at `path` assert (path is not None) # assure cfg_proc is a list (relevant if used via Python API) cfg_proc = ensure_list(cfg_proc) # prep for yield res = dict(action='create', path=str(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != str(path): refds = require_dataset(refds_path, check_installed=True, purpose='create a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, str(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = get_dataset_root( op.normpath(op.join(str(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if (not pstatus.get(check_path, {}).get("type") == "dataset" and any(check_path == p or check_path in p.parents for p in pstatus)): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents ] res.update({ 'status': 'error', 'message': ('collision with content in parent dataset at %s: %s', str(parentds_path), [str(c) for c in conflict]) }) yield res return if not force: # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in pstatus.items() if v.get('type', None) == 'dataset' } check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ('collision with %s (dataset) in dataset %s', str(conflict[0]), str(parentds_path)) }) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = ds if isinstance(ds, Dataset) and \ ds.path == path else Dataset(str(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`--force` option to ignore' }) yield res return # Check if specified cfg_proc(s) can be discovered, storing # the results so they can be used when the time comes to run # the procedure. If a procedure cannot be found, raise an # error to prevent creating the dataset. cfg_proc_specs = [] if cfg_proc: discovered_procs = tbds.run_procedure( discover=True, result_renderer='disabled', return_type='generator', ) for cfg_proc_ in cfg_proc: for discovered_proc in discovered_procs: if discovered_proc['procedure_name'] == 'cfg_' + cfg_proc_: cfg_proc_specs.append(discovered_proc) break else: raise ValueError("Cannot find procedure with name " "'%s'" % cfg_proc_) if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # Note for the code below: # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Re-use tbrepo instance, do not use tbds.repo # create and configure desired repository # also provides initial set of content to be tracked with git (not annex) if no_annex: tbrepo, add_to_git = _setup_git_repo(path, initopts, fake_dates) else: tbrepo, add_to_git = _setup_annex_repo(path, initopts, fake_dates, description) # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Note, must not happen earlier (before if) since "smart" it would not be tbds_config = tbds.config # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds_config: # make sure we reset this variable completely, in case of a # re-create tbds_config.unset(id_var, scope='branch') if _seed is None: # just the standard way # use a fully random identifier (i.e. UUID version 4) uuid_id = str(uuid.uuid4()) else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds_config.add(id_var, tbds_id if tbds_id is not None else uuid_id, scope='branch', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in tbds_config.overrides.items(): tbds_config.add(k, v, scope='local', reload=False) # all config manipulation is done -> fll reload tbds_config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbrepo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked' } # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbrepo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) for cfg_proc_spec in cfg_proc_specs: yield from tbds.run_procedure( cfg_proc_spec, result_renderer='disabled', return_type='generator', ) # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(refds, Dataset) and refds.path != tbds.path: # we created a dataset in another dataset # -> make submodule yield from refds.save( path=tbds.path, return_type='generator', result_renderer='disabled', ) res.update({'status': 'ok'}) yield res
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None, action=None, unavailable_path_status='', unavailable_path_msg=None, nondataset_path_status='error', force_parentds_discovery=True, force_subds_discovery=True, force_no_revision_change_discovery=True, force_untracked_discovery=True, modified=None): # upfront check for the fastest possible response if not path and dataset is None: # nothing given, try "here", but do not use `require_dataset`, as # it will determine the root dataset of `curdir` and further down # lead to path annotation of upstairs directories dataset = curdir if force_subds_discovery and not force_parentds_discovery: raise ValueError( 'subdataset discovery requires parent dataset discovery') # CONCEPT: yield with no status to indicate further processing # everything in one big loop to be able too yield as fast a possible # without any precomputing for all paths refds_path = Interface.get_refds_path(dataset) if modified is not None and (refds_path is None or not GitRepo.is_valid_repo(refds_path)): raise ValueError( "modification detection only works with a base dataset (non-given or found)") # prep common result props res_kwargs = dict( action=action if action else 'annotate_path', refds=refds_path, logger=lgr) # handle the case of recursion into a single dataset without any # extra fancy processing first -- full recursion can be done # faster than manual recursion, hence we gain quite some speed # from these few lines of extra code if not modified and not path and refds_path: if not GitRepo.is_valid_repo(refds_path): yield get_status_dict( # doesn't matter if the path is in another dataset # it was given as reference dataset status=nondataset_path_status, message='given reference dataset is not a dataset', path=refds_path, **res_kwargs) return refds = Dataset(refds_path) path = [] # yield the dataset itself r = get_status_dict(ds=refds, status='', **res_kwargs) yield r if recursive: # if we have nothing given, but need recursion, we need to feed # the dataset path itself for r in yield_recursive( refds, refds_path, action, recursion_limit): r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] yield r return # goal: structure in a way that makes most information on any path # available in a single pass, at the cheapest possible cost reported_paths = {} requested_paths = assure_list(path) if modified is not None: # modification detection would silently kill all nondataset paths # but we have to complain about them, hence doing it here if requested_paths and refds_path: for r in requested_paths: p = r['path'] if isinstance(r, dict) else r p = resolve_path(p, ds=refds_path) if path_startswith(p, refds_path): # all good continue # not the refds path_props = r if isinstance(r, dict) else {} res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with reference dataset' reported_paths[r] = res yield res # preserve non-existing paths to be silently killed by modification # detection and append them to requested_paths again after detection. # TODO: This might be melted in with treatment of non dataset paths # above. Re-appending those paths seems to be better than yielding # directly to avoid code duplication, since both cases later on are # dealt with again. preserved_paths = [] if requested_paths: [preserved_paths.append(r) for r in requested_paths if not lexists(r['path'] if isinstance(r, dict) else r)] # replace the requested paths by those paths that were actually # modified underneath or at a requested location requested_paths = get_modified_subpaths( # either the request, or the base dataset, if there was no request requested_paths if requested_paths else [refds_path], refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit) from itertools import chain # re-append the preserved paths: requested_paths = chain(requested_paths, iter(preserved_paths)) # do not loop over unique(), this could be a list of dicts # we avoid duplicates manually below via `reported_paths` for path in requested_paths: if not isinstance(path, dict): path = rawpath2ap(path, refds_path) # this is now an annotated path! path_props = path path = path['path'] # we need to mark our territory, who knows where this has been path_props.update(res_kwargs) if path in reported_paths: # we already recorded this path in the output # this can happen, whenever `path` is a subdataset, that was # discovered via recursive processing of another path before continue # the path exists in some shape or form # TODO if we have path_props already we could skip this test if isdir(path): # keep any existing type info, previously a more expensive run # could have discovered an uninstalled 'dataset', and we don't # want it to be relabeled to a directory path_props['type'] = \ path_props.get( 'type', 'dataset' if not islink(path) and GitRepo.is_valid_repo(path) else 'directory') # this could contain all types of additional content containing_dir = path if not islink(path) else normpath(opj(path, pardir)) else: if lexists(path): path_props['type'] = 'file' else: path_props['state'] = 'absent' # for everything else we are interested in the container containing_dir = dirname(path) if not containing_dir: containing_dir = curdir dspath = parent = get_dataset_root(containing_dir) if dspath: if path_props.get('type', None) == 'dataset': # for a dataset the root is not the parent, for anything else # it is parent = path_props.get('parentds', None) oneupdir = normpath(opj(containing_dir, pardir)) if parent is None and (force_parentds_discovery or ( refds_path and _with_sep(oneupdir).startswith( _with_sep(refds_path)))): # either forced, or only if we have a reference dataset, and # only if we stay within this refds when searching for the # parent parent = get_dataset_root(normpath(opj(containing_dir, pardir))) # NOTE the `and refds_path` is critical, as it will determine # whether a top-level dataset that was discovered gets the # parent property or not, it won't get it without a common # base dataset, and that is how we always rolled if parent and refds_path: path_props['parentds'] = parent # don't check whether this is actually a true subdataset of the # parent, done further down else: # set parent, but prefer existing property path_props['parentds'] = path_props.get('parentds', dspath) # test for `dspath` not `parent`, we only need to know whether there is # ANY dataset, not which one is the true parent, logic below relies on # the fact that we end here, if there is no dataset at all if not dspath: # not in any dataset res = get_status_dict( **dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = 'path not associated with any dataset' reported_paths[path] = res yield res continue # check that we only got SUBdatasets if refds_path and not path_startswith(dspath, refds_path): res = get_status_dict(**dict(res_kwargs, **path_props)) res['status'] = nondataset_path_status res['message'] = \ ('path not part of the reference dataset at %s', refds_path) reported_paths[path] = res yield res continue if path_props.get('type', None) == 'file': # nothing else we can learn about this res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res continue containing_ds = None path_type = path_props.get('type', None) if parent and force_subds_discovery and ( (path_type == 'dataset' and 'registered_subds' not in path_props) or path_type == 'directory' or not lexists(path)): # if the path doesn't exist, or is labeled a directory, or a dataset even # a dataset (without this info) -> record whether this is a known subdataset # to its parent containing_ds = Dataset(parent) subdss = containing_ds.subdatasets( fulfilled=None, recursive=False, result_xfm=None, result_filter=None, return_type='list') if path in [s['path'] for s in subdss]: if path_type == 'directory' or not lexists(path): # first record that it isn't here, if just a dir or not here at all path_props['state'] = 'absent' # this must be a directory, and it is not installed path_props['type'] = 'dataset' path_props['registered_subds'] = True if not lexists(path) or \ (path_props.get('type', None) == 'dataset' and path_props.get('state', None) == 'absent'): # not there (yet) message = unavailable_path_msg if unavailable_path_msg else None if message and '%s' in message: message = (message, path) path_props['message'] = message res = get_status_dict(**dict(res_kwargs, **path_props)) # assign given status, but only if the props don't indicate a status # already res['status'] = path_props.get( 'status', unavailable_path_status) reported_paths[path] = res yield res continue # we know everything we can, report res = get_status_dict(**dict(res_kwargs, **path_props)) if 'status' not in res: res['status'] = '' reported_paths[path] = res yield res rec_paths = [] if recursive: # here we need to consider the special case that `path` is # a dataset itself, if a recursion_limit is given (e.g. # `remove` will do that by default), we need to recurse # from the dataset itself, and not its parent to get things # right -- this will also avoid needless discovery of # unrelated subdatasets if path_props.get('type', None) == 'dataset': containing_ds = Dataset(path) else: # regular parent, we might have a dataset already containing_ds = Dataset(parent) if containing_ds is None else containing_ds for r in yield_recursive(containing_ds, path, action, recursion_limit): # capture reported paths r.update(res_kwargs) if 'refds' in r and not r['refds']: # avoid cruft del r['refds'] reported_paths[r['path']] = r if modified is not None: # we cannot yield right away, maybe it wasn't modified rec_paths.append(r) else: yield r if modified is not None and rec_paths: # replace the recursively discovered paths by those paths that # were actually modified underneath or at a requested location for r in get_modified_subpaths( rec_paths, refds=Dataset(refds_path), revision=modified, report_no_revision_change=force_no_revision_change_discovery, report_untracked='all' if force_untracked_discovery else 'no', recursion_limit=recursion_limit): res = get_status_dict(**dict(r, **res_kwargs)) reported_paths[res['path']] = res yield res return
def __call__(dataset, urlfile, urlformat, filenameformat, input_type="ext", exclude_autometa=None, meta=None, message=None, dry_run=False, fast=False, ifexists=None, missing_value=None, save=True, version_urls=False, cfg_proc=None): # Temporarily work around gh-2269. url_file = urlfile url_format, filename_format = urlformat, filenameformat from requests.exceptions import RequestException from datalad.distribution.dataset import Dataset, require_dataset from datalad.interface.results import get_status_dict from datalad.support.annexrepo import AnnexRepo lgr = logging.getLogger("datalad.plugin.addurls") ds = require_dataset(dataset, check_installed=False) if ds.repo and not isinstance(ds.repo, AnnexRepo): yield get_status_dict(action="addurls", ds=ds, status="error", message="not an annex repo") return url_file = str(resolve_path(url_file, dataset)) if input_type == "ext": extension = os.path.splitext(url_file)[1] input_type = "json" if extension == ".json" else "csv" with open(url_file) as fd: try: rows, subpaths = extract(fd, input_type, url_format, filename_format, exclude_autometa, meta, dry_run, missing_value) except (ValueError, RequestException) as exc: yield get_status_dict(action="addurls", ds=ds, status="error", message=exc_str(exc)) return if not rows: yield get_status_dict(action="addurls", ds=ds, status="notneeded", message="No rows to process") return if len(rows) != len(set(row["filename"] for row in rows)): yield get_status_dict(action="addurls", ds=ds, status="error", message=("There are file name collisions; " "consider using {_repindex}")) return if dry_run: for subpath in subpaths: lgr.info("Would create a subdataset at %s", subpath) for row in rows: lgr.info("Would download %s to %s", row["url"], os.path.join(ds.path, row["filename"])) lgr.info( "Metadata: %s", sorted(u"{}={}".format(k, v) for k, v in row["meta_args"].items())) yield get_status_dict(action="addurls", ds=ds, status="ok", message="dry-run finished") return if not ds.repo: # Populate a new dataset with the URLs. for r in ds.create(result_xfm=None, return_type='generator', cfg_proc=cfg_proc): yield r annex_options = ["--fast"] if fast else [] for spath in subpaths: if os.path.exists(os.path.join(ds.path, spath)): lgr.warning("Not creating subdataset at existing path: %s", spath) else: for r in ds.create(spath, result_xfm=None, cfg_proc=cfg_proc, return_type='generator'): yield r for row in rows: # Add additional information that we'll need for various # operations. filename_abs = os.path.join(ds.path, row["filename"]) if row["subpath"]: ds_current = Dataset(os.path.join(ds.path, row["subpath"])) ds_filename = os.path.relpath(filename_abs, ds_current.path) else: ds_current = ds ds_filename = row["filename"] row.update({ "filename_abs": filename_abs, "ds": ds_current, "ds_filename": ds_filename }) if version_urls: num_urls = len(rows) log_progress(lgr.info, "addurls_versionurls", "Versioning %d URLs", num_urls, label="Versioning URLs", total=num_urls, unit=" URLs") for row in rows: url = row["url"] try: row["url"] = get_versioned_url(url) except (ValueError, NotImplementedError) as exc: # We don't expect this to happen because get_versioned_url # should return the original URL if it isn't an S3 bucket. # It only raises exceptions if it doesn't know how to # handle the scheme for what looks like an S3 bucket. lgr.warning("error getting version of %s: %s", row["url"], exc_str(exc)) log_progress(lgr.info, "addurls_versionurls", "Versioned result for %s: %s", url, row["url"], update=1, increment=True) log_progress(lgr.info, "addurls_versionurls", "Finished versioning URLs") files_to_add = set() for r in add_urls(rows, ifexists=ifexists, options=annex_options): if r["status"] == "ok": files_to_add.add(r["path"]) yield r msg = message or """\ [DATALAD] add files from URLs url_file='{}' url_format='{}' filename_format='{}'""".format(url_file, url_format, filename_format) if files_to_add: meta_rows = [r for r in rows if r["filename_abs"] in files_to_add] for r in add_meta(meta_rows): yield r if save: for r in ds.save(path=files_to_add, message=msg, recursive=True): yield r
def __call__(path=None, dataset=None, reporton='all', recursive=False): # prep results res_kwargs = dict(action='meta_dump', logger=lgr) ds = require_dataset(dataset=dataset, check_installed=True, purpose='aggregate metadata query') if dataset: res_kwargs['refds'] = ds.path agginfos = get_ds_aggregate_db( ds.pathobj, version=str(aggregate_layout_version), # we are handling errors below warn_absent=False, ) if not agginfos: # if there has ever been an aggregation run, this file would # exist, hence there has not been and we need to tell this # to people yield get_status_dict( ds=ds, status='impossible', message='metadata aggregation has never been performed in ' 'this dataset', **res_kwargs) return if not path: # implement https://github.com/datalad/datalad/issues/3282 path = ds.pathobj if isinstance(dataset, Dataset) else os.getcwd() # check for paths that are not underneath this dataset resolved_paths = set() for p in assure_list(path): p = resolve_path(p, dataset) if p != ds.pathobj and ds.pathobj not in p.parents: raise ValueError( 'given path {} is not underneath dataset {}'.format(p, ds)) resolved_paths.add(p) # sort paths into their containing dataset aggregate records paths_by_ds = {} while resolved_paths: resolved_path = resolved_paths.pop() # find the first dataset that matches for aggdspath in sorted(agginfos, reverse=True): if recursive and resolved_path in aggdspath.parents: ps = paths_by_ds.get(aggdspath, set()) ps.add(aggdspath) paths_by_ds[aggdspath] = ps elif aggdspath == resolved_path \ or aggdspath in resolved_path.parents: ps = paths_by_ds.get(aggdspath, set()) ps.add(resolved_path) paths_by_ds[aggdspath] = ps # stop when the containing dataset is found break # which files do we need to have locally to perform the query info_keys = \ ('dataset_info', 'content_info') \ if reporton in ('all', 'jsonld') else \ ('dataset_info',) if reporton == 'datasets' else \ ('content_info',) if reporton == 'files' else \ [] objfiles = [ text_type(agginfos[d][t]) for d in paths_by_ds for t in info_keys if t in agginfos[d] ] lgr.debug( 'Verifying/achieving local availability of %i metadata objects', len(objfiles)) if objfiles: for r in ds.get(path=objfiles, result_renderer='disabled', return_type='generator'): # report only of not a success as this is an internal operation # that a user would not (need to) expect if success_status_map.get( r['status'], False) != 'success': # pragma: no cover yield r contexts = {} nodes_by_context = {} parentds = [] # loop over all records to get complete parentds relationships for aggdspath in sorted(agginfos): while parentds and parentds[-1] not in aggdspath.parents: parentds.pop() if aggdspath not in paths_by_ds: # nothing to say about this parentds.append(aggdspath) continue agg_record = agginfos[aggdspath] if reporton == 'aggregates': # we do not need to loop over the actual query paths, as # the aggregates of the containing dataset will contain # the desired info, if any exists # convert pathobj before emitting until we became more clever info = { k: text_type(v) if isinstance(v, ut.PurePath) else v for k, v in iteritems(agg_record) } info.update( path=text_type(aggdspath), type='dataset', ) if aggdspath == ds.pathobj: info['layout_version'] = aggregate_layout_version if parentds: info['parentds'] = text_type(parentds[-1]) yield dict(info, status='ok', **res_kwargs) parentds.append(aggdspath) continue # pull out actual metadata records for res in _yield_metadata_records( aggdspath, agg_record, paths_by_ds[aggdspath], reporton, parentds=parentds[-1] if parentds else None): if reporton != 'jsonld': yield dict(res, **res_kwargs) continue collect_jsonld_metadata(aggdspath, res, nodes_by_context, contexts) parentds.append(aggdspath) if reporton == 'jsonld': yield dict(status='ok', type='dataset', path=ds.path, metadata=format_jsonld_metadata(nodes_by_context), refcommit=agginfos[ds.pathobj]['refcommit'], **res_kwargs)
def __call__( target, opts=None, *, # opts is positional but optional in CLI dataset=None, remote=None, annex_wanted=None, froms=None, missing_content='error',): # only non-bare repos have hashdirmixed, so require one ds = require_dataset( dataset, check_installed=True, purpose='export to ORA archive') ds_repo = ds.repo annex_objs = ds_repo.dot_git / 'annex' / 'objects' archive = resolve_path(target, dataset) if archive.is_dir(): archive = archive / 'archive.7z' else: archive.parent.mkdir(exist_ok=True, parents=True) froms = ensure_list(froms) if not opts: # uncompressed by default opts = ['-mx0'] res_kwargs = dict( action="export-archive-ora", logger=lgr, ) if not annex_objs.is_dir(): yield get_status_dict( ds=ds, status='notneeded', message='no annex keys present', **res_kwargs, ) return exportdir = ds_repo.dot_git / 'datalad' / 'tmp' / 'ora_archive' if exportdir.exists(): yield get_status_dict( ds=ds, status='error', message=( 'export directory already exists, please remove first: %s', str(exportdir)), **res_kwargs, ) return def expr_to_opts(expr): opts = [] expr = expr.replace('(', ' ( ').replace(')', ' ) ') for sub_expr in expr.split(' '): if len(sub_expr): if sub_expr in '()': opts.append(f"-{sub_expr}") else: opts.append(f"--{sub_expr}") return opts find_filters = [] if remote: find_filters = ['-('] + expr_to_opts(ds_repo.get_preferred_content('wanted', remote)) + ['-)'] if annex_wanted: find_filters.extend(expr_to_opts(annex_wanted)) # git-annex find results need to be uniqued with set, as git-annex find # will return duplicates if multiple symlinks point to the same key. if froms: keypaths = set([ annex_objs.joinpath(k) for treeish in froms for k in ds_repo.call_annex_items_([ 'find', *find_filters, f"--branch={treeish}", "--format=${hashdirmixed}${key}/${key}\\n"]) ]) else: keypaths = set(annex_objs.joinpath(k) for k in ds_repo.call_annex_items_([ 'find', *find_filters, "--format=${hashdirmixed}${key}/${key}\\n" ])) log_progress( lgr.info, 'oraarchiveexport', 'Start ORA archive export %s', ds, total=len(keypaths), label='ORA archive export', unit=' Keys', ) if missing_content == 'continue': missing_file_lgr_func = lgr.warning elif missing_content == 'ignore': missing_file_lgr_func = lgr.debug link_fx = os.link for keypath in keypaths: key = keypath.name hashdir = op.join(keypath.parts[-4], keypath.parts[-3]) log_progress( lgr.info, 'oraarchiveexport', 'Export key %s to %s', key, hashdir, update=1, increment=True) keydir = exportdir / hashdir / key keydir.mkdir(parents=True, exist_ok=True) try: link_fx(str(keypath), str(keydir / key)) except FileNotFoundError as e: if missing_content == 'error': raise IOError('Key %s has no content available' % keypath) missing_file_lgr_func( 'Key %s has no content available', str(keypath)) except OSError: lgr.warning( 'No hard links supported at %s, will copy files instead', str(keypath)) # no hard links supported # switch function after first error link_fx = shutil.copyfile link_fx(str(keypath), str(keydir / key)) log_progress( lgr.info, 'oraarchiveexport', 'Finished RIA archive export from %s', ds ) try: subprocess.run( ['7z', 'u', str(archive), '.'] + opts, cwd=str(exportdir), ) yield get_status_dict( path=str(archive), type='file', status='ok', **res_kwargs) except Exception as e: ce = CapturedException(e) yield get_status_dict( path=str(archive), type='file', status='error', message=('7z failed: %s', ce), exception=ce, **res_kwargs) return finally: rmtree(str(exportdir))
def __call__(urls, *, dataset=None, path=None, overwrite=False, archive=False, save=True, message=None): from ..downloaders.http import HTTPDownloader from ..downloaders.providers import Providers ds = None if save or dataset: try: ds = require_dataset(dataset, check_installed=True, purpose='download urls') except NoDatasetFound: pass common_report = {"action": "download_url", "ds": ds} got_ds_instance = isinstance(dataset, Dataset) dir_is_target = not path or str(path).endswith(op.sep) path = str(resolve_path(path or op.curdir, ds=dataset)) if dir_is_target: # resolve_path() doesn't preserve trailing separators. Add one for # the download() call. path = path + op.sep urls = ensure_list_from_str(urls) if not dir_is_target: if len(urls) > 1: yield get_status_dict( status="error", message= ("When specifying multiple urls, --path should point to " "a directory target (with a trailing separator). Got %r", path), type="file", path=path, **common_report) return if archive: # make sure the file suffix indicated by a URL is preserved # so that any further archive processing doesn't have to # employ mime type inspection in order to determine the archive # type from datalad.support.network import URL suffixes = PurePosixPath(URL(urls[0]).path).suffixes if not Path(path).suffixes == suffixes: path += ''.join(suffixes) # we know that we have a single URL # download() would be fine getting an existing directory and # downloading the URL underneath it, but let's enforce a trailing # slash here for consistency. if op.isdir(path): yield get_status_dict( status="error", message=( "Non-directory path given (no trailing separator) " "but a directory with that name (after adding archive " "suffix) exists"), type="file", path=path, **common_report) return # TODO setup fancy ui.progressbars doing this in parallel and reporting overall progress # in % of urls which were already downloaded providers = Providers.from_config_files() downloaded_paths = [] path_urls = {} need_datalad_remote = False for url in urls: # somewhat "ugly" downloader = providers.get_provider(url).get_downloader(url) try: downloaded_path = downloader.download(url, path=path, overwrite=overwrite) except Exception as e: ce = CapturedException(e) yield get_status_dict(status="error", message=str(ce), type="file", path=path, exception=ce, **common_report) else: if not need_datalad_remote \ and (downloader.authenticator or downloader.credential or type(downloader) != HTTPDownloader): need_datalad_remote = True downloaded_paths.append(downloaded_path) path_urls[downloaded_path] = url yield get_status_dict(status="ok", type="file", path=downloaded_path, **common_report) if downloaded_paths and save and ds is not None: msg = message or """\ [DATALAD] Download URLs URLs: {}""".format("\n ".join(urls)) for r in Save()( downloaded_paths, message=msg, # ATTN: Pass the original dataset argument to # preserve relative path handling semantics. dataset=dataset, return_type="generator", result_renderer='disabled', result_xfm=None, result_filter=None, on_failure="ignore"): yield r ds_repo = ds.repo if isinstance(ds_repo, AnnexRepo): if need_datalad_remote: from datalad.customremotes.base import ( ensure_datalad_remote, ) ensure_datalad_remote(ds_repo, autoenable=True, encryption=None) if got_ds_instance: # Paths in `downloaded_paths` are already relative to the # dataset. rpaths = dict(zip(downloaded_paths, downloaded_paths)) else: # Paths in `downloaded_paths` are already relative to the # current working directory. Take these relative to the # dataset for use with the AnnexRepo method calls. rpaths = {} for orig_path, resolved in zip( downloaded_paths, resolve_path(downloaded_paths, ds=dataset)): rpath = path_under_rev_dataset(ds, resolved) if rpath: rpaths[str(rpath)] = orig_path else: lgr.warning("Path %s not under dataset %s", orig_path, ds) annex_paths = [ p for p, annexed in zip( rpaths, ds_repo.is_under_annex(list(rpaths.keys()))) if annexed ] if annex_paths: for path in annex_paths: url = path_urls[rpaths[path]] try: # The file is already present. This is just to # register the URL. ds_repo.add_url_to_file( path, url, # avoid batch mode for single files # https://github.com/datalad/datalad/issues/2849 batch=len(annex_paths) > 1, # bypass URL size check, we already have the file options=['--relaxed']) except CommandError as exc: lgr.warning("Registering %s with %s failed: %s", path, url, CapturedException(exc)) if archive: for path in annex_paths: yield from ds.add_archive_content( path, delete=True, on_failure='ignore', return_type='generator', result_renderer='disabled')
def __call__( path=None, dataset=None, recursive=False, recursion_limit=None): refds = require_dataset(dataset, check_installed=True, purpose="unlock") # Before passing the results to status() # * record explicitly specified non-directory paths so that we can # decide whether to yield a result for reported paths # * filter out and yield results for paths that don't exist res_paths_nondir = set() paths_lexist = None res_paths = list() if path: # Note, that we need unresolved versions of the path input to be # passed on to status. See gh-5456 for example. path = ensure_list(path) res_paths = resolve_path(path, ds=dataset) paths_lexist = [] res_paths_lexist = [] for p, p_r in zip(path, res_paths): if p_r.exists() or p_r.is_symlink(): paths_lexist.append(p) res_paths_lexist.append(p_r) if not p_r.is_dir(): res_paths_nondir.add(p_r) res_kwargs = dict(action='unlock', logger=lgr, refds=refds.path) if res_paths: for p in set(res_paths).difference(set(res_paths_lexist)): yield get_status_dict( status="impossible", path=str(p), type="file", message="path does not exist", **res_kwargs) if not (paths_lexist or paths_lexist is None): return # Collect information on the paths to unlock. to_unlock = defaultdict(list) # ds => paths (relative to ds) for res in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=paths_lexist, untracked="normal" if res_paths_nondir else "no", report_filetype=False, annex="availability", recursive=recursive, recursion_limit=recursion_limit, result_renderer='disabled', on_failure="ignore"): if res["action"] != "status" or res["status"] != "ok": yield res continue has_content = res.get("has_content") if has_content: parentds = res["parentds"] to_unlock[parentds].append(op.relpath(res["path"], parentds)) elif res_paths_nondir and Path(res["path"]) in res_paths_nondir: if has_content is False: msg = "no content present" status = "impossible" elif res["state"] == "untracked": msg = "untracked" status = "impossible" else: # This is either a regular git file or an unlocked annex # file. msg = "non-annex file" status = "notneeded" yield get_status_dict( status=status, path=res["path"], type="file", message="{}; cannot unlock".format(msg), **res_kwargs) # Do the actual unlocking. for ds_path, files in to_unlock.items(): ds = Dataset(ds_path) for r in ds.repo._call_annex_records( ["unlock"], files=files): yield get_status_dict( path=op.join(ds.path, r['file']), status='ok' if r['success'] else 'error', type='file', **res_kwargs)
def __call__(dataset=None, path=None, data_only=True, recursive=False): # Note: copy logic from install to resolve dataset and path: # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if not path: if ds is None: # no dataset, no target location, nothing to do raise ValueError( "insufficient information for uninstallation (needs at " "least a dataset or a path") elif isinstance(path, list): # TODO: not sure. might be possible to deal with that list directly return [ Uninstall.__call__(dataset=ds, path=p, data_only=data_only, recursive=recursive) for p in path ] # resolve the target location against the provided dataset if path is not None: path = resolve_path(path, ds) lgr.debug("Resolved uninstallation target: {0}".format(path)) # if we have no dataset given, figure out which one we need to operate # on, based on the resolved target location (that is now guaranteed to # be specified if ds is None: # try to find a dataset at or above the installation target dspath = GitRepo.get_toppath(abspath(path)) if dspath is None: # no top-level dataset found, use path as such dspath = path ds = Dataset(dspath) assert (ds is not None) lgr.debug("Resolved target dataset for uninstallation: {0}".format(ds)) if not ds.is_installed(): if not path or path == ds.path: # we want to uninstall the dataset itself, which is not # installed => nothing to do # TODO: consider `data` option! is_installed currently only # checks for a repository lgr.info("Dataset {0} not installed. Nothing to " "do.".format(ds.path)) return else: # we want to uninstall something from a not installed dataset # Doesn't make sense, does it? => fail raise ValueError("Dataset {0} is not installed.".format( ds.path)) assert (ds.repo is not None) if not path or path == ds.path: # uninstall the dataset `ds` # TODO: what to consider? # - whether it is a submodule of another dataset # - `data_only` ? # - `recursive` # - what to return in what case (data_only)? raise NotImplementedError("TODO: Uninstall dataset %s" % ds.path) # needed by the logic below assert (isabs(path)) # express the destination path relative to the root of this dataset relativepath = relpath(path, start=ds.path) if path.startswith(pardir): raise ValueError("uninstallation path outside dataset") lgr.debug( "Resolved uninstallation target relative to dataset {0}: {1}". format(ds, relativepath)) # figure out, what path actually is pointing to: if not exists(path): # nothing there, nothing to uninstall lgr.info("Nothing found to uninstall at %s" % path) return if relativepath in ds.get_dataset_handles(recursive=True): # it's a submodule # --recursive required or implied? raise NotImplementedError("TODO: uninstall submodule %s from " "dataset %s" % (relativepath, ds.path)) if isdir(path): # don't know what to do yet # in git vs. untracked? # recursive? raise NotImplementedError("TODO: uninstall directory %s from " "dataset %s" % (path, ds.path)) # we know, it's an existing file if isinstance(ds.repo, AnnexRepo): try: ds.repo.get_file_key(relativepath) except FileInGitError: # file directly in git _file_in_git = True except FileNotInAnnexError: # either an untracked file in this dataset, or something that # also actually exists in the file system but could be part of # a subdataset _untracked_or_within_submodule = True # it's an annexed file if data_only: ds.repo.annex_drop([path]) return path else: raise NotImplementedError("TODO: fully uninstall file %s " "(annex) from dataset %s" % (path, ds.path)) else: # plain git repo if relativepath in ds.repo.get_indexed_files(): # file directly in git _file_in_git = True else: # either an untracked file in this dataset, or something that # also actually exists in the file system but could be part of # a subdataset _untracked_or_within_submodule = True if _file_in_git: if data_only: raise ValueError("%s is not a file handle. Removing its " "data only doesn't make sense." % path) else: return ds.repo.git_remove([relativepath]) elif _untracked_or_within_submodule: subds = get_containing_subdataset(ds, relativepath) if ds.path != subds.path: # target path belongs to a subdataset, hand uninstallation # over to it return subds.uninstall(path=relpath(path, start=subds.path), data_only=data_only, recursive=recursive) # this must be an untracked/existing something # it wasn't installed, so we cannot uninstall it raise ValueError("Cannot uninstall %s" % path)
def __call__(dataset=None, path=None, source=None, recursive=False, add_data_to_git=False): lgr.debug("Installation attempt started") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if isinstance(path, list): if not len(path): # normalize value to expected state when nothing was provided path = None elif len(path) == 1: # we can simply continue with the function as called with a # single argument path = path[0] else: lgr.debug("Installation of multiple targets was requested: {0}".format(path)) return [Install.__call__( dataset=ds, path=p, source=source, recursive=recursive) for p in path] # resolve the target location against the provided dataset if path is not None: # make sure it is not a URL, `resolve_path` cannot handle that if is_url(path): try: path = get_local_path_from_url(path) path = resolve_path(path, ds) except ValueError: # URL doesn't point to a local something pass else: path = resolve_path(path, ds) # any `path` argument that point to something local now resolved and # is no longer a URL # if we have no dataset given, figure out which one we need to operate # on, based on the resolved target location (that is now guaranteed to # be specified, but only if path isn't a URL (anymore) -> special case, # handles below if ds is None and path is not None and not is_url(path): # try to find a dataset at or above the installation target dspath = GitRepo.get_toppath(abspath(path)) if dspath is None: # no top-level dataset found, use path as such dspath = path ds = Dataset(dspath) if ds is None and source is None and path is not None: # no dataset, no source # this could be a shortcut install call, where the first # arg identifies the source if is_url(path) or os.path.exists(path): # we have an actual URL -> this should be the source # OR # it is not a URL, but it exists locally lgr.debug( "Single argument given to install and no dataset found. " "Assuming the argument identifies a source location.") source = path path = None lgr.debug("Resolved installation target: {0}".format(path)) if ds is None and path is None and source is not None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue lgr.debug( "Neither dataset not target installation path provided. " "Assuming installation of a remote dataset. " "Deriving destination path from given source {0}".format( source)) ds = Dataset(_installationpath_from_url(source)) if not path and ds is None: # no dataset, no target location, nothing to do raise InsufficientArgumentsError( "insufficient information for installation (needs at " "least a dataset or an installation path") assert(ds is not None) lgr.debug("Resolved target dataset for installation: {0}".format(ds)) vcs = ds.repo if vcs is None: # TODO check that a "ds.path" actually points to a TOPDIR # should be the case already, but maybe nevertheless check try: with swallow_logs(): vcs = Install._get_new_vcs(ds, source, vcs) except GitCommandError: lgr.debug("Cannot retrieve from URL: {0}".format(source)) # maybe source URL was missing a '/.git' if source and not source.rstrip('/').endswith('/.git'): source = '{0}/.git'.format(source.rstrip('/')) lgr.debug("Attempt to retrieve from URL: {0}".format(source)) vcs = Install._get_new_vcs(ds, source, vcs) else: lgr.debug("Unable to establish repository instance at: {0}".format(ds.path)) raise assert(ds.repo) # is automagically re-evaluated in the .repo property runner = Runner() if path is None or path == ds.path: # if the goal was to install this dataset, we are done, # except for 'recursive'. # TODO: For now 'recursive' means just submodules. # See --with-data vs. -- recursive and figure it out if recursive: for sm in ds.repo.get_submodules(): _install_subds_from_flexible_source( ds, sm.path, sm.url, recursive=recursive) return ds # at this point this dataset is "installed", now we can test whether to # install something into the dataset # needed by the logic below assert(isabs(path)) # express the destination path relative to the root of this dataset relativepath = relpath(path, start=ds.path) if path.startswith(pardir): raise ValueError("installation path outside dataset") lgr.debug( "Resolved installation target relative to dataset {0}: {1}".format( ds, relativepath)) # this dataset must already know everything necessary ################################################### # FLOW GUIDE # # at this point we know nothing about the # installation targether ################################################### try: # it is simplest to let annex tell us what we are dealing with lgr.debug("Trying to fetch file %s using annex", relativepath) if not isinstance(vcs, AnnexRepo): assert(isinstance(vcs, GitRepo)) # FLOW GUIDE # this is not an annex repo, but we raise exceptions # to be able to treat them alike in the special case handling # below if not exists(path): raise IOError("path doesn't exist yet, might need special handling") elif relativepath in vcs.get_indexed_files(): # relativepath is in git raise FileInGitError("We need to handle it as known to git") else: raise FileNotInAnnexError("We don't have yet annex repo here") if vcs.get_file_key(relativepath): # FLOW GUIDE EXIT POINT # this is an annex'ed file -> get it # TODO implement `copy --from` using `source` # TODO fail if `source` is something strange vcs.annex_get(relativepath) # return the absolute path to the installed file return path except FileInGitError: ################################################### # FLOW GUIDE # # `path` is either # - a file already checked into Git # - known submodule ################################################### lgr.log(5, "FileInGitError logic") if source is not None: raise FileInGitError("File %s is already in git. Specifying source (%s) makes no sense" % (path, source)) # file is checked into git directly -> nothing to do # OR this is a submodule of this dataset submodule = [sm for sm in ds.repo.get_submodules() if sm.path == relativepath] if not len(submodule): # FLOW GUIDE EXIT POINT # this is a file in Git and no submodule, just return its path lgr.debug("Don't act, data already present in Git") return path elif len(submodule) > 1: raise RuntimeError( "more than one submodule registered at the same path?") submodule = submodule[0] # FLOW GUIDE EXIT POINT # we are dealing with a known submodule (i.e. `source` # doesn't matter) -> check it out lgr.debug("Install subdataset at: {0}".format(submodule.path)) subds = _install_subds_from_flexible_source( ds, submodule.path, submodule.url, recursive=recursive) return subds except FileNotInAnnexError: ################################################### # FLOW GUIDE # # `path` is either # - content of a subdataset # - an untracked file in this dataset # - an entire untracked/unknown existing subdataset ################################################### lgr.log(5, "FileNotInAnnexError logic") subds = get_containing_subdataset(ds, relativepath) if ds.path != subds.path: # FLOW GUIDE EXIT POINT # target path belongs to a known subdataset, hand # installation over to it return subds.install( path=relpath(path, start=subds.path), source=source, recursive=recursive, add_data_to_git=add_data_to_git) # FLOW GUIDE # this must be an untracked/existing something, so either # - a file # - a directory # - an entire repository if exists(opj(path, '.git')): # FLOW GUIDE EXIT POINT # this is an existing repo and must be in-place turned into # a submodule of this dataset return _install_subds_inplace( ds, path, relativepath, source, runner) # FLOW GUIDE EXIT POINT # - untracked file or directory in this dataset if isdir(path) and not recursive: # this is a directory and we want --recursive for it raise ValueError( "installation of a directory requires the `recursive` flag") # few sanity checks if source and abspath(source) != path: raise ValueError( "installation target already exists, but `source` points to " "another location (target: '{0}', source: '{0}'".format( source, path)) if not add_data_to_git and not (isinstance(vcs, AnnexRepo)): raise RuntimeError( "Trying to install file(s) into a dataset " "with a plain Git repository. First initialize annex, or " "provide override flag.") # switch `add` procedure between Git and Git-annex according to flag if add_data_to_git: vcs.git_add(relativepath) added_files = resolve_path(relativepath, ds) else: # do a blunt `annex add` added_files = vcs.annex_add(relativepath) # return just the paths of the installed components if isinstance(added_files, list): added_files = [resolve_path(i['file'], ds) for i in added_files] else: added_files = resolve_path(added_files['file'], ds) if added_files: return added_files else: return None except IOError: ################################################### # FLOW GUIDE # # more complicated special cases -- `path` is either # - a file/subdataset in a not yet initialized but known # submodule # - an entire untracked/unknown existing subdataset # - non-existing content that should be installed from `source` ################################################### lgr.log(5, "IOError logic") # we can end up here in two cases ATM if (exists(path) or islink(path)) or source is None: # FLOW GUIDE # - target exists but this dataset's VCS rejects it, # so it should be part of a subdataset # or # - target doesn't exist, but no source is given, so # it could be a handle that is actually contained in # a not yet installed subdataset subds = get_containing_subdataset(ds, relativepath) if ds.path != subds.path: # FLOW GUIDE # target path belongs to a subdataset, hand installation # over to it if not subds.is_installed(): # FLOW GUIDE # we are dealing with a target in a not yet # available but known subdataset -> install it first ds.install(subds.path, recursive=recursive) return subds.install( path=relpath(path, start=subds.path), source=source, recursive=recursive, add_data_to_git=add_data_to_git) # FLOW GUIDE EXIT POINT raise InsufficientArgumentsError( "insufficient information for installation: the " "installation target {0} doesn't exists, isn't a " "known handle of dataset {1}, and no `source` " "information was provided.".format(path, ds)) if not source: # FLOW GUIDE EXIT POINT raise InsufficientArgumentsError( "insufficient information for installation: the " "installation target {0} doesn't exists, isn't a " "known handle of dataset {1}, and no `source` " "information was provided.".format(path, ds)) source_path = expandpath(source) if exists(source_path): # FLOW GUIDE EXIT POINT # this could be # - local file # - local directory # - repository outside the dataset # we only want to support the last case of locally cloning # a repo -- fail otherwise if exists(opj(source_path, '.git')): return _install_subds_from_flexible_source( ds, relativepath, source_path, recursive) raise ValueError( "installing individual local files or directories is not " "supported, copy/move them into the dataset first") # FLOW GUIDE # `source` is non-local, it could be: # - repository # - file # we have no further evidence, hence we need to try try: # FLOW GUIDE EXIT POINT # assume it is a dataset return _install_subds_from_flexible_source( ds, relativepath, source, recursive) except CommandError: # FLOW GUIDE EXIT POINT # apaarently not a repo, assume it is a file url vcs.annex_addurl_to_file(relativepath, source) return path
def __call__( dataset=None, dest=None, path=None, # Note: add remote currently disabled in publish # dest_url=None, dest_pushurl=None, with_data=None, recursive=False): # Note: add remote currently disabled in publish # if dest is None and (dest_url is not None # or dest_pushurl is not None): # raise ValueError("""insufficient information for adding the # destination as a sibling (needs at least a name)""") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if not path: path = curdir elif isinstance(path, list): return [ Publish.__call__( dataset=ds, dest=dest, path=p, # Note: add remote currently disabled in publish # dest_url=dest_url, # dest_pushurl=dest_pushurl, with_data=with_data, recursive=recursive) for p in path ] # resolve the location against the provided dataset if path is not None: path = resolve_path(path, ds) lgr.info("Publishing {0}".format(path)) # if we have no dataset given, figure out which one we need to operate # on, based on the resolved location (that is now guaranteed to # be specified if ds is None: # try to find a dataset at or above the location dspath = GitRepo.get_toppath(abspath(path)) if dspath is None: # no top-level dataset found, use path as such dspath = path ds = Dataset(dspath) lgr.debug("Resolved dataset for publication: {0}".format(ds)) assert (ds is not None) # it might still be about a subdataset of ds: if path is not None: relativepath = relpath(path, start=ds.path) subds = get_containing_subdataset(ds, relativepath) if subds.path != ds.path: # path belongs to a subdataset; hand it over lgr.debug("Hand over to submodule %s" % subds.path) return subds.publish( dest=dest, path=relpath(path, start=subds.path), # Note: add remote currently disabled in publish # dest_url=dest_url, # dest_pushurl=dest_pushurl, with_data=with_data, recursive=recursive) # now, we know, we have to operate on ds. So, ds needs to be installed, # since we cannot publish anything from a not installed dataset, # can we? # (But may be just the existence of ds.repo is important here.) if not ds.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(ds.path)) assert (ds.repo is not None) # TODO: For now we can deal with a sibling(remote) name given by `dest` # only. Figure out, when to allow for passing a local path or URL # directly and what to do in that case. # Note: we need an upstream remote, if there's none given. We could # wait for git push to complain, but we need to explicitly figure it # out for pushing annex branch anyway and we might as well fail right # here. # keep original dest in case it's None for passing to recursive calls: dest_resolved = dest if dest is None: # check for tracking branch's remote: try: std_out, std_err = \ ds.repo._git_custom_command('', ["git", "config", "--get", "branch.{active_branch}.remote".format(active_branch=ds.repo.git_get_active_branch())], expect_fail=True) except CommandError as e: if e.code == 1 and e.stdout == "": std_out = None else: raise if std_out: dest_resolved = std_out.strip() else: # we have no remote given and no upstream => fail raise RuntimeError("No known default target for " "publication and none given.") # upstream branch needed for update (merge) and subsequent push, # in case there is no. set_upstream = False try: # Note: tracking branch actually defined bei entry "merge" # PLUS entry "remote" std_out, std_err = \ ds.repo._git_custom_command('', ["git", "config", "--get", "branch.{active_branch}.merge".format(active_branch=ds.repo.git_get_active_branch())], expect_fail=True) except CommandError as e: if e.code == 1 and e.stdout == "": # no tracking branch yet: set_upstream = True else: raise # is `dest` an already known remote? if dest_resolved not in ds.repo.git_get_remotes(): # unknown remote raise ValueError("No sibling '%s' found." % dest_resolved) # Note: add remote currently disabled in publish # if dest_url is None: # raise ValueError("No sibling '%s' found. Provide `dest-url`" # " to register it." % dest_resolved) # lgr.info("Sibling %s unknown. Registering ...") # # # Fill in URL-Template: # remote_url = dest_url.replace("%NAME", basename(ds.path)) # # TODO: handle_name.replace("/", "-")) instead of basename() # # - figure it out ;) # # - either a datasets needs to discover superdatasets in # # order to get it's relative path to provide a name # # - or: We need a different approach on the templates # # # Add the remote # ds.repo.git_remote_add(dest_resolved, remote_url) # if dest_pushurl: # # Fill in template: # remote_url_push = \ # dest_pushurl.replace("%NAME", basename(ds.path)) # # TODO: Different way of replacing %NAME; See above # # # Modify push url: # ds.repo._git_custom_command('', # ["git", "remote", # "set-url", # "--push", dest_resolved, # remote_url_push]) # lgr.info("Added sibling '%s'." % dest) # lgr.debug("Added remote '%s':\n %s (fetch)\n%s (push)." % # (dest_resolved, remote_url, # remote_url_push if dest_pushurl else remote_url)) # Note: add remote currently disabled in publish # else: # # known remote: parameters dest-url-* currently invalid. # # This may change to adapt the existing remote. # if dest_url: # lgr.warning("Sibling '%s' already exists for dataset '%s'. " # "Ignoring dest-url %s." % # (dest_resolved, ds.path, dest_url)) # if dest_pushurl: # lgr.warning("Sibling '%s' already exists for dataset '%s'. " # "Ignoring dest-pushurl %s." % # (dest_resolved, ds.path, dest_pushurl)) # Figure out, what to publish if path is None or path == ds.path: # => publish the dataset itself # push local state: # TODO: Rework git_push in GitRepo cmd = ['git', 'push'] if set_upstream: # no upstream branch yet cmd.append("--set-upstream") cmd += [dest_resolved, ds.repo.git_get_active_branch()] ds.repo._git_custom_command('', cmd) # push annex branch: if isinstance(ds.repo, AnnexRepo): ds.repo.git_push("%s +git-annex:git-annex" % dest_resolved) # TODO: if with_data is a shell pattern, we get a list, when called # from shell, right? # => adapt the following and check constraints to allow for that if with_data: ds.repo._git_custom_command('', ["git", "annex", "copy"] + with_data + ["--to", dest_resolved]) if recursive and ds.get_dataset_handles() != []: results = [ds] # Note: add remote currently disabled in publish # modify URL templates: # if dest_url: # dest_url = dest_url.replace('%NAME', basename(ds.path) + '-%NAME') # if dest_pushurl: # dest_pushurl = dest_pushurl.replace('%NAME', basename(ds.path) + '-%NAME') for subds in ds.get_dataset_handles(): results.append( Dataset(opj(ds.path, subds)).publish( dest=dest, # Note: use `dest` instead of `dest_resolved` in case # dest was None, so subdatasets would use their default # as well # Note: add remote currently disabled in publish # dest_url=dest_url, # dest_pushurl=dest_pushurl, with_data=with_data, recursive=recursive)) return results return ds elif exists(path): # At this point `path` is not referencing a (sub)dataset. # An annexed file is the only thing left, that `path` might be # validly pointing to. Anything else we can't handle currently. if isinstance(ds.repo, AnnexRepo): try: if ds.repo.get_file_key(relativepath): # file is in annex, publish it ds.repo._run_annex_command( 'copy', annex_options=[path, '--to=%s' % dest_resolved]) return path except (FileInGitError, FileNotInAnnexError): pass # `path` can't be published lgr.warning("Don't know how to publish %s." % path) return None else: # nothing to publish found lgr.warning("Nothing to publish found at %s." % path) return None
def __call__(sshurl, name=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, recursion_limit=None, existing='error', shared=None, group=None, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None, annex_wanted=None, annex_group=None, annex_groupwanted=None, inherit=False, since=None): # # nothing without a base dataset # ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') refds_path = ds.path # # all checks that are possible before we start parsing the dataset # # possibly use sshurl to get the name in case if not specified if not sshurl: if not inherit: raise InsufficientArgumentsError( "needs at least an SSH URL, if no inherit option") if name is None: raise ValueError( "Neither SSH URL, nor the name of sibling to inherit from " "was specified") # It might well be that we already have this remote setup try: sshurl = CreateSibling._get_remote_url(ds, name) except Exception as exc: lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc)) elif inherit: raise ValueError( "For now, for clarity not allowing specifying a custom sshurl " "while inheriting settings") # may be could be safely dropped -- still WiP if not sshurl: # TODO: may be more back up before _prep? super_ds = ds.get_superdataset() if not super_ds: raise ValueError( "Could not determine super dataset for %s to inherit URL" % ds) super_url = CreateSibling._get_remote_url(super_ds, name) # for now assuming hierarchical setup # (TODO: to be able to destinguish between the two, probably # needs storing datalad.*.target_dir to have %RELNAME in there) sshurl = slash_join(super_url, relpath(refds_path, super_ds.path)) # check the login URL sibling_ri = RI(sshurl) ssh_sibling = is_ssh(sibling_ri) if not (ssh_sibling or isinstance(sibling_ri, PathRI)): raise ValueError( "Unsupported SSH URL or path: '{0}', " "use ssh://host/path, host:path or path syntax".format(sshurl)) if not name: name = sibling_ri.hostname if ssh_sibling else "local" lgr.debug("No sibling name given. Using %s'%s' as sibling name", "URL hostname " if ssh_sibling else "", name) if since == '': # consider creating siblings only since the point of # the last update # XXX here we assume one to one mapping of names from local branches # to the remote active_branch = ds.repo.get_active_branch() since = '%s/%s' % (name, active_branch) # # parse the base dataset to find all subdatasets that need processing # to_process = [] cand_ds = [ Dataset(r['path']) for r in diff_dataset( ds, fr=since, to=None, # make explicit, but doesn't matter, no recursion in diff() constant_refs=True, # contrain to the paths of all locally existing subdatasets path=[ sds['path'] for sds in ds.subdatasets(recursive=recursive, recursion_limit=recursion_limit, fulfilled=True, result_renderer=None) ], # save cycles, we are only looking for datasets annex=None, untracked='no', # recursion was done faster by subdatasets() recursive=False, # save cycles, we are only looking for datasets eval_file_type=False, ) if r.get('type') == 'dataset' and r.get('state', None) != 'clean' ] # check remotes setup for d in cand_ds if since else ([ds] + cand_ds): d_repo = d.repo if d_repo is None: continue checkds_remotes = d.repo.get_remotes() res = dict( action='create_sibling', path=d.path, type='dataset', ) if publish_depends: # make sure dependencies are valid # TODO: inherit -- we might want to automagically create # those dependents as well??? unknown_deps = set( ensure_list(publish_depends)).difference(checkds_remotes) if unknown_deps: yield dict( res, status='error', message=('unknown sibling(s) specified as publication ' 'dependency: %s', unknown_deps), ) continue if name in checkds_remotes and existing in ('error', 'skip'): yield dict( res, status='error' if existing == 'error' else 'notneeded', message=( "sibling '%s' already configured (specify alternative " "name, or force reconfiguration via --existing", name), ) continue to_process.append(res) if not to_process: # we ruled out all possibilities # TODO wait for gh-1218 and make better return values lgr.info("No datasets qualify for sibling creation. " "Consider different settings for --existing " "or --since if this is unexpected") return if ssh_sibling: # request ssh connection: lgr.info("Connecting ...") shell = ssh_manager.get_connection(sshurl) else: shell = _RunnerAdapter() sibling_ri.path = str(resolve_path(sibling_ri.path, dataset)) if target_dir: target_dir = opj(sibling_ri.path, target_dir) if target_dir is None: if sibling_ri.path: target_dir = sibling_ri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = "%RELNAME" not in target_dir if not shell.get_annex_version(): raise MissingExternalDependency( 'git-annex', msg="It's required on the {} machine to create a sibling". format('remote' if ssh_sibling else 'local')) # # all checks done and we have a connection, now do something # # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) yielded = set() remote_repos_to_run_hook_for = [] for currentds_ap in \ sorted(to_process, key=lambda x: x['path'].count('/')): current_ds = Dataset(currentds_ap['path']) path = _create_dataset_sibling( name, current_ds, refds_path, shell, replicate_local_structure, sibling_ri, target_dir, target_url, target_pushurl, existing, shared, group, publish_depends, publish_by_default, ui, as_common_datasrc, annex_wanted, annex_group, annex_groupwanted, inherit) if not path: # nothing new was created # TODO is 'notneeded' appropriate in this case? currentds_ap['status'] = 'notneeded' # TODO explain status in 'message' yield currentds_ap yielded.add(currentds_ap['path']) continue remote_repos_to_run_hook_for.append((path, currentds_ap)) # publish web-interface to root dataset on publication server if current_ds.path == refds_path and ui: lgr.info("Uploading web interface to %s" % path) try: CreateSibling.upload_web_interface(path, shell, shared, ui) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to push web interface to the remote datalad repository (%s)", exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue # in reverse order would be depth first lgr.info("Running post-update hooks in all created siblings") # TODO: add progressbar for path, currentds_ap in remote_repos_to_run_hook_for[::-1]: # Trigger the hook lgr.debug("Running hook for %s (if exists and executable)", path) try: shell( "cd {} " "&& ( [ -x hooks/post-update ] && hooks/post-update || : )" "".format(sh_quote(_path_(path, ".git")))) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to run post-update hook under remote path %s (%s)", path, exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue if not currentds_ap['path'] in yielded: # if we were silent until now everything is just splendid currentds_ap['status'] = 'ok' yield currentds_ap
def __call__( path=None, initopts=None, force=False, description=None, dataset=None, no_annex=_NoAnnexDefault, annex=True, fake_dates=False, cfg_proc=None ): # TODO: introduced with 0.13, remove with 0.14 if no_annex is not _NoAnnexDefault: # the two mirror options do not agree and the deprecated one is # not at default value warnings.warn("datalad-create's `no_annex` option is deprecated " "and will be removed in a future release, " "use the reversed-sign `annex` option instead.", DeprecationWarning) # honor the old option for now annex = not no_annex # we only perform negative tests below no_annex = not annex if dataset: if isinstance(dataset, Dataset): ds = dataset else: ds = Dataset(dataset) refds_path = ds.path else: ds = refds_path = None # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if (isinstance(initopts, (list, tuple)) and '--bare' in initopts) or ( isinstance(initopts, dict) and 'bare' in initopts): raise ValueError( "Creation of bare repositories is not supported. Consider " "one of the create-sibling commands, or use " "Git to init a bare repository and push an existing dataset " "into it.") if path: path = resolve_path(path, dataset) path = path if path \ else getpwd() if ds is None \ else refds_path # we know that we need to create a dataset at `path` assert(path is not None) # assure cfg_proc is a list (relevant if used via Python API) cfg_proc = assure_list(cfg_proc) # prep for yield res = dict(action='create', path=str(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != str(path): refds = require_dataset( refds_path, check_installed=True, purpose='creating a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, str(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = get_dataset_root( op.normpath(op.join(str(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = ut.Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = ut.Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if (not pstatus.get(check_path, {}).get("type") == "dataset" and any(check_path == p or check_path in p.parents for p in pstatus)): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents] res.update({ 'status': 'error', 'message': ( 'collision with content in parent dataset at %s: %s', str(parentds_path), [str(c) for c in conflict])}) yield res return if not force: # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in pstatus.items() if v.get('type', None) == 'dataset'} check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ( 'collision with %s (dataset) in dataset %s', str(conflict[0]), str(parentds_path))}) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = ds if isinstance(ds, Dataset) and \ ds.path == path else Dataset(str(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore'}) yield res return # stuff that we create and want to have tracked with git (not annex) add_to_git = {} if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # Note for the code below: # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Re-use tbrepo instance, do not use tbds.repo # create and configure desired repository if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) tbrepo = GitRepo( tbds.path, url=None, create=True, create_sanity_checks=False, git_opts=initopts, fake_dates=fake_dates) # place a .noannex file to indicate annex to leave this repo alone stamp_path = ut.Path(tbrepo.path) / '.noannex' stamp_path.touch() add_to_git[stamp_path] = { 'type': 'file', 'state': 'untracked'} else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo( tbds.path, url=None, create=True, create_sanity_checks=False, # do not set backend here, to avoid a dedicated commit backend=None, # None causes version to be taken from config version=None, description=description, git_opts=initopts, fake_dates=fake_dates ) # set the annex backend in .gitattributes as a staged change tbrepo.set_default_backend( cfg.obtain('datalad.repo.backend'), persistent=True, commit=False) add_to_git[tbrepo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'added'} # make sure that v6 annex repos never commit content under .datalad attrs_cfg = ( ('config', 'annex.largefiles', 'nothing'), ('metadata/aggregate*', 'annex.largefiles', 'nothing'), ('metadata/objects/**', 'annex.largefiles', '({})'.format(cfg.obtain( 'datalad.metadata.create-aggregate-annex-limit')))) attrs = tbrepo.get_gitattributes( [op.join('.datalad', i[0]) for i in attrs_cfg]) set_attrs = [] for p, k, v in attrs_cfg: if not attrs.get( op.join('.datalad', p), {}).get(k, None) == v: set_attrs.append((p, {k: v})) if set_attrs: tbrepo.set_gitattributes( set_attrs, attrfile=op.join('.datalad', '.gitattributes')) # prevent git annex from ever annexing .git* stuff (gh-1597) attrs = tbrepo.get_gitattributes('.git') if not attrs.get('.git', {}).get( 'annex.largefiles', None) == 'nothing': tbrepo.set_gitattributes([ ('**/.git*', {'annex.largefiles': 'nothing'})]) # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbrepo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'untracked'} # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Note, must not happen earlier (before if) since "smart" it would not be tbds_config = tbds.config # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds_config: # make sure we reset this variable completely, in case of a # re-create tbds_config.unset(id_var, where='dataset') if _seed is None: # just the standard way uuid_id = uuid.uuid1().urn.split(':')[-1] else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds_config.add( id_var, tbds_id if tbds_id is not None else uuid_id, where='dataset', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in tbds_config.overrides.items(): tbds_config.add(k, v, where='local', reload=False) # all config manipulation is done -> fll reload tbds_config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbrepo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked'} # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbrepo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) for cfg_proc_ in cfg_proc: for r in tbds.run_procedure('cfg_' + cfg_proc_): yield r # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(refds, Dataset) and refds.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in refds.save( path=tbds.path, ): yield r res.update({'status': 'ok'}) yield res
def __call__(path=None, source=None, dataset=None, get_data=False, description=None, recursive=False, recursion_limit=None, reckless=None, jobs="auto"): # normalize path argument to be equal when called from cmdline and # python and nothing was passed into `path` path = ensure_list(path) if not source and not path: raise InsufficientArgumentsError( "Please provide at least a source or a path") # Common kwargs to pass to underlying git/install calls. # They might need adjustments (e.g. for recursion_limit, but # otherwise would be applicable throughout # # There should have been more of common options! # since underneath get could do similar installs common_kwargs = dict( get_data=get_data, recursive=recursive, recursion_limit=recursion_limit, # git_opts=git_opts, # annex_opts=annex_opts, reckless=reckless, jobs=jobs, ) # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = None if dataset is not None: ds = require_dataset(dataset, check_installed=True, purpose='installation') common_kwargs['dataset'] = dataset # pre-compute for results below refds_path = Interface.get_refds_path(ds) # switch into the two scenarios without --source: # 1. list of URLs # 2. list of (sub)dataset content if source is None: # we need to collect URLs and paths to_install = [] to_get = [] # TODO: this approach is problematic, it disrupts the order of input args. # consequently results will be returned in an unexpected order when a # mixture of source URL and paths is given. Reordering is only possible when # everything in here is fully processed before any results can be yielded. # moreover, I think the semantics of the status quo implementation are a # bit complicated: in a mixture list a source URL will lead to a new dataset # at a generated default location, but a path will lead to a subdataset # at that exact location for urlpath in path: ri = RI(urlpath) (to_get if isinstance(ri, PathRI) else to_install).append(urlpath) # 1. multiple source URLs for s in to_install: lgr.debug("Install passes into install source=%s", s) for r in Install.__call__( source=s, description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of the installed content on disk # should be necessary here, all done by code further # down that deals with an install from an actuall `source` # any necessary fixes should go there too! r['refds'] = refds_path yield r # 2. one or more dataset content paths if to_get: lgr.debug("Install passes into get %d items", len(to_get)) # all commented out hint on inability to pass those options # into underlying install-related calls. # Also need to pass from get: # annex_get_opts for r in Get.__call__( to_get, # TODO should pass-through description, not sure why disabled # description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, result_filter=None, **common_kwargs): # no post-processing of get'ed content on disk should be # necessary here, this is the responsibility of `get` # (incl. adjusting parent's gitmodules when submodules end # up in an "updated" state (done in get helpers) # any required fixes should go there! r['refds'] = refds_path yield r # we are done here # the rest is about install from a `source` return # an actual `source` was given if source and path and len(path) > 1: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise ValueError( "install needs a single PATH when source is provided. " "Was given mutliple PATHs: %s" % str(path)) # parameter constraints: if not source: # exception is ok here, if this fails it is either direct user error # or we f****d up one of our internal calls raise InsufficientArgumentsError( "a `source` is required for installation") # code below deals with a single path only path = path[0] if path else None if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination yield get_status_dict( 'install', path=path, status='impossible', logger=lgr, source_url=source, refds=refds_path, message= "installation `source` and destination `path` are identical. " "If you are trying to add a subdataset simply use the `save` command" ) return # resolve the target location (if local) against the provided dataset # or CWD: if path is not None: # MIH everything in here is highly similar to what common # interface helpers do (or should/could do), but at the same # is very much tailored to just apply to `install` -- I guess # it has to stay special # Should work out just fine for regular paths, so no additional # conditioning is necessary try: path_ri = RI(path) except Exception as e: raise ValueError("invalid path argument {}: ({})".format( path, exc_str(e))) try: # Wouldn't work for SSHRI ATM, see TODO within SSHRI # yoh: path should be a local path, and mapping note within # SSHRI about mapping localhost:path to path is kinda # a peculiar use-case IMHO # TODO Stringification can be removed once PY35 is no longer # supported path = str(resolve_path(path_ri.localpath, dataset)) # any `path` argument that point to something local now # resolved and is no longer a URL except ValueError: # `path` is neither a valid source nor a local path. # TODO: The only thing left is a known subdataset with a # name, that is not a path; Once we correctly distinguish # between path and name of a submodule, we need to consider # this. # For now: Just raise raise ValueError("Invalid path argument {0}".format(path)) # `path` resolved, if there was any. # clone dataset, will also take care of adding to superdataset, if one # is given res = Clone.__call__( source, path, dataset=ds, description=description, reckless=reckless, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" result_xfm=None, return_type='generator', result_filter=None, on_failure='ignore') # helper as_ds = YieldDatasets() destination_dataset = None for r in res: if r['action'] == 'install' and r['type'] == 'dataset': # make sure logic below is valid, only one dataset result is # coming back assert (destination_dataset is None) destination_dataset = as_ds(r) r['refds'] = refds_path yield r assert (destination_dataset) # Now, recursive calls: if recursive or get_data: # dataset argument must not be passed inside since we use bound .get # It is ok to do "inplace" as long as we still return right # after the loop ends common_kwargs.pop('dataset', '') for r in destination_dataset.get( curdir, description=description, # we need to disable error handling in order to have it done at # the very top, otherwise we are not able to order a global # "ignore-and-keep-going" on_failure='ignore', return_type='generator', result_xfm=None, **common_kwargs): r['refds'] = refds_path yield r # at this point no futher post-processing should be necessary, # `clone` and `get` must have done that (incl. parent handling) # if not, bugs should be fixed in those commands return
def __call__(path, dataset=None, spec_file=None, properties=None, replace=False): # TODO: message dataset = require_dataset(dataset, check_installed=True, purpose="hirni spec4anything") path = assure_list(path) path = [resolve_path(p, dataset) for p in path] res_kwargs = dict(action='hirni spec4anything', logger=lgr) res_kwargs['refds'] = Interface.get_refds_path(dataset) # ### This might become superfluous. See datalad-gh-2653 ds_path = PathRI(dataset.path) # ### updated_files = [] paths = [] for ap in AnnotatePaths.__call__( dataset=dataset, path=path, action='hirni spec4anything', unavailable_path_status='impossible', nondataset_path_status='error', return_type='generator', # TODO: Check this one out: on_failure='ignore', # Note/TODO: Not sure yet whether and when we need those. # Generally we want to be able to create a spec for subdatasets, # too: # recursive=recursive, # recursion_limit=recursion_limit, # force_subds_discovery=True, # force_parentds_discovery=True, ): if ap.get('status', None) in ['error', 'impossible']: yield ap continue # ### This might become superfluous. See datalad-gh-2653 ap_path = PathRI(ap['path']) # ### # find acquisition and respective specification file: rel_path = posixpath.relpath(ap_path.posixpath, ds_path.posixpath) path_parts = rel_path.split('/') # TODO: Note: Outcommented this warning for now. We used to not have # a spec file at the toplevel of the study dataset, but now we do. # The logic afterwards works, but should be revisited. At least, # `acq` should be called differently now. # if len(path_parts) < 2: # lgr.warning("Not within an acquisition") acq = path_parts[0] # TODO: spec file specifiable or fixed path? # if we want the former, what we actually need is an # association of acquisition and its spec path # => prob. not an option but a config spec_path = spec_file if spec_file \ else posixpath.join(ds_path.posixpath, acq, dataset.config.get("datalad.hirni.studyspec.filename", "studyspec.json")) spec = [r for r in json_py.load_stream(spec_path)] \ if posixpath.exists(spec_path) else list() lgr.debug("Add specification snippet for %s", ap['path']) # XXX 'add' does not seem to be the thing we want to do # rather 'set', so we have to check whether a spec for a location # is already known and fail or replace it (maybe with --force) # go through all existing specs and extract unique value # and also assign them to the new record (subjects, ...), but only # editable fields!! uniques = dict() for s in spec: for k in s: if isinstance(s[k], dict) and 'value' in s[k]: if k not in uniques: uniques[k] = set() uniques[k].add(s[k]['value']) overrides = dict() for k in uniques: if len(uniques[k]) == 1: overrides[k] = _get_edit_dict(value=uniques[k].pop(), approved=False) if properties: # TODO: This entire reading of properties needs to be RF'd # into proper generalized functions. # spec got more complex. update() prob. can't simply override # (think: 'procedures' and 'tags' prob. need to be appended # instead) # load from file or json string if isinstance(properties, dict): props = properties elif op.exists(properties): props = json_py.load(properties) else: props = json_py.loads(properties) # turn into editable, pre-approved records spec_props = { k: dict(value=v, approved=True) for k, v in props.items() if k not in non_editables + ['tags', 'procedures'] } spec_props.update({ k: v for k, v in props.items() if k in non_editables + ['tags'] }) # TODO: still wrong. It's a list. Append or override? How to decide? spec_props.update({ o_k: [{ i_k: dict(value=i_v, approved=True) for i_k, i_v in o_v.items() }] for o_k, o_v in props.items() if o_k in ['procedures'] }) overrides.update(spec_props) # TODO: It's probably wrong to use uniques for overwriting! At least # they cannot be used to overwrite values explicitly set in # _add_to_spec like "location", "type", etc. # # But then: This should concern non-editable fields only, right? spec = _add_to_spec(spec, posixpath.split(spec_path)[0], ap, dataset, overrides=overrides, replace=replace) # Note: Not sure whether we really want one commit per snippet. # If not - consider: # - What if we fail amidst? => Don't write to file yet. # - What about input paths from different acquisitions? # => store specs per acquisition in memory # MIH: One commit per line seems silly. why not update all files # collect paths of updated files, and give them to a single `add` # at the very end? # MIH: if we fail, we fail and nothing is committed from datalad_hirni.support.spec_helpers import sort_spec json_py.dump2stream(sorted(spec, key=lambda x: sort_spec(x)), spec_path) updated_files.append(spec_path) yield get_status_dict(status='ok', type=ap['type'], path=ap['path'], **res_kwargs) paths.append(ap) from datalad.dochelpers import single_or_plural from os import linesep message = "[HIRNI] Add specification {n_snippets} for: {paths}".format( n_snippets=single_or_plural("snippet", "snippets", len(paths)), paths=linesep.join(" - " + op.relpath(p['path'], dataset.path) for p in paths) if len(paths) > 1 else op.relpath(paths[0]['path'], dataset.path)) for r in dataset.save(updated_files, to_git=True, message=message, return_type='generator', result_renderer='disabled'): yield r
def __call__(message=None, files=None, dataset=None, auto_add_changes=False, version_tag=None, recursive=False, recursion_limit=None, super_datasets=False): # shortcut ds = require_dataset(dataset, check_installed=True, purpose='saving') if not ds.repo.repo.is_dirty(index=True, working_tree=True, untracked_files=True, submodules=True): # if we cannot see anything dirty at all, the only things we could # do is tag if version_tag: ds.repo.tag(version_tag) # take the easy one out return # always yields list; empty if None files = assure_list(files) # track what to be committed, so it becomes # possible to decide when/what to save further down # and one level up orig_hexsha = ds.repo.get_hexsha() to_commit = [] # before anything, let's deal with missing submodules that may have # been rm'ed by the user # this will not alter/amend the history of the dataset deinit_deleted_submodules(ds) # XXX path resolution needs to happen on the input argument, not the # resolved dataset! # otherwise we will not be able to figure out, whether there was an # explicit dataset provided, or just a matching one resolved # automatically. # if files are provided but no dataset, we interpret them as # CWD-related if auto_add_changes: # use the dataset's base path to indicate that everything # should be saved if files: lgr.warning( "List of paths was provided to save but auto_add_changes " "was specified, so list of paths was ignored") files = [ds.path] else: # make sure we apply the usual path interpretation logic files = [resolve_path(p, dataset) for p in files] new_submodules = untracked_subdatasets_to_submodules(ds, files) if new_submodules: # make sure that .gitmodules is added to the list of files # to be committed. Adding to index might not be enough iff # custom files was provided to_commit.append('.gitmodules') to_commit.extend(new_submodules) # now we should have a complete list of submodules to potentially # recurse into if recursive and (recursion_limit is None or recursion_limit > 0): # what subdataset to touch? subdss = [] if auto_add_changes: # all installed 1st-level ones # we only want immediate subdatasets, higher depths will come # via recursion subdss = [ Dataset(opj(ds.path, subds_path)) for subds_path in ds.get_subdatasets(recursive=False) ] elif files is not None: # only subdatasets that contain any of the to-be-considered # paths # TODO: the same deductions will be redone later again # very inefficient. Should be just sorted into subds # once! subdss = [ ds.get_containing_subdataset(p, recursion_limit=1) for p in files ] # skip anything that isn't installed, or this dataset subdss = [d for d in subdss if d.is_installed() and d != ds] prop_recursion_limit = \ None if recursion_limit is None else max(recursion_limit - 1, 0) for subds in subdss: # TODO: just make use of get._sort_paths_into_datasets # currently it is very inefficient since for the same ds # it asks about subdatasets for every file! subds_files = [] # files belonging to the subds todo_files = [] # leftover files for f in files: if ds.get_containing_subdataset( f, recursion_limit=1) == subds: subds_files.append(f) else: todo_files.append(f) files = todo_files subds_modified = Save.__call__( message=message, files=subds_files, dataset=subds, auto_add_changes=auto_add_changes, version_tag=version_tag, recursive=recursive and (prop_recursion_limit is None or prop_recursion_limit > 0), recursion_limit=prop_recursion_limit, ) if subds_modified: # stage changes in this submodule subdspath = relpath(subds.path, ds.path) ds.repo.add(subdspath, git=True) to_commit.append(subdspath) if files: # could still be none without auto add changes ds_subdatasets = ds.get_subdatasets(recursive=False) subdatasets_paths = {opj(ds.path, f) for f in ds_subdatasets} # TODO: also use some centralized sorting into sub-datasets # e.g. one used in get ds_files = [ f for f in files if f in subdatasets_paths or ds.get_containing_subdataset(f, recursion_limit=1) == ds ] if len(ds_files): # XXX Is there a better way to handle files in mixed repos? ds.repo.add(ds_files) ds.repo.add(ds_files, git=True) to_commit.extend(ds_files) # it might be that the file itself is the submodule, so we might # need to commit .gitmodules for f in files: for subds in subdatasets_paths: if subds.rstrip('/') == f.rstrip('/'): to_commit.append('.gitmodules') break _datalad_msg = False if not message: message = 'Recorded existing changes' _datalad_msg = True # extend with files yet to be committed in this dataset to_commit.extend(files) # anything should be staged by now # however, staged submodule changes are not considered as # `index`, hence `submodules` needs to be True too # we can have an explicit list of stuff to save or (if no `files` # provided) have staged stuff if ds.repo.repo.is_dirty(index=True, working_tree=False, untracked_files=False, submodules=True): # Analyze list of known to be committed files/submodules, # see if nothing points outside, and then convert to relative paths to_commit_rel = [] if to_commit: repopath = ds.repo.path for f in to_commit: if isabs(f): frel = relpath(f, repopath) if frel.startswith(pardir): # XXX may be just a warning and skip? raise RuntimeError( "Path %s outside of the dataset %s. Can't commit" % (f, ds)) f = frel to_commit_rel.append(f) to_commit_rel = sorted(set(to_commit_rel)) if '.' in to_commit_rel: # we need to commit everything to_commit_rel = [] ds.repo.commit(message, options=to_commit_rel, _datalad_msg=_datalad_msg) elif to_commit: lgr.warning( "Was instructed to commit %s files but repository is not dirty", to_commit) elif not auto_add_changes: lgr.info('Nothing to save, consider auto-detection of changes, ' 'if this is unexpected.') # MIH: let's tag even if there was nothing commit. I'd forget this # option too often... if version_tag: ds.repo.tag(version_tag) _was_modified = ds.repo.get_hexsha() != orig_hexsha # and now we could consider saving our changes within super-datasets # Let's float up until we get to a non-dataset if super_datasets: if _was_modified: if version_tag: lgr.info( "Version tag %s will not be applied to super datasets", version_tag) superds = ds while True: supersubds = superds superds = superds.get_superdataset(datalad_only=True) if not superds: break Save.__call__( message=message + " [origin: %s]" % relpath(ds.path, superds.path), files=[relpath(supersubds.path, superds.path)], dataset=superds, auto_add_changes=False, version_tag=None, recursive=False, ) else: lgr.info( "Not trying to save super-datasets since no modifications") # TODO: figure out what we should return for recursive/super_datasets # shouldn't we return all commits??? return ds.repo.repo.head.commit if _was_modified else None
def __call__(path=None, spec=None, dataset=None, subject=None, anon_subject=None, acquisition=None, properties=None): # TODO: acquisition can probably be removed (or made an alternative to # derive spec and/or dicom location from) # Change, so path needs to point directly to dicom ds? # Or just use acq and remove path? dataset = require_dataset(dataset, check_installed=True, purpose="spec from dicoms") from datalad.utils import assure_list if path is not None: path = assure_list(path) path = [resolve_path(p, dataset) for p in path] else: raise InsufficientArgumentsError( "insufficient arguments for dicom2spec: a path is required") # TODO: We should be able to deal with several paths at once # ATM we aren't (see also commit + message of actual spec) assert len(path) == 1 if not spec: raise InsufficientArgumentsError( "insufficient arguments for dicom2spec: a spec file is required" ) # TODO: That's prob. wrong. We can derive default spec from acquisition else: spec = resolve_path(spec, dataset) spec_series_list = \ [r for r in json_py.load_stream(spec)] if op.exists(spec) else list() # get dataset level metadata: found_some = False for meta in dataset.meta_dump( path, recursive=False, # always False? reporton='datasets', return_type='generator', result_renderer='disabled'): if meta.get('status', None) not in ['ok', 'notneeded']: yield meta continue if 'dicom' not in meta['metadata']: # TODO: Really "notneeded" or simply not a result at all? yield dict(status='notneeded', message=("found no DICOM metadata for %s", meta['path']), path=meta['path'], type='dataset', action='dicom2spec', logger=lgr) continue if 'Series' not in meta['metadata']['dicom'] or \ not meta['metadata']['dicom']['Series']: yield dict( status='impossible', message=("no image series detected in DICOM metadata of" " %s", meta['path']), path=meta['path'], type='dataset', action='dicom2spec', logger=lgr) continue found_some = True overrides = dict() if properties: # load from file or json string props = json_py.load(properties) \ if op.exists(properties) else json_py.loads(properties) # turn into editable, pre-approved records props = { k: dict(value=v, approved=True) for k, v in props.items() } overrides.update(props) spec_series_list = add_to_spec( meta, spec_series_list, op.dirname(spec), subject=subject, anon_subject=anon_subject, # session=session, # TODO: parameter "session" was what # we now call acquisition. This is # NOT a good default for bids_session! # Particularly wrt to anonymization overrides=overrides, dataset=dataset) if not found_some: yield dict( status='impossible', message="found no DICOM metadata", path=path, type= 'file', # TODO: arguable should be 'file' or 'dataset', depending on path action='dicom2spec', logger=lgr) return # TODO: RF needed. This rule should go elsewhere: # ignore duplicates (prob. reruns of aborted runs) # -> convert highest id only # Note: This sorting is a q&d hack! # TODO: Sorting needs to become more sophisticated + include notion of :all spec_series_list = sorted(spec_series_list, key=lambda x: get_specval(x, 'id') if 'id' in x.keys() else 0) for i in range(len(spec_series_list)): # Note: Removed the following line from condition below, # since it appears to be pointless. Value for 'converter' # used to be 'heudiconv' or 'ignore' for a 'dicomseries', so # it's not clear ATM what case this could possibly have catched: # heuristic.has_specval(spec_series_list[i], "converter") and \ if spec_series_list[i]["type"] == "dicomseries" and \ has_specval(spec_series_list[i], "bids-run") and \ get_specval(spec_series_list[i], "bids-run") in \ [get_specval(s, "bids-run") for s in spec_series_list[i + 1:] if get_specval( s, "description") == get_specval( spec_series_list[i], "description") and \ get_specval(s, "id") > get_specval( spec_series_list[i], "id") ]: lgr.debug("Ignore SeriesNumber %s for conversion" % i) spec_series_list[i]["tags"].append( 'hirni-dicom-converter-ignore') lgr.debug("Storing specification (%s)", spec) # store as a stream (one record per file) to be able to # easily concat files without having to parse them, or # process them line by line without having to fully parse them from datalad_hirni.support.spec_helpers import sort_spec # Note: Sorting paradigm needs to change. See above. # spec_series_list = sorted(spec_series_list, key=lambda x: sort_spec(x)) json_py.dump2stream(spec_series_list, spec) # make sure spec is in git: dataset.repo.set_gitattributes([(spec, { 'annex.largefiles': 'nothing' })], '.gitattributes') for r in Save.__call__(dataset=dataset, path=[spec, '.gitattributes'], to_git=True, message="[HIRNI] Added study specification " "snippet for %s" % op.relpath(path[0], dataset.path), return_type='generator', result_renderer='disabled'): if r.get('status', None) not in ['ok', 'notneeded']: yield r elif r['path'] in [spec, op.join(dataset.path, '.gitattributes')] \ and r['type'] == 'file': r['action'] = 'dicom2spec' r['logger'] = lgr yield r elif r['type'] == 'dataset': # 'ok' or 'notneeded' for a dataset is okay, since we commit # the spec. But it's not a result to yield continue else: # anything else shouldn't happen yield dict( status='error', message=("unexpected result from save: %s", r), path= spec, # TODO: This actually isn't clear - get it from `r` type='file', action='dicom2spec', logger=lgr)
def diff_dataset(dataset, fr, to, constant_refs, path=None, annex=None, untracked='normal', recursive=False, recursion_limit=None, eval_file_type=True, reporting_order='depth-first'): """Internal helper to diff a dataset Parameters ---------- dataset : Dataset Dataset to perform the diff on. `fr` and `to` parameters are interpreted in the context of this dataset. fr : str Commit-ish to compare from. to : str Commit-ish to compare to. constant_refs : bool If True, `fr` and `to` will be passed on unmodified to diff operations on subdatasets. This can be useful with symbolic references like tags to report subdataset changes independent of superdataset changes. If False, `fr` and `to` will be translated to the subdataset commit-ish that match the given commit-ish in the superdataset. path : Path-like, optional Paths to constrain the diff to (see main diff() command). annex : str, optional Reporting mode for annex properties (see main diff() command). untracked : str, optional Reporting mode for untracked content (see main diff() command). recursive : bool, optional Flag to enable recursive operation (see main diff() command). recursion_limit : int, optional Recursion limit (see main diff() command). eval_file_type : bool, optional Whether to perform file type discrimination between real symlinks and symlinks representing annex'ed files. This can be expensive in datasets with many files. reporting_order : {'depth-first', 'breadth-first'}, optional By default, subdataset content records are reported after the record on the subdataset's submodule in a superdataset (depth-first). Alternatively, report all superdataset records first, before reporting any subdataset content records (breadth-first). Yields ------ dict DataLad result records. """ if reporting_order not in ('depth-first', 'breadth-first'): raise ValueError('Unknown reporting order: {}'.format(reporting_order)) ds = require_dataset(dataset, check_installed=True, purpose='difference reporting') # we cannot really perform any sorting of paths into subdatasets # or rejecting paths based on the state of the filesystem, as # we need to be able to compare with states that are not represented # in the worktree (anymore) if path: ps = [] # sort any path argument into the respective subdatasets for p in sorted(assure_list(path)): # it is important to capture the exact form of the # given path argument, before any normalization happens # distinguish rsync-link syntax to identify # a dataset as whole (e.g. 'ds') vs its # content (e.g. 'ds/') # special case is the root dataset, always report its content # changes orig_path = str(p) resolved_path = resolve_path(p, dataset) p = \ resolved_path, \ orig_path.endswith(op.sep) or resolved_path == ds.pathobj str_path = str(p[0]) root = get_dataset_root(str_path) if root is None: # no root, not possibly underneath the refds yield dict(action='status', path=str_path, refds=ds.path, status='error', message='path not underneath this dataset', logger=lgr) continue if path_under_rev_dataset(ds, str_path) is None: # nothing we support handling any further # there is only a single refds yield dict( path=str_path, refds=ds.path, action='diff', status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, str_path), logger=lgr, ) continue ps.append(p) path = ps # TODO we might want to move away from the single-pass+immediate-yield # paradigm for this command. If we gather all information first, we # could do post-processing and detect when a file (same gitsha, or same # key) was copied/moved from another dataset. Another command (e.g. # save) could act on this information and also move/copy # availability information or at least enhance the respective commit # message with cross-dataset provenance info # cache to help avoid duplicate status queries content_info_cache = {} for res in _diff_ds( ds, fr, to, constant_refs, recursion_limit if recursion_limit is not None and recursive else -1 if recursive else 0, # TODO recode paths to repo path reference origpaths=None if not path else OrderedDict(path), untracked=untracked, annexinfo=annex, eval_file_type=eval_file_type, cache=content_info_cache, order=reporting_order): res.update( refds=ds.path, logger=lgr, action='diff', ) yield res
def sort_paths_by_datasets(refds, orig_dataset_arg, paths): """Sort paths into actually present datasets Parameters ---------- refds : Dataset or None Dataset instance of a reference dataset, if any exists. This is not just a `dataset` argument in any form (for path resolution), see `orig_dataset_arg` for that, but has to be a Dataset instance that serves as the root of all operations. orig_dataset_arg : None or str The original dataset argument of the calling command. This is used to determine the path specification semantics, i.e. relative to CWD vs. relative to a given dataset paths : list Paths as given to the calling command Returns ------- OrderedDict, list The dictionary contains all to-be-sorted paths as values to their respective containing datasets paths (as keys). The second list contains status dicts for any errors that may have occurred during processing. They can be yielded in the context of the calling command. """ errors = [] paths_by_ds = OrderedDict() # sort any path argument into the respective subdatasets for p in sorted(paths): # it is important to capture the exact form of the # given path argument, before any normalization happens # for further decision logic below orig_path = text_type(p) p = resolve_path(p, orig_dataset_arg) root = rev_get_dataset_root(text_type(p)) if root is None: # no root, not possibly underneath the refds errors.append( dict(action='status', path=p, status='error', message='path not underneath this dataset', logger=lgr)) continue else: if refds and root == text_type(p) and \ not orig_path.endswith(op.sep): # the given path is pointing to a dataset # distinguish rsync-link syntax to identify # the dataset as whole (e.g. 'ds') vs its # content (e.g. 'ds/') super_root = rev_get_dataset_root(op.dirname(root)) if super_root: # the dataset identified by the path argument # is contained in a superdataset, and no # trailing path separator was found in the # argument -> user wants to address the dataset # as a whole (in the superdataset) root = super_root root = Path(root) ps = paths_by_ds.get(root, []) ps.append(p) paths_by_ds[root] = ps return paths_by_ds, errors
def __call__( source, path=None, dataset=None, description=None, reckless=None): # did we explicitly get a dataset to install into? # if we got a dataset, path will be resolved against it. # Otherwise path will be resolved first. ds = require_dataset( dataset, check_installed=True, purpose='cloning') \ if dataset is not None else dataset refds_path = ds.path if ds else None # legacy compatibility if reckless is True: # so that we can forget about how things used to be reckless = 'auto' if isinstance(source, Dataset): source = source.path if source == path: # even if they turn out to be identical after resolving symlinks # and more sophisticated witchcraft, it would still happily say # "it appears to be already installed", so we just catch an # obviously pointless input combination raise ValueError( "clone `source` and destination `path` are identical [{}]. " "If you are trying to add a subdataset simply use `save`".format( path)) if path is not None: path = resolve_path(path, dataset) # derive target from source: if path is None: # we got nothing but a source. do something similar to git clone # and derive the path from the source and continue # since this is a relative `path`, resolve it: # we are not going to reuse the decoded URL, as this is done for # all source candidates in clone_dataset(), we just use to determine # a destination path here in order to perform a bunch of additional # checks that shall not pollute the helper function source_ = decode_source_spec( source, cfg=None if ds is None else ds.config) path = resolve_path(source_['default_destpath'], dataset) lgr.debug("Determined clone target path from source") lgr.debug("Resolved clone target path to: '%s'", path) # there is no other way -- my intoxicated brain tells me assert(path is not None) result_props = dict( action='install', logger=lgr, refds=refds_path, source_url=source) try: # this will implicitly cause pathlib to run a bunch of checks # whether the present path makes any sense on the platform # we are running on -- we don't care if the path actually # exists at this point, but we want to abort early if the path # spec is determined to be useless path.exists() except OSError as e: yield get_status_dict( status='error', path=path, message=('cannot handle target path: %s', exc_str(e)), **result_props) return destination_dataset = Dataset(path) result_props['ds'] = destination_dataset if ds is not None and ds.pathobj not in path.parents: yield get_status_dict( status='error', message=("clone target path '%s' not in specified target dataset '%s'", path, ds), **result_props) return # perform the actual cloning operation yield from clone_dataset( [source], destination_dataset, reckless, description, result_props, cfg=None if ds is None else ds.config, ) # TODO handle any 'version' property handling and verification using a dedicated # public helper if ds is not None: # we created a dataset in another dataset # -> make submodule for r in ds.save( path, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r
def __call__(dataset=None, dest=None, path=None, # Note: add remote currently disabled in publish # dest_url=None, dest_pushurl=None, with_data=None, recursive=False): # Note: add remote currently disabled in publish # if dest is None and (dest_url is not None # or dest_pushurl is not None): # raise ValueError("""insufficient information for adding the # destination as a sibling (needs at least a name)""") # shortcut ds = dataset if ds is not None and not isinstance(ds, Dataset): ds = Dataset(ds) if not path: path = curdir elif isinstance(path, list): return [Publish.__call__( dataset=ds, dest=dest, path=p, # Note: add remote currently disabled in publish # dest_url=dest_url, # dest_pushurl=dest_pushurl, with_data=with_data, recursive=recursive) for p in path] # resolve the location against the provided dataset if path is not None: path = resolve_path(path, ds) lgr.info("Publishing {0}".format(path)) # if we have no dataset given, figure out which one we need to operate # on, based on the resolved location (that is now guaranteed to # be specified if ds is None: # try to find a dataset at or above the location dspath = GitRepo.get_toppath(abspath(path)) if dspath is None: # no top-level dataset found, use path as such dspath = path ds = Dataset(dspath) lgr.debug("Resolved dataset for publication: {0}".format(ds)) assert(ds is not None) # it might still be about a subdataset of ds: if path is not None: relativepath = relpath(path, start=ds.path) subds = get_containing_subdataset(ds, relativepath) if subds.path != ds.path: # path belongs to a subdataset; hand it over lgr.debug("Hand over to submodule %s" % subds.path) return subds.publish(dest=dest, path=relpath(path, start=subds.path), # Note: add remote currently disabled in publish # dest_url=dest_url, # dest_pushurl=dest_pushurl, with_data=with_data, recursive=recursive) # now, we know, we have to operate on ds. So, ds needs to be installed, # since we cannot publish anything from a not installed dataset, # can we? # (But may be just the existence of ds.repo is important here.) if not ds.is_installed(): raise ValueError("No installed dataset found at " "{0}.".format(ds.path)) assert(ds.repo is not None) # TODO: For now we can deal with a sibling(remote) name given by `dest` # only. Figure out, when to allow for passing a local path or URL # directly and what to do in that case. # Note: we need an upstream remote, if there's none given. We could # wait for git push to complain, but we need to explicitly figure it # out for pushing annex branch anyway and we might as well fail right # here. # keep original dest in case it's None for passing to recursive calls: dest_resolved = dest if dest is None: # check for tracking branch's remote: try: std_out, std_err = \ ds.repo._git_custom_command('', ["git", "config", "--get", "branch.{active_branch}.remote".format(active_branch=ds.repo.git_get_active_branch())], expect_fail=True) except CommandError as e: if e.code == 1 and e.stdout == "": std_out = None else: raise if std_out: dest_resolved = std_out.strip() else: # we have no remote given and no upstream => fail raise RuntimeError("No known default target for " "publication and none given.") # upstream branch needed for update (merge) and subsequent push, # in case there is no. set_upstream = False try: # Note: tracking branch actually defined bei entry "merge" # PLUS entry "remote" std_out, std_err = \ ds.repo._git_custom_command('', ["git", "config", "--get", "branch.{active_branch}.merge".format(active_branch=ds.repo.git_get_active_branch())], expect_fail=True) except CommandError as e: if e.code == 1 and e.stdout == "": # no tracking branch yet: set_upstream = True else: raise # is `dest` an already known remote? if dest_resolved not in ds.repo.git_get_remotes(): # unknown remote raise ValueError("No sibling '%s' found." % dest_resolved) # Note: add remote currently disabled in publish # if dest_url is None: # raise ValueError("No sibling '%s' found. Provide `dest-url`" # " to register it." % dest_resolved) # lgr.info("Sibling %s unknown. Registering ...") # # # Fill in URL-Template: # remote_url = dest_url.replace("%NAME", basename(ds.path)) # # TODO: handle_name.replace("/", "-")) instead of basename() # # - figure it out ;) # # - either a datasets needs to discover superdatasets in # # order to get it's relative path to provide a name # # - or: We need a different approach on the templates # # # Add the remote # ds.repo.git_remote_add(dest_resolved, remote_url) # if dest_pushurl: # # Fill in template: # remote_url_push = \ # dest_pushurl.replace("%NAME", basename(ds.path)) # # TODO: Different way of replacing %NAME; See above # # # Modify push url: # ds.repo._git_custom_command('', # ["git", "remote", # "set-url", # "--push", dest_resolved, # remote_url_push]) # lgr.info("Added sibling '%s'." % dest) # lgr.debug("Added remote '%s':\n %s (fetch)\n%s (push)." % # (dest_resolved, remote_url, # remote_url_push if dest_pushurl else remote_url)) # Note: add remote currently disabled in publish # else: # # known remote: parameters dest-url-* currently invalid. # # This may change to adapt the existing remote. # if dest_url: # lgr.warning("Sibling '%s' already exists for dataset '%s'. " # "Ignoring dest-url %s." % # (dest_resolved, ds.path, dest_url)) # if dest_pushurl: # lgr.warning("Sibling '%s' already exists for dataset '%s'. " # "Ignoring dest-pushurl %s." % # (dest_resolved, ds.path, dest_pushurl)) # Figure out, what to publish if path is None or path == ds.path: # => publish the dataset itself # push local state: # TODO: Rework git_push in GitRepo cmd = ['git', 'push'] if set_upstream: # no upstream branch yet cmd.append("--set-upstream") cmd += [dest_resolved, ds.repo.git_get_active_branch()] ds.repo._git_custom_command('', cmd) # push annex branch: if isinstance(ds.repo, AnnexRepo): ds.repo.git_push("%s +git-annex:git-annex" % dest_resolved) # TODO: if with_data is a shell pattern, we get a list, when called # from shell, right? # => adapt the following and check constraints to allow for that if with_data: ds.repo._git_custom_command('', ["git", "annex", "copy"] + with_data + ["--to", dest_resolved]) if recursive and ds.get_dataset_handles() != []: results = [ds] # Note: add remote currently disabled in publish # modify URL templates: # if dest_url: # dest_url = dest_url.replace('%NAME', basename(ds.path) + '-%NAME') # if dest_pushurl: # dest_pushurl = dest_pushurl.replace('%NAME', basename(ds.path) + '-%NAME') for subds in ds.get_dataset_handles(): results.append(Dataset(opj(ds.path, subds)).publish( dest=dest, # Note: use `dest` instead of `dest_resolved` in case # dest was None, so subdatasets would use their default # as well # Note: add remote currently disabled in publish # dest_url=dest_url, # dest_pushurl=dest_pushurl, with_data=with_data, recursive=recursive)) return results return ds elif exists(path): # At this point `path` is not referencing a (sub)dataset. # An annexed file is the only thing left, that `path` might be # validly pointing to. Anything else we can't handle currently. if isinstance(ds.repo, AnnexRepo): try: if ds.repo.get_file_key(relativepath): # file is in annex, publish it ds.repo._run_annex_command('copy', annex_options=[path, '--to=%s' % dest_resolved]) return path except (FileInGitError, FileNotInAnnexError): pass # `path` can't be published lgr.warning("Don't know how to publish %s." % path) return None else: # nothing to publish found lgr.warning("Nothing to publish found at %s." % path) return None