def test_resolve_path(path): if str(Path(path).resolve()) != path: raise SkipTest("Test assumptions require non-symlinked parent paths") # initially ran into on OSX https://github.com/datalad/datalad/issues/2406 opath = op.join(path, "origin") os.makedirs(opath) if not on_windows: lpath = op.join(path, "linked") os.symlink('origin', lpath) ds_global = Dataset(path) # path resolution of absolute paths is not influenced by symlinks # ignore the linked path on windows, it is not a symlink in the POSIX sense for d in (opath, ) if on_windows else (opath, lpath): ds_local = Dataset(d) # no symlink resolution eq_(str(resolve_path(d)), d) # list comes out as a list eq_(resolve_path([d]), [Path(d)]) # multiple OK eq_(resolve_path([d, d]), [Path(d), Path(d)]) with chpwd(d): # be aware: knows about cwd, but this CWD has symlinks resolved eq_(str(resolve_path(d).cwd()), opath) # using pathlib's `resolve()` will resolve any # symlinks # also resolve `opath`, as on old windows systems the path might # come in crippled (e.g. C:\Users\MIKE~1/...) # and comparison would fails unjustified eq_(resolve_path('.').resolve(), ut.Path(opath).resolve()) # no norming, but absolute paths, without resolving links eq_(resolve_path('.'), ut.Path(d)) eq_(str(resolve_path('.')), d) # there is no concept of an "explicit" relative path anymore # relative is relative, regardless of the specific syntax eq_(resolve_path(op.join(os.curdir, 'bu'), ds=ds_global), ds_global.pathobj / 'bu') # there is no full normpath-ing or other funky resolution of # parent directory back-reference eq_(str(resolve_path(op.join(os.pardir, 'bu'), ds=ds_global)), op.join(ds_global.path, os.pardir, 'bu')) # resolve against a dataset given as a path/str # (cmdline input scenario) eq_(resolve_path('bu', ds=ds_local.path), Path.cwd() / 'bu') eq_(resolve_path('bu', ds=ds_global.path), Path.cwd() / 'bu') # resolve against a dataset given as a dataset instance # (object method scenario) eq_(resolve_path('bu', ds=ds_local), ds_local.pathobj / 'bu') eq_(resolve_path('bu', ds=ds_global), ds_global.pathobj / 'bu') # not being inside a dataset doesn't change the resolution result eq_(resolve_path(op.join(os.curdir, 'bu'), ds=ds_global), ds_global.pathobj / 'bu') eq_(str(resolve_path(op.join(os.pardir, 'bu'), ds=ds_global)), op.join(ds_global.path, os.pardir, 'bu'))
def custom_result_renderer(res, **kwargs): # pragma: no cover if not (res['status'] == 'ok' \ and res['action'] in ('status', 'diff') \ and res.get('state', None) != 'clean'): # logging reported already return from datalad.ui import ui # when to render relative paths: # 1) if a dataset arg was given # 2) if CWD is the refds refds = res.get('refds', None) refds = refds if kwargs.get('dataset', None) is not None \ or refds == os.getcwd() else None path = res['path'] if refds is None \ else str(ut.Path(res['path']).relative_to(refds)) type_ = res.get('type', res.get('type_src', '')) max_len = len('untracked') state = res['state'] ui.message('{fill}{state}: {path}{type_}'.format( fill=' ' * max(0, max_len - len(state)), state=ac.color_word( state, STATE_COLOR_MAP.get(res['state'], ac.WHITE)), path=path, type_=' ({})'.format( ac.color_word(type_, ac.MAGENTA) if type_ else '')))
def pathobj(self): """pathobj for the dataset""" # XXX this relies on the assumption that self._path as managed # by the base class is always a native path if not self._pathobj: self._pathobj = ut.Path(self._path) return self._pathobj
def custom_result_renderer(res, **kwargs): # pragma: no cover if not (res['status'] == 'ok' \ and res['action'] in ('status', 'diff') \ and res.get('state', None) != 'clean'): # logging reported already return from datalad.ui import ui # when to render relative paths: # 1) if a dataset arg was given # 2) if CWD is the refds refds = res.get('refds', None) refds = refds if kwargs.get('dataset', None) is not None \ or refds == os.getcwd() else None # Note: We have to force unicode for res['path'] because # interface.utils encodes it on py2 before passing it to # custom_result_renderer(). path = assure_unicode(res['path']) if refds is None \ else text_type(ut.Path(res['path']).relative_to(refds)) type_ = res.get('type', res.get('type_src', '')) max_len = len('untracked') state = res.get('state', 'unknown') ui.message(u'{fill}{state}: {path}{type_}'.format( fill=' ' * max(0, max_len - len(state)), state=ac.color_word( state, STATE_COLOR_MAP.get(res.get('state', 'unknown'))), path=path, type_=' ({})'.format( ac.color_word(type_, ac.MAGENTA) if type_ else '')))
def test_status_basics(path, linkpath, otherdir): if has_symlink_capability(): # make it more complicated by default ut.Path(linkpath).symlink_to(path, target_is_directory=True) path = linkpath with chpwd(path): assert_raises(NoDatasetFound, status) ds = Dataset(path).create() # outcome identical between ds= and auto-discovery with chpwd(path): assert_raises(IncompleteResultsError, status, path=otherdir) stat = status(result_renderer=None) eq_(stat, ds.status(result_renderer=None)) assert_status('ok', stat) # we have a bunch of reports (be vague to be robust to future changes assert len(stat) > 2 # check the composition for s in stat: eq_(s['status'], 'ok') eq_(s['action'], 'status') eq_(s['state'], 'clean') eq_(s['type'], 'file') assert_in('gitshasum', s) assert_in('bytesize', s) eq_(s['refds'], ds.path)
def custom_result_renderer(res, **kwargs): # pragma: more cover if (res['status'] == 'ok' and res['action'] in ('status', 'diff') and res.get('state') == 'clean'): # this renderer will be silent for clean status|diff results return if res['status'] != 'ok' or res['action'] not in ('status', 'diff'): # whatever this renderer cannot account for, send to generic generic_result_renderer(res) return from datalad.ui import ui # when to render relative paths: # 1) if a dataset arg was given # 2) if CWD is the refds refds = res.get('refds', None) refds = refds if kwargs.get('dataset', None) is not None \ or refds == os.getcwd() else None path = res['path'] if refds is None \ else str(ut.Path(res['path']).relative_to(refds)) type_ = res.get('type', res.get('type_src', '')) max_len = len('untracked') state = res.get('state', 'unknown') ui.message(u'{fill}{state}: {path}{type_}'.format( fill=' ' * max(0, max_len - len(state)), state=ac.color_word( state, STATE_COLOR_MAP.get(res.get('state', 'unknown'))), path=path, type_=' ({})'.format( ac.color_word(type_, ac.MAGENTA) if type_ else '')))
def test_add_files(path): ds = Dataset(path).create(force=True) test_list_1 = ['test_annex.txt'] test_list_2 = ['test.txt'] test_list_3 = ['test1.dat', 'test2.dat'] test_list_4 = [ op.join('dir', 'testindir'), op.join('dir', OBSCURE_FILENAME) ] for arg in [(test_list_1[0], False), (test_list_2[0], True), (test_list_3, False), (test_list_4, False)]: # special case 4: give the dir: if arg[0] == test_list_4: result = ds.save('dir', to_git=arg[1]) status = ds.repo.annexstatus(['dir']) else: result = ds.save(arg[0], to_git=arg[1]) for a in assure_list(arg[0]): assert_result_count(result, 1, path=text_type(ds.pathobj / a)) status = ds.repo.get_content_annexinfo( ut.Path(p) for p in assure_list(arg[0])) for f, p in iteritems(status): if arg[1]: assert p.get('key', None) is None, f else: assert p.get('key', None) is not None, f
def test_rev_resolve_path(path): if op.realpath(path) != path: raise SkipTest("Test assumptions require non-symlinked parent paths") # initially ran into on OSX https://github.com/datalad/datalad/issues/2406 opath = op.join(path, "origin") os.makedirs(opath) if not on_windows: lpath = op.join(path, "linked") os.symlink('origin', lpath) ds_global = Dataset(path) # path resolution of absolute paths is not influenced by symlinks # ignore the linked path on windows, it is not a symlink in the POSIX sense for d in (opath, ) if on_windows else (opath, lpath): ds_local = Dataset(d) # no symlink resolution eq_(str(rev_resolve_path(d)), d) with chpwd(d): # be aware: knows about cwd, but this CWD has symlinks resolved eq_(str(rev_resolve_path(d).cwd()), opath) # using pathlib's `resolve()` will resolve any # symlinks # also resolve `opath`, as on old windows systems the path might # come in crippled (e.g. C:\Users\MIKE~1/...) # and comparison would fails unjustified eq_(rev_resolve_path('.').resolve(), ut.Path(opath).resolve()) # no norming, but absolute paths, without resolving links eq_(rev_resolve_path('.'), ut.Path(d)) eq_(str(rev_resolve_path('.')), d) eq_(str(rev_resolve_path(op.join(os.curdir, 'bu'), ds=ds_global)), op.join(d, 'bu')) eq_(str(rev_resolve_path(op.join(os.pardir, 'bu'), ds=ds_global)), op.join(ds_global.path, 'bu')) # resolve against a dataset eq_(str(rev_resolve_path('bu', ds=ds_local)), op.join(d, 'bu')) eq_(str(rev_resolve_path('bu', ds=ds_global)), op.join(path, 'bu')) # but paths outside the dataset are left untouched eq_(str(rev_resolve_path(op.join(os.curdir, 'bu'), ds=ds_global)), op.join(getpwd(), 'bu')) eq_(str(rev_resolve_path(op.join(os.pardir, 'bu'), ds=ds_global)), op.normpath(op.join(getpwd(), os.pardir, 'bu')))
def _flyweight_postproc_path(cls, path): """perform any desired path post-processing (e.g., dereferencing etc) By default - realpath to guarantee reuse. Derived classes (e.g., Dataset) could override to allow for symlinked datasets to have individual instances for multiple symlinks """ # resolve symlinks to make sure we have exactly one instance per # physical repository at a time # do absolute() in addition to always get an absolute path # even with non-existing paths on windows resolved = str(ut.Path(path).resolve().absolute()) if ut.on_windows and resolved.startswith('\\\\'): # resolve() ended up converting a mounted network drive into a UNC path. # such paths are not supoprted (e.g. as cmd.exe CWD), hence redo and take # absolute path at face value. This has the consequence we cannot determine # repo duplicates mounted on different drives, but this is no worse than # on UNIX return str(ut.Path(path).absolute()) return resolved
def test_hashable(path): path = ut.Path(path) tryme = set() # is it considered hashable at all tryme.add(Dataset(path / 'one')) eq_(len(tryme), 1) # do another one, same class different path tryme.add(Dataset(path / 'two')) eq_(len(tryme), 2) # test whether two different types of repo instances pointing # to the same repo on disk are considered different Dataset(path).create() tryme.add(GitRepo(path)) eq_(len(tryme), 3) tryme.add(AnnexRepo(path)) eq_(len(tryme), 4)
def test_gh1597_simpler(path): ds = Dataset(path).create() # same goes for .gitattributes with open(op.join(ds.path, '.gitignore'), 'a') as f: f.write('*.swp\n') ds.save('.gitignore') assert_repo_status(ds.path) # put .gitattributes in some subdir and add all, should also go into Git attrfile = op.join('subdir', '.gitattributes') ds.repo.set_gitattributes([('*', dict(mycustomthing='this'))], attrfile) assert_repo_status(ds.path, untracked=[attrfile], untracked_mode='all') ds.save() assert_repo_status(ds.path) # no annex key, not in annex assert_not_in( 'key', ds.repo.get_content_annexinfo([ut.Path(attrfile)]).popitem()[1])
def test_resolve_path_symlink_edition(path): deepest = ut.Path(path) / 'one' / 'two' / 'three' deepest_str = str(deepest) os.makedirs(deepest_str) with chpwd(deepest_str): # direct absolute eq_(deepest, resolve_path(deepest)) eq_(deepest, resolve_path(deepest_str)) # explicit direct relative eq_(deepest, resolve_path('.')) eq_(deepest, resolve_path(op.join('.', '.'))) eq_(deepest, resolve_path(op.join('..', 'three'))) eq_(deepest, resolve_path(op.join('..', '..', 'two', 'three'))) eq_(deepest, resolve_path(op.join('..', '..', '..', 'one', 'two', 'three'))) # weird ones eq_(deepest, resolve_path(op.join('..', '.', 'three'))) eq_(deepest, resolve_path(op.join('..', 'three', '.'))) eq_(deepest, resolve_path(op.join('..', 'three', '.'))) eq_(deepest, resolve_path(op.join('.', '..', 'three')))
def path_under_rev_dataset(ds, path): ds_path = ds.pathobj try: rpath = text_type(ut.Path(path).relative_to(ds_path)) if not rpath.startswith(op.pardir): # path is already underneath the dataset return path except Exception: # whatever went wrong, we gotta play save pass root = rev_get_dataset_root(text_type(path)) while root is not None and not ds_path.samefile(root): # path and therefore root could be relative paths, # hence in the next round we cannot use dirname() # to jump in the the next directory up, but we have # to use ./.. and get_dataset_root() will handle # the rest just fine root = rev_get_dataset_root(op.join(root, op.pardir)) if root is None: return None return ds_path / op.relpath(text_type(path), root)
def __call__( path=None, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") path = assure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='saving') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, recursive=recursive, recursion_limit=recursion_limit, result_renderer='disabled'): # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in iteritems(s) if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in iteritems(dataset_hierarchies): edges = {} discover_dataset_trace_to_targets(rootds, children, [], edges, includeds=children) for superds, subdss in iteritems(edges): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: # TODO actually start from an entry that may already # exist in the status record superds_status[ut.Path(subds)] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status # TODO parallelize, whenever we have multiple subdataset of a single # dataset they can all be processed simultaneously # sort list of dataset to handle, starting with the ones deep down for pdspath in sorted(paths_by_ds, reverse=True): pds = Dataset(pdspath) # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds.repo.pathobj / p.relative_to(pdspath): props for p, props in iteritems(paths_by_ds.pop(pdspath)) } start_commit = pds.repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()): for res in pds.repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'annexstatus') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = str( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to(pds.repo.pathobj)) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds.repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres continue try: pds.repo.tag(version_tag) dsres.update(status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save yield dsres.copy() # and now complain that tagging didn't work dsres.update(status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres
def get_paths_by_ds(refds, dataset_arg, paths, subdsroot_mode='rsync'): """Resolve and sort any paths into their containing datasets Any path will be associated (sorted into) its nearest containing dataset. It is irrelevant whether or not a path presently exists on the file system. However, only datasets that exist on the file system are used for sorting/association -- known, but non-existent subdatasets are not considered. Parameters ---------- refds: Dataset dataset_arg: Dataset or str or Path or None Any supported value given to a command's `dataset` argument. Given to `resolve_path()`. paths: list Any number of absolute or relative paths, in str-form or as Path instances, to be sorted into their respective datasets. See also the `subdsroot_mode` parameter. subdsroot_mode: {'rsync', 'super', 'sub'} Switch behavior for paths that are the root of a subdataset. By default ('rsync'), such a path is associated with its parent/superdataset, unless the path ends with a trailing directory separator, in which case it is sorted into the subdataset record (this resembles the path semantics of rsync, hence the label). In 'super' mode, the path is always placed with the superdataset record. Likewise, in 'sub' mode the path is always placed into the subdataset record. Returns ------- dict, list The first return value is the main result, a dictionary with root directories of all discovered datasets as keys and a list of the associated paths inside these datasets as values. Keys and values are normalized to be Path instances of absolute paths. The second return value is a list of all paths (again Path instances) that are not located underneath the reference dataset. """ ds_path = refds.path paths_by_ds = OrderedDict() errors = [] if not paths: # that was quick paths_by_ds[refds.pathobj] = None return paths_by_ds, errors # in order to guarantee proper path sorting, we first need to resolve all # of them (some may be str, some Path, some relative, some absolute) # step 1: normalize to unicode paths = map(ensure_unicode, paths) # step 2: resolve # for later comparison, we need to preserve the original value too paths = [(resolve_path(p, dataset_arg), str(p)) for p in paths] # OPT: store cache for dataset roots for each directory directly # listed in paths, or containing the path (if file) roots_cache = {} # sort any path argument into the respective subdatasets # sort by comparing the resolved Path instances, this puts top-level # paths first, leading to their datasets to be injected into the result # dict first for p, orig_path in sorted(paths, key=lambda x: x[0]): # TODO (left from implementing caching OPT): # Logic here sounds duplicated with discover_dataset_trace_to_targets # and even get_tree_roots of save. str_p = str(p) # query get_dataset_root caching for repeated queries within the same # directory if p.is_dir(): p_dir = str(p) else: # symlink, file, whatnot - seems to match logic in get_dataset_root p_dir = str(p.parent) try: root = roots_cache[p_dir] except KeyError: root = roots_cache[p_dir] = get_dataset_root(p_dir) # to become the root of the dataset that contains the path in question # in the context of (same basepath) as the reference dataset qds_inrefds = None if root is not None: qds_inrefds = path_under_rev_dataset(refds, root) if root is None or qds_inrefds is None: # no root, not possibly underneath the refds # or root that is not underneath/equal the reference dataset root errors.append(p) continue if root != qds_inrefds: # try to recode the dataset path wrt to the reference # dataset # the path that it might have been located by could # have been a resolved path or another funky thing # the path this dataset was located by is not how it would # be referenced underneath the refds (possibly resolved # realpath) -> recode all paths to be underneath the refds p = qds_inrefds / p.relative_to(root) root = qds_inrefds # Note: Compare to Dataset(root).path rather # than root to get same path normalization. if root == str_p and not Dataset(root).path == ds_path and ( subdsroot_mode == 'super' or (subdsroot_mode == 'rsync' and dataset_arg and not orig_path.endswith(op.sep))): # the given path is pointing to a subdataset # and we are either in 'super' mode, or in 'rsync' and found # rsync-link syntax to identify the dataset as whole # (e.g. 'ds') vs its content (e.g. 'ds/') root_dir = op.dirname(root) try: super_root = roots_cache[root_dir] except KeyError: super_root = roots_cache[root_dir] = get_dataset_root(root_dir) if super_root: # the dataset identified by the path argument # is contained in a superdataset, and no # trailing path separator was found in the # argument -> user wants to address the dataset # as a whole (in the superdataset) root = super_root root = ut.Path(root) ps = paths_by_ds.get(root, []) ps.append(p) paths_by_ds[root] = ps return paths_by_ds, errors
def __call__(path=None, initopts=None, force=False, description=None, dataset=None, no_annex=False, fake_dates=False, cfg_proc=None): refds_path = dataset.path if hasattr(dataset, 'path') else dataset # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if path: path = rev_resolve_path(path, dataset) path = path if path \ else getpwd() if dataset is None \ else refds_path # we know that we need to create a dataset at `path` assert (path is not None) # prep for yield res = dict(action='create', path=text_type(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != path: refds = require_dataset(refds_path, check_installed=True, purpose='creating a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", dataset, text_type(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = rev_get_dataset_root( op.normpath(op.join(text_type(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = ut.Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = ut.Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if any(check_path == p or check_path in p.parents for p in pstatus): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents ] res.update({ 'status': 'error', 'message': ('collision with content in parent dataset at %s: %s', text_type(parentds_path), [text_type(c) for c in conflict]) }) yield res return # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in iteritems(pstatus) if v.get('type', None) == 'dataset' } check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ('collision with %s (dataset) in dataset %s', text_type(conflict[0]), text_type(parentds_path)) }) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if isinstance(dataset, Dataset) and \ dataset.path == path else Dataset(text_type(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore' }) yield res return # stuff that we create and want to have tracked with git (not annex) add_to_git = {} if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # create and configure desired repository if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) tbrepo = GitRepo(tbds.path, url=None, create=True, create_sanity_checks=False, git_opts=initopts, fake_dates=fake_dates) # place a .noannex file to indicate annex to leave this repo alone stamp_path = ut.Path(tbrepo.path) / '.noannex' stamp_path.touch() add_to_git[stamp_path] = {'type': 'file', 'state': 'untracked'} else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo( tbds.path, url=None, create=True, create_sanity_checks=False, # do not set backend here, to avoid a dedicated commit backend=None, # None causes version to be taken from config version=None, description=description, git_opts=initopts, fake_dates=fake_dates) # set the annex backend in .gitattributes as a staged change tbrepo.set_default_backend(cfg.obtain('datalad.repo.backend'), persistent=True, commit=False) add_to_git[tbds.repo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'added' } # make sure that v6 annex repos never commit content under .datalad attrs_cfg = (('config', 'annex.largefiles', 'nothing'), ( 'metadata/aggregate*', 'annex.largefiles', 'nothing' ), ('metadata/objects/**', 'annex.largefiles', '({})'.format( cfg.obtain('datalad.metadata.create-aggregate-annex-limit')))) attrs = tbds.repo.get_gitattributes( [op.join('.datalad', i[0]) for i in attrs_cfg]) set_attrs = [] for p, k, v in attrs_cfg: if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v: set_attrs.append((p, {k: v})) if set_attrs: tbds.repo.set_gitattributes(set_attrs, attrfile=op.join( '.datalad', '.gitattributes')) # prevent git annex from ever annexing .git* stuff (gh-1597) attrs = tbds.repo.get_gitattributes('.git') if not attrs.get('.git', {}).get('annex.largefiles', None) == 'nothing': tbds.repo.set_gitattributes([('**/.git*', { 'annex.largefiles': 'nothing' })]) # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbds.repo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'untracked' } # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds.config: # make sure we reset this variable completely, in case of a # re-create tbds.config.unset(id_var, where='dataset') if _seed is None: # just the standard way uuid_id = uuid.uuid1().urn.split(':')[-1] else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds.config.add(id_var, tbds_id if tbds_id is not None else uuid_id, where='dataset', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in iteritems(tbds.config.overrides): tbds.config.add(k, v, where='local', reload=False) # all config manipulation is done -> fll reload tbds.config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbds.repo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked' } # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbds.repo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(refds, Dataset) and refds.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in refds.save(path=tbds.path, ): yield r res.update({'status': 'ok'}) yield res for cfg_proc_ in cfg_proc or []: for r in tbds.run_procedure('cfg_' + cfg_proc_): yield r
def rev_resolve_path(path, ds=None): """Resolve a path specification (against a Dataset location) Any explicit path (absolute or relative) is returned as an absolute path. In case of an explicit relative path (e.g. "./some", or ".\\some" on windows), the current working directory is used as reference. Any non-explicit relative path is resolved against as dataset location, i.e. considered relative to the location of the dataset. If no dataset is provided, the current working directory is used. Note however, that this function is not able to resolve arbitrarily obfuscated path specifications. All operations are purely lexical, and no actual path resolution against the filesystem content is performed. Consequently, common relative path arguments like '../something' (relative to PWD) can be handled properly, but things like 'down/../under' cannot, as resolving this path properly depends on the actual target of any (potential) symlink leading up to '..'. Parameters ---------- path : str or PathLike Platform-specific path specific path specification. ds : Dataset or None Dataset instance to resolve non-explicit relative paths against. Returns ------- `pathlib.Path` object """ if ds is not None and not isinstance(ds, Dataset): ds = require_dataset(ds, check_installed=False, purpose='path resolution') if ds is None: # CWD is the reference path = ut.Path(path) # we have a dataset # stringify in case a pathobj came in elif not op.isabs(str(path)) and \ not (str(path).startswith(os.curdir + os.sep) or str(path).startswith(os.pardir + os.sep)): # we have a dataset and no abspath nor an explicit relative path -> # resolve it against the dataset path = ds.pathobj / path else: # CWD is the reference path = ut.Path(path) # make sure we return an absolute path, but without actually # resolving anything if not path.is_absolute(): # in general it is almost impossible to use resolve() when # we can have symlinks in the root path of a dataset # (that we don't want to resolve here), symlinks to annex'ed # files (that we never want to resolve), and other within-repo # symlinks that we (sometimes) want to resolve (i.e. symlinked # paths for addressing content vs adding content) # CONCEPT: do the minimal thing to catch most real-world inputs # ASSUMPTION: the only sane relative path input that needs # handling and can be handled are upward references like # '../../some/that', wherease stuff like 'down/../someotherdown' # are intellectual excercises # ALGORITHM: match any number of leading '..' path components # and shorten the PWD by that number # NOT using ut.Path.cwd(), because it has symlinks resolved!! pwd_parts = ut.Path(getpwd()).parts path_parts = path.parts leading_parents = 0 for p in path.parts: if p == op.pardir: leading_parents += 1 path_parts = path_parts[1:] elif p == op.curdir: # we want to discard that, but without stripping # a corresponding parent path_parts = path_parts[1:] else: break path = ut.Path( op.join( *(pwd_parts[:-leading_parents if leading_parents else None] + path_parts))) # note that we will not "normpath()" the result, check the # pathlib docs for why this is the only sane choice in the # face of the possibility of symlinks in the path return path
def test_get_content_info(path): repo = GitRepo(path) assert_equal(repo.get_content_info(), {}) # an invalid reference causes an exception assert_raises(ValueError, repo.get_content_info, ref='HEAD') ds = get_convoluted_situation(path) repopath = ds.repo.pathobj assert_equal(ds.repo.pathobj, repopath) assert_equal(ds.pathobj, ut.Path(path)) # verify general rules on fused info records that are incrementally # assembled: for git content info, amended with annex info on 'HEAD' # (to get the last committed stage and with it possibly vanished # content), and lastly annex info wrt to the present worktree, to # also get info on added/staged content # this fuses the info reported from # - git ls-files # - git annex findref HEAD # - git annex find --include '*' for f, r in ds.repo.annexstatus().items(): if f.match('*_untracked'): assert (r.get('gitshasum', None) is None) if f.match('*_deleted'): assert (not f.exists() and not f.is_symlink() is None) if f.match('subds_*'): assert (r['type'] == 'dataset' if r.get('gitshasum', None) else 'directory') if f.match('file_*'): # which one exactly depends on many things assert_in(r['type'], ('file', 'symlink')) if f.match('file_ingit*'): assert (r['type'] == 'file') elif '.datalad' not in f.parts and not f.match('.git*') and \ r.get('gitshasum', None) and not f.match('subds*'): # this should be known to annex, one way or another # regardless of whether things add deleted or staged # or anything in between assert_in('key', r, f) assert_in('keyname', r, f) assert_in('backend', r, f) assert_in('bytesize', r, f) # no duplication with path assert_not_in('file', r, f) # query full untracked report res = ds.repo.get_content_info() assert_in(repopath.joinpath('dir_untracked', 'file_untracked'), res) assert_not_in(repopath.joinpath('dir_untracked'), res) # query for compact untracked report res = ds.repo.get_content_info(untracked='normal') assert_not_in(repopath.joinpath('dir_untracked', 'file_untracked'), res) assert_in(repopath.joinpath('dir_untracked'), res) # query no untracked report res = ds.repo.get_content_info(untracked='no') assert_not_in(repopath.joinpath('dir_untracked', 'file_untracked'), res) assert_not_in(repopath.joinpath('dir_untracked'), res) # git status integrity status = ds.repo.status() for t in ('subds', 'file'): for s in ('untracked', 'added', 'deleted', 'clean', 'ingit_clean', 'dropped_clean', 'modified', 'ingit_modified'): for l in ('', ut.PurePosixPath('subdir', '')): if t == 'subds' and 'ingit' in s or 'dropped' in s: # invalid combination continue if t == 'subds' and s == 'deleted': # same as subds_unavailable -> clean continue p = repopath.joinpath(l, '{}_{}'.format(t, s)) assert p.match('*_{}'.format(status[p]['state'])), p if t == 'subds': assert_in(status[p]['type'], ('dataset', 'directory'), p) else: assert_in(status[p]['type'], ('file', 'symlink'), p) # git annex status integrity annexstatus = ds.repo.annexstatus() for t in ('file', ): for s in ('untracked', 'added', 'deleted', 'clean', 'ingit_clean', 'dropped_clean', 'modified', 'ingit_modified'): for l in ('', ut.PurePosixPath('subdir', '')): p = repopath.joinpath(l, '{}_{}'.format(t, s)) if s in ('untracked', 'ingit_clean', 'ingit_modified'): # annex knows nothing about these things assert_not_in('key', annexstatus[p]) continue assert_in('key', annexstatus[p]) # dear future, # if the next one fails, git-annex might have changed the # nature of the path that are being reported by # `annex find --json` # when this was written `hashir*` was a native path, but # `file` was a POSIX path assert_equal(annexstatus[p]['has_content'], 'dropped' not in s) # check the different subds evaluation modes someds = Dataset(ds.pathobj / 'subds_modified' / 'someds') dirtyds_path = someds.pathobj / 'dirtyds' assert_not_in('state', someds.repo.status(eval_submodule_state='no')[dirtyds_path]) assert_equal( 'clean', someds.repo.status( eval_submodule_state='commit')[dirtyds_path]['state']) assert_equal( 'modified', someds.repo.status(eval_submodule_state='full')[dirtyds_path]['state'])
def test_repo_diff(path, norepo): ds = Dataset(path).create() assert_repo_status(ds.path) assert_raises(ValueError, ds.repo.diff, fr='WTF', to='MIKE') if ds.repo.is_managed_branch(): fr_base = DEFAULT_BRANCH to = DEFAULT_BRANCH else: fr_base = "HEAD" to = None # no diff eq_(ds.repo.diff(fr_base, to), {}) # bogus path makes no difference eq_(ds.repo.diff(fr_base, to, paths=['THIS']), {}) # let's introduce a known change create_tree(ds.path, {'new': 'empty'}) ds.save(to_git=True) assert_repo_status(ds.path) eq_( ds.repo.diff(fr=fr_base + '~1', to=fr_base), { ut.Path(ds.repo.pathobj / 'new'): { 'state': 'added', 'type': 'file', 'bytesize': 5, 'gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6' } }) # modify known file create_tree(ds.path, {'new': 'notempty'}) eq_( ds.repo.diff(fr='HEAD', to=None), { ut.Path(ds.repo.pathobj / 'new'): { 'state': 'modified', 'type': 'file', # the beast is modified, but no change in shasum -> not staged 'gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6', 'prev_gitshasum': '7b4d68d70fcae134d5348f5e118f5e9c9d3f05f6' } }) # per path query gives the same result eq_(ds.repo.diff(fr=fr_base, to=to), ds.repo.diff(fr=fr_base, to=to, paths=['new'])) # also given a directory as a constraint does the same eq_(ds.repo.diff(fr=fr_base, to=to), ds.repo.diff(fr=fr_base, to=to, paths=['.'])) # but if we give another path, it doesn't show up eq_(ds.repo.diff(fr=fr_base, to=to, paths=['other']), {}) # make clean ds.save() assert_repo_status(ds.path) # untracked stuff create_tree(ds.path, {'deep': {'down': 'untracked', 'down2': 'tobeadded'}}) # default is to report all files eq_( ds.repo.diff(fr='HEAD', to=None), { ut.Path(ds.repo.pathobj / 'deep' / 'down'): { 'state': 'untracked', 'type': 'file' }, ut.Path(ds.repo.pathobj / 'deep' / 'down2'): { 'state': 'untracked', 'type': 'file' } }) # but can be made more compact eq_( ds.repo.diff(fr='HEAD', to=None, untracked='normal'), { ut.Path(ds.repo.pathobj / 'deep'): { 'state': 'untracked', 'type': 'directory' } }) # again a unmatching path constrainted will give an empty report eq_(ds.repo.diff(fr='HEAD', to=None, paths=['other']), {}) # perfect match and anything underneath will do eq_( ds.repo.diff(fr='HEAD', to=None, paths=['deep']), { ut.Path(ds.repo.pathobj / 'deep' / 'down'): { 'state': 'untracked', 'type': 'file' }, ut.Path(ds.repo.pathobj / 'deep' / 'down2'): { 'state': 'untracked', 'type': 'file' } })
def test_path_diff(_path, linkpath): # do the setup on the real path, not the symlink, to have its # bugs not affect this test of status() ds = get_deeply_nested_structure(str(_path)) if has_symlink_capability(): # make it more complicated by default ut.Path(linkpath).symlink_to(_path, target_is_directory=True) path = linkpath else: path = _path ds = Dataset(path) if has_symlink_capability(): assert ds.pathobj != ds.repo.pathobj plain_recursive = ds.diff(recursive=True, annex='all', result_renderer=None) # check integrity of individual reports with a focus on how symlinks # are reported for res in plain_recursive: # anything that is an "intended" symlink should be reported # as such. In contrast, anything that is a symlink for mere # technical reasons (annex using it for something in some mode) # should be reported as the thing it is representing (i.e. # a file) if 'link2' in str(res['path']): assert res['type'] == 'symlink', res else: assert res['type'] != 'symlink', res # every item must report its parent dataset assert_in('parentds', res) # bunch of smoke tests # query of '.' is same as no path eq_(plain_recursive, ds.diff(path='.', recursive=True, annex='all', result_renderer=None)) # duplicate paths do not change things eq_( plain_recursive, ds.diff(path=['.', '.'], recursive=True, annex='all', result_renderer=None)) # neither do nested paths if not "2.24.0" <= ds.repo.git_version < "2.25.0": # Release 2.24.0 contained a regression that was fixed with 072a231016 # (2019-12-10). eq_( plain_recursive, ds.diff(path=['.', 'subds_modified'], recursive=True, annex='all', result_renderer=None)) # when invoked in a subdir of a dataset it still reports on the full thing # just like `git status`, as long as there are no paths specified with chpwd(op.join(path, 'directory_untracked')): plain_recursive = diff(recursive=True, annex='all', result_renderer=None) # should be able to take absolute paths and yield the same # output eq_( plain_recursive, ds.diff(path=ds.path, recursive=True, annex='all', result_renderer=None)) # query for a deeply nested path from the top, should just work with a # variety of approaches rpath = op.join('subds_modified', 'subds_lvl1_modified', u'{}_directory_untracked'.format(OBSCURE_FILENAME)) apathobj = ds.pathobj / rpath apath = str(apathobj) for p in (rpath, apath, None): if p is None: # change into the realpath of the dataset and # query with an explicit path with chpwd(ds.path): res = ds.diff(path=op.join('.', rpath), recursive=True, annex='all', result_renderer=None) else: res = ds.diff(path=p, recursive=True, annex='all', result_renderer=None) assert_result_count( res, 1, state='untracked', type='directory', refds=ds.path, # path always comes out a full path inside the queried dataset path=apath, ) assert_result_count(ds.diff(recursive=True, result_renderer=None), 1, path=apath) # limiting recursion will exclude this particular path assert_result_count(ds.diff(recursive=True, recursion_limit=1, result_renderer=None), 0, path=apath) # negative limit is unlimited limit eq_(ds.diff(recursive=True, recursion_limit=-1, result_renderer=None), ds.diff(recursive=True, result_renderer=None))
def test_status(_path, linkpath): # do the setup on the real path, not the symlink, to have its # bugs not affect this test of status() ds = get_deeply_nested_structure(str(_path)) if has_symlink_capability(): # make it more complicated by default ut.Path(linkpath).symlink_to(_path, target_is_directory=True) path = linkpath else: path = _path ds = Dataset(path) if has_symlink_capability(): assert ds.pathobj != ds.repo.pathobj # spotcheck that annex status reporting and availability evaluation # works assert_result_count( ds.status(annex='all', result_renderer=None), 1, path=str(ds.pathobj / 'subdir' / 'annexed_file.txt'), key='MD5E-s5--275876e34cf609db118f3d84b799a790.txt', has_content=True, objloc=str(ds.repo.pathobj / '.git' / 'annex' / 'objects' / # hashdir is different on windows ('f33' if ds.repo.is_managed_branch() else '7p') / ('94b' if ds.repo.is_managed_branch() else 'gp') / 'MD5E-s5--275876e34cf609db118f3d84b799a790.txt' / 'MD5E-s5--275876e34cf609db118f3d84b799a790.txt')) plain_recursive = ds.status(recursive=True, result_renderer=None) # check integrity of individual reports with a focus on how symlinks # are reported for res in plain_recursive: # anything that is an "intended" symlink should be reported # as such. In contrast, anything that is a symlink for mere # technical reasons (annex using it for something in some mode) # should be reported as the thing it is representing (i.e. # a file) if 'link2' in str(res['path']): assert res['type'] == 'symlink', res else: assert res['type'] != 'symlink', res # every item must report its parent dataset assert_in('parentds', res) # bunch of smoke tests # query of '.' is same as no path eq_(plain_recursive, ds.status(path='.', recursive=True, result_renderer=None)) # duplicate paths do not change things eq_(plain_recursive, ds.status(path=['.', '.'], recursive=True, result_renderer=None)) # neither do nested paths eq_( plain_recursive, ds.status(path=['.', 'subds_modified'], recursive=True, result_renderer=None)) # when invoked in a subdir of a dataset it still reports on the full thing # just like `git status`, as long as there are no paths specified with chpwd(op.join(path, 'directory_untracked')): plain_recursive = status(recursive=True, result_renderer=None) # should be able to take absolute paths and yield the same # output eq_(plain_recursive, ds.status(path=ds.path, recursive=True, result_renderer=None)) # query for a deeply nested path from the top, should just work with a # variety of approaches rpath = op.join('subds_modified', 'subds_lvl1_modified', OBSCURE_FILENAME + u'_directory_untracked') apathobj = ds.pathobj / rpath apath = str(apathobj) # ds.repo.pathobj will have the symlink resolved arealpath = ds.repo.pathobj / rpath # TODO include explicit relative path in test for p in (rpath, apath, arealpath, None): if p is None: # change into the realpath of the dataset and # query with an explicit path with chpwd(ds.repo.path): res = ds.status(path=op.join('.', rpath), result_renderer=None) else: res = ds.status(path=p, result_renderer=None) assert_result_count( res, 1, state='untracked', type='directory', refds=ds.path, # path always comes out a full path inside the queried dataset path=apath, ) assert_result_count(ds.status(recursive=True, result_renderer=None), 1, path=apath) # limiting recursion will exclude this particular path assert_result_count(ds.status(recursive=True, recursion_limit=1, result_renderer=None), 0, path=apath) # negative limit is unlimited limit eq_(ds.status(recursive=True, recursion_limit=-1, result_renderer=None), ds.status(recursive=True, result_renderer=None))
def __call__(path=None, dataset=None, annex=None, untracked='normal', recursive=False, recursion_limit=None, eval_subdataset_state='full'): # To the next white knight that comes in to re-implement `status` as a # special case of `diff`. There is one fundamental difference between # the two commands: `status` can always use the worktree as evident on # disk as a contraint (e.g. to figure out which subdataset a path is in) # `diff` cannot do that (everything need to be handled based on a # "virtual" representation of a dataset hierarchy). # MIH concludes that while `status` can be implemented as a special case # of `diff` doing so would complicate and slow down both `diff` and # `status`. So while the apparent almost code-duplication between the # two commands feels wrong, the benefit is speed. Any future RF should # come with evidence that speed does not suffer, and complexity stays # on a manageable level ds = require_dataset(dataset, check_installed=True, purpose='status reporting') paths_by_ds = OrderedDict() if path: # sort any path argument into the respective subdatasets for p in sorted(assure_list(path)): # it is important to capture the exact form of the # given path argument, before any normalization happens # for further decision logic below orig_path = text_type(p) p = rev_resolve_path(p, dataset) root = rev_get_dataset_root(text_type(p)) if root is None: # no root, not possibly underneath the refds yield dict(action='status', path=p, refds=ds.path, status='error', message='path not underneath this dataset', logger=lgr) continue else: if dataset and root == text_type(p) and \ not (orig_path.endswith(op.sep) or orig_path == "."): # the given path is pointing to a dataset # distinguish rsync-link syntax to identify # the dataset as whole (e.g. 'ds') vs its # content (e.g. 'ds/') super_root = rev_get_dataset_root(op.dirname(root)) if super_root: # the dataset identified by the path argument # is contained in a superdataset, and no # trailing path separator was found in the # argument -> user wants to address the dataset # as a whole (in the superdataset) root = super_root root = ut.Path(root) ps = paths_by_ds.get(root, []) ps.append(p) paths_by_ds[root] = ps else: paths_by_ds[ds.pathobj] = None queried = set() content_info_cache = {} while paths_by_ds: qdspath, qpaths = paths_by_ds.popitem(last=False) if qpaths and qdspath in qpaths: # this is supposed to be a full query, save some # cycles sifting through the actual path arguments qpaths = [] # try to recode the dataset path wrt to the reference # dataset # the path that it might have been located by could # have been a resolved path or another funky thing qds_inrefds = path_under_rev_dataset(ds, qdspath) if qds_inrefds is None: # nothing we support handling any further # there is only a single refds yield dict( path=text_type(qdspath), refds=ds.path, action='status', status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, qpaths), logger=lgr, ) continue elif qds_inrefds != qdspath: # the path this dataset was located by is not how it would # be referenced underneath the refds (possibly resolved # realpath) -> recode all paths to be underneath the refds qpaths = [qds_inrefds / p.relative_to(qdspath) for p in qpaths] qdspath = qds_inrefds if qdspath in queried: # do not report on a single dataset twice continue qds = Dataset(text_type(qdspath)) for r in _yield_status( qds, qpaths, annex, untracked, recursion_limit if recursion_limit is not None else -1 if recursive else 0, queried, eval_subdataset_state, content_info_cache): yield dict( r, refds=ds.path, action='status', status='ok', )
def resolve_path(path, ds=None, ds_resolved=None): """Resolve a path specification (against a Dataset location) Any path is returned as an absolute path. If, and only if, a dataset object instance is given as `ds`, relative paths are interpreted as relative to the given dataset. In all other cases, relative paths are treated as relative to the current working directory. Note however, that this function is not able to resolve arbitrarily obfuscated path specifications. All operations are purely lexical, and no actual path resolution against the filesystem content is performed. Consequently, common relative path arguments like '../something' (relative to PWD) can be handled properly, but things like 'down/../under' cannot, as resolving this path properly depends on the actual target of any (potential) symlink leading up to '..'. Parameters ---------- path : str or PathLike or list Platform-specific path specific path specification. Multiple path specifications can be given as a list ds : Dataset or PathLike or None Dataset instance to resolve relative paths against. ds_resolved : Dataset or None A dataset instance that was created from `ds` outside can be provided to avoid multiple instantiation on repeated calls. Returns ------- `pathlib.Path` object or list(Path) When a list was given as input a list is returned, a Path instance otherwise. """ got_ds_instance = isinstance(ds, Dataset) if ds is not None and not got_ds_instance: ds = ds_resolved or require_dataset( ds, check_installed=False, purpose='path resolution') out = [] pwd_parts = None # get it upon first use but only once for p in ensure_list(path): if ds is None or not got_ds_instance: # no dataset at all or no instance provided -> CWD is always the reference # nothing needs to be done here. Path-conversion and absolutification # are done next pass # we have a given datasets instance elif not Path(p).is_absolute(): # we have a dataset and no abspath nor an explicit relative path -> # resolve it against the dataset p = ds.pathobj / p p = ut.Path(p) # make sure we return an absolute path, but without actually # resolving anything if not p.is_absolute(): # in general it is almost impossible to use resolve() when # we can have symlinks in the root path of a dataset # (that we don't want to resolve here), symlinks to annex'ed # files (that we never want to resolve), and other within-repo # symlinks that we (sometimes) want to resolve (i.e. symlinked # paths for addressing content vs adding content) # CONCEPT: do the minimal thing to catch most real-world inputs # ASSUMPTION: the only sane relative path input that needs # handling and can be handled are upward references like # '../../some/that', whereas stuff like 'down/../someotherdown' # are intellectual exercises # ALGORITHM: match any number of leading '..' path components # and shorten the PWD by that number # NOT using ut.Path.cwd(), because it has symlinks resolved!! if not pwd_parts: pwd_parts = ut.Path(getpwd()).parts path_parts = p.parts leading_parents = 0 for pp in p.parts: if pp == op.pardir: leading_parents += 1 path_parts = path_parts[1:] elif pp == op.curdir: # we want to discard that, but without stripping # a corresponding parent path_parts = path_parts[1:] else: break p = ut.Path( op.join(*( pwd_parts[:-leading_parents if leading_parents else None] + path_parts))) # note that we will not "normpath()" the result, check the # pathlib docs for why this is the only sane choice in the # face of the possibility of symlinks in the path out.append(p) return out[0] if isinstance(path, (str, PurePath)) else out
def __call__( path=None, message=None, dataset=None, version_tag=None, recursive=False, recursion_limit=None, updated=False, message_file=None, to_git=None, jobs=None, amend=False, ): if message and message_file: raise ValueError( "Both a message and message file were specified for save()") if amend and recursive: raise ValueError("Cannot amend a commit recursively.") path = ensure_list(path) if message_file: with open(message_file) as mfh: message = mfh.read() # we want 'normal' to achieve the most compact argument list # for git calls # untracked_mode = 'no' if updated else 'normal' # TODO however, Repo.add() would refuse to add any dotfiles # in a directory that is itself untracked, hence the only # choice is to go with potentially crazy long lists # until https://github.com/datalad/datalad/issues/1454 # has a resolution untracked_mode = 'no' if updated else 'all' # there are three basic scenarios: # 1. save modifications to any already tracked content # 2. save any content (including removal of deleted content) # to bring things to a clean state # 3. like (2), but only operate on a given subset of content # identified by paths # - all three have to work in conjunction with --recursive # - the difference between (1) and (2) should be no more # that a switch from --untracked=no to --untracked=all # in Repo.save() # we do not support # - simultaneous operations on multiple datasets from disjoint # dataset hierarchies, hence a single reference dataset must be # identifiable from the either # - curdir or # - the `dataset` argument. # This avoids complex annotation loops and hierarchy tracking. # - any modification upwards from the root dataset ds = require_dataset(dataset, check_installed=True, purpose='saving') # use status() to do all discovery and annotation of paths paths_by_ds = {} for s in Status()( # ATTN: it is vital to pass the `dataset` argument as it, # and not a dataset instance in order to maintain the path # semantics between here and the status() call dataset=dataset, path=path, untracked=untracked_mode, report_filetype=False, recursive=recursive, recursion_limit=recursion_limit, on_failure='ignore', # for save without recursion only commit matters eval_subdataset_state='full' if recursive else 'commit', result_renderer='disabled'): if s['status'] == 'error': # Downstream code can't do anything with these. Let the caller # decide their fate. yield s continue # fish out status dict for this parent dataset ds_status = paths_by_ds.get(s['parentds'], {}) # reassemble path status info as repo.status() would have made it ds_status[ut.Path(s['path'])] = \ {k: v for k, v in s.items() if k not in ( 'path', 'parentds', 'refds', 'status', 'action', 'logger')} paths_by_ds[s['parentds']] = ds_status lgr.debug('Determined %i datasets for saving from input arguments', len(paths_by_ds)) # figure out what datasets to process, start with the ones containing # the paths that were given as arguments discovered_datasets = list(paths_by_ds.keys()) if dataset: # if a reference dataset was given we want to save all the way up # to it, so let's throw it into the mix discovered_datasets.append(ds.path) # sort the datasets into (potentially) disjoint hierarchies, # or a single one, if a reference dataset was given dataset_hierarchies = get_tree_roots(discovered_datasets) for rootds, children in dataset_hierarchies.items(): edges = {} discover_dataset_trace_to_targets(rootds, children, [], edges, includeds=children) for superds, subdss in edges.items(): superds_status = paths_by_ds.get(superds, {}) for subds in subdss: subds_path = ut.Path(subds) sub_status = superds_status.get(subds_path, {}) if not (sub_status.get("state") == "clean" and sub_status.get("type") == "dataset"): # TODO actually start from an entry that may already # exist in the status record superds_status[subds_path] = dict( # shot from the hip, some status config # to trigger this specific super/sub # relation to be saved state='untracked', type='dataset') paths_by_ds[superds] = superds_status def save_ds(args, version_tag=None): pdspath, paths = args pds = Dataset(pdspath) pds_repo = pds.repo # pop status for this dataset, we are not coming back to it pds_status = { # for handing over to the low-level code, we recode any # path relative to the real repo location, this avoid # cumbersome symlink handling without context in the # lower levels pds_repo.pathobj / p.relative_to(pdspath): props for p, props in paths.items() } start_commit = pds_repo.get_hexsha() if not all(p['state'] == 'clean' for p in pds_status.values()) or \ (amend and message): for res in pds_repo.save_( message=message, # make sure to have the `path` arg be None, as we want # to prevent and bypass any additional repo.status() # calls paths=None, # prevent whining of GitRepo git=True if not hasattr(ds.repo, 'annexstatus') else to_git, # we are supplying the full status already, do not # detect anything else untracked='no', _status=pds_status, amend=amend): # TODO remove stringification when datalad-core can handle # path objects, or when PY3.6 is the lowest supported # version for k in ('path', 'refds'): if k in res: res[k] = str( # recode path back to dataset path anchor pds.pathobj / res[k].relative_to(pds_repo.pathobj)) yield res # report on the dataset itself dsres = dict( action='save', type='dataset', path=pds.path, refds=ds.path, status='ok' if start_commit != pds_repo.get_hexsha() else 'notneeded', logger=lgr, ) if not version_tag: yield dsres return try: # method requires str version_tag = str(version_tag) pds_repo.tag(version_tag) dsres.update(status='ok', version_tag=version_tag) yield dsres except CommandError as e: if dsres['status'] == 'ok': # first we yield the result for the actual save # TODO: we will get duplicate dataset/save record obscuring # progress reporting. yoh thought to decouple "tag" from "save" # messages but was worrying that original authors would disagree yield dsres.copy() # and now complain that tagging didn't work dsres.update(status='error', message=('cannot tag this version: %s', e.stderr.strip())) yield dsres if not paths_by_ds: # Special case: empty repo. There's either an empty commit only or # none at all. An empty one we can amend otherwise there's nothing # to do. if amend and ds.repo.get_hexsha(): yield from save_ds((ds.pathobj, dict()), version_tag=version_tag) else: yield dict(action='save', type='dataset', path=ds.path, refds=ds.path, status='notneeded', logger=lgr) return # TODO: in principle logging could be improved to go not by a dataset # but by path(s) within subdatasets. That should provide a bit better ETA # and more "dynamic" feedback than jumpy datasets count. # See addurls where it is implemented that way by providing agg and another # log_filter yield from ProducerConsumerProgressLog( sorted(paths_by_ds.items(), key=lambda v: v[0], reverse=True), partial(save_ds, version_tag=version_tag), safe_to_consume=no_subds_in_futures, producer_future_key=lambda ds_items: ds_items[0], jobs=jobs, log_filter=_log_filter_save_dataset, unit="datasets", lgr=lgr, )