def test_known_failure_v6(): @known_failure_v6 def failing(): raise AssertionError("Failed") from datalad import cfg v6 = cfg.obtain("datalad.repo.version") == 6 skip = cfg.obtain("datalad.tests.knownfailures.skip") probe = cfg.obtain("datalad.tests.knownfailures.probe") if v6: if skip: # skipping takes precedence over probing failing() elif probe: # if we probe a known failure it's okay to fail: failing() else: # not skipping and not probing results in the original failure: assert_raises(AssertionError, failing) else: # behaves as if it wasn't decorated at all, no matter what assert_raises(AssertionError, failing)
def test_probe_known_failure(): # Note: we can't test the switch "datalad.tests.knownfailures.probe" # directly, since it was evaluated in the decorator already. So we need # to have different assertions in this test based on config and have it # tested across builds, which use different settings for that switch. @probe_known_failure def not_failing(): pass @probe_known_failure def failing(): raise AssertionError("Failed") from datalad import cfg switch = cfg.obtain("datalad.tests.knownfailures.probe") if switch: # if probing is enabled the failing is considered to be expected and # therefore the decorated function doesn't actually fail: failing() # in opposition a function that doesn't fail raises an AssertionError: assert_raises(AssertionError, not_failing) else: # if probing is disabled it should just fail/pass as is: assert_raises(AssertionError, failing) not_failing()
def get_oracle_db( dbserver=None, port=1521, sid='ORCL', credential=None): dbserver = dbserver or cfg.obtain('datalad.externals.nda.dbserver', default=DEFAULT_SERVER) # This specific username has access to the 'Image' selection of NDA as of about today #username = username \ # or cfg.get('externals:nda', 'username', # default='halchenkoy_103924') if not credential: providers = Providers.from_config_files() credential = providers.get_provider(DEFAULT_SERVER).credential if not isinstance(credential, dict): credential = credential() import cx_Oracle # you must have the beast if you want to access the dark side dsnStr = cx_Oracle.makedsn(dbserver, port, sid) db = cx_Oracle.connect(user=credential['user'], password=credential['password'], dsn=dsnStr) return db
def test_subdatasets(path): # from scratch ds = Dataset(path) assert_false(ds.is_installed()) eq_(ds.subdatasets(), []) ds = ds.create() assert_true(ds.is_installed()) eq_(ds.subdatasets(), []) # create some file and commit it open(os.path.join(ds.path, 'test'), 'w').write('some') ds.add(path='test') assert_true(ds.is_installed()) ds.save("Hello!", version_tag=1) # Assuming that tmp location was not under a super-dataset eq_(ds.get_superdataset(), None) eq_(ds.get_superdataset(topmost=True), ds) # add itself as a subdataset (crazy, isn't it?) subds = ds.install('subds', source=path, result_xfm='datasets', return_type='item-or-list') assert_true(subds.is_installed()) eq_(subds.get_superdataset(), ds) eq_(subds.get_superdataset(topmost=True), ds) subdss = ds.subdatasets() eq_(len(subdss), 1) eq_(subds.path, ds.subdatasets(result_xfm='paths')[0]) eq_(subdss, ds.subdatasets(recursive=True)) eq_(subdss, ds.subdatasets(fulfilled=True)) ds.save("with subds", version_tag=2) ds.recall_state(1) assert_true(ds.is_installed()) eq_(ds.subdatasets(), []) # very nested subdataset to test topmost subsubds = subds.install( _path_('d1/subds'), source=path, result_xfm='datasets', return_type='item-or-list') assert_true(subsubds.is_installed()) eq_(subsubds.get_superdataset(), subds) # by default, it will only report a subperdataset that actually # has the queries dataset as a registered true subdataset eq_(subsubds.get_superdataset(topmost=True), subds) # by we can also ask for a dataset that is merely above eq_(subsubds.get_superdataset(topmost=True, registered_only=False), ds) # verify that '^' alias would work with chpwd(subsubds.path): dstop = Dataset('^') eq_(dstop, subds) # and while in the dataset we still can resolve into central one dscentral = Dataset('///') eq_(dscentral.path, cfg.obtain('datalad.locations.default-dataset')) with chpwd(ds.path): dstop = Dataset('^') eq_(dstop, ds)
def test_known_failure(): @known_failure def failing(): raise AssertionError("Failed") from datalad import cfg skip = cfg.obtain("datalad.tests.knownfailures.skip") probe = cfg.obtain("datalad.tests.knownfailures.probe") if skip: # skipping takes precedence over probing failing() elif probe: # if we probe a known failure it's okay to fail: failing() else: # not skipping and not probing results in the original failure: assert_raises(AssertionError, failing)
def get_url_cache_filename(url, name=None): """Return a filename where to cache online doc from a url""" if not name: name = "misc" cache_dir = opj(cfg.obtain('datalad.locations.cache'), name) doc_fname = opj( cache_dir, '{}-{}.p{}'.format( urlsplit(url).netloc, md5(url.encode('utf-8')).hexdigest(), pickle.HIGHEST_PROTOCOL) ) return doc_fname
def _flyweight_id_from_args(cls, *args, **kwargs): if args: # to a certain degree we need to simulate an actual call to __init__ # and make sure, passed arguments are fitting: # TODO: Figure out, whether there is a cleaner way to do this in a # generic fashion assert('path' not in kwargs) path = args[0] args = args[1:] elif 'path' in kwargs: path = kwargs.pop('path') else: raise TypeError("__init__() requires argument `path`") if path is None: raise AttributeError # mirror what is happening in __init__ if isinstance(path, ut.PurePath): path = text_type(path) # Custom handling for few special abbreviations path_ = path if path == '^': # get the topmost dataset from current location. Note that 'zsh' # might have its ideas on what to do with ^, so better use as -d^ path_ = Dataset(curdir).get_superdataset(topmost=True).path elif path == '///': # TODO: logic/UI on installing a default dataset could move here # from search? path_ = cfg.obtain('datalad.locations.default-dataset') if path != path_: lgr.debug("Resolved dataset alias %r to path %r", path, path_) # Sanity check for argument `path`: # raise if we cannot deal with `path` at all or # if it is not a local thing: path_ = RI(path_).localpath # we want an absolute path, but no resolved symlinks if not isabs(path_): path_ = opj(getpwd(), path_) # use canonical paths only: path_ = normpath(path_) kwargs['path'] = path_ return path_, args, kwargs
def _flyweight_preproc_path(cls, path): """Custom handling for few special abbreviations for datasets""" path_ = path if path == '^': # get the topmost dataset from current location. Note that 'zsh' # might have its ideas on what to do with ^, so better use as -d^ path_ = Dataset(get_dataset_root(curdir)).get_superdataset( topmost=True).path elif path == '^.': # get the dataset containing current directory path_ = get_dataset_root(curdir) elif path == '///': # TODO: logic/UI on installing a default dataset could move here # from search? path_ = cfg.obtain('datalad.locations.default-dataset') if path != path_: lgr.debug("Resolved dataset alias %r to path %r", path, path_) return path_
def __call__(self): """Obtain credentials from a keyring and if any is not known -- ask""" fields = {} # check if we shall ask for credentials, even if some are on record # already (but maybe they were found to need updating) force_reentry = dlcfg.obtain( 'datalad.credentials.force-ask', valtype=anything2bool) for f in self._FIELDS: # don't query for value if we need to get a new one v = None if force_reentry else self._get_field_value(f) if not self._is_field_optional(f): while v is None: # was not known v = self._ask_and_set(f) fields[f] = v elif v is not None: fields[f] = v return fields
def _flyweight_id_from_args(cls, *args, **kwargs): if args: # to a certain degree we need to simulate an actual call to __init__ # and make sure, passed arguments are fitting: # TODO: Figure out, whether there is a cleaner way to do this in a # generic fashion assert ('path' not in kwargs) path = args[0] args = args[1:] elif 'path' in kwargs: path = kwargs.pop('path') else: raise TypeError("__init__() requires argument `path`") if path is None: raise AttributeError # Custom handling for few special abbreviations path_ = path if path == '^': # get the topmost dataset from current location. Note that 'zsh' # might have its ideas on what to do with ^, so better use as -d^ path_ = Dataset(curdir).get_superdataset(topmost=True).path elif path == '///': # TODO: logic/UI on installing a default dataset could move here # from search? path_ = cfg.obtain('datalad.locations.default-dataset') if path != path_: lgr.debug("Resolved dataset alias %r to path %r", path, path_) # Sanity check for argument `path`: # raise if we cannot deal with `path` at all or # if it is not a local thing: path_ = RI(path_).localpath # we want an absolute path, but no resolved symlinks if not isabs(path_): path_ = opj(getpwd(), path_) # use canonical paths only: path_ = normpath(path_) kwargs['path'] = path_ return path_, args, kwargs
def format_oneline_tb(self, limit=None, include_str=True): """Format an exception traceback as a one-line summary Returns a string of the form [filename:contextname:linenumber, ...]. If include_str is True (default), this is prepended with the string representation of the exception. """ # Note: No import at module level, since ConfigManager imports # dochelpers -> circular import when creating datalad.cfg instance at # startup. from datalad import cfg if include_str: # try exc message leading = str(self.tb) if not leading: # go with type leading = self.tb.exc_type.__qualname__ out = "{} ".format(leading) else: out = "" if limit is None: # TODO: config logging.exceptions.traceback_levels = 1 # ^ This is taken from exc_str(). What exactly does it mean? # Controlling the tblimit differently for logging, result # reporting, whatever else? limit = int(cfg.obtain('datalad.exc.str.tblimit', default=1)) entries = [] entries.extend(self.tb.stack) if self.tb.__cause__: entries.extend(self.tb.__cause__.stack) elif self.tb.__context__ and not self.tb.__suppress_context__: entries.extend(self.tb.__context__.stack) if entries: tb_str = "[%s]" % (','.join("{}:{}:{}".format( Path(frame_summary.filename).name, frame_summary.name, frame_summary.lineno) for frame_summary in entries[-limit:])) out += "{}".format(tb_str) return out
def _get_result_filter(cls, args): from datalad import cfg result_filter = None if args.common_report_status or 'datalad.runtime.report-status' in cfg: report_status = args.common_report_status or \ cfg.obtain('datalad.runtime.report-status') if report_status == "all": pass # no filter elif report_status == 'success': result_filter = EnsureKeyChoice('status', ('ok', 'notneeded')) elif report_status == 'failure': result_filter = EnsureKeyChoice('status', ('impossible', 'error')) else: result_filter = EnsureKeyChoice('status', (report_status,)) if args.common_report_type: tfilt = EnsureKeyChoice('type', tuple(args.common_report_type)) result_filter = result_filter & tfilt if result_filter else tfilt return result_filter
def test_search_outside1(tdir, newhome): with chpwd(tdir): # should fail since directory exists, but not a dataset # should not even waste our response ;) always_render = cfg.obtain('datalad.api.alwaysrender') with patch.object(search_mod, 'LOCAL_CENTRAL_PATH', newhome): if always_render: # we do try to render results which actually causes exception # to come right away assert_raises(NoDatasetArgumentFound, search, "bu") else: gen = search("bu") assert_is_generator(gen) assert_raises(NoDatasetArgumentFound, next, gen) # and if we point to some non-existing dataset -- the same in both cases # but might come before even next if always_render with assert_raises(ValueError): next(search("bu", dataset=newhome))
def _get_result_filter(cls, args): from datalad import cfg result_filter = None if args.common_report_status or 'datalad.runtime.report-status' in cfg: report_status = args.common_report_status or \ cfg.obtain('datalad.runtime.report-status') if report_status == "all": pass # no filter elif report_status == 'success': result_filter = EnsureKeyChoice('status', ('ok', 'notneeded')) elif report_status == 'failure': result_filter = EnsureKeyChoice('status', ('impossible', 'error')) else: result_filter = EnsureKeyChoice('status', (report_status, )) if args.common_report_type: tfilt = EnsureKeyChoice('type', tuple(args.common_report_type)) result_filter = result_filter & tfilt if result_filter else tfilt return result_filter
def known_failure_direct_mode(func): """Test decorator marking a test as known to fail in a direct mode test run If datalad.repo.direct is set to True behaves like `known_failure`. Otherwise the original (undecorated) function is returned. """ from datalad import cfg direct = cfg.obtain("datalad.repo.direct") or on_windows if direct: @known_failure @wraps(func) def dm_func(*args, **kwargs): return func(*args, **kwargs) return dm_func return func
def known_failure_v6(func): """Test decorator marking a test as known to fail in a v6 test run If datalad.repo.version is set to 6 behaves like `known_failure`. Otherwise the original (undecorated) function is returned. """ from datalad import cfg version = cfg.obtain("datalad.repo.version") if version and version == 6: @known_failure @wraps(func) def v6_func(*args, **kwargs): return func(*args, **kwargs) return v6_func return func
def test_skip_known_failure(): # Note: we can't test the switch "datalad.tests.knownfailures.skip" # directly, since it was evaluated in the decorator already. So we need # to have different assertions in this test based on config and have it # tested across builds, which use different settings for that switch. @skip_known_failure def failing(): raise AssertionError("Failed") switch = dl_cfg.obtain("datalad.tests.knownfailures.skip") if switch: # if skipping is enabled, we shouldn't see the exception: failing() else: # if it's disabled, failing() is executed and therefore exception # is raised: assert_raises(AssertionError, failing)
def check_datasets_datalad_org(suffix, tdir): # Test that git annex / datalad install, get work correctly on our datasets.datalad.org # Apparently things can break, especially with introduction of the # smart HTTP backend for apache2 etc ds = install(tdir, source='///dicoms/dartmouth-phantoms/bids_test6-PD+T2w' + suffix) eq_(ds.config.get('remote.origin.annex-ignore', None), None) # assert_result_count and not just assert_status since for some reason on # Windows we get two records due to a duplicate attempt (as res[1]) to get it # again, which is reported as "notneeded". For the purpose of this test # it doesn't make a difference. # git-annex version is not "real" - but that is about when fix was introduced from datalad import cfg if on_windows \ and cfg.obtain("datalad.repo.version") < 6 \ and external_versions['cmd:annex'] <= '7.20181203': raise SkipTest("Known to fail, needs fixed git-annex") assert_result_count( ds.get(op.join('001-anat-scout_ses-{date}', '000001.dcm')), 1, status='ok') assert_status('ok', ds.remove())
def test_skip_known_failure(): # Note: we can't test the switch "datalad.tests.knownfailures.skip" # directly, since it was evaluated in the decorator already. So we need # to have different assertions in this test based on config and have it # tested across builds, which use different settings for that switch. @skip_known_failure def failing(): raise AssertionError("Failed") from datalad import cfg switch = cfg.obtain("datalad.tests.knownfailures.skip") if switch: # if skipping is enabled, we shouldn't see the exception: failing() else: # if it's disabled, failing() is executed and therefore exception # is raised: assert_raises(AssertionError, failing)
def _get_ssh_version(exe=None): """Return version of ssh Annex prior 20170302 was using bundled version, then across all systems we used system one if installed, and then switched to the one defined in configuration, with system-wide (not default in PATH e.g. from conda) "forced" on Windows. If no specific executable provided in `exe`, we will use the one in configuration """ if exe is None: from datalad import cfg exe = cfg.obtain("datalad.ssh.executable") out = _runner.run([exe, '-V'], protocol=StdOutErrCapture) # apparently spits out to err but I wouldn't trust it blindly stdout = out['stdout'] if out['stderr'].startswith('OpenSSH'): stdout = out['stderr'] assert stdout.startswith( 'OpenSSH') # that is the only one we care about atm # The last item in _-separated list in the first word which could be separated # from the rest by , or yet have another word after space return stdout.split(',', 1)[0].split(' ')[0].rstrip('.').split('_')[-1]
def __call__(self, instructions=None): """Obtain credentials from a keyring and if any is not known -- ask Parameters ---------- instructions : str, optional If given, the auto-generated instructions based on a login-URL are replaced by the given string """ fields = {} # check if we shall ask for credentials, even if some are on record # already (but maybe they were found to need updating) force_reentry = dlcfg.obtain('datalad.credentials.force-ask', valtype=anything2bool) for f in self._FIELDS: # don't query for value if we need to get a new one v = None if force_reentry else self._get_field_value(f) if not self._is_field_optional(f): while v is None: # was not known v = self._ask_and_set(f, instructions=instructions) fields[f] = v elif v is not None: fields[f] = v return fields
def _process_results(results, cmd_class, on_failure, action_summary, incomplete_results, result_renderer, result_log_level, allkwargs): # private helper pf @eval_results # loop over results generated from some source and handle each # of them according to the requested behavior (logging, rendering, ...) # used to track repeated messages in the default renderer last_result = None last_result_ts = None # which result dict keys to inspect for changes to discover repetitions # of similar messages repetition_keys = set(('action', 'status', 'type', 'refds')) # counter for detected repetitions result_repetitions = 0 # how many repetitions to show, before suppression kicks in render_n_repetitions = \ dlcfg.obtain('datalad.ui.suppress-similar-results-threshold') \ if sys.stdout.isatty() \ and dlcfg.obtain('datalad.ui.suppress-similar-results') \ else float("inf") for res in results: if not res or 'action' not in res: # XXX Yarik has to no clue on how to track the origin of the # record to figure out WTF, so he just skips it # but MIH thinks leaving a trace of that would be good lgr.debug('Drop result record without "action": %s', res) continue actsum = action_summary.get(res['action'], {}) if res['status']: actsum[res['status']] = actsum.get(res['status'], 0) + 1 action_summary[res['action']] = actsum ## log message, if there is one and a logger was given msg = res.get('message', None) # remove logger instance from results, as it is no longer useful # after logging was done, it isn't serializable, and generally # pollutes the output res_lgr = res.pop('logger', None) if msg and res_lgr: if isinstance(res_lgr, logging.Logger): # didn't get a particular log function, go with default res_lgr = getattr( res_lgr, default_logchannels[res['status']] if result_log_level is None else result_log_level) msg = res['message'] msgargs = None if isinstance(msg, tuple): msgargs = msg[1:] msg = msg[0] if 'path' in res: # result path could be a path instance path = str(res['path']) if msgargs: # we will pass the msg for %-polation, so % should be doubled path = path.replace('%', '%%') msg = '{} [{}({})]'.format(msg, res['action'], path) if msgargs: # support string expansion of logging to avoid runtime cost try: res_lgr(msg, *msgargs) except TypeError as exc: raise TypeError("Failed to render %r with %r from %r: %s" % (msg, msgargs, res, exc_str(exc))) else: res_lgr(msg) ## output rendering # TODO RF this in a simple callable that gets passed into this function if result_renderer is None or result_renderer == 'disabled': pass elif result_renderer == 'default': trimmed_result = { k: v for k, v in res.items() if k in repetition_keys } if res.get('status', None) != 'notneeded' \ and trimmed_result == last_result: # this is a similar report, suppress if too many, but count it result_repetitions += 1 if result_repetitions < render_n_repetitions: default_result_renderer(res) else: last_result_ts = _display_suppressed_message( result_repetitions, render_n_repetitions, last_result_ts) else: # this one is new, first report on any prev. suppressed results # by number, and then render this fresh one last_result_ts = _display_suppressed_message( result_repetitions, render_n_repetitions, last_result_ts, final=True) default_result_renderer(res) result_repetitions = 0 last_result = trimmed_result elif result_renderer in ('json', 'json_pp'): ui.message( json.dumps( {k: v for k, v in res.items() if k not in ('logger')}, sort_keys=True, indent=2 if result_renderer.endswith('_pp') else None, default=str)) elif result_renderer in ('tailored', 'default'): if hasattr(cmd_class, 'custom_result_renderer'): cmd_class.custom_result_renderer(res, **allkwargs) elif hasattr(result_renderer, '__call__'): try: result_renderer(res, **allkwargs) except Exception as e: lgr.warning('Result rendering failed for: %s [%s]', res, exc_str(e)) else: raise ValueError( 'unknown result renderer "{}"'.format(result_renderer)) ## error handling # looks for error status, and report at the end via # an exception if on_failure in ('continue', 'stop') \ and res['status'] in ('impossible', 'error'): incomplete_results.append(res) if on_failure == 'stop': # first fail -> that's it # raise will happen after the loop break yield res # make sure to report on any issues that we had suppressed _display_suppressed_message(result_repetitions, render_n_repetitions, last_result_ts, final=True)
def _generate_func_api(): """Auto detect all available interfaces and generate a function-based API from them """ from importlib import import_module from inspect import isgenerator from collections import namedtuple from collections import OrderedDict from functools import wraps from datalad import cfg from .interface.base import update_docstring_with_parameters from .interface.base import get_interface_groups from .interface.base import get_api_name from .interface.base import alter_interface_docs_for_api def _kwargs_to_namespace(call, args, kwargs): """ Given a __call__, args and kwargs passed, prepare a cmdlineargs-like thing """ from inspect import getargspec argspec = getargspec(call) defaults = argspec.defaults nargs = len(argspec.args) assert (nargs >= len(defaults)) # map any args to their name argmap = list(zip(argspec.args[:len(args)], args)) # map defaults of kwargs to their names (update below) argmap += list(zip(argspec.args[-len(defaults):], defaults)) kwargs_ = OrderedDict(argmap) # update with provided kwarg args kwargs_.update(kwargs) assert (nargs == len(kwargs_)) # Get all arguments removing those possible ones used internally and # which shouldn't be exposed outside anyways [kwargs_.pop(k) for k in kwargs_ if k.startswith('_')] namespace = namedtuple("smth", kwargs_.keys())(**kwargs_) return namespace def call_gen(call, renderer): """Helper to generate a call_ for call, to use provided renderer""" @wraps(call) def call_(*args, **kwargs): ret1 = ret = call(*args, **kwargs) if isgenerator(ret): # At first I thought we might just rerun it for output # at the end, but that wouldn't work if command actually # has a side-effect, i.e. actually doing something # so we actually need to memoize all generated output and output # it instead from datalad.utils import saved_generator ret, ret1 = saved_generator(ret) renderer(ret, _kwargs_to_namespace(call, args, kwargs)) return ret1 # TODO: see if we could proxy the "signature" of function # call from the original one call_.__doc__ += \ "\nNote\n----\n\n" \ "This version of a function uses cmdline results renderer before " \ "returning the result" return call_ always_render = cfg.obtain('datalad.api.alwaysrender') for grp_name, grp_descr, interfaces in get_interface_groups(): for intfspec in interfaces: # turn the interface spec into an instance mod = import_module(intfspec[0], package='datalad') intf = getattr(mod, intfspec[1]) spec = getattr(intf, '_params_', dict()) # FIXME no longer using an interface class instance # convert the parameter SPEC into a docstring for the function update_docstring_with_parameters( intf.__call__, spec, prefix=alter_interface_docs_for_api( intf.__doc__), suffix=alter_interface_docs_for_api( intf.__call__.__doc__) ) globals()[get_api_name(intfspec)] = intf.__call__ # And the one with '_' suffix which would use cmdline results # renderer if hasattr(intf, 'result_renderer_cmdline'): intf__ = call_gen(intf.__call__, intf.result_renderer_cmdline) globals()[get_api_name(intfspec) + '_'] = intf__ if always_render: globals()[get_api_name(intfspec)] = intf__
from itertools import chain from functools import lru_cache from errno import EACCES from os.path import realpath from threading import Lock from functools import wraps import fsspec from fuse import FUSE, FuseOSError, Operations, LoggingMixIn from datalad.support.annexrepo import AnnexRepo from datalad import cfg CACHE_DIR = op.join(cfg.obtain('datalad.locations.cache'), 'fuse') if op.lexists(CACHE_DIR): raise RuntimeError( f"Please first remove {CACHE_DIR}. We are yet to figure out how to" f" ensure correctly working persistent cache:" f" https://github.com/intake/filesystem_spec/issues/553") # explicit blockcache instance for better control etc import fsspec.implementations.cached fs_block = fsspec.implementations.cached.CachingFileSystem( fs=fsspec.filesystem('http'), # , target_protocol='blockcache'), #target_protocol='blockcache', cache_storage=CACHE_DIR, #cache_check=600, #block_size=1024,
from datalad.core.distributed.clone import Clone from datalad.distribution.dataset import Dataset from datalad.support.annexrepo import AnnexRepo from datalad.tests.utils_pytest import ( DEFAULT_REMOTE, with_tempfile, ) from datalad.utils import ( Path, better_wraps, ensure_list, optional_args, rmtree, ) DATALAD_TESTS_CACHE = cfg.obtain("datalad.tests.cache") def url2filename(url): """generate file/directory name from a URL""" # TODO: Not really important for now, but there should be a more # sophisticated approach to replace. May be just everything that # isn't alphanumeric? Or simply hash the URL? # URL: Will include version eventually. Would need parsing to hash # w/o any parameters. Having separate clones per requested version # would defy point of cache, particularly wrt downloading content. # Depends on usecase, of course, but immediate one is about container # images -> not cheap. # make it a Path, too, so pathlib can raise if we are creating an invalid # path on some system we run the tests on.
def __call__(path=None, initopts=None, force=False, description=None, dataset=None, no_annex=False, fake_dates=False, cfg_proc=None): refds_path = dataset.path if hasattr(dataset, 'path') else dataset # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if path: path = rev_resolve_path(path, dataset) path = path if path \ else getpwd() if dataset is None \ else refds_path # we know that we need to create a dataset at `path` assert (path is not None) # prep for yield res = dict(action='create', path=text_type(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != path: refds = require_dataset(refds_path, check_installed=True, purpose='creating a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", dataset, text_type(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = rev_get_dataset_root( op.normpath(op.join(text_type(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = ut.Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = ut.Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if any(check_path == p or check_path in p.parents for p in pstatus): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents ] res.update({ 'status': 'error', 'message': ('collision with content in parent dataset at %s: %s', text_type(parentds_path), [text_type(c) for c in conflict]) }) yield res return # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in iteritems(pstatus) if v.get('type', None) == 'dataset' } check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ('collision with %s (dataset) in dataset %s', text_type(conflict[0]), text_type(parentds_path)) }) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if isinstance(dataset, Dataset) and \ dataset.path == path else Dataset(text_type(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore' }) yield res return # stuff that we create and want to have tracked with git (not annex) add_to_git = {} if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # create and configure desired repository if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) tbrepo = GitRepo(tbds.path, url=None, create=True, create_sanity_checks=False, git_opts=initopts, fake_dates=fake_dates) # place a .noannex file to indicate annex to leave this repo alone stamp_path = ut.Path(tbrepo.path) / '.noannex' stamp_path.touch() add_to_git[stamp_path] = {'type': 'file', 'state': 'untracked'} else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo( tbds.path, url=None, create=True, create_sanity_checks=False, # do not set backend here, to avoid a dedicated commit backend=None, # None causes version to be taken from config version=None, description=description, git_opts=initopts, fake_dates=fake_dates) # set the annex backend in .gitattributes as a staged change tbrepo.set_default_backend(cfg.obtain('datalad.repo.backend'), persistent=True, commit=False) add_to_git[tbds.repo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'added' } # make sure that v6 annex repos never commit content under .datalad attrs_cfg = (('config', 'annex.largefiles', 'nothing'), ( 'metadata/aggregate*', 'annex.largefiles', 'nothing' ), ('metadata/objects/**', 'annex.largefiles', '({})'.format( cfg.obtain('datalad.metadata.create-aggregate-annex-limit')))) attrs = tbds.repo.get_gitattributes( [op.join('.datalad', i[0]) for i in attrs_cfg]) set_attrs = [] for p, k, v in attrs_cfg: if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v: set_attrs.append((p, {k: v})) if set_attrs: tbds.repo.set_gitattributes(set_attrs, attrfile=op.join( '.datalad', '.gitattributes')) # prevent git annex from ever annexing .git* stuff (gh-1597) attrs = tbds.repo.get_gitattributes('.git') if not attrs.get('.git', {}).get('annex.largefiles', None) == 'nothing': tbds.repo.set_gitattributes([('**/.git*', { 'annex.largefiles': 'nothing' })]) # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbds.repo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'untracked' } # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds.config: # make sure we reset this variable completely, in case of a # re-create tbds.config.unset(id_var, where='dataset') if _seed is None: # just the standard way uuid_id = uuid.uuid1().urn.split(':')[-1] else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds.config.add(id_var, tbds_id if tbds_id is not None else uuid_id, where='dataset', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in iteritems(tbds.config.overrides): tbds.config.add(k, v, where='local', reload=False) # all config manipulation is done -> fll reload tbds.config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbds.repo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked' } # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbds.repo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(refds, Dataset) and refds.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in refds.save(path=tbds.path, ): yield r res.update({'status': 'ok'}) yield res for cfg_proc_ in cfg_proc or []: for r in tbds.run_procedure('cfg_' + cfg_proc_): yield r
unlink, rmdir, rmtemp, rmtree, get_tempfile_kwargs, on_windows, Path, ) from datalad import cfg from datalad.config import anything2bool # fall back on patool, if a functional implementation is available # (i.e. not on windows), it is requested, or 7z is not found if not on_windows and ( cfg.obtain( 'datalad.runtime.use-patool', default=False, valtype=anything2bool) or not external_versions['cmd:7z']): from datalad.support.archive_utils_patool import ( decompress_file as _decompress_file, # other code expects this to be here compress_files ) else: from datalad.support.archive_utils_7z import ( decompress_file as _decompress_file, # other code expects this to be here compress_files ) lgr = logging.getLogger('datalad.support.archives')
def get_runner(*args, **kwargs): if cfg.obtain('datalad.crawl.dryrun', default=False): kwargs = kwargs.copy() kwargs['protocol'] = DryRunProtocol() return Runner(*args, **kwargs)
def test_invalid_call(path): with chpwd(path): # ^ Change directory so that we don't fail with an # InvalidGitRepositoryError if the test is executed from a git # worktree. # needs spec or discover assert_raises(InsufficientArgumentsError, run_procedure) res = run_procedure('unknown', on_failure='ignore') assert_true(len(res) == 1) assert_in_results(res, status="impossible") # FIXME: For some reason fails to commit correctly if on windows and in direct # mode. However, direct mode on linux works @skip_if(cond=on_windows and cfg.obtain("datalad.repo.version") < 6) @known_failure_direct_mode #FIXME @with_tree( tree={ 'code': { 'datalad_test_proc.py': """\ import sys import os.path as op from datalad.api import add, Dataset with open(op.join(sys.argv[1], 'fromproc.txt'), 'w') as f: f.write('hello\\n') add(dataset=Dataset(sys.argv[1]), path='fromproc.txt') """ }
def _get_search_index(index_dir, ds, force_reindex): from whoosh import index as widx from .metadata import agginfo_relpath # what is the lastest state of aggregated metadata metadata_state = ds.repo.get_last_commit_hash(agginfo_relpath) stamp_fname = opj(index_dir, 'datalad_metadata_state') definitions_fname = opj(index_dir, 'datalad_term_definitions.json.gz') if not force_reindex and \ exists(stamp_fname) and \ open(stamp_fname).read() == metadata_state: try: # TODO check that the index schema is the same # as the one we would have used for reindexing # TODO support incremental re-indexing, whoosh can do it idx = widx.open_dir(index_dir) lgr.debug('Search index contains %i documents', idx.doc_count()) return idx except widx.LockError as e: raise e except widx.IndexError as e: # Generic index error. # we try to regenerate # TODO log this pass except widx.IndexVersionError as e: # (msg, version, release=None) # Raised when you try to open an index using a format that the # current version of Whoosh cannot read. That is, when the index # you're trying to open is either not backward or forward # compatible with this version of Whoosh. # we try to regenerate # TODO log this pass except widx.OutOfDateError as e: # Raised when you try to commit changes to an index which is not # the latest generation. # this should not happen here, but if it does ... KABOOM raise e except widx.EmptyIndexError as e: # Raised when you try to work with an index that has no indexed # terms. # we can just continue with generating an index pass lgr.info('{} search index'.format( 'Rebuilding' if exists(index_dir) else 'Building')) if not exists(index_dir): os.makedirs(index_dir) schema, definitions, per_ds_defs = _get_search_schema(ds) idx_obj = widx.create_in(index_dir, schema) idx = idx_obj.writer( # cache size per process limitmb=cfg.obtain('datalad.search.indexercachesize'), # disable parallel indexing for now till #1927 is resolved ## number of processes for indexing #procs=multiprocessing.cpu_count(), ## write separate index segments in each process for speed ## asks for writer.commit(optimize=True) #multisegment=True, ) # load metadata of the base dataset and what it knows about all its subdatasets # (recursively) old_idx_size = 0 old_ds_rpath = '' idx_size = 0 for res in _query_aggregated_metadata( reporton=ds.config.obtain( 'datalad.metadata.searchindex-documenttype'), ds=ds, aps=[dict(path=ds.path, type='dataset')], # TODO expose? but this would likely only affect metadata in the # base dataset merge_mode='init', # MIH: I cannot see a case when we would not want recursion (within # the metadata) recursive=True): rpath = relpath(res['path'], start=ds.path) # this assumes that files are reported after each dataset report, # and after a subsequent dataset report no files for the previous # dataset will be reported again rtype = res['type'] meta = res.get('metadata', {}) meta = MetadataDict(meta) if rtype == 'dataset': if old_ds_rpath: lgr.info( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) old_idx_size = idx_size old_ds_rpath = rpath # get any custom dataset mappings ds_defs = per_ds_defs.get(res['path'], {}) # now we merge all reported unique content properties (flattened representation # of content metadata) with the main metadata set, using the 'add' strategy # this way any existing metadata value of a dataset itself will be amended by # those coming from the content. E.g. a single dataset 'license' might be turned # into a sequence of unique license identifiers across all dataset components meta.merge_add(meta.get('unique_content_properties', {})) meta.pop('unique_content_properties', None) doc_props = dict(path=rpath, type=rtype, **_meta2index_dict(meta, definitions, ds_defs)) if 'parentds' in res: doc_props['parentds'] = relpath(res['parentds'], start=ds.path) _add_document(idx, **doc_props) idx_size += 1 if old_ds_rpath: lgr.info( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) idx.commit(optimize=True) # "timestamp" the search index to allow for automatic invalidation with open(stamp_fname, 'w') as f: f.write(metadata_state) # dump the term/field definitions records for later introspection # use compressed storage, the is not point in inflating the # diskspace requirements with gzopen(definitions_fname, 'wb') as f: # TODO actually go through all, incl. compound, defintions ('@id' plus 'unit' # or similar) and resolve terms to URLs, if anyhow possible jsondump2file(definitions, f) lgr.info('Search index contains %i documents', idx_size) return idx_obj
ensure_bytes, ensure_unicode, unlink, rmdir, rmtemp, rmtree, get_tempfile_kwargs, on_windows, Path, ) from datalad import cfg from datalad.config import anything2bool # fall back on patool, if a functional implementation is available # (i.e. not on windows), it is requested, or 7z is not found if not on_windows and (cfg.obtain( 'datalad.runtime.use-patool', default=False, valtype=anything2bool) or not external_versions['cmd:7z']): from datalad.support.archive_utils_patool import ( decompress_file as _decompress_file, # other code expects this to be here compress_files) else: from datalad.support.archive_utils_7z import ( decompress_file as _decompress_file, # other code expects this to be here compress_files) lgr = logging.getLogger('datalad.support.archives') def decompress_file(archive, dir_, leading_directories='strip'):
def test_subdatasets(path): # from scratch ds = Dataset(path) assert_false(ds.is_installed()) assert_raises(ValueError, ds.subdatasets) ds = ds.create() assert_true(ds.is_installed()) eq_(ds.subdatasets(), []) # create some file and commit it open(os.path.join(ds.path, 'test'), 'w').write('some') ds.save(path='test', message="Hello!", version_tag=1) assert_true(ds.is_installed()) # Assuming that tmp location was not under a super-dataset eq_(ds.get_superdataset(), None) eq_(ds.get_superdataset(topmost=True), ds) # add itself as a subdataset (crazy, isn't it?) subds = ds.install('subds', source=path, result_xfm='datasets', return_type='item-or-list') assert_true(subds.is_installed()) eq_(subds.get_superdataset(), ds) eq_(subds.get_superdataset(topmost=True), ds) subdss = ds.subdatasets() eq_(len(subdss), 1) eq_(subds.path, ds.subdatasets(result_xfm='paths')[0]) eq_(subdss, ds.subdatasets(recursive=True)) eq_(subdss, ds.subdatasets(fulfilled=True)) ds.save(message="with subds", version_tag=2) ds.recall_state(1) assert_true(ds.is_installed()) eq_(ds.subdatasets(), []) # very nested subdataset to test topmost subsubds = subds.install(_path_('d1/subds'), source=path, result_xfm='datasets', return_type='item-or-list') assert_true(subsubds.is_installed()) eq_(subsubds.get_superdataset(), subds) # by default, it will only report a subperdataset that actually # has the queries dataset as a registered true subdataset eq_(subsubds.get_superdataset(topmost=True), subds) # by we can also ask for a dataset that is merely above eq_(subsubds.get_superdataset(topmost=True, registered_only=False), ds) # verify that '^' alias would work with chpwd(subsubds.path): dstop = Dataset('^') eq_(dstop, subds) # and while in the dataset we still can resolve into central one dscentral = Dataset('///') eq_(dscentral.path, dl_cfg.obtain('datalad.locations.default-dataset')) with chpwd(ds.path): dstop = Dataset('^') eq_(dstop, ds) # TODO actual submodule checkout is still there # Test ^. (the dataset for curdir) shortcut # At the top should point to the top with chpwd(ds.path): dstop = Dataset('^.') eq_(dstop, ds) # and still does within subdir os.mkdir(opj(ds.path, 'subdir')) with chpwd(opj(ds.path, 'subdir')): dstop = Dataset('^.') eq_(dstop, ds) # within submodule will point to submodule with chpwd(subsubds.path): dstop = Dataset('^.') eq_(dstop, subsubds)
def __call__(repo_name, repo_accession, repo_url, path=None, output=None, dataset=None): # we need this resource file, no point in starting without it itmpl_path = cfg.obtain( 'datalad.plugin.bids2scidata.investigator.template', default=opj( dirname(datalad_neuroimaging.__file__), 'resources', 'isatab', 'scidata_bids_investigator.txt')) if path and dataset is None: dataset = path dataset = require_dataset( dataset, purpose='metadata query', check_installed=True) errored = False dsmeta = None filemeta = [] for m in metadata( path, dataset=dataset, # BIDS hierarchy might go across multiple dataset recursive=True, reporton='all', return_type='generator', result_renderer='disabled'): type = m.get('type', None) if type not in ('dataset', 'file'): continue if m.get('status', None) != 'ok': errored = errored or m.get('status', None) in ('error', 'impossible') yield m continue if type == 'dataset': if dsmeta is not None: lgr.warn( 'Found metadata for more than one datasets, ' 'ignoring their dataset-level metadata') continue dsmeta = m elif type == 'file': filemeta.append(m) if errored: return if not dsmeta or not 'refcommit' in dsmeta: yield dict( status='error', message=("could not find aggregated metadata on path '%s'", path), path=dataset.path, type='dataset', action='bids2scidata', logger=lgr) return lgr.info("Metadata for %i files associated with '%s' on record in %s", len(filemeta), path, dataset) if not output: output = 'scidata_isatab_{}'.format(dsmeta['refcommit']) info = convert( dsmeta, filemeta, output_directory=output, repository_info={ 'Comment[Data Repository]': repo_name, 'Comment[Data Record Accession]': repo_accession, 'Comment[Data Record URI]': repo_url}, ) if info is None: yield dict( status='error', message='dataset does not seem to contain relevant metadata', path=dataset.path, type='dataset', action='bids2scidata', logger=lgr) return itmpl = open(itmpl_path, encoding='utf-8').read() with open(opj(output, 'i_Investigation.txt'), 'w', encoding='utf-8') as ifile: ifile.write( itmpl.format( datalad_version=datalad.__version__, date=datetime.now().strftime('%Y/%m/%d'), repo_name=repo_name, repo_accession=repo_accession, repo_url=repo_url, **info )) yield dict( status='ok', path=abspath(output), # TODO add switch to make tarball/ZIP #type='file', type='directory', action='bids2scidata', logger=lgr)
def _get_procedure_implementation(name='*', ds=None): """get potential procedure path and configuration Order of consideration is user-level, system-level, dataset, datalad extensions, datalad. First one found according to this order is the one to be returned. Therefore local definitions/configurations take precedence over ones, that come from outside (via a datalad-extension or a dataset with its .datalad/config). If a dataset had precedence (as it was before), the addition (or just an update) of a (sub-)dataset would otherwise surprisingly cause you do execute code different from what you defined within ~/.gitconfig or your local repository's .git/config. So, local definitions take precedence over remote ones and more specific ones over more general ones. Returns ------- tuple path, name, format string, help message """ ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None # 1. check system and user account for procedure for loc in (cfg.obtain('datalad.locations.user-procedures'), cfg.obtain('datalad.locations.system-procedures')): for dir in assure_list(loc): for m, n in _get_file_match(dir, name): yield (m, n,) + _get_proc_config(n) # 2. check dataset for procedure if ds is not None and ds.is_installed(): # could be more than one dirs = assure_list( ds.config.obtain('datalad.locations.dataset-procedures')) for dir in dirs: # TODO `get` dirs if necessary for m, n in _get_file_match(op.join(ds.path, dir), name): yield (m, n,) + _get_proc_config(n, ds=ds) # 2.1. check subdatasets recursively for subds in ds.subdatasets(return_type='generator', result_xfm='datasets'): for m, n, f, h in _get_procedure_implementation(name=name, ds=subds): yield m, n, f, h # 3. check extensions for procedure # delay heavy import until here from pkg_resources import iter_entry_points from pkg_resources import resource_isdir from pkg_resources import resource_filename for entry_point in iter_entry_points('datalad.extensions'): # use of '/' here is OK wrt to platform compatibility if resource_isdir(entry_point.module_name, 'resources/procedures'): for m, n in _get_file_match( resource_filename( entry_point.module_name, 'resources/procedures'), name): yield (m, n,) + _get_proc_config(n) # 4. at last check datalad itself for procedure for m, n in _get_file_match( resource_filename('datalad', 'resources/procedures'), name): yield (m, n,) + _get_proc_config(n)
def __call__(path=None, force=False, description=None, dataset=None, no_annex=False, save=True, annex_version=None, annex_backend='MD5E', native_metadata_type=None, shared_access=None, git_opts=None, annex_opts=None, annex_init_opts=None, text_no_annex=None): # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD if path and dataset: # Given a path and a dataset (path) not pointing to installed # dataset if not dataset.is_installed(): msg = "No installed dataset at %s found." % dataset.path dsroot = get_dataset_root(dataset.path) if dsroot: msg += " If you meant to add to the %s dataset, use that path " \ "instead but remember that if dataset is provided, " \ "relative paths are relative to the top of the " \ "dataset." % dsroot raise ValueError(msg) # sanity check first if git_opts: lgr.warning( "`git_opts` argument is presently ignored, please complain!") if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if annex_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex and declaring no " "annex repo.") if annex_init_opts: raise ValueError("Incompatible arguments: cannot specify " "options for annex init and declaring no " "annex repo.") if not isinstance(force, bool): raise ValueError( "force should be bool, got %r. Did you mean to provide a 'path'?" % force) annotated_paths = AnnotatePaths.__call__( # nothing given explicitly, assume create fresh right here path=path if path else getpwd() if dataset is None else None, dataset=dataset, recursive=False, action='create', # we need to know whether we have to check for potential # subdataset collision force_parentds_discovery=True, # it is absolutely OK to have something that does not exist unavailable_path_status='', unavailable_path_msg=None, # if we have a dataset given that actually exists, we want to # fail if the requested path is not in it nondataset_path_status='error' \ if isinstance(dataset, Dataset) and dataset.is_installed() else '', on_failure='ignore') path = None for r in annotated_paths: if r['status']: # this is dealt with already yield r continue if path is not None: raise ValueError( "`create` can only handle single target path or dataset") path = r if len(annotated_paths) and path is None: # we got something, we complained already, done return # we know that we need to create a dataset at `path` assert (path is not None) # prep for yield path.update({'logger': lgr, 'type': 'dataset'}) # just discard, we have a new story to tell path.pop('message', None) if 'parentds' in path: subs = Subdatasets.__call__( dataset=path['parentds'], # any known fulfilled=None, recursive=False, contains=path['path'], result_xfm='relpaths') if len(subs): path.update({ 'status': 'error', 'message': ('collision with known subdataset %s/ in dataset %s', subs[0], path['parentds']) }) yield path return # TODO here we need a further test that if force=True, we need to look if # there is a superdataset (regardless of whether we want to create a # subdataset or not), and if that superdataset tracks anything within # this directory -- if so, we need to stop right here and whine, because # the result of creating a repo here will produce an undesired mess if git_opts is None: git_opts = {} if shared_access: # configure `git --shared` value git_opts['shared'] = shared_access # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if isinstance(dataset, Dataset) and dataset.path == path['path'] \ else Dataset(path['path']) # don't create in non-empty directory without `force`: if isdir(tbds.path) and listdir(tbds.path) != [] and not force: path.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore' }) yield path return if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) GitRepo(tbds.path, url=None, create=True, git_opts=git_opts) else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo(tbds.path, url=None, create=True, backend=annex_backend, version=annex_version, description=description, git_opts=git_opts, annex_opts=annex_opts, annex_init_opts=annex_init_opts) if text_no_annex: git_attributes_file = opj(tbds.path, '.gitattributes') with open(git_attributes_file, 'a') as f: f.write('* annex.largefiles=(not(mimetype=text/*))\n') tbrepo.add([git_attributes_file], git=True) tbrepo.commit("Instructed annex to add text files to git", _datalad_msg=True, files=[git_attributes_file]) if native_metadata_type is not None: if not isinstance(native_metadata_type, list): native_metadata_type = [native_metadata_type] for nt in native_metadata_type: tbds.config.add('datalad.metadata.nativetype', nt) # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' if id_var in tbds.config: # make sure we reset this variable completely, in case of a re-create tbds.config.unset(id_var, where='dataset') if _seed is None: # just the standard way uuid_id = uuid.uuid1().urn.split(':')[-1] else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds.config.add(id_var, tbds.id if tbds.id is not None else uuid_id, where='dataset') # make sure that v6 annex repos never commit content under .datalad with open(opj(tbds.path, '.datalad', '.gitattributes'), 'a') as gitattr: # TODO this will need adjusting, when annex'ed aggregate metadata # comes around gitattr.write( '# Text files (according to file --mime-type) are added directly to git.\n' ) gitattr.write( '# See http://git-annex.branchable.com/tips/largefiles/ for more info.\n' ) gitattr.write('** annex.largefiles=nothing\n') gitattr.write('metadata/objects/** annex.largefiles=({})\n'.format( cfg.obtain('datalad.metadata.create-aggregate-annex-limit'))) # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbds.add('.datalad', to_git=True, save=save, message='[DATALAD] new dataset') # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(dataset, Dataset) and dataset.path != tbds.path \ and tbds.repo.get_hexsha(): # we created a dataset in another dataset # -> make submodule for r in dataset.add(tbds.path, save=save, return_type='generator', result_filter=None, result_xfm=None, on_failure='ignore'): yield r path.update({'status': 'ok'}) yield path
def _setup_annex_repo(path, initopts=None, fake_dates=False, description=None): """Create and configure a repository at `path` This includes a default setup of annex.largefiles. Parameters ---------- path: str or Path Path of the repository initopts: dict, optional Git options to be passed to the AnnexRepo constructor fake_dates: bool, optional Passed to the AnnexRepo constructor description: str, optional Passed to the AnnexRepo constructor Returns ------- AnnexRepo, dict Created repository and records for any repo component that needs to be passed to git-add as a result of the setup procedure. """ # always come with annex when created from scratch tbrepo = AnnexRepo( path, create=True, create_sanity_checks=False, # do not set backend here, to avoid a dedicated commit backend=None, # None causes version to be taken from config version=None, description=description, git_opts=initopts, fake_dates=fake_dates) # set the annex backend in .gitattributes as a staged change tbrepo.set_default_backend(cfg.obtain('datalad.repo.backend'), persistent=True, commit=False) add_to_git = { tbrepo.pathobj / '.gitattributes': { 'type': 'file', 'state': 'added', } } # make sure that v6 annex repos never commit content under .datalad attrs_cfg = (('config', 'annex.largefiles', 'nothing'), ( 'metadata/aggregate*', 'annex.largefiles', 'nothing'), ('metadata/objects/**', 'annex.largefiles', '({})'.format( cfg.obtain('datalad.metadata.create-aggregate-annex-limit')))) attrs = tbrepo.get_gitattributes( [op.join('.datalad', i[0]) for i in attrs_cfg]) set_attrs = [] for p, k, v in attrs_cfg: if not attrs.get(op.join('.datalad', p), {}).get(k, None) == v: set_attrs.append((p, {k: v})) if set_attrs: tbrepo.set_gitattributes(set_attrs, attrfile=op.join('.datalad', '.gitattributes')) # prevent git annex from ever annexing .git* stuff (gh-1597) attrs = tbrepo.get_gitattributes('.git') if not attrs.get('.git', {}).get('annex.largefiles', None) == 'nothing': tbrepo.set_gitattributes([('**/.git*', { 'annex.largefiles': 'nothing' })]) # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbrepo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'untracked' } return tbrepo, add_to_git
from datalad.api import run_procedure from datalad.api import clean from datalad import cfg def test_invalid_call(): # needs spec or discover assert_raises(InsufficientArgumentsError, run_procedure) res = run_procedure('unknown', on_failure='ignore') assert_true(len(res) == 1) assert_in_results(res, status="impossible") # FIXME: For some reason fails to commit correctly if on windows and in direct # mode. However, direct mode on linux works @skip_if(cond=on_windows and cfg.obtain("datalad.repo.version") < 6) @known_failure_direct_mode #FIXME @with_tree(tree={ 'code': {'datalad_test_proc.py': """\ import sys import os.path as op from datalad.api import add, Dataset with open(op.join(sys.argv[1], 'fromproc.txt'), 'w') as f: f.write('hello\\n') add(dataset=Dataset(sys.argv[1]), path='fromproc.txt') """}}) @with_tempfile def test_basics(path, super_path): ds = Dataset(path).create(force=True) ds.run_procedure('setup_yoda_dataset')
def _generate_func_api(): """Auto detect all available interfaces and generate a function-based API from them """ from importlib import import_module from inspect import isgenerator from collections import namedtuple from collections import OrderedDict from functools import wraps from datalad import cfg from .interface.base import update_docstring_with_parameters from .interface.base import get_interface_groups from .interface.base import get_api_name from .interface.base import alter_interface_docs_for_api def _kwargs_to_namespace(call, args, kwargs): """ Given a __call__, args and kwargs passed, prepare a cmdlineargs-like thing """ from inspect import getargspec argspec = getargspec(call) defaults = argspec.defaults nargs = len(argspec.args) assert (nargs >= len(defaults)) # map any args to their name argmap = list(zip(argspec.args[:len(args)], args)) # map defaults of kwargs to their names (update below) argmap += list(zip(argspec.args[-len(defaults):], defaults)) kwargs_ = OrderedDict(argmap) # update with provided kwarg args kwargs_.update(kwargs) assert (nargs == len(kwargs_)) # Get all arguments removing those possible ones used internally and # which shouldn't be exposed outside anyways [kwargs_.pop(k) for k in kwargs_ if k.startswith('_')] namespace = namedtuple("smth", kwargs_.keys())(**kwargs_) return namespace def call_gen(call, renderer): """Helper to generate a call_ for call, to use provided renderer""" @wraps(call) def call_(*args, **kwargs): ret1 = ret = call(*args, **kwargs) if isgenerator(ret): # At first I thought we might just rerun it for output # at the end, but that wouldn't work if command actually # has a side-effect, i.e. actually doing something # so we actually need to memoize all generated output and output # it instead from datalad.utils import saved_generator ret, ret1 = saved_generator(ret) renderer(ret, _kwargs_to_namespace(call, args, kwargs)) return ret1 # TODO: see if we could proxy the "signature" of function # call from the original one call_.__doc__ += \ "\nNote\n----\n\n" \ "This version of a function uses cmdline results renderer before " \ "returning the result" return call_ always_render = cfg.obtain('datalad.api.alwaysrender') for grp_name, grp_descr, interfaces in get_interface_groups(): for intfspec in interfaces: # turn the interface spec into an instance mod = import_module(intfspec[0], package='datalad') intf = getattr(mod, intfspec[1]) spec = getattr(intf, '_params_', dict()) # FIXME no longer using an interface class instance # convert the parameter SPEC into a docstring for the function update_docstring_with_parameters( intf.__call__, spec, prefix=alter_interface_docs_for_api(intf.__doc__), suffix=alter_interface_docs_for_api(intf.__call__.__doc__)) globals()[get_api_name(intfspec)] = intf.__call__ # And the one with '_' suffix which would use cmdline results # renderer if hasattr(intf, 'result_renderer_cmdline'): intf__ = call_gen(intf.__call__, intf.result_renderer_cmdline) globals()[get_api_name(intfspec) + '_'] = intf__ if always_render: globals()[get_api_name(intfspec)] = intf__
def _get_procedure_implementation(name='*', ds=None): """get potential procedure path and configuration Order of consideration is user-level, system-level, dataset, datalad extensions, datalad. First one found according to this order is the one to be returned. Therefore local definitions/configurations take precedence over ones, that come from outside (via a datalad-extension or a dataset with its .datalad/config). If a dataset had precedence (as it was before), the addition (or just an update) of a (sub-)dataset would otherwise surprisingly cause you do execute code different from what you defined within ~/.gitconfig or your local repository's .git/config. So, local definitions take precedence over remote ones and more specific ones over more general ones. Returns ------- tuple path, format string, help message """ ds = ds if isinstance(ds, Dataset) else Dataset(ds) if ds else None # 1. check system and user account for procedure for loc in (cfg.obtain('datalad.locations.user-procedures'), cfg.obtain('datalad.locations.system-procedures')): for dir in assure_list(loc): for m, n in _get_file_match(dir, name): yield (m, n,) + _get_proc_config(n) # 2. check dataset for procedure if ds is not None and ds.is_installed(): # could be more than one dirs = assure_list( ds.config.obtain('datalad.locations.dataset-procedures')) for dir in dirs: # TODO `get` dirs if necessary for m, n in _get_file_match(op.join(ds.path, dir), name): yield (m, n,) + _get_proc_config(n, ds=ds) # 2.1. check subdatasets recursively for subds in ds.subdatasets(return_type='generator', result_xfm='datasets'): for m, n, f, h in _get_procedure_implementation(name=name, ds=subds): yield m, n, f, h # 3. check extensions for procedure # delay heavy import until here from pkg_resources import iter_entry_points from pkg_resources import resource_isdir from pkg_resources import resource_filename for entry_point in iter_entry_points('datalad.extensions'): # use of '/' here is OK wrt to platform compatibility if resource_isdir(entry_point.module_name, 'resources/procedures'): for m, n in _get_file_match( resource_filename( entry_point.module_name, 'resources/procedures'), name): yield (m, n,) + _get_proc_config(n) # 4. at last check datalad itself for procedure for m, n in _get_file_match( resource_filename('datalad', 'resources/procedures'), name): yield (m, n,) + _get_proc_config(n)
def _mk_search_index(self, force_reindex): """Generic entrypoint to index generation The actual work that determines the structure and content of the index is done by functions that are passed in as arguments `meta2doc` - must return dict for index document from result input """ from whoosh import index as widx from .metadata import agginfo_relpath # what is the lastest state of aggregated metadata metadata_state = self.ds.repo.get_last_commit_hash(agginfo_relpath) # use location common to all index types, they would all invalidate # simultaneously stamp_fname = opj(self.index_dir, 'datalad_metadata_state') index_dir = opj(self.index_dir, self._mode_label) if (not force_reindex) and \ exists(index_dir) and \ exists(stamp_fname) and \ open(stamp_fname).read() == metadata_state: try: # TODO check that the index schema is the same # as the one we would have used for reindexing # TODO support incremental re-indexing, whoosh can do it idx = widx.open_dir(index_dir) lgr.debug('Search index contains %i documents', idx.doc_count()) self.idx_obj = idx return except widx.LockError as e: raise e except widx.IndexError as e: # Generic index error. # we try to regenerate lgr.warning( "Cannot open existing index %s (%s), will regenerate", index_dir, exc_str(e)) except widx.IndexVersionError as e: # (msg, version, release=None) # Raised when you try to open an index using a format that the # current version of Whoosh cannot read. That is, when the index # you're trying to open is either not backward or forward # compatible with this version of Whoosh. # we try to regenerate lgr.warning(exc_str(e)) pass except widx.OutOfDateError as e: # Raised when you try to commit changes to an index which is not # the latest generation. # this should not happen here, but if it does ... KABOOM raise except widx.EmptyIndexError as e: # Raised when you try to work with an index that has no indexed # terms. # we can just continue with generating an index pass except ValueError as e: if 'unsupported pickle protocol' in str(e): lgr.warning( "Cannot open existing index %s (%s), will regenerate", index_dir, exc_str(e)) else: raise lgr.info('{} search index'.format( 'Rebuilding' if exists(index_dir) else 'Building')) if not exists(index_dir): os.makedirs(index_dir) # this is a pretty cheap call that just pull this info from a file dsinfo = self.ds.metadata(get_aggregates=True, return_type='list', result_renderer='disabled') self._mk_schema(dsinfo) idx_obj = widx.create_in(index_dir, self.schema) idx = idx_obj.writer( # cache size per process limitmb=cfg.obtain('datalad.search.indexercachesize'), # disable parallel indexing for now till #1927 is resolved ## number of processes for indexing #procs=multiprocessing.cpu_count(), ## write separate index segments in each process for speed ## asks for writer.commit(optimize=True) #multisegment=True, ) # load metadata of the base dataset and what it knows about all its subdatasets # (recursively) old_idx_size = 0 old_ds_rpath = '' idx_size = 0 log_progress( lgr.info, 'autofieldidxbuild', 'Start building search index', total=len(dsinfo), label='Building search index', unit=' Datasets', ) for res in query_aggregated_metadata( reporton=self.documenttype, ds=self.ds, aps=[dict(path=self.ds.path, type='dataset')], # MIH: I cannot see a case when we would not want recursion (within # the metadata) recursive=True): # this assumes that files are reported after each dataset report, # and after a subsequent dataset report no files for the previous # dataset will be reported again meta = res.get('metadata', {}) doc = self._meta2doc(meta) admin = { 'type': res['type'], 'path': relpath(res['path'], start=self.ds.path), } if 'parentds' in res: admin['parentds'] = relpath(res['parentds'], start=self.ds.path) if admin['type'] == 'dataset': if old_ds_rpath: lgr.debug( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) log_progress(lgr.info, 'autofieldidxbuild', 'Indexed dataset at %s', old_ds_rpath, update=1, increment=True) old_idx_size = idx_size old_ds_rpath = admin['path'] admin['id'] = res.get('dsid', None) doc.update({k: assure_unicode(v) for k, v in admin.items()}) lgr.debug("Adding document to search index: {}".format(doc)) # inject into index idx.add_document(**doc) idx_size += 1 if old_ds_rpath: lgr.debug( 'Added %s on dataset %s', single_or_plural('document', 'documents', idx_size - old_idx_size, include_count=True), old_ds_rpath) lgr.debug("Committing index") idx.commit(optimize=True) log_progress(lgr.info, 'autofieldidxbuild', 'Done building search index') # "timestamp" the search index to allow for automatic invalidation with open(stamp_fname, 'w') as f: f.write(metadata_state) lgr.info('Search index contains %i documents', idx_size) self.idx_obj = idx_obj
def format_oneline_tb(exc, tb=None, limit=None, include_str=True): """Format an exception traceback as a one-line summary Parameters ---------- exc: Exception tb: TracebackException, optional If not given, it is generated from the given exception. limit: int, optional Traceback depth limit. If not given, the config setting 'datalad.exc.str.tblimit' will be used, or all entries are reported. include_str: bool If set, is True (default), the return value is prepended with a string representation of the exception. Returns ------- str Of format [filename:contextname:linenumber, ...]. """ # Note: No import at module level, since ConfigManager imports # dochelpers -> circular import when creating datalad.cfg instance at # startup. from datalad import cfg if include_str: # try exc message else exception type leading = exc.message or exc.name out = "{} ".format(leading) else: out = "" if tb is None: tb = traceback.TracebackException.from_exception( exc, limit=limit, lookup_lines=True, capture_locals=False, ) entries = [] entries.extend(tb.stack) if tb.__cause__: entries.extend(tb.__cause__.stack) elif tb.__context__ and not tb.__suppress_context__: entries.extend(tb.__context__.stack) if limit is None: limit = int(cfg.obtain('datalad.exc.str.tblimit', default=len(entries))) if entries: tb_str = "[%s]" % (','.join( "{}:{}:{}".format( Path(frame_summary.filename).name, frame_summary.name, frame_summary.lineno) for frame_summary in entries[-limit:]) ) out += "{}".format(tb_str) return out
from datalad.utils import ( any_re_search, ensure_bytes, ensure_unicode, unlink, rmtemp, rmtree, get_tempfile_kwargs, on_windows, Path, ) from datalad import cfg from datalad.config import anything2bool # fall back on patool, if requested, or 7z is not found if (cfg.obtain( 'datalad.runtime.use-patool', default=False, valtype=anything2bool) or not external_versions['cmd:7z']): from datalad.support.archive_utils_patool import ( decompress_file as _decompress_file, # other code expects this to be here compress_files) else: from datalad.support.archive_utils_7z import ( decompress_file as _decompress_file, # other code expects this to be here compress_files) lgr = logging.getLogger('datalad.support.archives') def decompress_file(archive, dir_, leading_directories='strip'):
def __call__( path=None, initopts=None, force=False, description=None, dataset=None, no_annex=False, fake_dates=False, cfg_proc=None ): refds_path = dataset.path if hasattr(dataset, 'path') else dataset # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if path: path = rev_resolve_path(path, dataset) path = path if path \ else getpwd() if dataset is None \ else refds_path # we know that we need to create a dataset at `path` assert(path is not None) # prep for yield res = dict(action='create', path=text_type(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != path: refds = require_dataset( refds_path, check_installed=True, purpose='creating a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", dataset, text_type(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = rev_get_dataset_root( op.normpath(op.join(text_type(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = ut.Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = ut.Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if any( check_path == p or check_path in p.parents for p in pstatus): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents] res.update({ 'status': 'error', 'message': ( 'collision with content in parent dataset at %s: %s', text_type(parentds_path), [text_type(c) for c in conflict])}) yield res return # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in iteritems(pstatus) if v.get('type', None) == 'dataset'} check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ( 'collision with %s (dataset) in dataset %s', text_type(conflict[0]), text_type(parentds_path))}) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = dataset if isinstance(dataset, Dataset) and \ dataset.path == path else Dataset(text_type(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`force` option to ignore'}) yield res return # stuff that we create and want to have tracked with git (not annex) add_to_git = {} if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # create and configure desired repository if no_annex: lgr.info("Creating a new git repo at %s", tbds.path) tbrepo = GitRepo( tbds.path, url=None, create=True, create_sanity_checks=False, git_opts=initopts, fake_dates=fake_dates) # place a .noannex file to indicate annex to leave this repo alone stamp_path = ut.Path(tbrepo.path) / '.noannex' stamp_path.touch() add_to_git[stamp_path] = { 'type': 'file', 'state': 'untracked'} else: # always come with annex when created from scratch lgr.info("Creating a new annex repo at %s", tbds.path) tbrepo = AnnexRepo( tbds.path, url=None, create=True, create_sanity_checks=False, # do not set backend here, to avoid a dedicated commit backend=None, # None causes version to be taken from config version=None, description=description, git_opts=initopts, fake_dates=fake_dates ) # set the annex backend in .gitattributes as a staged change tbrepo.set_default_backend( cfg.obtain('datalad.repo.backend'), persistent=True, commit=False) add_to_git[tbds.repo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'added'} # make sure that v6 annex repos never commit content under .datalad attrs_cfg = ( ('config', 'annex.largefiles', 'nothing'), ('metadata/aggregate*', 'annex.largefiles', 'nothing'), ('metadata/objects/**', 'annex.largefiles', '({})'.format(cfg.obtain( 'datalad.metadata.create-aggregate-annex-limit')))) attrs = tbds.repo.get_gitattributes( [op.join('.datalad', i[0]) for i in attrs_cfg]) set_attrs = [] for p, k, v in attrs_cfg: if not attrs.get( op.join('.datalad', p), {}).get(k, None) == v: set_attrs.append((p, {k: v})) if set_attrs: tbds.repo.set_gitattributes( set_attrs, attrfile=op.join('.datalad', '.gitattributes')) # prevent git annex from ever annexing .git* stuff (gh-1597) attrs = tbds.repo.get_gitattributes('.git') if not attrs.get('.git', {}).get( 'annex.largefiles', None) == 'nothing': tbds.repo.set_gitattributes([ ('**/.git*', {'annex.largefiles': 'nothing'})]) # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbds.repo.pathobj / '.gitattributes'] = { 'type': 'file', 'state': 'untracked'} # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds.config: # make sure we reset this variable completely, in case of a # re-create tbds.config.unset(id_var, where='dataset') if _seed is None: # just the standard way uuid_id = uuid.uuid1().urn.split(':')[-1] else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds.config.add( id_var, tbds_id if tbds_id is not None else uuid_id, where='dataset', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in iteritems(tbds.config.overrides): tbds.config.add(k, v, where='local', reload=False) # all config manipulation is done -> fll reload tbds.config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbds.repo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked'} # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbds.repo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(dataset, Dataset) and dataset.path != tbds.path: # we created a dataset in another dataset # -> make submodule for r in dataset.save( path=tbds.path, ): yield r res.update({'status': 'ok'}) yield res for cfg_proc_ in cfg_proc or []: for r in tbds.run_procedure('cfg_' + cfg_proc_): yield r