def test_plugin_config(path): # baseline behavior, empty datasets on create ds = create(dataset=opj(path, 'ds1')) eq_(sorted(os.listdir(ds.path)), ['.datalad', '.git', '.gitattributes']) # now we configure a plugin to run twice after `create` cfg.add('datalad.create.run-after', 'add_readme filename=after1.txt', where='global') cfg.add('datalad.create.run-after', 'add_readme filename=after2.txt', where='global') # force reload to pick up newly populated .gitconfig cfg.reload(force=True) assert_in('datalad.create.run-after', cfg) # and now we create a dataset and expect the two readme files # to be part of it ds = create(dataset=opj(path, 'ds')) ok_clean_git(ds.path) assert(exists(opj(ds.path, 'after1.txt'))) assert(exists(opj(ds.path, 'after2.txt'))) # cleanup cfg.unset( 'datalad.create.run-after', where='global') assert_not_in('datalad.create.run-after', cfg)
def _wrap_with_store_insteadof(*args, **kwargs): host = args[0] base_path = args[1] try: dl_cfg.set('url.ria+{prot}://{host}{path}.insteadOf' ''.format(prot='ssh' if host else 'file', host=host if host else '', path=base_path), 'ria+ssh://test-store:', where='global', reload=True) return func(*args, **kwargs) finally: dl_cfg.unset('url.ria+{prot}://{host}{path}.insteadOf' ''.format(prot='ssh' if host else 'file', host=host if host else '', path=base_path), where='global', reload=True)
def _test_create_store(host, ds_path, base_path, clone_path): # TODO: This is an issue. We are writing to ~/.gitconfig here. Override # doesn't work, since RIARemote itself (actually git-annex!) doesn't # have access to it, so initremote will still fail. # => at least move cfg.set/unset into a decorator, so it doesn't # remain when a test is failing. # TODO this should be wrapped in a decorator that performs the set/unset # in a try-finally configuration cfg.set('url.ria+{prot}://{host}{path}.insteadOf' ''.format(prot='ssh' if host else 'file', host=host if host else '', path=base_path), 'ria+ssh://test-store:', where='global') ds = Dataset(ds_path).create(force=True) subds = ds.create('sub', force=True) ds.save(recursive=True) assert_repo_status(ds.path) # don't specify special remote. By default should be git-remote + "-ria" res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore") assert_result_count(res, 1, status='ok', action='create-sibling-ria') eq_(len(res), 1) # remotes exist, but only in super siblings = ds.siblings(result_renderer=None) eq_({'datastore', 'datastore-ria', 'here'}, {s['name'] for s in siblings}) sub_siblings = subds.siblings(result_renderer=None) eq_({'here'}, {s['name'] for s in sub_siblings}) # TODO: post-update hook was enabled # implicit test of success by ria-installing from store: ds.publish(to="datastore", transfer_data='all') with chpwd(clone_path): if host: # note, we are not using the "test-store"-label here clone('ria+ssh://{}{}#{}'.format(host, base_path, ds.id), path='test_install') else: # TODO: Whenever ria+file supports special remote config (label), # change here: clone('ria+file://{}#{}'.format(base_path, ds.id), path='test_install') installed_ds = Dataset(op.join(clone_path, 'test_install')) assert installed_ds.is_installed() assert_repo_status(installed_ds.repo) eq_(installed_ds.id, ds.id) assert_in(op.join('ds', 'file1.txt'), installed_ds.repo.get_annexed_files()) assert_result_count(installed_ds.get(op.join('ds', 'file1.txt')), 1, status='ok', action='get', path=op.join(installed_ds.path, 'ds', 'file1.txt')) # now, again but recursive. res = ds.create_sibling_ria("ria+ssh://test-store:", "datastore", recursive=True, existing='replace') eq_(len(res), 2) assert_result_count(res, 2, status='ok', action="create-sibling-ria") # remotes now exist in super and sub siblings = ds.siblings(result_renderer=None) eq_({'datastore', 'datastore-ria', 'here'}, {s['name'] for s in siblings}) sub_siblings = subds.siblings(result_renderer=None) eq_({'datastore', 'datastore-ria', 'here'}, {s['name'] for s in sub_siblings}) cfg.unset('url.ria+{prot}://{host}{path}.insteadOf' ''.format(prot='ssh' if host else 'file', host=host if host else '', path=base_path), where='global', reload=True)
def __call__(path=None, is_pipeline=False, is_template=False, recursive=False, chdir=None): # dry_run=False, dry_run = False from datalad_crawler.pipeline import ( load_pipeline_from_config, load_pipeline_from_module, get_repo_pipeline_config_path, get_repo_pipeline_script_path ) from datalad_crawler.pipeline import run_pipeline from datalad.utils import chpwd # import late so we could mock during tests with chpwd(chdir): assert not (is_pipeline and is_template), "it is either a pipeline or a template name, can't be both" if is_template: # generate a config and overload path with its filename path = initiate_pipeline_config(template=path, # kwargs=TODO, commit=True) # TODO: centralize via _params_ handling if dry_run: dryrun_optlabel = 'datalad.crawl.dryrun' if dryrun_optlabel in cfg: cfg.unset(dryrun_optlabel, where='local', reload=False) cfg.add(dryrun_optlabel, "True", where='local') if path is None: # get config from the current repository/dataset if is_pipeline: raise ValueError("You must specify the file if --pipeline") # Let's see if there is a config or pipeline in this repo path = get_repo_pipeline_config_path() if not path or not exists(path): # Check if there may be the pipeline provided path = get_repo_pipeline_script_path() if path and exists(path): is_pipeline = True stats = ActivityStats() if not path: raise RuntimeError("Cannot locate crawler config or pipeline file") if is_pipeline: lgr.info("Loading pipeline definition from %s" % path) pipeline = load_pipeline_from_module(path) else: lgr.info("Loading pipeline specification from %s" % path) pipeline = load_pipeline_from_config(path) lgr.info("Running pipeline %s" % str(pipeline)) # TODO: capture the state of all branches so in case of crash # we could gracefully reset back try: output = run_pipeline(pipeline, stats=stats) except Exception as exc: # TODO: config.crawl.failure = full-reset | last-good-master # probably ask via ui which action should be performed unless # explicitly specified raise stats.datasets_crawled += 1 # TODO: Move gc/clean over here! stats_total = stats.get_total() if recursive: # get all subdatasets, and crawl them too! ## ? assert path_orig is None, "Otherwise not sure what to do with path=%r in subdatasets" % path import os from datalad.distribution.dataset import Dataset from datalad.api import crawl from datalad.utils import swallow_logs from datalad.dochelpers import exc_str # Note: we could collect all datasets to be crawled here or pass recursive=True # into the subdatasets' crawl. We will collect all of them here so we might later # also introduce automatic commits when super-dataset got successfully updated subdatasets = Dataset(os.curdir).subdatasets(recursive=recursive, result_xfm='relpaths') lgr.info("Crawling %d subdatasets", len(subdatasets)) output = [output] # TODO: parallelize # TODO: assumes that all sub-datasets are 'crawllable', and if not # just adds them to crawl_failed count. But may be we should make it more # explicit, that some sub-datasets might not need to be crawled, so they get # skipped explicitly? for ds_ in subdatasets: ds_logfile = utils.get_logfilename(ds_, 'crawl') try: # TODO: might be cool to be able to report a 'heart beat' from the swallow into pbar or smth with swallow_logs(file_=ds_logfile) as cml: output_, stats_ = crawl(chdir=ds_) stats_total += stats_ output.append(output_) lgr.info("Crawled %s: %s (log: %s)", ds_, stats_.as_str(mode='line'), ds_logfile) except Exception as exc: stats_total.datasets_crawl_failed += 1 stats_total.datasets_crawled += 1 output += [None] lgr.warning("Crawling of %s has failed (more in %s): %s.", # Log output: %s", ds_, ds_logfile, exc_str(exc)) # , cml.out) lgr.info("Total stats: %s", stats_total.as_str(mode='line')) return output, stats_total