def test_merge_follow_parentds_subdataset_other_branch(path): path = Path(path) ds_src = Dataset(path / "source").create() on_adjusted = ds_src.repo.is_managed_branch() ds_src_subds = ds_src.create("subds") ds_clone = install(source=ds_src.path, path=path / "clone", recursive=True, result_xfm="datasets") ds_clone_subds = Dataset(ds_clone.pathobj / "subds") ds_src_subds.repo.call_git(["checkout", "-b", "other"]) (ds_src_subds.pathobj / "foo").write_text("foo content") ds_src.save(recursive=True) assert_repo_status(ds_src.path) res = ds_clone.update(merge=True, follow="parentds", recursive=True, on_failure="ignore") if on_adjusted: # Our git-annex sync based on approach on adjusted branches is # incompatible with follow='parentds'. assert_in_results(res, action="update", status="impossible") return else: assert_in_results(res, action="update", status="ok") eq_(ds_clone.repo.get_hexsha(), ds_src.repo.get_hexsha()) ok_(ds_clone_subds.repo.is_under_annex("foo")) (ds_src_subds.pathobj / "bar").write_text("bar content") ds_src.save(recursive=True) ds_clone_subds.repo.checkout(DEFAULT_BRANCH, options=["-bnew"]) ds_clone.update(merge=True, follow="parentds", recursive=True) if not on_adjusted: eq_(ds_clone.repo.get_hexsha(), ds_src.repo.get_hexsha())
def _describe_credentials(): import keyring from keyring.util import platform_ def describe_keyring_backend(be): be_repr = repr(be) return be.name if 'object at 0' in be_repr else be_repr.strip('<>') # might later add information on non-keyring credentials gh-4981 props = {} active_keyring = keyring.get_keyring() krp = { 'config_file': Path(platform_.config_root(), 'keyringrc.cfg'), 'data_root': platform_.data_root(), 'active_backends': [ describe_keyring_backend(be) for be in getattr(active_keyring, 'backends', [active_keyring]) ], } props.update( keyring=krp, ) return props
def test_local_path_target_dir(path): path = Path(path) ds_main = Dataset(path / "main").create() ds_main.create_sibling(name="abspath-targetdir", sshurl=str(path / "a"), target_dir="tdir") ok_((path / "a" / "tdir").exists()) ds_main.create_sibling(name="relpath-bound-targetdir", sshurl=os.path.relpath(str(path / "b"), ds_main.path), target_dir="tdir") ok_((path / "b" / "tdir").exists()) with chpwd(path): create_sibling(dataset=ds_main.path, name="relpath-unbound-targetdir", sshurl="c", target_dir="tdir") ok_((path / "c" / "tdir").exists()) ds_main.create("subds") ds_main.create_sibling(name="rec-plain-targetdir", recursive=True, sshurl=str(path / "d"), target_dir="tdir") ok_((path / "d" / "tdir" / "subds").exists()) ds_main.create_sibling(name="rec-template-targetdir", recursive=True, sshurl=str(path / "e"), target_dir="d%RELNAME") ok_((path / "e" / "d").exists()) ok_((path / "e" / "d-subds").exists())
def check_save_dotfiles(to_git, save_path, path): # Note: Take relpath to work with Travis "TMPDIR=/var/tmp/sym\ link" run. paths = [Path(op.relpath(op.join(root, fname), path)) for root, _, fnames in os.walk(op.join(path, save_path or "")) for fname in fnames] ok_(paths) ds = Dataset(path).create(force=True) if not to_git and ds.repo.is_managed_branch(): ver = ds.repo.git_annex_version if "8" < ver < "8.20200309": # git-annex's 1978a2420 (2020-03-09) fixed a bug where # annexed dotfiles could switch when annex.dotfiles=true # was not set in .git/config or git-annex:config.log. ds.repo.config.set("annex.dotfiles", "true", where="local", reload=True) elif ver < "8" and save_path is None: raise SkipTest("Fails with annex version below v8.*") ds.save(save_path, to_git=to_git) if save_path is None: assert_repo_status(ds.path) repo = ds.repo annexinfo = repo.get_content_annexinfo() def _check(fn, p): fn("key", annexinfo[repo.pathobj / p], p) if to_git: def check(p): _check(assert_not_in, p) else: def check(p): _check(assert_in, p) for path in paths: check(path)
def test_gitannex(osf_id, dspath): from datalad.cmd import ( GitRunner, WitlessRunner ) dspath = Path(dspath) ds = Dataset(dspath).create() # add remote parameters here init_remote_opts = ["project={}".format(osf_id)] # add special remote init_opts = common_init_opts + init_remote_opts ds.repo.init_remote('osfproject', options=init_opts) # run git-annex-testremote # note, that we don't want to capture output. If something goes wrong we # want to see it in test build's output log. WitlessRunner( cwd=dspath, env=GitRunner.get_git_environ_adjusted()).run( ['git', 'annex', 'testremote', 'osfproject', "--fast"] )
def check_save_dotfiles(to_git, save_path, path): # Note: Take relpath to work with Travis "TMPDIR=/var/tmp/sym\ link" run. paths = [ Path(op.relpath(op.join(root, fname), path)) for root, _, fnames in os.walk(op.join(path, save_path or "")) for fname in fnames ] ok_(paths) ds = Dataset(path).create(force=True) if not to_git and ds.repo.is_managed_branch(): if not ds.repo._check_version_kludges("has-include-dotfiles"): # FIXME(annex.dotfiles) ds.repo.config.set("annex.dotfiles", "true", where="local", reload=True) ds.save(save_path, to_git=to_git) if save_path is None: assert_repo_status(ds.path) repo = ds.repo annexinfo = repo.get_content_annexinfo() def _check(fn, p): fn("key", annexinfo[repo.pathobj / p], p) if to_git: def check(p): _check(assert_not_in, p) else: def check(p): _check(assert_in, p) for path in paths: check(path)
def __init__(self, path): # A lock to prevent multiple threads performing write operations in parallel self._write_lock = threading.Lock() # Note, that the following three path objects are used often and # therefore are stored for performance. Path object creation comes with # a cost. Most notably, this is used for validity checking of the # repository. self.pathobj = Path(path) self.dot_git = _get_dot_git(self.pathobj, ok_missing=True) self._valid_git_test_path = self.dot_git / 'HEAD' self._cfg = None self._git_runner = GitWitlessRunner(cwd=self.pathobj) self.__fake_dates_enabled = None # Finally, register a finalizer (instead of having a __del__ method). # This will be called by garbage collection as well as "atexit". By # keeping the reference here, we can also call it explicitly. # Note, that we can pass required attributes to the finalizer, but not # `self` itself. This would create an additional reference to the object # and thereby preventing it from being collected at all. self._finalizer = finalize(self, GitRepo._cleanup, self.pathobj)
def test_diff_rsync_syntax(path): # three nested datasets ds = Dataset(path).create() subds = ds.create('sub') subsubds = subds.create(Path('subdir', 'deep')) justtop = ds.diff(fr=PRE_INIT_COMMIT_SHA, path='sub', result_renderer=None) # we only get a single result, the subdataset in question assert_result_count(justtop, 1) assert_result_count(justtop, 1, type='dataset', path=subds.path) # now with "peak inside the dataset" syntax inside = ds.diff(fr=PRE_INIT_COMMIT_SHA, path='sub' + os.sep, result_renderer=None) # we get both subdatasets, but nothing else inside the nested one assert_result_count(inside, 2, type='dataset') assert_result_count(inside, 1, type='dataset', path=subds.path) assert_result_count(inside, 1, type='dataset', path=subsubds.path) assert_result_count(inside, 0, type='file', parentds=subsubds.path) # if we point to the subdir in 'sub' the reporting wrt the subsubds # doesn't change. It is merely a path constraint within the queried # subds, but because the subsubds is still underneath it, nothing changes inside_subdir = ds.diff(fr=PRE_INIT_COMMIT_SHA, path=op.join('sub', 'subdir'), result_renderer=None) assert_result_count(inside_subdir, 2, type='dataset') assert_result_count(inside_subdir, 1, type='dataset', path=subds.path) assert_result_count(inside_subdir, 1, type='dataset', path=subsubds.path) assert_result_count(inside_subdir, 0, type='file', parentds=subsubds.path) # but the rest is different (e.g. all the stuff in .datalad is gone) neq_(inside, inside_subdir) # just for completeness, we get more when going full recursive rec = ds.diff(fr=PRE_INIT_COMMIT_SHA, recursive=True, path='sub' + os.sep, result_renderer=None) assert (len(inside) < len(rec))
def __call__(self, dataset, refcommit, process_type, status): # shortcut ds = dataset repo = ds.repo # OPT: .repo could be relatively expensive if not isinstance(repo, AnnexRepo): # nothing to be done return if process_type not in ('all', 'content'): return # no progress bar, we are only making a one-shot call to # annex, the rest is pretty much instantaneous # limit query to paths that are annexed query_paths = [ # go relative to minimize cmdline footprint of annex call text_type(Path(s['path']).relative_to(ds.pathobj)) for s in status # anything that looks like an annexed file if s.get('type', None) == 'file' \ and s.get('key', None) is not None ] log_progress( lgr.info, 'extractorannex', 'Start annex metadata extraction from %s', ds, total=len(query_paths), label='Annex metadata extraction', unit=' Files', ) for fpath, meta in repo.get_metadata( query_paths, # no timestamps, we are describing the status quo timestamps=False, # because we have filtered the query to only contained # annexed files, we can use batch mode and deal with # many files batch=True): log_progress(lgr.info, 'extractorannex', 'Extracted annex metadata from %s', fpath, update=1, increment=True) meta = { k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in meta.items() } if not meta: # only talk about files that actually carry metadata continue yield dict( # git annex reports the path in POSIX conventions path=PurePosixPath(fpath), metadata=meta, type='file', status='ok', ) log_progress( lgr.info, 'extractorannex', 'Finished annex metadata extraction from %s', ds, )
def check_merge_follow_parentds_subdataset_detached(on_adjusted, path): # Note: For the adjusted case, this is not much more than a smoke test that # on an adjusted branch we fail sensibly. The resulting state is not easy # to reason about nor desirable. path = Path(path) # $path/source/s0/s1 # The additional dataset level is to gain some confidence that this works # for nested datasets. ds_src = Dataset(path / "source").create() if ds_src.repo.is_managed_branch(): if not on_adjusted: raise SkipTest("System only supports adjusted branches. " "Skipping non-adjusted test") ds_src_s0 = ds_src.create("s0") ds_src_s1 = ds_src_s0.create("s1") ds_src.save(recursive=True) if on_adjusted: # Note: We adjust after creating all the datasets above to avoid a bug # fixed in git-annex 7.20191024, specifically bbdeb1a1a (sync: Fix # crash when there are submodules and an adjusted branch is checked # out, 2019-10-23). for ds in [ds_src, ds_src_s0, ds_src_s1]: _adjust(ds.repo) ds_src.save(recursive=True) assert_repo_status(ds_src.path) ds_clone = install(source=ds_src.path, path=path / "clone", recursive=True, result_xfm="datasets") ds_clone_s1 = Dataset(ds_clone.pathobj / "s0" / "s1") ds_src_s1.repo.checkout(DEFAULT_BRANCH + "^0") (ds_src_s1.pathobj / "foo").write_text("foo content") ds_src.save(recursive=True) assert_repo_status(ds_src.path) res = ds_clone.update(merge=True, recursive=True, follow="parentds", on_failure="ignore") if on_adjusted: # The top-level update is okay because there is no parent revision to # update to. assert_in_results(res, status="ok", path=ds_clone.path, action="update") # The subdataset, on the other hand, is impossible. assert_in_results(res, status="impossible", path=ds_clone_s1.path, action="update") return assert_repo_status(ds_clone.path) # We brought in the revision and got to the same state of the remote. # Blind saving here without bringing in the current subdataset revision # would have resulted in a new commit in ds_clone that reverting the # last subdataset ID recorded in ds_src. eq_(ds_clone.repo.get_hexsha(), ds_src.repo.get_hexsha()) # Record a revision in the parent and then move HEAD away from it so that # the explicit revision fetch fails. (ds_src_s1.pathobj / "bar").write_text("bar content") ds_src.save(recursive=True) ds_src_s1.repo.checkout(DEFAULT_BRANCH) # This is the default, but just in case: ds_src_s1.repo.config.set("uploadpack.allowAnySHA1InWant", "false", where="local") # Configure the fetcher to use v0 because Git defaults to v2 as of # v2.26.0, which allows fetching unadvertised objects regardless # of the value of uploadpack.allowAnySHA1InWant. ds_clone_s1.repo.config.set("protocol.version", "0", where="local") res = ds_clone.update(merge=True, recursive=True, follow="parentds", on_failure="ignore") # The fetch with the explicit ref fails because it isn't advertised. assert_in_results(res, status="impossible", path=ds_clone_s1.path, action="update") # Back to the detached head. ds_src_s1.repo.checkout("HEAD@{1}") # Set up a case where update() will not resolve the sibling. ds_clone_s1.repo.call_git(["branch", "--unset-upstream"]) ds_clone_s1.config.reload(force=True) ds_clone_s1.repo.call_git(["remote", "add", "other", ds_src_s1.path]) res = ds_clone.update(recursive=True, follow="parentds", on_failure="ignore") # In this case, update() won't abort if we call with merge=False, but # it does if the revision wasn't brought down in the `fetch(all_=True)` # call. assert_in_results(res, status="impossible", path=ds_clone_s1.path, action="update")
def _test_bare_git_version_2(host, dspath, store): # Similarly to test_bare_git_version_1, this should ensure a bare git repo # at the store location for a dataset doesn't conflict with the ORA remote. # Note: Usability of git remote by annex depends on dataset layout version # (dirhashlower vs. -mixed). # For version 2 (mixed) upload via ORA and consumption via git should # work. But not the other way around, since git-annex uses # dirhashlower with bare repos. ds_path = Path(dspath) store = Path(store) ds = Dataset(ds_path).create() populate_dataset(ds) ds.save() bare_repo_path, _, _ = get_layout_locations(1, store, ds.id) # Use git to make sure the remote end is what git thinks a bare clone of it # should look like subprocess.run([ 'git', 'clone', '--bare', quote_cmdlinearg(str(dspath)), quote_cmdlinearg(str(bare_repo_path)) ]) if host: url = "ria+ssh://{host}{path}".format(host=host, path=store) else: url = "ria+{}".format(store.as_uri()) init_opts = common_init_opts + ['url={}'.format(url)] # set up store: io = SSHRemoteIO(host) if host else LocalIO() create_store(io, store, '1') # set up the dataset location, too. # Note: Dataset layout version 2 (dirhash mixed): create_ds_in_store(io, store, ds.id, '2', '1') # Now, let's have the bare repo as a git remote git_url = "ssh://{host}{path}".format(host=host, path=bare_repo_path) \ if host else bare_repo_path.as_uri() ds.repo.add_remote('bare-git', git_url) ds.repo.enable_remote('bare-git') # and the ORA remote in addition: ds.repo.init_remote('ora-remote', options=init_opts) # upload keys via ORA: ds.repo.copy_to('.', 'ora-remote') # bare-git doesn't know yet: eq_(len(ds.repo.whereis('one.txt')), 2) # fsck to make availability known assert_status('ok', [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='bare-git', fast=True) ]) eq_(len(ds.repo.whereis('one.txt')), 3) ds.drop('.') eq_(len(ds.repo.whereis('one.txt')), 2) # actually consumable via git remote: ds.repo.call_annex(['move', 'one.txt', '--from', 'bare-git']) eq_(len(ds.repo.whereis('one.txt')), 2) # now, move back via git - shouldn't be consumable via ORA ds.repo.call_annex(['move', 'one.txt', '--to', 'bare-git']) # fsck to make availability known, but there's nothing from POV of ORA: fsck_res = [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='ora-remote', fast=True) ] assert_result_count(fsck_res, 1, status='error', message='** Based on the location log, one.txt\n' '** was expected to be present, ' 'but its content is missing.') assert_result_count(fsck_res, 1, status='ok') eq_(len(fsck_res), 2) eq_(len(ds.repo.whereis('one.txt')), 1)
def postclonecfg_ria(ds, props): """Configure a dataset freshly cloned from a RIA store""" repo = ds.repo # RIA uses hashdir mixed, copying data to it via git-annex (if cloned via # ssh) would make it see a bare repo and establish a hashdir lower annex # object tree. # Moreover, we want the ORA remote to receive all data for the store, so its # objects could be moved into archives (the main point of a RIA store). RIA_REMOTE_NAME = 'origin' # don't hardcode everywhere ds.config.set( 'remote.{}.annex-ignore'.format(RIA_REMOTE_NAME), 'true', where='local') # chances are that if this dataset came from a RIA store, its subdatasets # may live there too. Place a subdataset source candidate config that makes # get probe this RIA store when obtaining subdatasets ds.config.set( # we use the label 'origin' for this candidate in order to not have to # generate a complicated name from the actual source specification. # we pick a cost of 200 to sort it before datalad's default candidates # for non-RIA URLs, because they prioritize hierarchical layouts that # cannot be found in a RIA store 'datalad.get.subdataset-source-candidate-200origin', # use the entire original URL, up to the fragment + plus dataset ID # placeholder, this should make things work with any store setup we # support (paths, ports, ...) props['source'].split('#', maxsplit=1)[0] + '#{id}', where='local') # setup publication dependency, if a corresponding special remote exists # and was enabled (there could be RIA stores that actually only have repos) # make this function be a generator ora_remotes = [s for s in ds.siblings('query', result_renderer='disabled') if s.get('annex-externaltype') == 'ora'] if not ora_remotes and any( r.get('externaltype') == 'ora' for r in (repo.get_special_remotes().values() if hasattr(repo, 'get_special_remotes') else [])): # no ORA remote autoenabled, but configuration known about at least one. # Let's check origin's config for datalad.ora-remote.uuid as stored by # create-sibling-ria and enable try enabling that one. lgr.debug("Found no autoenabled ORA special remote. Trying to look it " "up in source config ...") # First figure whether we cloned via SSH, HTTP or local path and then # get that config file the same way: config_content = None scheme = props['giturl'].split(':', 1)[0] if scheme in ['http', 'https']: try: config_content = download_url( "{}{}config".format( props['giturl'], '/' if not props['giturl'].endswith('/') else '')) except DownloadError as e: lgr.debug("Failed to get config file from source:\n%s", exc_str(e)) elif scheme == 'ssh': # TODO: switch the following to proper command abstraction: # SSHRemoteIO ignores the path part ATM. No remote CWD! (To be # changed with command abstractions). So we need to get that part to # have a valid path to origin's config file: cfg_path = PurePosixPath(URL(props['giturl']).path) / 'config' op = SSHRemoteIO(props['giturl']) try: config_content = op.read_file(cfg_path) except RIARemoteError as e: lgr.debug("Failed to get config file from source: %s", exc_str(e)) elif scheme == 'file': # TODO: switch the following to proper command abstraction: op = LocalIO() cfg_path = Path(URL(props['giturl']).localpath) / 'config' try: config_content = op.read_file(cfg_path) except (RIARemoteError, OSError) as e: lgr.debug("Failed to get config file from source: %s", exc_str(e)) else: lgr.debug("Unknown URL-Scheme %s in %s. Can handle SSH, HTTP or " "FILE scheme URLs.", scheme, props['source']) # 3. And read it org_uuid = None if config_content: # TODO: We might be able to spare the saving to a file. # "git config -f -" is not explicitly documented but happens # to work and would read from stdin. Make sure we know this # works for required git versions and on all platforms. with make_tempfile(content=config_content) as cfg_file: runner = GitWitlessRunner() try: result = runner.run( ['git', 'config', '-f', cfg_file, 'datalad.ora-remote.uuid'], protocol=StdOutCapture ) org_uuid = result['stdout'].strip() except CommandError as e: # doesn't contain what we are looking for lgr.debug("Found no UUID for ORA special remote at " "'%s' (%s)", RIA_REMOTE_NAME, exc_str(e)) # Now, enable it. If annex-init didn't fail to enable it as stored, we # wouldn't end up here, so enable with store URL as suggested by the URL # we cloned from. if org_uuid: srs = repo.get_special_remotes() if org_uuid in srs.keys(): # TODO: - Double-check autoenable value and only do this when # true? # - What if still fails? -> Annex shouldn't change config # in that case # we only need the store: new_url = props['source'].split('#')[0] try: repo.enable_remote(srs[org_uuid]['name'], options=['url={}'.format(new_url)] ) lgr.info("Reconfigured %s for %s", srs[org_uuid]['name'], new_url) # update ora_remotes for considering publication dependency # below ora_remotes = [s for s in ds.siblings('query', result_renderer='disabled') if s.get('annex-externaltype', None) == 'ora'] except CommandError as e: lgr.debug("Failed to reconfigure ORA special remote: %s", exc_str(e)) else: lgr.debug("Unknown ORA special remote uuid at '%s': %s", RIA_REMOTE_NAME, org_uuid) if ora_remotes: if len(ora_remotes) == 1: yield from ds.siblings('configure', name=RIA_REMOTE_NAME, publish_depends=ora_remotes[0]['name'], result_filter=None, result_renderer='disabled') else: lgr.warning("Found multiple ORA remotes. Couldn't decide which " "publishing to 'origin' should depend on: %s. Consider " "running 'datalad siblings configure -s origin " "--publish-depends ORAREMOTENAME' to set publication " "dependency manually.", [r['name'] for r in ora_remotes])
def _test_initremote_basic(host, ds_path, store, link): ds_path = Path(ds_path) store = Path(store) link = Path(link) ds = Dataset(ds_path).create() populate_dataset(ds) ds.save() if host: url = "ria+ssh://{host}{path}".format(host=host, path=store) else: url = "ria+{}".format(store.as_uri()) init_opts = common_init_opts + ['url={}'.format(url)] # fails on non-existing storage location assert_raises(CommandError, ds.repo.init_remote, 'ria-remote', options=init_opts) # Doesn't actually create a remote if it fails assert_not_in('ria-remote', [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()] ) # fails on non-RIA URL assert_raises(CommandError, ds.repo.init_remote, 'ria-remote', options=common_init_opts + ['url={}'.format(store.as_uri())] ) # Doesn't actually create a remote if it fails assert_not_in('ria-remote', [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()] ) # set up store: io = SSHRemoteIO(host) if host else LocalIO() create_store(io, store, '1') # still fails, since ds isn't setup in the store assert_raises(CommandError, ds.repo.init_remote, 'ria-remote', options=init_opts) # Doesn't actually create a remote if it fails assert_not_in('ria-remote', [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()] ) # set up the dataset as well create_ds_in_store(io, store, ds.id, '2', '1') # now should work ds.repo.init_remote('ria-remote', options=init_opts) assert_in('ria-remote', [cfg['name'] for uuid, cfg in ds.repo.get_special_remotes().items()] ) assert_repo_status(ds.path) # git-annex:remote.log should have: # - url # - common_init_opts # - archive_id (which equals ds id) remote_log = ds.repo.call_git(['cat-file', 'blob', 'git-annex:remote.log'], read_only=True) assert_in("url={}".format(url), remote_log) [assert_in(c, remote_log) for c in common_init_opts] assert_in("archive-id={}".format(ds.id), remote_log) # re-configure with invalid URL should fail: assert_raises( CommandError, ds.repo.call_annex, ['enableremote', 'ria-remote'] + common_init_opts + [ 'url=ria+file:///non-existing']) # but re-configure with valid URL should work if has_symlink_capability(): link.symlink_to(store) new_url = 'ria+{}'.format(link.as_uri()) ds.repo.call_annex( ['enableremote', 'ria-remote'] + common_init_opts + [ 'url={}'.format(new_url)]) # git-annex:remote.log should have: # - url # - common_init_opts # - archive_id (which equals ds id) remote_log = ds.repo.call_git(['cat-file', 'blob', 'git-annex:remote.log'], read_only=True) assert_in("url={}".format(new_url), remote_log) [assert_in(c, remote_log) for c in common_init_opts] assert_in("archive-id={}".format(ds.id), remote_log) # we can deal with --sameas, which leads to a special remote not having a # 'name' property, but only a 'sameas-name'. See gh-4259 try: ds.repo.init_remote('ora2', options=init_opts + ['--sameas', 'ria-remote']) except CommandError as e: if 'Invalid option `--sameas' in e.stderr: # annex too old - doesn't know --sameas pass else: raise
def _test_remote_layout(host, dspath, store, archiv_store): dspath = Path(dspath) store = Path(store) archiv_store = Path(archiv_store) ds = Dataset(dspath).create() populate_dataset(ds) ds.save() assert_repo_status(ds.path) # set up store: io = SSHRemoteIO(host) if host else LocalIO() if host: store_url = "ria+ssh://{host}{path}".format(host=host, path=store) arch_url = "ria+ssh://{host}{path}".format(host=host, path=archiv_store) else: store_url = "ria+{}".format(store.as_uri()) arch_url = "ria+{}".format(archiv_store.as_uri()) create_store(io, store, '1') # TODO: Re-establish test for version 1 # version 2: dirhash create_ds_in_store(io, store, ds.id, '2', '1') # add special remote init_opts = common_init_opts + ['url={}'.format(store_url)] ds.repo.init_remote('store', options=init_opts) # copy files into the RIA store ds.repo.copy_to('.', 'store') # we should see the exact same annex object tree dsgit_dir, archive_dir, dsobj_dir = \ get_layout_locations(1, store, ds.id) store_objects = get_all_files(dsobj_dir) local_objects = get_all_files(ds.pathobj / '.git' / 'annex' / 'objects') assert_equal(len(store_objects), 2) if not ds.repo.is_managed_branch(): # with managed branches the local repo uses hashdirlower instead # TODO: However, with dataset layout version 1 this should therefore # work on adjusted branch the same way # TODO: Wonder whether export-archive-ora should account for that and # rehash according to target layout. assert_equal(sorted([p for p in store_objects]), sorted([p for p in local_objects]) ) if not io.get_7z(): raise SkipTest("No 7z available in RIA store") # we can simply pack up the content of the remote into a # 7z archive and place it in the right location to get a functional # archive remote create_store(io, archiv_store, '1') create_ds_in_store(io, archiv_store, ds.id, '2', '1') whereis = ds.repo.whereis('one.txt') dsgit_dir, archive_dir, dsobj_dir = \ get_layout_locations(1, archiv_store, ds.id) ds.export_archive_ora(archive_dir / 'archive.7z') init_opts = common_init_opts + ['url={}'.format(arch_url)] ds.repo.init_remote('archive', options=init_opts) # now fsck the new remote to get the new special remote indexed ds.repo.fsck(remote='archive', fast=True) assert_equal(len(ds.repo.whereis('one.txt')), len(whereis) + 1)
def _create_sibling_ria(ds, url, name, ria_remote, ria_remote_name, existing, shared, group, post_update_hook, res_kwargs): # be safe across datasets res_kwargs = res_kwargs.copy() # parse target URL try: ssh_host, base_path = verify_ria_url(url, ds.config) except ValueError as e: yield get_status_dict(status='error', message=str(e), **res_kwargs) return base_path = Path(base_path) git_url = decode_source_spec( # append dataset id to url and use magic from clone-helper: url + '#{}'.format(ds.id), cfg=ds.config)['giturl'] # go for a v1 layout repo_path, _, _ = get_layout_locations(1, base_path, ds.id) ds_siblings = [r['name'] for r in ds.siblings(result_renderer=None)] # Figure whether we are supposed to skip this very dataset if existing == 'skip' and (name in ds_siblings or (ria_remote_name and ria_remote_name in ds_siblings)): yield get_status_dict(status='notneeded', message="Skipped on existing sibling", **res_kwargs) # if we skip here, nothing else can change that decision further # down return # we might learn that some processing (remote repo creation is # not desired) skip = False lgr.info("create sibling{} '{}'{} ...".format( 's' if ria_remote_name else '', name, " and '{}'".format(ria_remote_name) if ria_remote_name else '', )) if ssh_host: from datalad import ssh_manager ssh = ssh_manager.get_connection(ssh_host, use_remote_annex_bundle=False) ssh.open() # determine layout locations if ria_remote: lgr.debug('init special remote {}'.format(ria_remote_name)) ria_remote_options = [ 'type=external', 'externaltype=ria', 'encryption=none', 'autoenable=true', 'url={}'.format(url) ] try: ds.repo.init_remote(ria_remote_name, options=ria_remote_options) except CommandError as e: if existing in ['replace', 'reconfigure'] \ and 'git-annex: There is already a special remote' \ in e.stderr: # run enableremote instead lgr.debug( "special remote '%s' already exists. " "Run enableremote instead.", ria_remote_name) # TODO: Use AnnexRepo.enable_remote (which needs to get # `options` first) cmd = ['git', 'annex', 'enableremote', ria_remote_name ] + ria_remote_options subprocess.run(cmd, cwd=quote_cmdlinearg(ds.repo.path)) else: yield get_status_dict( status='error', message="initremote failed.\nstdout: %s\nstderr: %s" % (e.stdout, e.stderr), **res_kwargs) return # 1. create remote object store: # Note: All it actually takes is to trigger the special # remote's `prepare` method once. # ATM trying to achieve that by invoking a minimal fsck. # TODO: - It's probably faster to actually talk to the special # remote (i.e. pretending to be annex and use # the protocol to send PREPARE) # - Alternatively we can create the remote directory and # ria version file directly, but this means # code duplication that then needs to be kept in sync # with ria-remote implementation. # - this leads to the third option: Have that creation # routine importable and callable from # ria-remote package without the need to actually # instantiate a RIARemote object lgr.debug("initializing object store") ds.repo.fsck(remote=ria_remote_name, fast=True, annex_options=['--exclude=*/*']) else: # with no special remote we currently need to create the # required directories # TODO: This should be cleaner once we have access to the # special remote's RemoteIO classes without # talking via annex if ssh_host: try: stdout, stderr = ssh('test -e {repo}'.format( repo=quote_cmdlinearg(str(repo_path)))) exists = True except CommandError as e: exists = False if exists: if existing == 'skip': # 1. not rendered by default # 2. message doesn't show up in ultimate result # record as shown by -f json_pp yield get_status_dict(status='notneeded', message="Skipped on existing remote " "directory {}".format(repo_path), **res_kwargs) skip = True elif existing in ['error', 'reconfigure']: yield get_status_dict( status='error', message="remote directory {} already " "exists.".format(repo_path), **res_kwargs) return elif existing == 'replace': ssh('chmod u+w -R {}'.format( quote_cmdlinearg(str(repo_path)))) ssh('rm -rf {}'.format(quote_cmdlinearg(str(repo_path)))) if not skip: ssh('mkdir -p {}'.format(quote_cmdlinearg(str(repo_path)))) else: if repo_path.exists(): if existing == 'skip': skip = True elif existing in ['error', 'reconfigure']: yield get_status_dict( status='error', message="remote directory {} already " "exists.".format(repo_path), **res_kwargs) return elif existing == 'replace': rmtree(repo_path) if not skip: repo_path.mkdir(parents=True) # Note, that this could have changed since last tested due to existing # remote dir if skip: return # 2. create a bare repository in-store: lgr.debug("init bare repository") # TODO: we should prob. check whether it's there already. How? # Note: like the special remote itself, we assume local FS if no # SSH host is specified disabled_hook = repo_path / 'hooks' / 'post-update.sample' enabled_hook = repo_path / 'hooks' / 'post-update' if group: chgrp_cmd = "chgrp -R {} {}".format(quote_cmdlinearg(str(group)), quote_cmdlinearg(str(repo_path))) if ssh_host: ssh('cd {rootdir} && git init --bare{shared}'.format( rootdir=quote_cmdlinearg(str(repo_path)), shared=" --shared='{}'".format(quote_cmdlinearg(shared)) if shared else '')) if post_update_hook: ssh('mv {} {}'.format(quote_cmdlinearg(str(disabled_hook)), quote_cmdlinearg(str(enabled_hook)))) if group: # Either repository existed before or a new directory was # created for it, set its group to a desired one if was # provided with the same chgrp ssh(chgrp_cmd) else: GitRepo(repo_path, create=True, bare=True, shared=" --shared='{}'".format(quote_cmdlinearg(shared)) if shared else None) if post_update_hook: disabled_hook.rename(enabled_hook) if group: # TODO; do we need a cwd here? subprocess.run(chgrp_cmd, cwd=quote_cmdlinearg(ds.path)) # add a git remote to the bare repository # Note: needs annex-ignore! Otherwise we might push into default # annex/object tree instead of directory type tree with dirhash # lower. This in turn would be an issue, if we want to pack the # entire thing into an archive. Special remote will then not be # able to access content in the "wrong" place within the archive lgr.debug("set up git remote") # TODO: # - This sibings call results in "[WARNING] Failed to determine # if datastore carries annex." # (see https://github.com/datalad/datalad/issues/4028) # => for now have annex-ignore configured before. Evtl. Allow # configure/add to include that option # - additionally there's # https://github.com/datalad/datalad/issues/3989, # where datalad-siblings might hang forever if name in ds_siblings: # otherwise we should have skipped or failed before assert existing in ['replace', 'reconfigure'] ds.config.set("remote.{}.annex-ignore".format(name), value="true", where="local") ds.siblings( 'configure', name=name, url=git_url if ssh_host else str(repo_path), recursive=False, # Note, that this should be None if ria_remote was not set publish_depends=ria_remote_name, result_renderer=None, # Note, that otherwise a subsequent publish will report # "notneeded". fetch=True) yield get_status_dict( status='ok', **res_kwargs, )
def ctrl_path(self): with open(bogus_socket, "w") as f: f.write("whatever") return Path(bogus_socket)
def __call__(*, dataset=None, what=None, dry_run=False, recursive=False, recursion_limit=None): ds = require_dataset(dataset, purpose="report on cleanable locations" if dry_run else "clean dataset") res_kwargs = dict(action='clean [dry-run]' if dry_run else 'clean', logger=lgr, refds=ds.path) for wds in itertools.chain( [ds], ds.subdatasets(state='present', recursive=recursive, recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm='datasets') if recursive else []): d = wds.pathobj gitdir = wds.repo.dot_git DIRS_PLURAL = ("directory", "directories") FILES_PLURAL = ("file", "files") discover_or_remove = "Discovered" if dry_run else "Removed" for dirpath, flag, msg, sing_pl in [ (Path(ARCHIVES_TEMP_DIR), "cached-archives", "temporary archive", DIRS_PLURAL), (Path(ANNEX_TEMP_DIR), "annex-tmp", "temporary annex", FILES_PLURAL), (Path(ANNEX_TRANSFER_DIR), "annex-transfer", "annex temporary transfer", DIRS_PLURAL), (gitdir / Path(SEARCH_INDEX_DOTGITDIR), 'search-index', "metadata search index", FILES_PLURAL), ]: topdir = wds.pathobj / dirpath lgr.debug("Considering to clean %s:%s", d, dirpath) if not ((what is None) or (flag in what)): yield get_status_dict(path=str(topdir), status='notneeded', type='directory', **res_kwargs) continue paths = [p for p in topdir.glob('*')] if not paths: if not topdir.exists(): yield get_status_dict(path=str(topdir), status='notneeded', type='directory', **res_kwargs) continue else: # we empty topdir only message = ("%s empty %s directory", discover_or_remove, msg) else: pl = len(paths) > 1 message = ("%s %d %s %s: %s", discover_or_remove, len(paths), msg, sing_pl[int(pl)], ", ".join( sorted([ str(p.relative_to(topdir)) for p in paths if p != topdir ]))) if not dry_run: rmtree(str(topdir)) yield get_status_dict(path=str(topdir), status='ok', type='directory', message=message, **res_kwargs)
def test_pathlib_unicode(): eq_(str(Path("a")), u"a") eq_(str(Path(u"β")), u"β")
def _test_version_check(host, dspath, store): dspath = Path(dspath) store = Path(store) ds = Dataset(dspath).create() populate_dataset(ds) ds.save() assert_repo_status(ds.path) # set up store: io = SSHRemoteIO(host) if host else LocalIO() if host: store_url = "ria+ssh://{host}{path}".format(host=host, path=store) else: store_url = "ria+{}".format(store.as_uri()) create_store(io, store, '1') # TODO: Re-establish test for version 1 # version 2: dirhash create_ds_in_store(io, store, ds.id, '2', '1') # add special remote init_opts = common_init_opts + ['url={}'.format(store_url)] ds.repo.init_remote('store', options=init_opts) ds.repo.copy_to('.', 'store') # check version files remote_ds_tree_version_file = store / 'ria-layout-version' dsgit_dir, archive_dir, dsobj_dir = \ get_layout_locations(1, store, ds.id) remote_obj_tree_version_file = dsgit_dir / 'ria-layout-version' assert_true(remote_ds_tree_version_file.exists()) assert_true(remote_obj_tree_version_file.exists()) with open(str(remote_ds_tree_version_file), 'r') as f: assert_equal(f.read().strip(), '1') with open(str(remote_obj_tree_version_file), 'r') as f: assert_equal(f.read().strip(), '2') # Accessing the remote should not yield any output regarding versioning, # since it's the "correct" version. Note that "fsck" is an arbitrary choice. # We need just something to talk to the special remote. with swallow_logs(new_level=logging.INFO) as cml: ds.repo.fsck(remote='store', fast=True) # TODO: For some reason didn't get cml.assert_logged to assert # "nothing was logged" assert not cml.out # Now fake-change the version with open(str(remote_obj_tree_version_file), 'w') as f: f.write('X\n') # Now we should see a message about it with swallow_logs(new_level=logging.INFO) as cml: ds.repo.fsck(remote='store', fast=True) cml.assert_logged(level="INFO", msg="Remote object tree reports version X", regex=False) # reading still works: ds.drop('.') assert_status('ok', ds.get('.')) # but writing doesn't: with open(str(Path(ds.path) / 'new_file'), 'w') as f: f.write("arbitrary addition") ds.save(message="Add a new_file") # TODO: use self.annex.error in special remote and see whether we get an # actual error result assert_raises(CommandError, ds.repo.copy_to, 'new_file', 'store') # However, we can force it by configuration ds.config.add("annex.ora-remote.store.force-write", "true", where='local') ds.repo.copy_to('new_file', 'store')
def __call__(url, name, dataset=None, storage_name=None, post_update_hook=False, shared=None, group=None, storage_sibling=True, existing='error', trust_level=None, recursive=False, recursion_limit=None, disable_storage__=None, ): if disable_storage__ is not None: import warnings warnings.warn("datalad-create-sibling-ria --no-storage-sibling " "is deprecated, use --storage-sibling off instead.", DeprecationWarning) # recode to new setup disable_storage__ = None storage_sibling = False if storage_sibling == 'only' and storage_name: lgr.warning( "Sibling name will be used for storage sibling in " "storage-sibling-only mode, but a storage sibling name " "was provided" ) ds = require_dataset( dataset, check_installed=True, purpose='create sibling RIA') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) # parse target URL try: ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config) except ValueError as e: yield get_status_dict( status='error', message=str(e), **res_kwargs ) return if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError( "Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format(ds.path)) if not storage_sibling and storage_name: lgr.warning( "Storage sibling setup disabled, but a storage sibling name " "was provided" ) if storage_sibling and not storage_name: storage_name = "{}-storage".format(name) if storage_sibling and name == storage_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': # in recursive mode this check could take a substantial amount of # time: employ a progress bar (or rather a counter, because we don't # know the total in advance pbar_id = 'check-siblings-{}'.format(id(ds)) log_progress( lgr.info, pbar_id, 'Start checking pre-existing sibling configuration %s', ds, label='Query siblings', unit=' Siblings', ) # even if we have to fail, let's report all conflicting siblings # in subdatasets failed = False for r in ds.siblings(result_renderer=None, recursive=recursive, recursion_limit=recursion_limit): log_progress( lgr.info, pbar_id, 'Discovered sibling %s in dataset at %s', r['name'], r['path'], update=1, increment=True) if not r['type'] == 'sibling' or r['status'] != 'ok': # this is an internal status query that has not consequence # for the outside world. Be silent unless something useful # can be said #yield r continue if r['name'] == name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(name, r['path']), **res_kwargs, ) failed = True yield res continue if storage_name and r['name'] == storage_name: res = get_status_dict( status='error', message="a sibling '{}' is already configured in " "dataset {}".format(storage_name, r['path']), **res_kwargs, ) failed = True yield res continue log_progress( lgr.info, pbar_id, 'Finished checking pre-existing sibling configuration %s', ds, ) if failed: return # TODO: - URL parsing + store creation needs to be RF'ed based on # command abstractions # - more generally consider store creation a dedicated command or # option # Note: URL parsing is done twice ATM (for top-level ds). This can't be # reduced to single instance, since rewriting url based on config could # be different for subdatasets. create_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(), Path(base_path), '1') yield from _create_sibling_ria( ds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(fulfilled=True, recursive=True, recursion_limit=recursion_limit, result_xfm='datasets'): yield from _create_sibling_ria( subds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs)
def clone_dataset( srcs, destds, reckless=None, description=None, result_props=None, cfg=None): """Internal helper to perform cloning without sanity checks (assumed done) This helper does not handle any saving of subdataset modification or adding in a superdataset. Parameters ---------- srcs : list Any suitable clone source specifications (paths, URLs) destds : Dataset Dataset instance for the clone destination reckless : {None, 'auto', 'ephemeral', 'shared-...'}, optional Mode switch to put cloned dataset into unsafe/throw-away configurations, i.e. sacrifice data safety for performance or resource footprint. When None and `cfg` is specified, use the value of `datalad.clone.reckless`. description : str, optional Location description for the annex of the dataset clone (if there is any). result_props : dict, optional Default properties for any yielded result, passed on to get_status_dict(). cfg : ConfigManager, optional Configuration for parent dataset. This will be queried instead of the global DataLad configuration. Yields ------ dict DataLad result records """ if not result_props: # in case the caller had no specific idea on how results should look # like, provide sensible defaults result_props = dict( action='install', logger=lgr, ds=destds, ) if reckless is None and cfg: # if reckless is not explicitly given, but we operate on a # superdataset, query whether it has been instructed to operate # in a reckless mode, and inherit it for the coming clone reckless = cfg.get('datalad.clone.reckless', None) dest_path = destds.pathobj # decode all source candidate specifications candidate_sources = [decode_source_spec(s, cfg=cfg) for s in srcs] # now expand the candidate sources with additional variants of the decoded # giturl, while duplicating the other properties in the additional records # for simplicity. The hope is to overcome a few corner cases and be more # robust than git clone candidate_sources = [ dict(props, giturl=s) for props in candidate_sources for s in _get_flexible_source_candidates(props['giturl']) ] # important test! based on this `rmtree` will happen below after failed clone dest_path_existed = dest_path.exists() if dest_path_existed and any(dest_path.iterdir()): if destds.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from # this is where it was actually installed from track_name, track_url = _get_tracking_source(destds) try: # this will get us track_url in system native path conventions, # whenever it is a path (and not a URL) # this is needed to match it to any potentially incoming local # source path in the 'notneeded' test below track_path = str(Path(track_url)) except Exception: # this should never happen, because Path() will let any non-path stringification # pass through unmodified, but we do not want any potential crash due to # pathlib behavior changes lgr.debug("Unexpected behavior of pathlib!") track_path = None for cand in candidate_sources: src = cand['giturl'] if track_url == src \ or (not is_url(track_url) and get_local_file_url(track_url, compatibility='git') == src) \ or track_path == expanduser(src): yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destds, src), **result_props) return # anything else is an error yield get_status_dict( status='error', message='target path already exists and not empty, refuse to clone into target path', **result_props) return log_progress( lgr.info, 'cloneds', 'Cloning dataset to %s', destds, total=len(candidate_sources), label='Clone attempt', unit=' Candidate locations', ) error_msgs = OrderedDict() # accumulate all error messages formatted per each url for cand in candidate_sources: log_progress( lgr.info, 'cloneds', 'Attempting to clone from %s to %s', cand['giturl'], dest_path, update=1, increment=True) clone_opts = {} if cand.get('version', None): clone_opts['branch'] = cand['version'] try: # TODO for now GitRepo.clone() cannot handle Path instances, and PY35 # doesn't make it happen seemlessly GitRepo.clone( path=str(dest_path), url=cand['giturl'], clone_options=clone_opts, create=True) except CommandError as e: e_stderr = e.stderr error_msgs[cand['giturl']] = e lgr.debug("Failed to clone from URL: %s (%s)", cand['giturl'], exc_str(e)) if dest_path.exists(): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it # TODO stringification can be removed once patlib compatible # or if PY35 is no longer supported rmtree(str(dest_path), children_only=dest_path_existed) if e_stderr and 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) # cancel progress bar log_progress( lgr.info, 'cloneds', 'Completed clone attempts for %s', destds ) yield get_status_dict( status='error', message=re_match.group(1).strip() if re_match else "stderr: " + e_stderr, **result_props) return # next candidate continue result_props['source'] = cand # do not bother with other sources if succeeded break log_progress( lgr.info, 'cloneds', 'Completed clone attempts for %s', destds ) if not destds.is_installed(): if len(error_msgs): if all(not e.stdout and not e.stderr for e in error_msgs.values()): # there is nothing we can learn from the actual exception, # the exit code is uninformative, the command is predictable error_msg = "Failed to clone from all attempted sources: %s" error_args = list(error_msgs.keys()) else: error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were:\n- %s" error_args = '\n- '.join( '{}\n {}'.format(url, exc_str(exc)) for url, exc in error_msgs.items() ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destds.path, cand['giturl']) yield get_status_dict( status='error', message=(error_msg, error_args), **result_props) return if not cand.get("version"): postclone_check_head(destds) # act on --reckless=shared-... # must happen prior git-annex-init, where we can cheaply alter the repo # setup through safe re-init'ing if reckless and reckless.startswith('shared-'): lgr.debug('Reinit %s to enable shared access permissions', destds) destds.repo.call_git(['init', '--shared={}'.format(reckless[7:])]) yield from postclonecfg_annexdataset( destds, reckless, description) # perform any post-processing that needs to know details of the clone # source if result_props['source']['type'] == 'ria': yield from postclonecfg_ria(destds, result_props['source']) if reckless: # store the reckless setting in the dataset to make it # known to later clones of subdatasets via get() destds.config.set( 'datalad.clone.reckless', reckless, where='local', reload=True) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **result_props)
def _create_sibling_ria( ds, url, name, storage_sibling, storage_name, existing, shared, group, post_update_hook, trust_level, res_kwargs): # be safe across datasets res_kwargs = res_kwargs.copy() # update dataset res_kwargs['ds'] = ds if not isinstance(ds.repo, AnnexRepo): # No point in dealing with a special remote when there's no annex. # Note, that in recursive invocations this might only apply to some of # the datasets. Therefore dealing with it here rather than one level up. lgr.debug("No annex at %s. Ignoring special remote options.", ds.path) storage_sibling = False storage_name = None # parse target URL try: ssh_host, base_path, rewritten_url = verify_ria_url(url, ds.config) except ValueError as e: yield get_status_dict( status='error', message=str(e), **res_kwargs ) return base_path = Path(base_path) git_url = decode_source_spec( # append dataset id to url and use magic from clone-helper: url + '#{}'.format(ds.id), cfg=ds.config )['giturl'] # determine layout locations; go for a v1 layout repo_path, _, _ = get_layout_locations(1, base_path, ds.id) ds_siblings = [r['name'] for r in ds.siblings(result_renderer=None)] # Figure whether we are supposed to skip this very dataset if existing == 'skip' and ( name in ds_siblings or ( storage_name and storage_name in ds_siblings)): yield get_status_dict( status='notneeded', message="Skipped on existing sibling", **res_kwargs ) # if we skip here, nothing else can change that decision further # down return # figure whether we need to skip or error due an existing target repo before # we try to init a special remote. if ssh_host: from datalad import ssh_manager ssh = ssh_manager.get_connection( ssh_host, use_remote_annex_bundle=False) ssh.open() if existing in ['skip', 'error']: config_path = repo_path / 'config' # No .git -- if it's an existing repo in a RIA store it should be a # bare repo. # Theoretically we could have additional checks for whether we have # an empty repo dir or a non-bare repo or whatever else. if ssh_host: try: ssh('[ -e {p} ]'.format(p=quote_cmdlinearg(str(config_path)))) exists = True except CommandError: exists = False else: exists = config_path.exists() if exists: if existing == 'skip': # 1. not rendered by default # 2. message doesn't show up in ultimate result # record as shown by -f json_pp yield get_status_dict( status='notneeded', message="Skipped on existing remote " "directory {}".format(repo_path), **res_kwargs ) return else: # existing == 'error' yield get_status_dict( status='error', message="remote directory {} already " "exists.".format(repo_path), **res_kwargs ) return if storage_sibling == 'only': lgr.info("create storage sibling '{}' ...".format(name)) else: lgr.info("create sibling{} '{}'{} ...".format( 's' if storage_name else '', name, " and '{}'".format(storage_name) if storage_name else '', )) create_ds_in_store(SSHRemoteIO(ssh_host) if ssh_host else LocalIO(), base_path, ds.id, '2', '1') if storage_sibling: # we are using the main `name`, if the only thing we are creating # is the storage sibling srname = name if storage_sibling == 'only' else storage_name lgr.debug('init special remote {}'.format(srname)) special_remote_options = [ 'type=external', 'externaltype=ora', 'encryption=none', 'autoenable=true', 'url={}'.format(url)] try: ds.repo.init_remote( srname, options=special_remote_options) except CommandError as e: if existing == 'reconfigure' \ and 'git-annex: There is already a special remote' \ in e.stderr: # run enableremote instead lgr.debug( "special remote '%s' already exists. " "Run enableremote instead.", srname) # TODO: Use AnnexRepo.enable_remote (which needs to get # `options` first) ds.repo.call_annex([ 'enableremote', srname] + special_remote_options) else: yield get_status_dict( status='error', message="initremote failed.\nstdout: %s\nstderr: %s" % (e.stdout, e.stderr), **res_kwargs ) return if trust_level: ds.repo.call_annex([trust_level, srname]) # get uuid for use in bare repo's config uuid = ds.config.get("remote.{}.annex-uuid".format(srname)) if storage_sibling == 'only': # we can stop here, the rest of the function is about setting up # the git remote part of the sibling yield get_status_dict( status='ok', **res_kwargs, ) return # 2. create a bare repository in-store: lgr.debug("init bare repository") # TODO: we should prob. check whether it's there already. How? # Note: like the special remote itself, we assume local FS if no # SSH host is specified disabled_hook = repo_path / 'hooks' / 'post-update.sample' enabled_hook = repo_path / 'hooks' / 'post-update' if group: chgrp_cmd = "chgrp -R {} {}".format( quote_cmdlinearg(str(group)), quote_cmdlinearg(str(repo_path))) if ssh_host: ssh('cd {rootdir} && git init --bare{shared}'.format( rootdir=quote_cmdlinearg(str(repo_path)), shared=" --shared='{}'".format( quote_cmdlinearg(shared)) if shared else '' )) if storage_sibling: # write special remote's uuid into git-config, so clone can # which one it is supposed to be and enable it even with # fallback URL ssh("cd {rootdir} && git config datalad.ora-remote.uuid {uuid}" "".format(rootdir=quote_cmdlinearg(str(repo_path)), uuid=uuid)) if post_update_hook: ssh('mv {} {}'.format(quote_cmdlinearg(str(disabled_hook)), quote_cmdlinearg(str(enabled_hook)))) if group: # Either repository existed before or a new directory was # created for it, set its group to a desired one if was # provided with the same chgrp ssh(chgrp_cmd) else: gr = GitRepo(repo_path, create=True, bare=True, shared=shared if shared else None) if storage_sibling: # write special remote's uuid into git-config, so clone can # which one it is supposed to be and enable it even with # fallback URL gr.config.add("datalad.ora-remote.uuid", uuid, where='local') if post_update_hook: disabled_hook.rename(enabled_hook) if group: # TODO; do we need a cwd here? subprocess.run(chgrp_cmd, cwd=quote_cmdlinearg(ds.path)) # add a git remote to the bare repository # Note: needs annex-ignore! Otherwise we might push into dirhash # lower annex/object tree instead of mixed, since it's a bare # repo. This in turn would be an issue, if we want to pack the # entire thing into an archive. Special remote will then not be # able to access content in the "wrong" place within the archive lgr.debug("set up git remote") if name in ds_siblings: # otherwise we should have skipped or failed before assert existing == 'reconfigure' ds.config.set( "remote.{}.annex-ignore".format(name), value="true", where="local") ds.siblings( 'configure', name=name, url=git_url if ssh_host else str(repo_path), recursive=False, # Note, that this should be None if storage_sibling was not set publish_depends=storage_name, result_renderer=None, # Note, that otherwise a subsequent publish will report # "notneeded". fetch=True ) yield get_status_dict( status='ok', **res_kwargs, )
def postclonecfg_annexdataset(ds, reckless, description=None): """If ds "knows annex" -- annex init it, set into reckless etc Provides additional tune up to a possibly an annex repo, e.g. "enables" reckless mode, sets up description """ # in any case check whether we need to annex-init the installed thing: if not knows_annex(ds.path): # not for us return # init annex when traces of a remote annex can be detected if reckless == 'auto': lgr.debug( "Instruct annex to hardlink content in %s from local " "sources, if possible (reckless)", ds.path) ds.config.set( 'annex.hardlink', 'true', where='local', reload=True) lgr.debug("Initializing annex repo at %s", ds.path) # Note, that we cannot enforce annex-init via AnnexRepo(). # If such an instance already exists, its __init__ will not be executed. # Therefore do quick test once we have an object and decide whether to call # its _init(). # # Additionally, call init if we need to add a description (see #1403), # since AnnexRepo.__init__ can only do it with create=True repo = AnnexRepo(ds.path, init=True) if not repo.is_initialized() or description: repo._init(description=description) if reckless == 'auto' or (reckless and reckless.startswith('shared-')): repo.call_annex(['untrust', 'here']) elif reckless == 'ephemeral': # with ephemeral we declare 'here' as 'dead' right away, whenever # we symlink origin's annex, since availability from 'here' should # not be propagated for an ephemeral clone when we publish back to # origin. # This will cause stuff like this for a locally present annexed file: # % git annex whereis d1 # whereis d1 (0 copies) failed # BUT this works: # % git annex find . --not --in here # % git annex find . --in here # d1 # we don't want annex copy-to origin ds.config.set( 'remote.origin.annex-ignore', 'true', where='local') ds.repo.set_remote_dead('here') if check_symlink_capability(ds.repo.dot_git / 'dl_link_test', ds.repo.dot_git / 'dl_target_test'): # symlink the annex to avoid needless copies in an ephemeral clone annex_dir = ds.repo.dot_git / 'annex' origin_annex_url = ds.config.get("remote.origin.url", None) origin_git_path = None if origin_annex_url: try: # Deal with file:// scheme URLs as well as plain paths. # If origin isn't local, we have nothing to do. origin_git_path = Path(RI(origin_annex_url).localpath) # we are local; check for a bare repo first to not mess w/ # the path if GitRepo(origin_git_path, create=False).bare: # origin is a bare repo -> use path as is pass elif origin_git_path.name != '.git': origin_git_path /= '.git' except ValueError: # Note, that accessing localpath on a non-local RI throws # ValueError rather than resulting in an AttributeError. # TODO: Warning level okay or is info level sufficient? # Note, that setting annex-dead is independent of # symlinking .git/annex. It might still make sense to # have an ephemeral clone that doesn't propagate its avail. # info. Therefore don't fail altogether. lgr.warning("reckless=ephemeral mode: origin doesn't seem " "local: %s\nno symlinks being used", origin_annex_url) if origin_git_path: # TODO make sure that we do not delete any unique data rmtree(str(annex_dir)) \ if not annex_dir.is_symlink() else annex_dir.unlink() annex_dir.symlink_to(origin_git_path / 'annex', target_is_directory=True) else: # TODO: What level? + note, that annex-dead is independ lgr.warning("reckless=ephemeral mode: Unable to create symlinks on " "this file system.") srs = {True: [], False: []} # special remotes by "autoenable" key remote_uuids = None # might be necessary to discover known UUIDs repo_config = repo.config # Note: The purpose of this function is to inform the user. So if something # looks misconfigured, we'll warn and move on to the next item. for uuid, config in repo.get_special_remotes().items(): sr_name = config.get('name', None) if sr_name is None: lgr.warning( 'Ignoring special remote %s because it does not have a name. ' 'Known information: %s', uuid, config) continue sr_autoenable = config.get('autoenable', False) try: sr_autoenable = ensure_bool(sr_autoenable) except ValueError: lgr.warning( 'Failed to process "autoenable" value %r for sibling %s in ' 'dataset %s as bool.' 'You might need to enable it later manually and/or fix it up to' ' avoid this message in the future.', sr_autoenable, sr_name, ds.path) continue # If it looks like a type=git special remote, make sure we have up to # date information. See gh-2897. if sr_autoenable and repo_config.get("remote.{}.fetch".format(sr_name)): try: repo.fetch(remote=sr_name) except CommandError as exc: lgr.warning("Failed to fetch type=git special remote %s: %s", sr_name, exc_str(exc)) # determine whether there is a registered remote with matching UUID if uuid: if remote_uuids is None: remote_uuids = { # Check annex-config-uuid first. For sameas annex remotes, # this will point to the UUID for the configuration (i.e. # the key returned by get_special_remotes) rather than the # shared UUID. (repo_config.get('remote.%s.annex-config-uuid' % r) or repo_config.get('remote.%s.annex-uuid' % r)) for r in repo.get_remotes() } if uuid not in remote_uuids: srs[sr_autoenable].append(sr_name) if srs[True]: lgr.debug( "configuration for %s %s added because of autoenable," " but no UUIDs for them yet known for dataset %s", # since we are only at debug level, we could call things their # proper names single_or_plural("special remote", "special remotes", len(srs[True]), True), ", ".join(srs[True]), ds.path ) if srs[False]: # if has no auto-enable special remotes lgr.info( 'access to %s %s not auto-enabled, enable with:\n' '\t\tdatalad siblings -d "%s" enable -s %s', # but since humans might read it, we better confuse them with our # own terms! single_or_plural("dataset sibling", "dataset siblings", len(srs[False]), True), ", ".join(srs[False]), ds.path, srs[False][0] if len(srs[False]) == 1 else "SIBLING", ) # we have just cloned the repo, so it has 'origin', configure any # reachable origin of origins yield from configure_origins(ds, ds)
def test_unlock_gh_5456(path=None): path = Path(path) unrelated_super = Dataset(path).create(annex=False, force=True) ds = Dataset(path / 'subdir' / 'sub').create() ds.unlock('.')
def _test_bare_git_version_1(host, dspath, store): # This test should take a dataset and create a bare repository at the remote # end from it. # Given, that it is placed correctly within a tree of dataset, that remote # thing should then be usable as an ora-remote as well as as a git-type # remote. # Note: Usability of git remote by annex depends on dataset layout version # (dirhashlower vs. -mixed). # For version 1 (lower) upload and consumption should be # interchangeable. It doesn't matter which remote is used for what # direction. ds_path = Path(dspath) store = Path(store) ds = Dataset(ds_path).create() populate_dataset(ds) ds.save() bare_repo_path, _, _ = get_layout_locations(1, store, ds.id) # Use git to make sure the remote end is what git thinks a bare clone of it # should look like subprocess.run([ 'git', 'clone', '--bare', quote_cmdlinearg(str(dspath)), quote_cmdlinearg(str(bare_repo_path)) ]) if host: url = "ria+ssh://{host}{path}".format(host=host, path=store) else: url = "ria+{}".format(store.as_uri()) init_opts = common_init_opts + ['url={}'.format(url)] # set up store: io = SSHRemoteIO(host) if host else LocalIO() create_store(io, store, '1') # set up the dataset location, too. # Note: Dataset layout version 1 (dirhash lower): create_ds_in_store(io, store, ds.id, '1', '1') # Now, let's have the bare repo as a git remote and use it with annex git_url = "ssh://{host}{path}".format(host=host, path=bare_repo_path) \ if host else bare_repo_path.as_uri() ds.repo.add_remote('bare-git', git_url) ds.repo.enable_remote('bare-git') # copy files to the remote ds.repo.copy_to('.', 'bare-git') eq_(len(ds.repo.whereis('one.txt')), 2) # now we can drop all content locally, reobtain it, and survive an # fsck ds.drop('.') ds.get('.') assert_status('ok', [annexjson2result(r, ds) for r in ds.repo.fsck()]) # Now, add the ora remote: ds.repo.init_remote('ora-remote', options=init_opts) # fsck to make availability known assert_status('ok', [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='ora-remote', fast=True) ]) eq_(len(ds.repo.whereis('one.txt')), 3) # Now move content from git-remote to local and see it not being available # via bare-git anymore. ds.repo.call_annex(['move', '--all', '--from=bare-git']) # ora-remote doesn't know yet: eq_(len(ds.repo.whereis('one.txt')), 2) # But after fsck it does: fsck_res = [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='ora-remote', fast=True) ] assert_result_count(fsck_res, 1, status='error', message='** Based on the location log, one.txt\n' '** was expected to be present, ' 'but its content is missing.') assert_result_count(fsck_res, 1, status='error', message='** Based on the location log, subdir/two\n' '** was expected to be present, ' 'but its content is missing.') eq_(len(ds.repo.whereis('one.txt')), 1) # and the other way around: upload via ora-remote and have it available via # git-remote: ds.repo.copy_to('.', 'ora-remote') # fsck to make availability known assert_status('ok', [ annexjson2result(r, ds) for r in ds.repo.fsck(remote='bare-git', fast=True) ]) eq_(len(ds.repo.whereis('one.txt')), 3)
def test_update_follow_parentds_lazy(path): path = Path(path) ds_src = Dataset(path / "source").create() ds_src_s0 = ds_src.create("s0") ds_src_s0_s0 = ds_src_s0.create("s0") ds_src_s0.create("s1") ds_src_s1 = ds_src.create("s1") ds_src.create("s2") ds_src.save(recursive=True) assert_repo_status(ds_src.path) ds_clone = install(source=ds_src.path, path=path / "clone", recursive=True, result_xfm="datasets") ds_clone_s0 = Dataset(ds_clone.pathobj / "s0") ds_clone_s0_s0 = Dataset(ds_clone.pathobj / "s0" / "s0") ds_clone_s0_s1 = Dataset(ds_clone.pathobj / "s0" / "s1") ds_clone_s1 = Dataset(ds_clone.pathobj / "s1") ds_clone_s2 = Dataset(ds_clone.pathobj / "s2") (ds_src_s0_s0.pathobj / "foo").write_text("in s0 s0") ds_src_s0_s0.save() (ds_src_s1.pathobj / "foo").write_text("in s1") ds_src.save(recursive=True) # State: # . # |-- s0 # | |-- s0 # | `-- s1 * matches registered commit # |-- s1 # `-- s2 * matches registered commit res = ds_clone.update(follow="parentds-lazy", merge=True, recursive=True, on_failure="ignore") on_adjusted = ds_clone.repo.is_managed_branch() # For adjusted branches, follow=parentds* bails with an impossible result, # so the s0 update doesn't get brought in and s0_s0 also matches the # registered commit. n_notneeded_expected = 3 if on_adjusted else 2 assert_result_count(res, n_notneeded_expected, action="update", status="notneeded") assert_in_results(res, action="update", status="notneeded", path=ds_clone_s0_s1.repo.path) assert_in_results(res, action="update", status="notneeded", path=ds_clone_s2.repo.path) if on_adjusted: assert_in_results(res, action="update", status="notneeded", path=ds_clone_s0_s0.repo.path) assert_repo_status( ds_clone.path, modified=[ds_clone_s0.repo.path, ds_clone_s1.repo.path]) else: assert_repo_status(ds_clone.path)
def _put_in_zip(zip, path, records): for k, v in records.items(): if isinstance(v, dict): _put_in_zip(zip, path + [k], v) else: zip.writestr(str(Path(*path) / k), v)
def __call__(path=None, initopts=None, *, force=False, description=None, dataset=None, annex=True, fake_dates=False, cfg_proc=None): # we only perform negative tests below no_annex = not annex if dataset: if isinstance(dataset, Dataset): ds = dataset else: ds = Dataset(dataset) refds_path = ds.path else: ds = refds_path = None # two major cases # 1. we got a `dataset` -> we either want to create it (path is None), # or another dataset in it (path is not None) # 2. we got no dataset -> we want to create a fresh dataset at the # desired location, either at `path` or PWD # sanity check first if no_annex: if description: raise ValueError("Incompatible arguments: cannot specify " "description for annex repo and declaring " "no annex repo.") if (isinstance(initopts, (list, tuple)) and '--bare' in initopts) or (isinstance(initopts, dict) and 'bare' in initopts): raise ValueError( "Creation of bare repositories is not supported. Consider " "one of the create-sibling commands, or use " "Git to init a bare repository and push an existing dataset " "into it.") if path: path = resolve_path(path, dataset) path = path if path \ else getpwd() if ds is None \ else refds_path # we know that we need to create a dataset at `path` assert (path is not None) # assure cfg_proc is a list (relevant if used via Python API) cfg_proc = ensure_list(cfg_proc) # prep for yield res = dict(action='create', path=str(path), logger=lgr, type='dataset', refds=refds_path) refds = None if refds_path and refds_path != str(path): refds = require_dataset(refds_path, check_installed=True, purpose='create a subdataset') path_inrefds = path_under_rev_dataset(refds, path) if path_inrefds is None: yield dict( res, status='error', message=( "dataset containing given paths is not underneath " "the reference dataset %s: %s", ds, str(path)), ) return # try to locate an immediate parent dataset # we want to know this (irrespective of whether we plan on adding # this new dataset to a parent) in order to avoid conflicts with # a potentially absent/uninstalled subdataset of the parent # in this location # it will cost some filesystem traversal though... parentds_path = get_dataset_root( op.normpath(op.join(str(path), os.pardir))) if parentds_path: prepo = GitRepo(parentds_path) parentds_path = Path(parentds_path) # we cannot get away with a simple # GitRepo.get_content_info(), as we need to detect # uninstalled/added subdatasets too check_path = Path(path) pstatus = prepo.status( untracked='no', # limit query to target path for a potentially massive speed-up paths=[check_path.relative_to(parentds_path)]) if (not pstatus.get(check_path, {}).get("type") == "dataset" and any(check_path == p or check_path in p.parents for p in pstatus)): # redo the check in a slower fashion, it is already broken # let's take our time for a proper error message conflict = [ p for p in pstatus if check_path == p or check_path in p.parents ] res.update({ 'status': 'error', 'message': ('collision with content in parent dataset at %s: %s', str(parentds_path), [str(c) for c in conflict]) }) yield res return if not force: # another set of check to see whether the target path is pointing # into a known subdataset that is not around ATM subds_status = { parentds_path / k.relative_to(prepo.path) for k, v in pstatus.items() if v.get('type', None) == 'dataset' } check_paths = [check_path] check_paths.extend(check_path.parents) if any(p in subds_status for p in check_paths): conflict = [p for p in check_paths if p in subds_status] res.update({ 'status': 'error', 'message': ('collision with %s (dataset) in dataset %s', str(conflict[0]), str(parentds_path)) }) yield res return # important to use the given Dataset object to avoid spurious ID # changes with not-yet-materialized Datasets tbds = ds if isinstance(ds, Dataset) and \ ds.path == path else Dataset(str(path)) # don't create in non-empty directory without `force`: if op.isdir(tbds.path) and listdir(tbds.path) != [] and not force: res.update({ 'status': 'error', 'message': 'will not create a dataset in a non-empty directory, use ' '`--force` option to ignore' }) yield res return # Check if specified cfg_proc(s) can be discovered, storing # the results so they can be used when the time comes to run # the procedure. If a procedure cannot be found, raise an # error to prevent creating the dataset. cfg_proc_specs = [] if cfg_proc: discovered_procs = tbds.run_procedure( discover=True, result_renderer='disabled', return_type='generator', ) for cfg_proc_ in cfg_proc: for discovered_proc in discovered_procs: if discovered_proc['procedure_name'] == 'cfg_' + cfg_proc_: cfg_proc_specs.append(discovered_proc) break else: raise ValueError("Cannot find procedure with name " "'%s'" % cfg_proc_) if initopts is not None and isinstance(initopts, list): initopts = {'_from_cmdline_': initopts} # Note for the code below: # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Re-use tbrepo instance, do not use tbds.repo # create and configure desired repository # also provides initial set of content to be tracked with git (not annex) if no_annex: tbrepo, add_to_git = _setup_git_repo(path, initopts, fake_dates) else: tbrepo, add_to_git = _setup_annex_repo(path, initopts, fake_dates, description) # OPT: be "smart" and avoid re-resolving .repo -- expensive in DataLad # Note, must not happen earlier (before if) since "smart" it would not be tbds_config = tbds.config # record an ID for this repo for the afterlife # to be able to track siblings and children id_var = 'datalad.dataset.id' # Note, that Dataset property `id` will change when we unset the # respective config. Therefore store it before: tbds_id = tbds.id if id_var in tbds_config: # make sure we reset this variable completely, in case of a # re-create tbds_config.unset(id_var, scope='branch') if _seed is None: # just the standard way # use a fully random identifier (i.e. UUID version 4) uuid_id = str(uuid.uuid4()) else: # Let's generate preseeded ones uuid_id = str(uuid.UUID(int=random.getrandbits(128))) tbds_config.add(id_var, tbds_id if tbds_id is not None else uuid_id, scope='branch', reload=False) # make config overrides permanent in the repo config # this is similar to what `annex init` does # we are only doing this for config overrides and do not expose # a dedicated argument, because it is sufficient for the cmdline # and unnecessary for the Python API (there could simply be a # subsequence ds.config.add() call) for k, v in tbds_config.overrides.items(): tbds_config.add(k, v, scope='local', reload=False) # all config manipulation is done -> fll reload tbds_config.reload() # must use the repo.pathobj as this will have resolved symlinks add_to_git[tbrepo.pathobj / '.datalad'] = { 'type': 'directory', 'state': 'untracked' } # save everything, we need to do this now and cannot merge with the # call below, because we may need to add this subdataset to a parent # but cannot until we have a first commit tbrepo.save( message='[DATALAD] new dataset', git=True, # we have to supply our own custom status, as the repo does # not have a single commit yet and the is no HEAD reference # TODO make `GitRepo.status()` robust to this state. _status=add_to_git, ) for cfg_proc_spec in cfg_proc_specs: yield from tbds.run_procedure( cfg_proc_spec, result_renderer='disabled', return_type='generator', ) # the next only makes sense if we saved the created dataset, # otherwise we have no committed state to be registered # in the parent if isinstance(refds, Dataset) and refds.path != tbds.path: # we created a dataset in another dataset # -> make submodule yield from refds.save( path=tbds.path, return_type='generator', result_renderer='disabled', ) res.update({'status': 'ok'}) yield res
def test_datalad_credential_helper(path=None): ds = Dataset(path).create() # tell git to use git-credential-datalad ds.config.add('credential.helper', 'datalad', scope='local') ds.config.add('datalad.credentials.githelper.noninteractive', 'true', scope='global') from datalad.downloaders.providers import Providers url1 = "https://datalad-test.org/some" url2 = "https://datalad-test.org/other" provider_name = "datalad-test.org" # `Providers` code is old and only considers a dataset root based on PWD # for config lookup. contextmanager below can be removed once the # provider/credential system is redesigned. with chpwd(ds.path): gitcred = GitCredentialInterface(url=url1, repo=ds) # There's nothing set up yet, helper should return empty gitcred.fill() eq_(gitcred['username'], '') eq_(gitcred['password'], '') # store new credentials # Note, that `Providers.enter_new()` currently uses user-level config # files for storage only. TODO: make that an option! # To not mess with existing ones, fail if it already exists: cfg_file = Path(Providers._get_providers_dirs()['user']) \ / f"{provider_name}.cfg" assert_false(cfg_file.exists()) # Make sure we clean up from datalad.tests import _TEMP_PATHS_GENERATED _TEMP_PATHS_GENERATED.append(str(cfg_file)) # Give credentials to git and ask it to store them: gitcred = GitCredentialInterface(url=url1, username="******", password="******", repo=ds) gitcred.approve() assert_true(cfg_file.exists()) providers = Providers.from_config_files() p1 = providers.get_provider(url=url1, only_nondefault=True) assert_is_instance(p1.credential, UserPassword) eq_(p1.credential.get('user'), 'dl-user') eq_(p1.credential.get('password'), 'dl-pwd') # default regex should be host only, so matching url2, too p2 = providers.get_provider(url=url2, only_nondefault=True) assert_is_instance(p1.credential, UserPassword) eq_(p1.credential.get('user'), 'dl-user') eq_(p1.credential.get('password'), 'dl-pwd') # git, too, should now find it for both URLs gitcred = GitCredentialInterface(url=url1, repo=ds) gitcred.fill() eq_(gitcred['username'], 'dl-user') eq_(gitcred['password'], 'dl-pwd') gitcred = GitCredentialInterface(url=url2, repo=ds) gitcred.fill() eq_(gitcred['username'], 'dl-user') eq_(gitcred['password'], 'dl-pwd') # Rejection must not currently lead to deleting anything, since we would # delete too broadly. gitcred.reject() assert_true(cfg_file.exists()) gitcred = GitCredentialInterface(url=url1, repo=ds) gitcred.fill() eq_(gitcred['username'], 'dl-user') eq_(gitcred['password'], 'dl-pwd') dlcred = UserPassword(name=provider_name) eq_(dlcred.get('user'), 'dl-user') eq_(dlcred.get('password'), 'dl-pwd')
def __call__(keyfile=None, merge=False, force_update=False, bids=False, non_bids_dir='non-bids', dataset=None): ds = require_dataset(dataset, check_installed=True, purpose='update') repo = ds.repo if not keyfile: # will error out, if no config was given keyfile = repo.config.obtain('datalad.ukbiobank.keyfile') # prep for yield res = dict( action='ukb_update', path=ds.path, type='dataset', logger=lgr, refds=ds.path, ) if repo.dirty: yield dict( res, status='error', message="Refuse to operate on dirty dataset", ) return # check if we have 'ukbfetch' before we start fiddling with the dataset # and leave it in a mess for no reason try: subprocess.run( # pull version info ['ukbfetch', '-i'], capture_output=True, ) except Exception as e: raise RuntimeError( "Cannot execute 'ukbfetch'. Original error: {}".format(e)) # just to be nice, and to be able to check it out again, # when we are done initial_branch = repo.get_active_branch() initial_incoming = repo.get_hexsha('incoming') # make sure we are in incoming repo.call_git(['checkout', 'incoming']) # first wipe out all prev. downloaded zip files so we can detect # when some files are no longer available for fp in repo.pathobj.glob('[0-9]*_[0-9]*_[0-9]_[0-9].*'): fp.unlink() # a place to put the download logs # better be semi-persistent to ease inspection tmpdir = repo.pathobj / repo.get_git_dir(repo) / 'tmp' / 'ukb' tmpdir.mkdir(parents=True, exist_ok=True) # redownload, run with explicit mode, because we just deleted the # ZIP files and that is OK ds.run( cmd='ukbfetch -v -a{} -b.ukbbatch -o{}'.format( quote_cmdlinearg(keyfile), quote_cmdlinearg(str(tmpdir)), ), explicit=True, outputs=['.'], message="Update from UKbiobank", ) # TODO what if something broke before? needs force switch if not force_update and repo.get_hexsha() == initial_incoming: yield dict( res, status='notneeded', message='No new content available', ) repo.call_git(['checkout', initial_branch]) # TODO drop? return # onto extraction and transformation of downloaded content repo.call_git(['checkout', 'incoming-processed']) # mark the incoming change as merged # (but we do not actually want any branch content) repo.call_git(['merge', 'incoming', '--strategy=ours', 'incoming']) for fp in repo.get_content_info(ref='incoming-processed', eval_file_type=False): fp.unlink() subid = None if bids: from datalad_ukbiobank.ukb2bids import restructure_ukb2bids # get participant ID from batch file subid = list( repo.call_git_items_(["cat-file", "-p", "incoming:.ukbbatch" ]))[0].split(maxsplit=1)[0] # discover all zip files present in the last commit in 'incoming' for fp, props in repo.get_content_annexinfo( ref='incoming', eval_availability=False).items(): if fp.name.startswith('.'): # skip internals continue # we have to extract into per-instance directories, otherwise files # would conflict ids = fp.stem.split('_') if not len(ids) >= 3: raise RuntimeError( 'Unrecognized filename structure: {}'.format(fp)) extract_dir = repo.pathobj / 'instance-{}'.format(ids[2]) extract_dir.mkdir(exist_ok=True) if fp.suffix == '.zip': with chpwd(extract_dir): # extract and add their content AddArchiveContent.__call__( props['key'], key=True, annex=repo, # --use-current-dir due to # https://github.com/datalad/datalad/issues/3995 use_current_dir=True, allow_dirty=True, commit=False, ) else: # move into instance dir, and strip participant ID, and instance ID # but keep array index # e.g. -> 25747_3_0.adv -> instance-3/25747_0 repo.call_git([ 'annex', 'fromkey', props['key'], str(extract_dir / ('_'.join(ids[1::2]) + ''.join(fp.suffixes))) ]) if bids: yield from restructure_ukb2bids( ds, subid=subid, unrecognized_dir=Path('ses-{}'.format(ids[2])) / non_bids_dir, base_path=extract_dir, session=ids[2], ) # save whatever the state is now, `save` will discover deletions # automatically and also commit them -- wonderful! ds.save(message="Track ZIP file content") yield dict( res, status='ok', ) if not merge: return # and update active branch repo.call_git(['checkout', initial_branch]) if initial_branch in ('incoming', 'incoming-processed'): yield dict( res, action='ukb_merge_update', status='impossible', message='Refuse to merge into incoming* branch', ) return repo.call_git([ 'merge', '-m', "Merge update from UKbiobank", 'incoming-processed' ]) yield dict( res, action='ukb_merge_update', status='ok', ) return