def postclonecfg_ria(ds, props): """Configure a dataset freshly cloned from a RIA store""" repo = ds.repo # RIA uses hashdir mixed, copying data to it via git-annex (if cloned via # ssh) would make it see a bare repo and establish a hashdir lower annex # object tree. # Moreover, we want the ORA remote to receive all data for the store, so its # objects could be moved into archives (the main point of a RIA store). RIA_REMOTE_NAME = 'origin' # don't hardcode everywhere ds.config.set( 'remote.{}.annex-ignore'.format(RIA_REMOTE_NAME), 'true', where='local') # chances are that if this dataset came from a RIA store, its subdatasets # may live there too. Place a subdataset source candidate config that makes # get probe this RIA store when obtaining subdatasets ds.config.set( # we use the label 'origin' for this candidate in order to not have to # generate a complicated name from the actual source specification. # we pick a cost of 200 to sort it before datalad's default candidates # for non-RIA URLs, because they prioritize hierarchical layouts that # cannot be found in a RIA store 'datalad.get.subdataset-source-candidate-200origin', # use the entire original URL, up to the fragment + plus dataset ID # placeholder, this should make things work with any store setup we # support (paths, ports, ...) props['source'].split('#', maxsplit=1)[0] + '#{id}', where='local') # setup publication dependency, if a corresponding special remote exists # and was enabled (there could be RIA stores that actually only have repos) # make this function be a generator ora_remotes = [s for s in ds.siblings('query', result_renderer='disabled') if s.get('annex-externaltype') == 'ora'] if not ora_remotes and any( r.get('externaltype') == 'ora' for r in (repo.get_special_remotes().values() if hasattr(repo, 'get_special_remotes') else [])): # no ORA remote autoenabled, but configuration known about at least one. # Let's check origin's config for datalad.ora-remote.uuid as stored by # create-sibling-ria and enable try enabling that one. lgr.debug("Found no autoenabled ORA special remote. Trying to look it " "up in source config ...") # First figure whether we cloned via SSH, HTTP or local path and then # get that config file the same way: config_content = None scheme = props['giturl'].split(':', 1)[0] if scheme in ['http', 'https']: try: config_content = download_url( "{}{}config".format( props['giturl'], '/' if not props['giturl'].endswith('/') else '')) except DownloadError as e: lgr.debug("Failed to get config file from source:\n%s", exc_str(e)) elif scheme == 'ssh': # TODO: switch the following to proper command abstraction: # SSHRemoteIO ignores the path part ATM. No remote CWD! (To be # changed with command abstractions). So we need to get that part to # have a valid path to origin's config file: cfg_path = PurePosixPath(URL(props['giturl']).path) / 'config' op = SSHRemoteIO(props['giturl']) try: config_content = op.read_file(cfg_path) except RIARemoteError as e: lgr.debug("Failed to get config file from source: %s", exc_str(e)) elif scheme == 'file': # TODO: switch the following to proper command abstraction: op = LocalIO() cfg_path = Path(URL(props['giturl']).localpath) / 'config' try: config_content = op.read_file(cfg_path) except (RIARemoteError, OSError) as e: lgr.debug("Failed to get config file from source: %s", exc_str(e)) else: lgr.debug("Unknown URL-Scheme %s in %s. Can handle SSH, HTTP or " "FILE scheme URLs.", scheme, props['source']) # 3. And read it org_uuid = None if config_content: # TODO: We might be able to spare the saving to a file. # "git config -f -" is not explicitly documented but happens # to work and would read from stdin. Make sure we know this # works for required git versions and on all platforms. with make_tempfile(content=config_content) as cfg_file: runner = GitWitlessRunner() try: result = runner.run( ['git', 'config', '-f', cfg_file, 'datalad.ora-remote.uuid'], protocol=StdOutCapture ) org_uuid = result['stdout'].strip() except CommandError as e: # doesn't contain what we are looking for lgr.debug("Found no UUID for ORA special remote at " "'%s' (%s)", RIA_REMOTE_NAME, exc_str(e)) # Now, enable it. If annex-init didn't fail to enable it as stored, we # wouldn't end up here, so enable with store URL as suggested by the URL # we cloned from. if org_uuid: srs = repo.get_special_remotes() if org_uuid in srs.keys(): # TODO: - Double-check autoenable value and only do this when # true? # - What if still fails? -> Annex shouldn't change config # in that case # we only need the store: new_url = props['source'].split('#')[0] try: repo.enable_remote(srs[org_uuid]['name'], options=['url={}'.format(new_url)] ) lgr.info("Reconfigured %s for %s", srs[org_uuid]['name'], new_url) # update ora_remotes for considering publication dependency # below ora_remotes = [s for s in ds.siblings('query', result_renderer='disabled') if s.get('annex-externaltype', None) == 'ora'] except CommandError as e: lgr.debug("Failed to reconfigure ORA special remote: %s", exc_str(e)) else: lgr.debug("Unknown ORA special remote uuid at '%s': %s", RIA_REMOTE_NAME, org_uuid) if ora_remotes: if len(ora_remotes) == 1: yield from ds.siblings('configure', name=RIA_REMOTE_NAME, publish_depends=ora_remotes[0]['name'], result_filter=None, result_renderer='disabled') else: lgr.warning("Found multiple ORA remotes. Couldn't decide which " "publishing to 'origin' should depend on: %s. Consider " "running 'datalad siblings configure -s origin " "--publish-depends ORAREMOTENAME' to set publication " "dependency manually.", [r['name'] for r in ora_remotes])
def __call__( url, name, *, # note that `name` is required but not posarg in CLI dataset=None, storage_name=None, alias=None, post_update_hook=False, shared=None, group=None, storage_sibling=True, existing='error', new_store_ok=False, trust_level=None, recursive=False, recursion_limit=None, disable_storage__=None, push_url=None): if disable_storage__ is not None: import warnings warnings.warn( "datalad-create-sibling-ria --no-storage-sibling " "is deprecated, use --storage-sibling off instead.", DeprecationWarning) # recode to new setup disable_storage__ = None storage_sibling = False if storage_sibling == 'only' and storage_name: lgr.warning( "Sibling name will be used for storage sibling in " "storage-sibling-only mode, but a storage sibling name " "was provided") ds = require_dataset(dataset, check_installed=True, purpose='create RIA sibling(s)') res_kwargs = dict( ds=ds, action="create-sibling-ria", logger=lgr, ) # parse target URL # Note: URL parsing is done twice ATM (for top-level ds). This can't be # reduced to single instance, since rewriting url based on config could # be different for subdatasets. try: ssh_host, base_path, rewritten_url = \ verify_ria_url(push_url if push_url else url, ds.config) except ValueError as e: yield get_status_dict(status='error', message=str(e), **res_kwargs) return if ds.repo.get_hexsha() is None or ds.id is None: raise RuntimeError("Repository at {} is not a DataLad dataset, " "run 'datalad create [--force]' first.".format( ds.path)) if not storage_sibling and storage_name: lgr.warning( "Storage sibling setup disabled, but a storage sibling name " "was provided") if storage_sibling and not storage_name: storage_name = "{}-storage".format(name) if storage_sibling and name == storage_name: # leads to unresolvable, circular dependency with publish-depends raise ValueError("sibling names must not be equal") if not isinstance(url, str): raise TypeError("url is not a string, but %s" % type(url)) # Query existing siblings upfront in order to fail early on # existing=='error', since misconfiguration (particularly of special # remotes) only to fail in a subdataset later on with that config, can # be quite painful. # TODO: messages - this is "create-sibling". Don't confuse existence of # local remotes with existence of the actual remote sibling # in wording if existing == 'error': failed = False for dpath, sname in _yield_ds_w_matching_siblings( ds, (name, storage_name), recursive=recursive, recursion_limit=recursion_limit): res = get_status_dict( status='error', message=( "a sibling %r is already configured in dataset %r", sname, dpath), type='sibling', name=sname, ds=ds, **res_kwargs, ) failed = True yield res if failed: return # TODO: - URL parsing + store creation needs to be RF'ed based on # command abstractions # - more generally consider store creation a dedicated command or # option io = SSHRemoteIO(ssh_host) if ssh_host else LocalIO() try: # determine the existence of a store by trying to read its layout. # Because this raises a FileNotFound error if non-existent, we need # to catch it io.read_file(Path(base_path) / 'ria-layout-version') except (FileNotFoundError, RIARemoteError, RemoteCommandFailedError) as e: if not new_store_ok: # we're instructed to only act in case of an existing RIA store res = get_status_dict(status='error', message="No store found at '{}'. Forgot " "--new-store-ok ?".format( Path(base_path)), **res_kwargs) yield res return log_progress( lgr.info, 'create-sibling-ria', 'Creating a new RIA store at %s', Path(base_path), ) create_store(io, Path(base_path), '1') yield from _create_sibling_ria(ds, url, push_url, name, storage_sibling, storage_name, alias, existing, shared, group, post_update_hook, trust_level, res_kwargs) if recursive: # Note: subdatasets can be treated independently, so go full # recursion when querying for them and _no_recursion with the # actual call. Theoretically this can be parallelized. for subds in ds.subdatasets(state='present', recursive=True, recursion_limit=recursion_limit, return_type='generator', result_renderer='disabled', result_xfm='datasets'): yield from _create_sibling_ria( subds, url, push_url, name, storage_sibling, storage_name, None, # subdatasets can't have the same alias as the parent existing, shared, group, post_update_hook, trust_level, res_kwargs)