def test_get_flexible_source_candidates(): f = _get_flexible_source_candidates # for http and https (dummy transport) we should get /.git source added eq_(f('http://e.c'), ['http://e.c', 'http://e.c/.git']) eq_(f('http://e.c/s/p'), ['http://e.c/s/p', 'http://e.c/s/p/.git']) # for those candidates should be just the original address, since git # understands those just fine for s in ('http://e.c/.git', '/', 'relative/path', 'smallrelative', './neighbor', '../../look/into/parent/bedroom', 'p:somewhere', 'user@host:/full/path', ): eq_(_get_flexible_source_candidates(s), [s]) # Now a few relative ones eq_(f('../r', '.'), ['../r']) eq_(f('../r', 'ssh://host/path'), ['ssh://host/r']) eq_(f('sub', 'ssh://host/path'), ['ssh://host/path/sub']) eq_(f('../r', 'http://e.c/p'), ['http://e.c/r', 'http://e.c/r/.git']) eq_(f('sub', 'http://e.c/p'), ['http://e.c/p/sub', 'http://e.c/p/sub/.git']) # tricky ones eq_(f('sub', 'http://e.c/p/.git'), ['http://e.c/p/sub/.git']) eq_(f('../s1/s2', 'http://e.c/p/.git'), ['http://e.c/s1/s2/.git']) # incorrect ones will stay incorrect eq_(f('../s1/s2', 'http://e.c/.git'), ['http://e.c/../s1/s2/.git']) # when source is not relative, but base_url is specified as just the destination path, # not really a "base url" as name was suggesting, then it should be ignored eq_(f('http://e.c/p', '/path'), ['http://e.c/p', 'http://e.c/p/.git'])
def clone_dataset( srcs, destds, reckless=None, description=None, result_props=None, cfg=None): """Internal helper to perform cloning without sanity checks (assumed done) This helper does not handle any saving of subdataset modification or adding in a superdataset. Parameters ---------- srcs : list Any suitable clone source specifications (paths, URLs) destds : Dataset Dataset instance for the clone destination reckless : {None, 'auto', 'ephemeral', 'shared-...'}, optional Mode switch to put cloned dataset into unsafe/throw-away configurations, i.e. sacrifice data safety for performance or resource footprint. When None and `cfg` is specified, use the value of `datalad.clone.reckless`. description : str, optional Location description for the annex of the dataset clone (if there is any). result_props : dict, optional Default properties for any yielded result, passed on to get_status_dict(). cfg : ConfigManager, optional Configuration for parent dataset. This will be queried instead of the global DataLad configuration. Yields ------ dict DataLad result records """ if not result_props: # in case the caller had no specific idea on how results should look # like, provide sensible defaults result_props = dict( action='install', logger=lgr, ds=destds, ) if reckless is None and cfg: # if reckless is not explicitly given, but we operate on a # superdataset, query whether it has been instructed to operate # in a reckless mode, and inherit it for the coming clone reckless = cfg.get('datalad.clone.reckless', None) dest_path = destds.pathobj # decode all source candidate specifications candidate_sources = [decode_source_spec(s, cfg=cfg) for s in srcs] # now expand the candidate sources with additional variants of the decoded # giturl, while duplicating the other properties in the additional records # for simplicity. The hope is to overcome a few corner cases and be more # robust than git clone candidate_sources = [ dict(props, giturl=s) for props in candidate_sources for s in _get_flexible_source_candidates(props['giturl']) ] # important test! based on this `rmtree` will happen below after failed clone dest_path_existed = dest_path.exists() if dest_path_existed and any(dest_path.iterdir()): if destds.is_installed(): # check if dest was cloned from the given source before # this is where we would have installed this from # this is where it was actually installed from track_name, track_url = _get_tracking_source(destds) try: # this will get us track_url in system native path conventions, # whenever it is a path (and not a URL) # this is needed to match it to any potentially incoming local # source path in the 'notneeded' test below track_path = str(Path(track_url)) except Exception: # this should never happen, because Path() will let any non-path stringification # pass through unmodified, but we do not want any potential crash due to # pathlib behavior changes lgr.debug("Unexpected behavior of pathlib!") track_path = None for cand in candidate_sources: src = cand['giturl'] if track_url == src \ or (not is_url(track_url) and get_local_file_url(track_url, compatibility='git') == src) \ or track_path == expanduser(src): yield get_status_dict( status='notneeded', message=("dataset %s was already cloned from '%s'", destds, src), **result_props) return # anything else is an error yield get_status_dict( status='error', message='target path already exists and not empty, refuse to clone into target path', **result_props) return log_progress( lgr.info, 'cloneds', 'Cloning dataset to %s', destds, total=len(candidate_sources), label='Clone attempt', unit=' Candidate locations', ) error_msgs = OrderedDict() # accumulate all error messages formatted per each url for cand in candidate_sources: log_progress( lgr.info, 'cloneds', 'Attempting to clone from %s to %s', cand['giturl'], dest_path, update=1, increment=True) clone_opts = {} if cand.get('version', None): clone_opts['branch'] = cand['version'] try: # TODO for now GitRepo.clone() cannot handle Path instances, and PY35 # doesn't make it happen seemlessly GitRepo.clone( path=str(dest_path), url=cand['giturl'], clone_options=clone_opts, create=True) except CommandError as e: e_stderr = e.stderr error_msgs[cand['giturl']] = e lgr.debug("Failed to clone from URL: %s (%s)", cand['giturl'], exc_str(e)) if dest_path.exists(): lgr.debug("Wiping out unsuccessful clone attempt at: %s", dest_path) # We must not just rmtree since it might be curdir etc # we should remove all files/directories under it # TODO stringification can be removed once patlib compatible # or if PY35 is no longer supported rmtree(str(dest_path), children_only=dest_path_existed) if e_stderr and 'could not create work tree' in e_stderr.lower(): # this cannot be fixed by trying another URL re_match = re.match(r".*fatal: (.*)$", e_stderr, flags=re.MULTILINE | re.DOTALL) # cancel progress bar log_progress( lgr.info, 'cloneds', 'Completed clone attempts for %s', destds ) yield get_status_dict( status='error', message=re_match.group(1).strip() if re_match else "stderr: " + e_stderr, **result_props) return # next candidate continue result_props['source'] = cand # do not bother with other sources if succeeded break log_progress( lgr.info, 'cloneds', 'Completed clone attempts for %s', destds ) if not destds.is_installed(): if len(error_msgs): if all(not e.stdout and not e.stderr for e in error_msgs.values()): # there is nothing we can learn from the actual exception, # the exit code is uninformative, the command is predictable error_msg = "Failed to clone from all attempted sources: %s" error_args = list(error_msgs.keys()) else: error_msg = "Failed to clone from any candidate source URL. " \ "Encountered errors per each url were:\n- %s" error_args = '\n- '.join( '{}\n {}'.format(url, exc_str(exc)) for url, exc in error_msgs.items() ) else: # yoh: Not sure if we ever get here but I felt that there could # be a case when this might happen and original error would # not be sufficient to troubleshoot what is going on. error_msg = "Awkward error -- we failed to clone properly. " \ "Although no errors were encountered, target " \ "dataset at %s seems to be not fully installed. " \ "The 'succesful' source was: %s" error_args = (destds.path, cand['giturl']) yield get_status_dict( status='error', message=(error_msg, error_args), **result_props) return if not cand.get("version"): postclone_check_head(destds) # act on --reckless=shared-... # must happen prior git-annex-init, where we can cheaply alter the repo # setup through safe re-init'ing if reckless and reckless.startswith('shared-'): lgr.debug('Reinit %s to enable shared access permissions', destds) destds.repo.call_git(['init', '--shared={}'.format(reckless[7:])]) yield from postclonecfg_annexdataset( destds, reckless, description) # perform any post-processing that needs to know details of the clone # source if result_props['source']['type'] == 'ria': yield from postclonecfg_ria(destds, result_props['source']) if reckless: # store the reckless setting in the dataset to make it # known to later clones of subdatasets via get() destds.config.set( 'datalad.clone.reckless', reckless, where='local', reload=True) # yield successful clone of the base dataset now, as any possible # subdataset clone down below will not alter the Git-state of the # parent yield get_status_dict(status='ok', **result_props)
def _get_flexible_source_candidates_for_submodule(ds, sm): """Assemble candidate locations from where to clone a submodule The following locations candidates are considered. For each candidate a cost is given in parenthesis, lower values indicate higher cost: - URL of any configured superdataset remote that is known to have the desired submodule commit, with the submodule path appended to it. There can be more than one candidate (cost 500). - A URL or absolute path recorded in `.gitmodules` (cost 600). - In case `.gitmodules` contains a relative path instead of a URL, the URL of any configured superdataset remote that is known to have the desired submodule commit, with this relative path appended to it. There can be more than one candidate (cost 500). - In case `.gitmodules` contains a relative path as a URL, the absolute path of the superdataset, appended with this relative path (cost 900). Additional candidate URLs can be generated based on templates specified as configuration variables with the pattern `datalad.get.subdataset-source-candidate-<name>` where `name` is an arbitrary identifier. If name starts with three digits (e.g. '400myserver') these will be interpreted as a cost, and the respective candidate will be sorted into the generated candidate list according to this cost. If no cost is given, a default of 700 is used. A template string assigned to such a variable can utilize the Python format mini language and may reference a number of properties that are inferred from the parent dataset's knowledge about the target subdataset. Properties include any submodule property specified in the respective `.gitmodules` record. For convenience, an existing `datalad-id` record is made available under the shortened name `id`. Additionally, the URL of any configured remote that contains the respective submodule commit is available as `remote-<name>` properties, where `name` is the configured remote name. Lastly, all candidates are sorted according to their cost (lower values first, and duplicate URLs are stripped, while preserving the first item in the candidate list. Parameters ---------- ds : Dataset Parent dataset of to-be-installed subdataset. sm : dict Submodule record as produced by `subdatasets()`. Returns ------- list of dict Where each dict has keys 'cost' (int), 'name' (str), 'url' (str). Names are not unique and either derived from the name of the respective remote, template configuration variable, or 'local'. """ # short cuts ds_repo = ds.repo sm_url = sm.get('gitmodule_url', None) sm_path = op.relpath(sm['path'], start=sm['parentds']) clone_urls = [] # CANDIDATE: tracking remote of the current branch tracking_remote, tracking_branch = ds_repo.get_tracking_branch() candidate_remotes = [tracking_remote] if tracking_remote else [] # if we have a remote, let's check the location of that remote # for the presence of the desired submodule last_commit = ds_repo.get_last_commit_hexsha(sm_path) if last_commit: # CANDIDATE: any remote that has the commit when the submodule was # last modified # ideally should also give preference to the remotes which have # the same branch checked out I guess candidate_remotes += list( _get_remotes_having_commit(ds_repo, last_commit)) # prepare a dict to generate URL candidates from templates sm_candidate_props = { k[10:].replace('datalad-id', 'id'): v for k, v in sm.items() if k.startswith('gitmodule_') } for remote in unique(candidate_remotes): remote_url = ds_repo.get_remote_url(remote, push=False) # Directly on parent's ds url if remote_url: # make remotes and their URLs available to template rendering sm_candidate_props['remoteurl-{}'.format(remote)] = remote_url # attempt: submodule checkout at parent remote URL # We might need to quote sm_path portion, e.g. for spaces etc if isinstance(RI(remote_url), URL): sm_path_url = urlquote(sm_path) else: sm_path_url = sm_path clone_urls.extend( dict(cost=500, name=remote, url=url) for url in _get_flexible_source_candidates( # alternate suffixes are tested by `clone` anyways sm_path_url, remote_url, alternate_suffix=False)) # attempt: provided (configured?) submodule URL # TODO: consider supporting DataLadRI here? or would confuse # git and we wouldn't want that (i.e. not allow pure git clone # --recursive) if sm_url: clone_urls.extend( dict(cost=600, name=remote, url=url) for url in _get_flexible_source_candidates( sm_url, remote_url, alternate_suffix=False)) cost_candidate_expr = re.compile('[0-9][0-9][0-9].*') candcfg_prefix = 'datalad.get.subdataset-source-candidate-' for name, tmpl in [(c[len(candcfg_prefix):], ds_repo.config[c]) for c in ds_repo.config.keys() if c.startswith(candcfg_prefix)]: url = tmpl.format(**sm_candidate_props) # we don't want "flexible_source_candidates" here, this is # configuration that can be made arbitrarily precise from the # outside. Additional guesswork can only make it slower has_cost = cost_candidate_expr.match(name) is not None clone_urls.append( # assign a default cost, if a config doesn't have one dict( cost=int(name[:3]) if has_cost else 700, name=name[3:] if has_cost else name, url=url, from_config=True, )) # CANDIDATE: the actual configured gitmodule URL if sm_url: clone_urls.extend( dict(cost=900, name='local', url=url) for url in _get_flexible_source_candidates( sm_url, ds.path, alternate_suffix=False) # avoid inclusion of submodule location itself if url != sm['path']) # sort all candidates by their label, thereby allowing a # candidate provided by configuration to purposefully # sort before or after automatically generated configuration clone_urls = sorted(clone_urls, key=lambda x: x['cost']) # take out any duplicate source candidates # unique() takes out the duplicated at the tail end clone_urls = unique(clone_urls, lambda x: x['url']) return clone_urls
def _get_flexible_source_candidates_for_submodule(ds, sm): """Assemble candidates from where to install a submodule Even if a URL for submodule is provided explicitly -- first tries urls under parent's module tracking branch remote. Additional candidate URLs can be generated based on templates specified as configuration variables with the pattern `datalad.get.subdataset-source-candidate-<name>` where `name` is an arbitrary identifier. A template string assigned to such a variable can utilize the Python format mini language and may reference a number of properties that are inferred from the parent dataset's knowledge about the target subdataset. Properties include any submodule property specified in the respective .gitmodules record. For convenience, an existing `datalad-id` record is made available under the shortened name `id`. Additionally, the URL of any configured remote that contains the respective submodule commit is available as `remote-<name>` properties, where `name` is the configured remote name. Parameters ---------- ds : Dataset Parent dataset of to-be-installed subdataset. sm : dict Submodule record as produced by `subdatasets()`. Returns ------- list of tuples Where each tuples consists of a name and a URL. Names are not unique and either derived from the name of the respective remote, template configuration variable, or 'origin' for the candidate URL that was obtained from the .gitmodule record. """ # short cuts ds_repo = ds.repo sm_url = sm.get('gitmodule_url', None) sm_path = op.relpath(sm['path'], start=sm['parentds']) clone_urls = [] # CANDIDATE: tracking remote of the current branch tracking_remote, tracking_branch = ds_repo.get_tracking_branch() candidate_remotes = [tracking_remote] if tracking_remote else [] # if we have a remote, let's check the location of that remote # for the presence of the desired submodule last_commit = ds_repo.get_last_commit_hexsha(sm_path) if last_commit: # CANDIDATE: any remote that has the commit when the submodule was # last modified # ideally should also give preference to the remotes which have # the same branch checked out I guess candidate_remotes += list( _get_remotes_having_commit(ds_repo, last_commit)) # prepare a dict to generate URL candidates from templates sm_candidate_props = { k[10:].replace('datalad-id', 'id'): v for k, v in sm.items() if k.startswith('gitmodule_') } for remote in unique(candidate_remotes): remote_url = ds_repo.get_remote_url(remote, push=False) # Directly on parent's ds url if remote_url: # make remotes and their URLs available to template rendering sm_candidate_props['remoteurl-{}'.format(remote)] = remote_url # attempt: submodule checkout at parent remote URL # We might need to quote sm_path portion, e.g. for spaces etc if isinstance(RI(remote_url), URL): sm_path_url = urlquote(sm_path) else: sm_path_url = sm_path clone_urls.extend( (remote, url) for url in _get_flexible_source_candidates( # alternate suffixes are tested by `clone` anyways sm_path_url, remote_url, alternate_suffix=False)) # attempt: provided (configured?) submodule URL # TODO: consider supporting DataLadRI here? or would confuse # git and we wouldn't want that (i.e. not allow pure git clone # --recursive) if sm_url: clone_urls.extend( (remote, url) for url in _get_flexible_source_candidates( sm_url, remote_url, alternate_suffix=False)) for name, tmpl in [ (c[12:], ds_repo.config[c]) for c in ds_repo.config.keys() if c.startswith('datalad.get.subdataset-source-candidate-') ]: url = tmpl.format(**sm_candidate_props) # we don't want "flexible_source_candidates" here, this is # configuration that can be made arbitrarily precise from the # outside. Additional guesswork can only make it slower clone_urls.append((name, url)) # CANDIDATE: the actual configured gitmodule URL if sm_url: clone_urls.extend(('local', url) for url in _get_flexible_source_candidates( sm_url, ds.path, alternate_suffix=False) # avoid inclusion of submodule location itself if url != sm['path']) return unique(clone_urls, lambda x: x[1])