def _get_flexible_source_candidates_for_submodule(ds, sm_path, sm_url=None): """Retrieve candidates from where to install the submodule Even if url for submodule is provided explicitly -- first tries urls under parent's module tracking branch remote. """ clone_urls = [] # should be our first candidate tracking_remote, tracking_branch = ds.repo.get_tracking_branch() candidate_remotes = [tracking_remote] if tracking_remote else [] # if we have a remote, let's check the location of that remote # for the presence of the desired submodule try: last_commit = next(ds.repo._get_files_history(sm_path)).hexsha # ideally should also give preference to the remotes which have # the same branch checked out I guess candidate_remotes += list(ds.repo._get_remotes_having_commit(last_commit)) except StopIteration: # no commit for it known yet, ... oh well pass for remote in unique(candidate_remotes): remote_url = ds.repo.get_remote_url(remote, push=False) # Directly on parent's ds url if remote_url: # attempt: submodule checkout at parent remote URL # We might need to quote sm_path portion, e.g. for spaces etc if isinstance(RI(remote_url), URL): sm_path_url = urlquote(sm_path) else: sm_path_url = sm_path clone_urls.extend( _get_flexible_source_candidates( # alternate suffixes are tested by `clone` anyways sm_path_url, remote_url, alternate_suffix=False)) # attempt: provided (configured?) submodule URL # TODO: consider supporting DataLadRI here? or would confuse # git and we wouldn't want that (i.e. not allow pure git clone # --recursive) if sm_url: clone_urls += _get_flexible_source_candidates( sm_url, remote_url, alternate_suffix=False ) # Do based on the ds.path as the last resort if sm_url: clone_urls += _get_flexible_source_candidates( sm_url, ds.path, alternate_suffix=False) return unique(clone_urls)
def get_key_url(e, schema='http', versioned=True): """Generate an s3:// or http:// url given a key """ e.name_urlquoted = urlquote(e.name) if schema == 'http': fmt = "http://{e.bucket.name}.s3.amazonaws.com/{e.name_urlquoted}" elif schema == 's3': fmt = "s3://{e.bucket.name}/{e.name_urlquoted}" else: raise ValueError(schema) if versioned: fmt += "?versionId={e.version_id}" return fmt.format(e=e)
def get_key_url(e, schema='http', versioned=True): """Generate an s3:// or http:// url given a key """ # TODO: here we would need to encode the name since urlquote actually # can't do that on its own... but then we should get a copy of the thing # so we could still do the .format.... # ... = e.name.encode('utf-8') # unicode isn't advised in URLs e.name_urlquoted = urlquote(e.name) if schema == 'http': fmt = "http://{e.bucket.name}.s3.amazonaws.com/{e.name_urlquoted}" elif schema == 's3': fmt = "s3://{e.bucket.name}/{e.name_urlquoted}" else: raise ValueError(schema) if versioned: fmt += "?versionId={e.version_id}" return fmt.format(e=e)
def get_key_url(e, schema='http', versioned=True): """Generate an s3:// or http:// url given a key if versioned url is requested but version_id is None, no versionId suffix will be added """ # TODO: here we would need to encode the name since urlquote actually # can't do that on its own... but then we should get a copy of the thing # so we could still do the .format.... # ... = e.name.encode('utf-8') # unicode isn't advised in URLs e.name_urlquoted = urlquote(e.name) if schema == 'http': fmt = "http://{e.bucket.name}.s3.amazonaws.com/{e.name_urlquoted}" elif schema == 's3': fmt = "s3://{e.bucket.name}/{e.name_urlquoted}" else: raise ValueError(schema) if versioned and e.version_id is not None: fmt += "?versionId={e.version_id}" return fmt.format(e=e)
def test_get_local_file_url(): for path, url in ( # relpaths are special-cased below ('test.txt', 'test.txt'), ) + ( ('C:\\Windows\\notepad.exe', 'file://C/Windows/notepad.exe'), ) if on_windows else ( (OBSCURE_FILENAME, urlquote(OBSCURE_FILENAME)), ('/a', 'file:///a'), ('/a/b/c', 'file:///a/b/c'), ('/a~', 'file:///a~'), # there are no files with trailing slashes in the name #('/a b/', 'file:///a%20b/'), ('/a b/name', 'file:///a%20b/name'), ): # Yarik found no better way to trigger. .decode() isn't enough print("D: %s" % path) if isabs(path): eq_(get_local_file_url(path), url) else: eq_(get_local_file_url(path), '/'.join( (get_local_file_url(os.getcwd()), url)))
def test_target_ssh_simple(origin, src_path, target_rootpath): # prepare src source = install(src_path, source=origin, result_xfm='datasets', return_type='item-or-list') target_path = opj(target_rootpath, "basic") with swallow_logs(new_level=logging.ERROR) as cml: create_sibling(dataset=source, name="local_target", sshurl="ssh://localhost:22", target_dir=target_path, ui=True) assert_not_in('enableremote local_target failed', cml.out) GitRepo(target_path, create=False) # raises if not a git repo assert_in("local_target", source.repo.get_remotes()) # Both must be annex or git repositories src_is_annex = AnnexRepo.is_valid_repo(src_path) eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path)) # And target one should be known to have a known UUID within the source if annex if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get # basic config in place eq_(local_target_cfg('annex-ignore'), 'false') ok_(local_target_cfg('annex-uuid')) # do it again without force, but use a different name to avoid initial checks # for existing remotes: with assert_raises(RuntimeError) as cm: assert_create_sshwebserver(dataset=source, name="local_target_alt", sshurl="ssh://localhost", target_dir=target_path) ok_( text_type(cm.exception).startswith( "Target path %s already exists. And it fails to rmdir" % target_path)) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() assert_not_equal(target_description, None) assert_not_equal(target_description, target_path) # on yoh's laptop TMPDIR is under HOME, so things start to become # tricky since then target_path is shortened and we would need to know # remote $HOME. To not over-complicate and still test, test only for # the basename of the target_path ok_endswith(target_description, basename(target_path)) # now, with force and correct url, which is also used to determine # target_dir # Note: on windows absolute path is not url conform. But this way it's easy # to test, that ssh path is correctly used. if not on_windows: # add random file under target_path, to explicitly test existing=replace open(opj(target_path, 'random'), 'w').write('123') assert_create_sshwebserver(dataset=source, name="local_target", sshurl="ssh://localhost" + target_path, publish_by_default='master', existing='replace') eq_("ssh://localhost" + urlquote(target_path), source.repo.get_remote_url("local_target")) ok_(source.repo.get_remote_url("local_target", push=True) is None) # ensure target tree actually replaced by source assert_false(exists(opj(target_path, 'random'))) if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes[ "local_target"].config_reader.get eq_(local_target_cfg('annex-ignore'), 'false') eq_(local_target_cfg('annex-uuid').count('-'), 4) # valid uuid # should be added too, even if URL matches prior state eq_(local_target_cfg('push'), 'master') # again, by explicitly passing urls. Since we are on localhost, the # local path should work: cpkwargs = dict( dataset=source, name="local_target", sshurl="ssh://localhost", target_dir=target_path, target_url=target_path, target_pushurl="ssh://localhost" + target_path, ui=True, ) assert_create_sshwebserver(existing='replace', **cpkwargs) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() eq_(target_description, target_path) eq_(target_path, source.repo.get_remote_url("local_target")) eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target", push=True)) _test_correct_publish(target_path) # now, push should work: publish(dataset=source, to="local_target") # and we should be able to 'reconfigure' def process_digests_mtimes(digests, mtimes): # it should have triggered a hook, which would have created log and metadata files check_metadata = False for part in 'logs', 'metadata': metafiles = [ k for k in digests if k.startswith(_path_('.git/datalad/%s/' % part)) ] # This is in effect ONLY if we have "compatible" datalad installed on remote # end. ATM we don't have easy way to guarantee that AFAIK (yoh), # so let's not check/enforce (TODO) # assert(len(metafiles) >= 1) # we might have 2 logs if timestamps do not collide ;) # Let's actually do it to some degree if part == 'logs': # always should have those: assert (len(metafiles) >= 1) with open(opj(target_path, metafiles[0])) as f: if 'no datalad found' not in f.read(): check_metadata = True if part == 'metadata': eq_(len(metafiles), bool(check_metadata)) for f in metafiles: digests.pop(f) mtimes.pop(f) # and just pop some leftovers from annex for f in list(digests): if f.startswith('.git/annex/mergedrefs'): digests.pop(f) mtimes.pop(f) orig_digests, orig_mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(orig_digests, orig_mtimes) import time time.sleep(0.1) # just so that mtimes change assert_create_sshwebserver(existing='reconfigure', **cpkwargs) digests, mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(digests, mtimes) assert_dict_equal(orig_digests, digests) # nothing should change in terms of content # but some files should have been modified modified_files = { k for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0) } # collect which files were expected to be modified without incurring any changes ok_modified_files = { _path_('.git/hooks/post-update'), 'index.html', # files which hook would manage to generate _path_('.git/info/refs'), '.git/objects/info/packs' } # on elderly git we don't change receive setting ok_modified_files.add(_path_('.git/config')) ok_modified_files.update( {f for f in digests if f.startswith(_path_('.git/datalad/web'))}) # it seems that with some recent git behavior has changed a bit # and index might get touched if _path_('.git/index') in modified_files: ok_modified_files.add(_path_('.git/index')) assert_set_equal(modified_files, ok_modified_files)
def _get_flexible_source_candidates_for_submodule(ds, sm): """Assemble candidate locations from where to clone a submodule The following locations candidates are considered. For each candidate a cost is given in parenthesis, lower values indicate higher cost: - URL of any configured superdataset remote that is known to have the desired submodule commit, with the submodule path appended to it. There can be more than one candidate (cost 500). - A URL or absolute path recorded in `.gitmodules` (cost 600). - In case `.gitmodules` contains a relative path instead of a URL, the URL of any configured superdataset remote that is known to have the desired submodule commit, with this relative path appended to it. There can be more than one candidate (cost 500). - In case `.gitmodules` contains a relative path as a URL, the absolute path of the superdataset, appended with this relative path (cost 900). Additional candidate URLs can be generated based on templates specified as configuration variables with the pattern `datalad.get.subdataset-source-candidate-<name>` where `name` is an arbitrary identifier. If name starts with three digits (e.g. '400myserver') these will be interpreted as a cost, and the respective candidate will be sorted into the generated candidate list according to this cost. If no cost is given, a default of 700 is used. A template string assigned to such a variable can utilize the Python format mini language and may reference a number of properties that are inferred from the parent dataset's knowledge about the target subdataset. Properties include any submodule property specified in the respective `.gitmodules` record. For convenience, an existing `datalad-id` record is made available under the shortened name `id`. Additionally, the URL of any configured remote that contains the respective submodule commit is available as `remote-<name>` properties, where `name` is the configured remote name. Lastly, all candidates are sorted according to their cost (lower values first, and duplicate URLs are stripped, while preserving the first item in the candidate list. Parameters ---------- ds : Dataset Parent dataset of to-be-installed subdataset. sm : dict Submodule record as produced by `subdatasets()`. Returns ------- list of dict Where each dict has keys 'cost' (int), 'name' (str), 'url' (str). Names are not unique and either derived from the name of the respective remote, template configuration variable, or 'local'. """ # short cuts ds_repo = ds.repo sm_url = sm.get('gitmodule_url', None) sm_path = op.relpath(sm['path'], start=sm['parentds']) clone_urls = [] # CANDIDATE: tracking remote of the current branch tracking_remote, tracking_branch = ds_repo.get_tracking_branch() candidate_remotes = [tracking_remote] if tracking_remote else [] # if we have a remote, let's check the location of that remote # for the presence of the desired submodule last_commit = ds_repo.get_last_commit_hexsha(sm_path) if last_commit: # CANDIDATE: any remote that has the commit when the submodule was # last modified # ideally should also give preference to the remotes which have # the same branch checked out I guess candidate_remotes += list( _get_remotes_having_commit(ds_repo, last_commit)) # prepare a dict to generate URL candidates from templates sm_candidate_props = { k[10:].replace('datalad-id', 'id'): v for k, v in sm.items() if k.startswith('gitmodule_') } for remote in unique(candidate_remotes): remote_url = ds_repo.get_remote_url(remote, push=False) # Directly on parent's ds url if remote_url: # make remotes and their URLs available to template rendering sm_candidate_props['remoteurl-{}'.format(remote)] = remote_url # attempt: submodule checkout at parent remote URL # We might need to quote sm_path portion, e.g. for spaces etc if isinstance(RI(remote_url), URL): sm_path_url = urlquote(sm_path) else: sm_path_url = sm_path clone_urls.extend( dict(cost=500, name=remote, url=url) for url in _get_flexible_source_candidates( # alternate suffixes are tested by `clone` anyways sm_path_url, remote_url, alternate_suffix=False)) # attempt: provided (configured?) submodule URL # TODO: consider supporting DataLadRI here? or would confuse # git and we wouldn't want that (i.e. not allow pure git clone # --recursive) if sm_url: clone_urls.extend( dict(cost=600, name=remote, url=url) for url in _get_flexible_source_candidates( sm_url, remote_url, alternate_suffix=False)) cost_candidate_expr = re.compile('[0-9][0-9][0-9].*') candcfg_prefix = 'datalad.get.subdataset-source-candidate-' for name, tmpl in [(c[len(candcfg_prefix):], ds_repo.config[c]) for c in ds_repo.config.keys() if c.startswith(candcfg_prefix)]: url = tmpl.format(**sm_candidate_props) # we don't want "flexible_source_candidates" here, this is # configuration that can be made arbitrarily precise from the # outside. Additional guesswork can only make it slower has_cost = cost_candidate_expr.match(name) is not None clone_urls.append( # assign a default cost, if a config doesn't have one dict( cost=int(name[:3]) if has_cost else 700, name=name[3:] if has_cost else name, url=url, from_config=True, )) # CANDIDATE: the actual configured gitmodule URL if sm_url: clone_urls.extend( dict(cost=900, name='local', url=url) for url in _get_flexible_source_candidates( sm_url, ds.path, alternate_suffix=False) # avoid inclusion of submodule location itself if url != sm['path']) # sort all candidates by their label, thereby allowing a # candidate provided by configuration to purposefully # sort before or after automatically generated configuration clone_urls = sorted(clone_urls, key=lambda x: x['cost']) # take out any duplicate source candidates # unique() takes out the duplicated at the tail end clone_urls = unique(clone_urls, lambda x: x['url']) return clone_urls
def test_target_ssh_simple(origin, src_path, target_rootpath): # prepare src source = install( src_path, source=origin, result_xfm='datasets', return_type='item-or-list') target_path = opj(target_rootpath, "basic") with swallow_logs(new_level=logging.ERROR) as cml: create_sibling( dataset=source, name="local_target", sshurl="ssh://localhost", target_dir=target_path, ui=True) assert_not_in('enableremote local_target failed', cml.out) GitRepo(target_path, create=False) # raises if not a git repo assert_in("local_target", source.repo.get_remotes()) # Both must be annex or git repositories src_is_annex = AnnexRepo.is_valid_repo(src_path) eq_(src_is_annex, AnnexRepo.is_valid_repo(target_path)) # And target one should be known to have a known UUID within the source if annex if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get # basic config in place eq_(local_target_cfg('annex-ignore'), 'false') ok_(local_target_cfg('annex-uuid')) # do it again without force, but use a different name to avoid initial checks # for existing remotes: with assert_raises(RuntimeError) as cm: assert_create_sshwebserver( dataset=source, name="local_target_alt", sshurl="ssh://localhost", target_dir=target_path) ok_(text_type(cm.exception).startswith( "Target path %s already exists. And it fails to rmdir" % target_path)) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() assert_not_equal(target_description, None) assert_not_equal(target_description, target_path) # on yoh's laptop TMPDIR is under HOME, so things start to become # tricky since then target_path is shortened and we would need to know # remote $HOME. To not over-complicate and still test, test only for # the basename of the target_path ok_endswith(target_description, basename(target_path)) # now, with force and correct url, which is also used to determine # target_dir # Note: on windows absolute path is not url conform. But this way it's easy # to test, that ssh path is correctly used. if not on_windows: # add random file under target_path, to explicitly test existing=replace open(opj(target_path, 'random'), 'w').write('123') assert_create_sshwebserver( dataset=source, name="local_target", sshurl="ssh://localhost" + target_path, publish_by_default='master', existing='replace') eq_("ssh://localhost" + urlquote(target_path), source.repo.get_remote_url("local_target")) ok_(source.repo.get_remote_url("local_target", push=True) is None) # ensure target tree actually replaced by source assert_false(exists(opj(target_path, 'random'))) if src_is_annex: annex = AnnexRepo(src_path) local_target_cfg = annex.repo.remotes["local_target"].config_reader.get eq_(local_target_cfg('annex-ignore'), 'false') eq_(local_target_cfg('annex-uuid').count('-'), 4) # valid uuid # should be added too, even if URL matches prior state eq_(local_target_cfg('push'), 'master') # again, by explicitly passing urls. Since we are on localhost, the # local path should work: cpkwargs = dict( dataset=source, name="local_target", sshurl="ssh://localhost", target_dir=target_path, target_url=target_path, target_pushurl="ssh://localhost" + target_path, ui=True, ) assert_create_sshwebserver(existing='replace', **cpkwargs) if src_is_annex: target_description = AnnexRepo(target_path, create=False).get_description() eq_(target_description, target_path) eq_(target_path, source.repo.get_remote_url("local_target")) eq_("ssh://localhost" + target_path, source.repo.get_remote_url("local_target", push=True)) _test_correct_publish(target_path) # now, push should work: publish(dataset=source, to="local_target") # and we should be able to 'reconfigure' def process_digests_mtimes(digests, mtimes): # it should have triggered a hook, which would have created log and metadata files check_metadata = False for part in 'logs', 'metadata': metafiles = [k for k in digests if k.startswith(_path_('.git/datalad/%s/' % part))] # This is in effect ONLY if we have "compatible" datalad installed on remote # end. ATM we don't have easy way to guarantee that AFAIK (yoh), # so let's not check/enforce (TODO) # assert(len(metafiles) >= 1) # we might have 2 logs if timestamps do not collide ;) # Let's actually do it to some degree if part == 'logs': # always should have those: assert (len(metafiles) >= 1) with open(opj(target_path, metafiles[0])) as f: if 'no datalad found' not in f.read(): check_metadata = True if part == 'metadata': eq_(len(metafiles), bool(check_metadata)) for f in metafiles: digests.pop(f) mtimes.pop(f) # and just pop some leftovers from annex for f in list(digests): if f.startswith('.git/annex/mergedrefs'): digests.pop(f) mtimes.pop(f) orig_digests, orig_mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(orig_digests, orig_mtimes) import time time.sleep(0.1) # just so that mtimes change assert_create_sshwebserver(existing='reconfigure', **cpkwargs) digests, mtimes = get_mtimes_and_digests(target_path) process_digests_mtimes(digests, mtimes) assert_dict_equal(orig_digests, digests) # nothing should change in terms of content # but some files should have been modified modified_files = {k for k in mtimes if orig_mtimes.get(k, 0) != mtimes.get(k, 0)} # collect which files were expected to be modified without incurring any changes ok_modified_files = { _path_('.git/hooks/post-update'), 'index.html', # files which hook would manage to generate _path_('.git/info/refs'), '.git/objects/info/packs' } # on elderly git we don't change receive setting ok_modified_files.add(_path_('.git/config')) ok_modified_files.update({f for f in digests if f.startswith(_path_('.git/datalad/web'))}) # it seems that with some recent git behavior has changed a bit # and index might get touched if _path_('.git/index') in modified_files: ok_modified_files.add(_path_('.git/index')) assert_set_equal(modified_files, ok_modified_files)
def _get_flexible_source_candidates_for_submodule(ds, sm): """Assemble candidates from where to install a submodule Even if a URL for submodule is provided explicitly -- first tries urls under parent's module tracking branch remote. Additional candidate URLs can be generated based on templates specified as configuration variables with the pattern `datalad.get.subdataset-source-candidate-<name>` where `name` is an arbitrary identifier. A template string assigned to such a variable can utilize the Python format mini language and may reference a number of properties that are inferred from the parent dataset's knowledge about the target subdataset. Properties include any submodule property specified in the respective .gitmodules record. For convenience, an existing `datalad-id` record is made available under the shortened name `id`. Additionally, the URL of any configured remote that contains the respective submodule commit is available as `remote-<name>` properties, where `name` is the configured remote name. Parameters ---------- ds : Dataset Parent dataset of to-be-installed subdataset. sm : dict Submodule record as produced by `subdatasets()`. Returns ------- list of tuples Where each tuples consists of a name and a URL. Names are not unique and either derived from the name of the respective remote, template configuration variable, or 'origin' for the candidate URL that was obtained from the .gitmodule record. """ # short cuts ds_repo = ds.repo sm_url = sm.get('gitmodule_url', None) sm_path = op.relpath(sm['path'], start=sm['parentds']) clone_urls = [] # CANDIDATE: tracking remote of the current branch tracking_remote, tracking_branch = ds_repo.get_tracking_branch() candidate_remotes = [tracking_remote] if tracking_remote else [] # if we have a remote, let's check the location of that remote # for the presence of the desired submodule last_commit = ds_repo.get_last_commit_hexsha(sm_path) if last_commit: # CANDIDATE: any remote that has the commit when the submodule was # last modified # ideally should also give preference to the remotes which have # the same branch checked out I guess candidate_remotes += list( _get_remotes_having_commit(ds_repo, last_commit)) # prepare a dict to generate URL candidates from templates sm_candidate_props = { k[10:].replace('datalad-id', 'id'): v for k, v in sm.items() if k.startswith('gitmodule_') } for remote in unique(candidate_remotes): remote_url = ds_repo.get_remote_url(remote, push=False) # Directly on parent's ds url if remote_url: # make remotes and their URLs available to template rendering sm_candidate_props['remoteurl-{}'.format(remote)] = remote_url # attempt: submodule checkout at parent remote URL # We might need to quote sm_path portion, e.g. for spaces etc if isinstance(RI(remote_url), URL): sm_path_url = urlquote(sm_path) else: sm_path_url = sm_path clone_urls.extend( (remote, url) for url in _get_flexible_source_candidates( # alternate suffixes are tested by `clone` anyways sm_path_url, remote_url, alternate_suffix=False)) # attempt: provided (configured?) submodule URL # TODO: consider supporting DataLadRI here? or would confuse # git and we wouldn't want that (i.e. not allow pure git clone # --recursive) if sm_url: clone_urls.extend( (remote, url) for url in _get_flexible_source_candidates( sm_url, remote_url, alternate_suffix=False)) for name, tmpl in [ (c[12:], ds_repo.config[c]) for c in ds_repo.config.keys() if c.startswith('datalad.get.subdataset-source-candidate-') ]: url = tmpl.format(**sm_candidate_props) # we don't want "flexible_source_candidates" here, this is # configuration that can be made arbitrarily precise from the # outside. Additional guesswork can only make it slower clone_urls.append((name, url)) # CANDIDATE: the actual configured gitmodule URL if sm_url: clone_urls.extend(('local', url) for url in _get_flexible_source_candidates( sm_url, ds.path, alternate_suffix=False) # avoid inclusion of submodule location itself if url != sm['path']) return unique(clone_urls, lambda x: x[1])