def _check_and_update_remote_server_info(ds, remote): # if we managed to copy to "http" url we should should try to trigger git # update-server-info hook on the remote if there was ssh annexurl defined # for it. Apparently we do that already in create_sibling ones, but here # we need more checks and preparation remote_url = ds.repo.config.get('remote.%s.url' % remote, None) if remote_url: remote_url = RI(remote_url) if isinstance(remote_url, URL) and remote_url.scheme in ('http', 'https'): remote_annexurl = ds.repo.config.get('remote.%s.annexurl' % remote, None) if remote_annexurl: remote_annexurl_ri = RI(remote_annexurl) if is_ssh(remote_annexurl_ri): ssh = ssh_manager.get_connection(remote_annexurl_ri) ssh('git -C {} update-server-info'.format( sh_quote(remote_annexurl_ri.path))) return True else: lgr.debug( "There is no annexurl defined but not ssh: %s, " "dunno if " "we could/should do anything", remote_annexurl) return False
def _check_ri(ri, cls, exact_str=True, localpath=None, **fields): """just a helper to carry out few checks on urls""" with swallow_logs(new_level=logging.DEBUG) as cml: ri_ = cls(**fields) murl = RI(ri) eq_(murl.__class__, cls) # not just a subclass eq_(murl, ri_) if isinstance(ri, str): eq_(str(RI(ri)), ri) eq_(eval(repr(ri_)), ri) # repr leads back to identical ri_ eq_(ri, ri_) # just in case ;) above should fail first if smth is wrong if not exact_str: assert_in('Parsed version of', cml.out) (eq_ if exact_str else neq_)(ri, str(ri_)) # that we can reconstruct it EXACTLY on our examples # and that we have access to all those fields nok_(set(fields).difference(set(cls._FIELDS))) for f, v in fields.items(): eq_(getattr(ri_, f), v) if localpath: eq_(ri_.localpath, localpath) old_localpath = ri_.localpath # for a test below else: # if not given -- must be a remote url, should raise exception with assert_raises(ValueError): ri_.localpath # do changes in the path persist? old_str = str(ri_) ri_.path = newpath = opj(ri_.path, 'sub') eq_(ri_.path, newpath) neq_(str(ri_), old_str) if localpath: eq_(ri_.localpath, opj(old_localpath, 'sub'))
def _get_flexible_source_candidates(src, base_url=None, alternate_suffix=True): """Get candidates to try cloning from. Primarily to mitigate the problem that git doesn't append /.git while cloning from non-bare repos over dummy protocol (http*). Also to simplify creation of urls whenever base url and relative path within it provided Parameters ---------- src : string or RI Full or relative (then considered within base_url if provided) path base_url : string or RI, optional alternate_suffix : bool Whether to generate URL candidates with and without '/.git' suffixes. Returns ------- candidates : list of str List of RIs (path, url, ssh targets) to try to install from """ candidates = [] ri = RI(src) if isinstance(ri, PathRI) and not isabs(ri.path) and base_url: ri = RI(base_url) if ri.path.endswith('/.git'): base_path = ri.path[:-5] base_suffix = '.git' else: base_path = ri.path base_suffix = '' if isinstance(ri, PathRI): # this is a path, so stay native ri.path = normpath(opj(base_path, src, base_suffix)) else: # we are handling a URL, use POSIX path conventions ri.path = posixpath.normpath( posixpath.join(base_path, src, base_suffix)) src = str(ri) candidates.append(src) if alternate_suffix and isinstance(ri, URL): if ri.scheme in {'http', 'https'}: # additionally try to consider .git: if not src.rstrip('/').endswith('/.git'): candidates.append( '{0}/.git'.format(src.rstrip('/'))) # TODO: # We need to provide some error msg with InstallFailedError, since now # it just swallows everything. # yoh: not sure if this comment applies here, but could be still applicable # outisde return candidates
def _get_flexible_source_candidates(src, base_url=None): """Get candidates to try cloning from. Primarily to mitigate the problem that git doesn't append /.git while cloning from non-bare repos over dummy protocol (http*). Also to simplify creation of urls whenever base url and relative path within it provided Parameters ---------- src : string or RI Full or relative (then considered within base_url if provided) path base_url : string or RI, optional Returns ------- candidates : list of str List of RIs (path, url, ssh targets) to try to install from """ candidates = [] ri = RI(src) if isinstance(ri, PathRI) and not isabs(ri.path) and base_url: ri = RI(base_url) if ri.path.endswith('/.git'): base_path = ri.path[:-5] base_suffix = '.git' else: base_path = ri.path base_suffix = '' ri.path = normpath(opj(base_path, src, base_suffix)) src = str(ri) candidates.append(src) if isinstance(ri, URL): if ri.scheme in {'http', 'https'}: # additionally try to consider .git: if not src.rstrip('/').endswith('/.git'): candidates.append('{0}/.git'.format(src.rstrip('/'))) # TODO: # We need to provide some error msg with InstallFailedError, since now # it just swallows everything. # yoh: not sure if this comment applies here, but could be still applicable # outisde return candidates
def resolve_path(path, ds=None): """Resolve a path specification (against a Dataset location) Any explicit path (absolute or relative) is returned as an absolute path. In case of an explicit relative path, the current working directory is used as a reference. Any non-explicit relative path is resolved against as dataset location, i.e. considered relative to the location of the dataset. If no dataset is provided, the current working directory is used. Returns ------- Absolute path """ # first make sure it's actually a valid path: from datalad.support.network import PathRI if not isinstance(RI(path), PathRI): raise ValueError("%s is not a valid path" % path) path = expandpath(path, force_absolute=False) if is_explicit_path(path): # normalize path consistently between two (explicit and implicit) cases return dlabspath(path, norm=True) # no dataset given, use CWD as reference # note: abspath would disregard symlink in CWD top_path = getpwd() \ if ds is None else ds.path if isinstance(ds, Dataset) else ds return normpath(opj(top_path, path))
def _flyweight_id_from_args(cls, *args, **kwargs): if args: # to a certain degree we need to simulate an actual call to __init__ # and make sure, passed arguments are fitting: # TODO: Figure out, whether there is a cleaner way to do this in a # generic fashion assert('path' not in kwargs) path = args[0] args = args[1:] elif 'path' in kwargs: path = kwargs.pop('path') else: raise TypeError("__init__() requires argument `path`") if path is None: lgr.debug("path is None. args: %s, kwargs: %s", args, kwargs) raise ValueError("path must not be None") # Custom handling for few special abbreviations if defined by the class path_ = cls._flyweight_preproc_path(path) # mirror what is happening in __init__ if isinstance(path, ut.PurePath): path = str(path) # Sanity check for argument `path`: # raise if we cannot deal with `path` at all or # if it is not a local thing: localpath = RI(path_).localpath path_postproc = cls._flyweight_postproc_path(localpath) kwargs['path'] = path_postproc return path_postproc, args, kwargs
def configure_origins(cfgds, probeds, label=None): """Configure any discoverable local dataset 'origin' sibling as a remote Parameters ---------- cfgds : Dataset Dataset to receive the remote configurations probeds : Dataset Dataset to start looking for 'origin' remotes. May be identical with `cfgds`. label : int, optional Each discovered 'origin' will be configured as a remote under the name 'origin-<label>'. If no label is given, '2' will be used by default, given that there is typically a 'origin' remote already. """ if label is None: label = 1 # let's look at the URL for that remote and see if it is a local # dataset origin_url = probeds.config.get('remote.origin.url') if not origin_url: # no origin, nothing to do return if not cfgds.config.obtain( 'datalad.install.inherit-local-origin', default=True): # no inheritance wanted return if not isinstance(RI(origin_url), PathRI): # not local path return # no need to reconfigure original/direct origin again if cfgds != probeds: # prevent duplicates known_remote_urls = set( cfgds.config.get(r + '.url', None) for r in cfgds.config.sections() if r.startswith('remote.') ) if origin_url not in known_remote_urls: yield from cfgds.siblings( 'configure', # no chance for conflict, can only be the second configured # remote name='origin-{}'.format(label), url=origin_url, # fetch to get all annex info fetch=True, result_renderer='disabled', on_failure='ignore', ) # and dive deeper # given the clone source is a local dataset, we can have a # cheap look at it, and configure its own 'origin' as a remote # (if there is any), and benefit from additional annex availability yield from configure_origins( cfgds, Dataset(probeds.pathobj / origin_url), label=label + 1)
def get_connection(self, url): """Get a singleton, representing a shared ssh connection to `url` Parameters ---------- url: str ssh url Returns ------- SSHConnection """ # parse url: from datalad.support.network import RI, is_ssh sshri = RI(url) if not is_ssh(sshri): raise ValueError( "Unsupported SSH URL: '{0}', use ssh://host/path or host:path syntax" .format(url)) # determine control master: ctrl_path = "%s/%s" % (self.socket_dir, sshri.hostname) if sshri.port: ctrl_path += ":%s" % sshri.port # do we know it already? if ctrl_path in self._connections: return self._connections[ctrl_path] else: c = SSHConnection(ctrl_path, sshri.hostname) self._connections[ctrl_path] = c return c
def _get_flexible_source_candidates_for_submodule(ds, sm_path, sm_url=None): """Retrieve candidates from where to install the submodule Even if url for submodule is provided explicitly -- first tries urls under parent's module tracking branch remote. """ clone_urls = [] # should be our first candidate tracking_remote, tracking_branch = ds.repo.get_tracking_branch() candidate_remotes = [tracking_remote] if tracking_remote else [] # if we have a remote, let's check the location of that remote # for the presence of the desired submodule try: last_commit = next(ds.repo._get_files_history(sm_path)).hexsha # ideally should also give preference to the remotes which have # the same branch checked out I guess candidate_remotes += list(ds.repo._get_remotes_having_commit(last_commit)) except StopIteration: # no commit for it known yet, ... oh well pass for remote in unique(candidate_remotes): remote_url = ds.repo.get_remote_url(remote, push=False) # Directly on parent's ds url if remote_url: # attempt: submodule checkout at parent remote URL # We might need to quote sm_path portion, e.g. for spaces etc if isinstance(RI(remote_url), URL): sm_path_url = urlquote(sm_path) else: sm_path_url = sm_path clone_urls.extend( _get_flexible_source_candidates( # alternate suffixes are tested by `clone` anyways sm_path_url, remote_url, alternate_suffix=False)) # attempt: provided (configured?) submodule URL # TODO: consider supporting DataLadRI here? or would confuse # git and we wouldn't want that (i.e. not allow pure git clone # --recursive) if sm_url: clone_urls += _get_flexible_source_candidates( sm_url, remote_url, alternate_suffix=False ) # Do based on the ds.path as the last resort if sm_url: clone_urls += _get_flexible_source_candidates( sm_url, ds.path, alternate_suffix=False) return unique(clone_urls)
def _get_git_url_from_source(source): """Return URL for cloning associated with a source specification For now just resolves DataLadRIs """ # TODO: Probably RF this into RI.as_git_url(), that would be overridden # by subclasses or sth. like that if not isinstance(source, RI): source_ri = RI(source) else: source_ri = source if isinstance(source_ri, DataLadRI): # we have got our DataLadRI as the source, so expand it source = source_ri.as_git_url() else: source = str(source_ri) return source
def get_connection(self, url, use_remote_annex_bundle=True, force_ip=False): """Get a singleton, representing a shared ssh connection to `url` Parameters ---------- url: str ssh url force_ip : {False, 4, 6} Force the use of IPv4 or IPv6 addresses. Returns ------- SSHConnection """ # parse url: from datalad.support.network import RI, is_ssh if isinstance(url, RI): sshri = url else: if ':' not in url and '/' not in url: # it is just a hostname lgr.debug("Assuming %r is just a hostname for ssh connection", url) url += ':' sshri = RI(url) if not is_ssh(sshri): raise ValueError("Unsupported SSH URL: '{0}', use " "ssh://host/path or host:path syntax".format(url)) from datalad import cfg identity_file = cfg.get("datalad.ssh.identityfile") conhash = get_connection_hash( sshri.hostname, port=sshri.port, identity_file=identity_file or "", username=sshri.username, bundled=use_remote_annex_bundle, force_ip=force_ip, ) # determine control master: ctrl_path = self.socket_dir / conhash # do we know it already? if ctrl_path in self._connections: return self._connections[ctrl_path] else: c = SSHConnection(ctrl_path, sshri, identity_file=identity_file, use_remote_annex_bundle=use_remote_annex_bundle, force_ip=force_ip) self._connections[ctrl_path] = c return c
def test_is_url(): ok_(is_url('file://localhost/some')) ok_(is_url('http://localhost')) ok_(is_url('ssh://me@localhost')) # in current understanding it is indeed a url but an 'ssh', implicit=True, not just # a useless scheme=weired with a hope to point to a netloc with swallow_logs(): ok_(is_url('weired://')) nok_(is_url('relative')) nok_(is_url('/absolute')) ok_(is_url('like@sshlogin')) # actually we do allow ssh:implicit urls ATM nok_(is_url('')) nok_(is_url(' ')) nok_(is_url(123)) # stuff of other types wouldn't be considered a URL # we can pass RI instance directly ok_(is_url(RI('file://localhost/some'))) nok_(is_url(RI('relative')))
def test_is_ssh(): ssh_locators = [ "ssh://host", "ssh://host/some/where", "user@host:path/sp1", "user@host:/absolute/path/sp1", "host:path/sp1", "host:/absolute/path/sp1", "user@host" ] for ri in ssh_locators: ok_(is_ssh(ri), "not considered ssh (string): %s" % ri) ok_(is_ssh(RI(ri)), "not considered ssh (RI): %s" % ri) non_ssh_locators = [ "file://path/to", "/abs/path", "../rel/path", "http://example.com", "git://host/user/proj", "s3://bucket/save/?key=891" ] for ri in non_ssh_locators: ok_(not is_ssh(ri), "considered ssh (string): %s" % ri) ok_(not is_ssh(RI(ri)), "considered ssh (RI): %s" % ri)
def _add_remote(ds, name, known_remotes, url, pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): # TODO: allow for no url if 'inherit' and deduce from the super ds # create-sibling already does it -- generalize/use # Actually we could even inherit/deduce name from the super by checking # which remote it is actively tracking in current branch... but may be # would be too much magic # it seems that the only difference is that `add` should fail if a remote # already exists if (url is None and pushurl is None): raise InsufficientArgumentsError( """insufficient information to add a sibling (needs at least a dataset, and any URL).""") if url is None: url = pushurl if not name: urlri = RI(url) # use the hostname as default remote name name = urlri.hostname lgr.debug( "No sibling name given, use URL hostname '%s' as sibling name", name) if not name: raise InsufficientArgumentsError("no sibling name given") if name in known_remotes: yield get_status_dict( action='add-sibling', status='error', path=ds.path, type='sibling', name=name, message=("sibling is already known: %s, use `configure` instead?", name), **res_kwargs) return # this remote is fresh: make it known # just minimalistic name and URL, the rest is coming from `configure` ds.repo.add_remote(name, url) known_remotes.append(name) # always copy signature from above to avoid bugs for r in _configure_remote(ds, name, known_remotes, url, pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): if r['action'] == 'configure-sibling': r['action'] = 'add-sibling' yield r
def _flyweight_id_from_args(cls, *args, **kwargs): if args: # to a certain degree we need to simulate an actual call to __init__ # and make sure, passed arguments are fitting: # TODO: Figure out, whether there is a cleaner way to do this in a # generic fashion assert('path' not in kwargs) path = args[0] args = args[1:] elif 'path' in kwargs: path = kwargs.pop('path') else: raise TypeError("__init__() requires argument `path`") if path is None: raise AttributeError # mirror what is happening in __init__ if isinstance(path, ut.PurePath): path = text_type(path) # Custom handling for few special abbreviations path_ = path if path == '^': # get the topmost dataset from current location. Note that 'zsh' # might have its ideas on what to do with ^, so better use as -d^ path_ = Dataset(curdir).get_superdataset(topmost=True).path elif path == '///': # TODO: logic/UI on installing a default dataset could move here # from search? path_ = cfg.obtain('datalad.locations.default-dataset') if path != path_: lgr.debug("Resolved dataset alias %r to path %r", path, path_) # Sanity check for argument `path`: # raise if we cannot deal with `path` at all or # if it is not a local thing: path_ = RI(path_).localpath # we want an absolute path, but no resolved symlinks if not isabs(path_): path_ = opj(getpwd(), path_) # use canonical paths only: path_ = normpath(path_) kwargs['path'] = path_ return path_, args, kwargs
def _import_dicom_tarball(target_ds, tarball, filename): # # TODO: doesn't work for updates yet: # # - branches are expected to not exist yet target_ds.repo.checkout('incoming', options=['-b']) target_ds.repo.init_remote( ARCHIVES_SPECIAL_REMOTE, options=[ 'encryption=none', 'type=external', 'externaltype=%s' % ARCHIVES_SPECIAL_REMOTE, 'autoenable=true', 'uuid={}'.format( DATALAD_SPECIAL_REMOTES_UUIDS[ARCHIVES_SPECIAL_REMOTE]) ]) if isinstance(RI(tarball), PathRI): shutil.copy2(tarball, op.join(target_ds.path, filename)) target_ds.repo.add(filename) else: target_ds.repo.add_url_to_file(file_=filename, url=tarball, batch=False) target_ds.repo.commit(msg="Retrieved %s" % tarball) target_ds.repo.checkout('incoming-processed', options=['--orphan']) if target_ds.repo.dirty: target_ds.repo.remove('.', r=True, f=True) target_ds.repo.merge('incoming', options=["-s", "ours", "--no-commit"], expect_stderr=True) target_ds.repo._git_custom_command([], "git read-tree -m -u incoming") from datalad.coreapi import add_archive_content # # TODO: Reconsider value of --existing add_archive_content(archive=filename, annex=target_ds.repo, existing='archive-suffix', delete=True, commit=False, allow_dirty=True) target_ds.repo.commit(msg="Extracted %s" % tarball) target_ds.repo.checkout('master') target_ds.repo.merge('incoming-processed', options=["--allow-unrelated"])
def _get_flexible_source_candidates(src, base_url=None, alternate_suffix=True): """Get candidates to try cloning from. Primarily to mitigate the problem that git doesn't append /.git while cloning from non-bare repos over dummy protocol (http*). Also to simplify creation of urls whenever base url and relative path within it provided Parameters ---------- src : string or RI Full or relative (then considered within base_url if provided) path base_url : string or RI, optional alternate_suffix : bool Whether to generate URL candidates with and without '/.git' suffixes. Returns ------- candidates : list of str List of RIs (path, url, ssh targets) to try to install from """ candidates = [] ri = RI(src) if isinstance(ri, PathRI) and not isabs(ri.path) and base_url: ri = RI(base_url) if ri.path.endswith('/.git'): base_path = ri.path[:-5] base_suffix = '.git' else: base_path = ri.path base_suffix = '' if isinstance(ri, PathRI): # this is a path, so stay native ri.path = normpath(opj(base_path, src, base_suffix)) else: # we are handling a URL, use POSIX path conventions ri.path = posixpath.normpath( posixpath.join(base_path, src, base_suffix)) src = str(ri) candidates.append(src) if alternate_suffix and isinstance(ri, URL): if ri.scheme in {'http', 'https'}: # additionally try to consider .git: if not src.rstrip('/').endswith('/.git'): candidates.append( '{0}/.git'.format(src.rstrip('/'))) return candidates
def test_get_multiple_files(path, url, ds_dir): from os import listdir from datalad.support.network import RI file_list = [f for f in listdir(path) if not f.startswith('.')] # prepare urls: [RI(url + f) for f in file_list] # prepare origin origin = Dataset(path).create(force=True) origin.add(file_list) origin.save("initial") ds = install(ds_dir, source=path, result_xfm='datasets', return_type='item-or-list') # no content present: ok_(not any(ds.repo.file_has_content(file_list))) # get two plus an invalid one: result = ds.get(['file1.txt', 'file2.txt', 'not_existing.txt'], on_failure='ignore') assert_status('impossible', [result[0]]) assert_status(['ok', 'notneeded'], result[1:]) # explicitly given not existing file was skipped: # (see test_get_invalid_call) eq_(set([basename(item.get('path')) for item in result[1:]]), {'file1.txt', 'file2.txt'}) ok_(all(ds.repo.file_has_content(['file1.txt', 'file2.txt']))) # get all of them: result = ds.get(curdir) # there were two files left to get: eq_( set([ basename(item.get('path')) for item in result if item['type'] == 'file' ]), {'file3.txt', 'file4.txt'}) ok_(all(ds.repo.file_has_content(file_list)))
def _get_ds_remote_shared_setting(ds, name, ssh): """Figure out setting of sharedrepository for dataset's `name` remote""" shared = None try: current_super_url = CreateSibling._get_remote_url(ds, name) current_super_ri = RI(current_super_url) out, err = ssh('git -C {} config --get core.sharedrepository'.format( # TODO -- we might need to expanduser taking .user into account # but then it must be done also on remote side sh_quote(current_super_ri.path))) shared = out.strip() if err: lgr.warning("Got stderr while calling ssh: %s", err) except CommandError as e: lgr.debug( "Could not figure out remote shared setting of %s for %s due " "to %s", ds, name, exc_str(e)) # could well be ok if e.g. not shared # TODO: more detailed analysis may be? return shared
def _prep_connection_args(self, url): # parse url: from datalad.support.network import RI, is_ssh if isinstance(url, RI): sshri = url else: if ':' not in url and '/' not in url: # it is just a hostname lgr.debug("Assuming %r is just a hostname for ssh connection", url) url += ':' sshri = RI(url) if not is_ssh(sshri): raise ValueError("Unsupported SSH URL: '{0}', use " "ssh://host/path or host:path syntax".format(url)) from datalad import cfg identity_file = cfg.get("datalad.ssh.identityfile") return sshri, identity_file
def get_connection(self, url): """Get a singleton, representing a shared ssh connection to `url` Parameters ---------- url: str ssh url Returns ------- SSHConnection """ # parse url: from datalad.support.network import RI, is_ssh if isinstance(url, RI): sshri = url else: if ':' not in url and '/' not in url: # it is just a hostname lgr.debug("Assuming %r is just a hostname for ssh connection", url) url += ':' sshri = RI(url) if not is_ssh(sshri): raise ValueError("Unsupported SSH URL: '{0}', use " "ssh://host/path or host:path syntax".format(url)) conhash = get_connection_hash(sshri.hostname, port=sshri.port, username=sshri.username) # determine control master: ctrl_path = "%s/%s" % (self.socket_dir, conhash) # do we know it already? if ctrl_path in self._connections: return self._connections[ctrl_path] else: c = SSHConnection(ctrl_path, sshri) self._connections[ctrl_path] = c return c
def _get_installationpath_from_url(url): """Returns a relative path derived from the trailing end of a URL This can be used to determine an installation path of a Dataset from a URL, analog to what `git clone` does. """ ri = RI(url) if isinstance(ri, (URL, DataLadRI)): # decode only if URL path = ri.path.rstrip('/') path = urlunquote(path) if path else ri.hostname if '/' in path: path = path.split('/') if path[-1] == '.git': path = path[-2] else: path = path[-1] else: path = Path(url).parts[-1] if path.endswith('.git'): path = path[:-4] return path
def _run_on_ds_ssh_remote(ds, name, ssh, cmd): """Given a dataset, and name of the remote, run command via ssh Parameters ---------- cmd: str Will be .format()'ed given the `path` to the dataset on remote Returns ------- out Raises ------ CommandError """ remote_url = CreateSibling._get_remote_url(ds, name) remote_ri = RI(remote_url) out, err = ssh(cmd.format(path=sh_quote(remote_ri.path))) if err: lgr.warning("Got stderr while calling ssh: %s", err) return out
def _test_url_quote_path(cls, clskwargs, target_url): path = '/ "\';a&b&cd `| ' if not (cls is PathRI): clskwargs['hostname'] = hostname = 'example.com' url = cls(path=path, **clskwargs) eq_(url.path, path) if 'hostname' in clskwargs: eq_(url.hostname, hostname) # all nasty symbols should be quoted url_str = str(url) eq_(url_str, target_url) # no side-effects: eq_(url.path, path) if 'hostname' in clskwargs: eq_(url.hostname, hostname) # and figured out and unquoted url_ = RI(url_str) ok_(isinstance(url_, cls)) eq_(url_.path, path) if 'hostname' in clskwargs: eq_(url.hostname, hostname)
def _get_flexible_source_candidates_for_submodule(ds, sm_path, sm_url=None): """Retrieve candidates from where to install the submodule Even if url for submodule is provided explicitly -- first tries urls under parent's module tracking branch remote. """ clone_urls = [] # if we have a remote, let's check the location of that remote # for the presence of the desired submodule remote_name, remote_url = _get_tracking_source(ds) # Directly on parent's ds url if remote_url: # attempt: submodule checkout at parent remote URL # We might need to quote sm_path portion, e.g. for spaces etc if isinstance(RI(remote_url), URL): sm_path_url = urlquote(sm_path) else: sm_path_url = sm_path clone_urls.extend( _get_flexible_source_candidates( # alternate suffixes are tested by `clone` anyways sm_path_url, remote_url, alternate_suffix=False)) # attempt: provided (configured?) submodule URL # TODO: consider supporting DataLadRI here? or would confuse # git and we wouldn't want that (i.e. not allow pure git clone # --recursive) if sm_url: clone_urls += _get_flexible_source_candidates( sm_url, remote_url if remote_url else ds.path, alternate_suffix=False) return unique(clone_urls)
def test_get_multiple_files(path, url, ds_dir): from os import listdir from datalad.support.network import RI file_list = [f for f in listdir(path) if not f.startswith('.')] # prepare urls: urls = [RI(url + f) for f in file_list] # prepare origin origin = Dataset(path).create(force=True) origin.add(file_list) origin.save("initial") ds = install(ds_dir, source=path) # no content present: ok_(not any(ds.repo.file_has_content(file_list))) # get two plus an invalid one: with assert_raises(IncompleteResultsError) as cme: ds.get(['file1.txt', 'file2.txt', 'not_existing.txt']) result = cme.exception.results # explicitly given not existing file was skipped: # (see test_get_invalid_call) eq_(set([item.get('file') for item in result]), {'file1.txt', 'file2.txt'}) ok_(all([x['success'] is True for x in result if x['file'] in ['file1.txt', 'file2.txt']])) ok_(all(ds.repo.file_has_content(['file1.txt', 'file2.txt']))) # get all of them: result = ds.get(curdir) # there were two files left to get: eq_(set([item.get('file') for item in result]), {'file3.txt', 'file4.txt'}) ok_(all(ds.repo.file_has_content(file_list)))
def postclonecfg_annexdataset(ds, reckless, description=None): """If ds "knows annex" -- annex init it, set into reckless etc Provides additional tune up to a possibly an annex repo, e.g. "enables" reckless mode, sets up description """ # in any case check whether we need to annex-init the installed thing: if not knows_annex(ds.path): # not for us return # init annex when traces of a remote annex can be detected if reckless == 'auto': lgr.debug( "Instruct annex to hardlink content in %s from local " "sources, if possible (reckless)", ds.path) ds.config.set( 'annex.hardlink', 'true', where='local', reload=True) lgr.debug("Initializing annex repo at %s", ds.path) # Note, that we cannot enforce annex-init via AnnexRepo(). # If such an instance already exists, its __init__ will not be executed. # Therefore do quick test once we have an object and decide whether to call # its _init(). # # Additionally, call init if we need to add a description (see #1403), # since AnnexRepo.__init__ can only do it with create=True repo = AnnexRepo(ds.path, init=True) if not repo.is_initialized() or description: repo._init(description=description) if reckless == 'auto' or (reckless and reckless.startswith('shared-')): repo.call_annex(['untrust', 'here']) elif reckless == 'ephemeral': # with ephemeral we declare 'here' as 'dead' right away, whenever # we symlink origin's annex, since availability from 'here' should # not be propagated for an ephemeral clone when we publish back to # origin. # This will cause stuff like this for a locally present annexed file: # % git annex whereis d1 # whereis d1 (0 copies) failed # BUT this works: # % git annex find . --not --in here # % git annex find . --in here # d1 # we don't want annex copy-to origin ds.config.set( 'remote.origin.annex-ignore', 'true', where='local') ds.repo.set_remote_dead('here') if check_symlink_capability(ds.repo.dot_git / 'dl_link_test', ds.repo.dot_git / 'dl_target_test'): # symlink the annex to avoid needless copies in an ephemeral clone annex_dir = ds.repo.dot_git / 'annex' origin_annex_url = ds.config.get("remote.origin.url", None) origin_git_path = None if origin_annex_url: try: # Deal with file:// scheme URLs as well as plain paths. # If origin isn't local, we have nothing to do. origin_git_path = Path(RI(origin_annex_url).localpath) # we are local; check for a bare repo first to not mess w/ # the path if GitRepo(origin_git_path, create=False).bare: # origin is a bare repo -> use path as is pass elif origin_git_path.name != '.git': origin_git_path /= '.git' except ValueError: # Note, that accessing localpath on a non-local RI throws # ValueError rather than resulting in an AttributeError. # TODO: Warning level okay or is info level sufficient? # Note, that setting annex-dead is independent of # symlinking .git/annex. It might still make sense to # have an ephemeral clone that doesn't propagate its avail. # info. Therefore don't fail altogether. lgr.warning("reckless=ephemeral mode: origin doesn't seem " "local: %s\nno symlinks being used", origin_annex_url) if origin_git_path: # TODO make sure that we do not delete any unique data rmtree(str(annex_dir)) \ if not annex_dir.is_symlink() else annex_dir.unlink() annex_dir.symlink_to(origin_git_path / 'annex', target_is_directory=True) else: # TODO: What level? + note, that annex-dead is independ lgr.warning("reckless=ephemeral mode: Unable to create symlinks on " "this file system.") srs = {True: [], False: []} # special remotes by "autoenable" key remote_uuids = None # might be necessary to discover known UUIDs repo_config = repo.config # Note: The purpose of this function is to inform the user. So if something # looks misconfigured, we'll warn and move on to the next item. for uuid, config in repo.get_special_remotes().items(): sr_name = config.get('name', None) if sr_name is None: lgr.warning( 'Ignoring special remote %s because it does not have a name. ' 'Known information: %s', uuid, config) continue sr_autoenable = config.get('autoenable', False) try: sr_autoenable = ensure_bool(sr_autoenable) except ValueError: lgr.warning( 'Failed to process "autoenable" value %r for sibling %s in ' 'dataset %s as bool.' 'You might need to enable it later manually and/or fix it up to' ' avoid this message in the future.', sr_autoenable, sr_name, ds.path) continue # If it looks like a type=git special remote, make sure we have up to # date information. See gh-2897. if sr_autoenable and repo_config.get("remote.{}.fetch".format(sr_name)): try: repo.fetch(remote=sr_name) except CommandError as exc: lgr.warning("Failed to fetch type=git special remote %s: %s", sr_name, exc_str(exc)) # determine whether there is a registered remote with matching UUID if uuid: if remote_uuids is None: remote_uuids = { # Check annex-config-uuid first. For sameas annex remotes, # this will point to the UUID for the configuration (i.e. # the key returned by get_special_remotes) rather than the # shared UUID. (repo_config.get('remote.%s.annex-config-uuid' % r) or repo_config.get('remote.%s.annex-uuid' % r)) for r in repo.get_remotes() } if uuid not in remote_uuids: srs[sr_autoenable].append(sr_name) if srs[True]: lgr.debug( "configuration for %s %s added because of autoenable," " but no UUIDs for them yet known for dataset %s", # since we are only at debug level, we could call things their # proper names single_or_plural("special remote", "special remotes", len(srs[True]), True), ", ".join(srs[True]), ds.path ) if srs[False]: # if has no auto-enable special remotes lgr.info( 'access to %s %s not auto-enabled, enable with:\n' '\t\tdatalad siblings -d "%s" enable -s %s', # but since humans might read it, we better confuse them with our # own terms! single_or_plural("dataset sibling", "dataset siblings", len(srs[False]), True), ", ".join(srs[False]), ds.path, srs[False][0] if len(srs[False]) == 1 else "SIBLING", ) # we have just cloned the repo, so it has 'origin', configure any # reachable origin of origins yield from configure_origins(ds, ds)
def decode_source_spec(spec, cfg=None): """Decode information from a clone source specification Parameters ---------- spec : str Any supported clone source specification cfg : ConfigManager, optional Configuration will be queried from the instance (i.e. from a particular dataset). If None is given, the global DataLad configuration will be queried. Returns ------- dict The value of each decoded property is stored under its own key in this dict. By default the following keys are return: 'type', a specification type label {'giturl', 'dataladri', 'ria'}; 'source' the original source specification; 'giturl' a URL for the source that is a suitable source argument for git-clone; 'version' a version-identifer, if present (None else); 'default_destpath' a relative path that that can be used as a clone destination. """ if cfg is None: from datalad import cfg # standard property dict composition props = dict( source=spec, version=None, ) # Git never gets to see these URLs, so let's manually apply any # rewrite configuration Git might know about. # Note: We need to rewrite before parsing, otherwise parsing might go wrong. # This is particularly true for insteadOf labels replacing even the URL # scheme. spec = cfg.rewrite_url(spec) # common starting point is a RI instance, support for accepting an RI # instance is kept for backward-compatibility reasons source_ri = RI(spec) if not isinstance(spec, RI) else spec # scenario switch, each case must set 'giturl' at the very minimum if isinstance(source_ri, DataLadRI): # we have got our DataLadRI as the source, so expand it props['type'] = 'dataladri' props['giturl'] = source_ri.as_git_url() elif isinstance(source_ri, URL) and source_ri.scheme.startswith('ria+'): # parse a RIA URI dsid, version = source_ri.fragment.split('@', maxsplit=1) \ if '@' in source_ri.fragment else (source_ri.fragment, None) uuid_regex = r'[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}' if re.match(uuid_regex, dsid): trace = '{}/{}'.format(dsid[:3], dsid[3:]) default_destpath = dsid elif dsid.startswith('~'): trace = 'alias/{}'.format(dsid[1:]) default_destpath = dsid[1:] else: raise ValueError( 'RIA URI not recognized, no valid dataset ID or other supported ' 'scheme: {}'.format(spec)) # now we cancel the fragment in the original URL, but keep everthing else # in order to be able to support the various combinations of ports, paths, # and everything else source_ri.fragment = '' # strip the custom protocol and go with standard one source_ri.scheme = source_ri.scheme[4:] # take any existing path, and add trace to dataset within the store source_ri.path = '{urlpath}{urldelim}{trace}'.format( urlpath=source_ri.path if source_ri.path else '', urldelim='' if not source_ri.path or source_ri.path.endswith('/') else '/', trace=trace, ) props.update( type='ria', giturl=str(source_ri), version=version, default_destpath=default_destpath, ) else: # let's assume that anything else is a URI that Git can handle props['type'] = 'giturl' # use original input verbatim props['giturl'] = spec if 'default_destpath' not in props: # if we still have no good idea on where a dataset could be cloned to if no # path was given, do something similar to git clone and derive the path from # the source props['default_destpath'] = _get_installationpath_from_url(props['giturl']) return props
def _configure_remote(ds, name, known_remotes, url, pushurl, fetch, description, as_common_datasrc, publish_depends, publish_by_default, annex_wanted, annex_required, annex_group, annex_groupwanted, inherit, get_annex_info, **res_kwargs): result_props = dict(action='configure-sibling', path=ds.path, type='sibling', name=name, **res_kwargs) if name is None: result_props['status'] = 'error' result_props['message'] = 'need sibling `name` for configuration' yield result_props return if name != 'here': # do all configure steps that are not meaningful for the 'here' sibling # AKA the local repo if name not in known_remotes: # this remote is fresh: make it known # just minimalistic name and URL, the rest is coming from `configure` ds.repo.add_remote(name, url) known_remotes.append(name) elif url: # not new, override URl if given ds.repo.set_remote_url(name, url) # make sure we have a configured fetch expression at this point fetchvar = 'remote.{}.fetch'.format(name) if fetchvar not in ds.repo.config: # place default fetch refspec in config # same as `git remote add` would have added ds.repo.config.add(fetchvar, '+refs/heads/*:refs/remotes/{}/*'.format(name), where='local') if pushurl: ds.repo.set_remote_url(name, pushurl, push=True) if publish_depends: # Check if all `deps` remotes are known to the `repo` unknown_deps = set( assure_list(publish_depends)).difference(known_remotes) if unknown_deps: result_props['status'] = 'error' result_props['message'] = ( 'unknown sibling(s) specified as publication dependency: %s', unknown_deps) yield result_props return # define config var name for potential publication dependencies depvar = 'remote.{}.datalad-publish-depends'.format(name) # and default pushes dfltvar = "remote.{}.push".format(name) if fetch: # fetch the remote so we are up to date for r in Update.__call__(dataset=res_kwargs['refds'], path=[dict(path=ds.path, type='dataset')], sibling=name, merge=False, recursive=False, on_failure='ignore', return_type='generator', result_xfm=None): # fixup refds r.update(res_kwargs) yield r if inherit: # Adjust variables which we should inherit delayed_super = _DelayedSuper(ds.repo) publish_depends = _inherit_config_var(delayed_super, depvar, publish_depends) publish_by_default = _inherit_config_var(delayed_super, dfltvar, publish_by_default) # Copy relevant annex settings for the sibling # makes sense only if current AND super are annexes, so it is # kinda a boomer, since then forbids having a super a pure git if isinstance(ds.repo, AnnexRepo) and \ isinstance(delayed_super.repo, AnnexRepo): if annex_wanted is None: annex_wanted = _inherit_annex_var(delayed_super, name, 'wanted') if annex_required is None: annex_required = _inherit_annex_var( delayed_super, name, 'required') if annex_group is None: # I think it might be worth inheritting group regardless what # value is #if annex_wanted in {'groupwanted', 'standard'}: annex_group = _inherit_annex_var(delayed_super, name, 'group') if annex_wanted == 'groupwanted' and annex_groupwanted is None: # we better have a value for the expression for that group annex_groupwanted = _inherit_annex_var( delayed_super, name, 'groupwanted') if publish_depends: if depvar in ds.config: # config vars are incremental, so make sure we start from # scratch ds.config.unset(depvar, where='local', reload=False) for d in assure_list(publish_depends): lgr.info('Configure additional publication dependency on "%s"', d) ds.config.add(depvar, d, where='local', reload=False) ds.config.reload() if publish_by_default: if dfltvar in ds.config: ds.config.unset(dfltvar, where='local', reload=False) for refspec in assure_list(publish_by_default): lgr.info( 'Configure additional default publication refspec "%s"', refspec) ds.config.add(dfltvar, refspec, 'local') ds.config.reload() assert isinstance(ds.repo, GitRepo) # just against silly code if isinstance(ds.repo, AnnexRepo): # we need to check if added sibling an annex, and try to enable it # another part of the fix for #463 and #432 try: if not ds.config.obtain('remote.{}.annex-ignore'.format(name), default=False, valtype=EnsureBool(), store=False): ds.repo.enable_remote(name) except CommandError as exc: # TODO yield # this is unlikely to ever happen, now done for AnnexRepo instances # only lgr.info("Failed to enable annex remote %s, " "could be a pure git" % name) lgr.debug("Exception was: %s" % exc_str(exc)) if as_common_datasrc: ri = RI(url) if isinstance(ri, URL) and ri.scheme in ('http', 'https'): # XXX what if there is already a special remote # of this name? Above check for remotes ignores special # remotes. we need to `git annex dead REMOTE` on reconfigure # before we can init a new one # XXX except it is not enough # make special remote of type=git (see #335) ds.repo._run_annex_command('initremote', annex_options=[ as_common_datasrc, 'type=git', 'location={}'.format(url), 'autoenable=true' ]) else: yield dict( status='impossible', name=name, message='cannot configure as a common data source, ' 'URL protocol is not http or https', **result_props) # # place configure steps that also work for 'here' below # if isinstance(ds.repo, AnnexRepo): for prop, var in (('wanted', annex_wanted), ('required', annex_required), ('group', annex_group)): if var is not None: ds.repo.set_preferred_content(prop, var, '.' if name == 'here' else name) if annex_groupwanted: ds.repo.set_groupwanted(annex_group, annex_groupwanted) if description: if not isinstance(ds.repo, AnnexRepo): result_props['status'] = 'impossible' result_props[ 'message'] = 'cannot set description of a plain Git repository' yield result_props return ds.repo._run_annex_command('describe', annex_options=[name, description]) # report all we know at once info = list( _query_remotes(ds, name, known_remotes, get_annex_info=get_annex_info))[0] info.update(dict(status='ok', **result_props)) yield info
def __call__(sshurl, name=None, target_dir=None, target_url=None, target_pushurl=None, dataset=None, recursive=False, recursion_limit=None, existing='error', shared=None, group=None, ui=False, as_common_datasrc=None, publish_by_default=None, publish_depends=None, annex_wanted=None, annex_group=None, annex_groupwanted=None, inherit=False, since=None): # # nothing without a base dataset # ds = require_dataset(dataset, check_installed=True, purpose='creating a sibling') refds_path = ds.path # # all checks that are possible before we start parsing the dataset # # possibly use sshurl to get the name in case if not specified if not sshurl: if not inherit: raise InsufficientArgumentsError( "needs at least an SSH URL, if no inherit option" ) if name is None: raise ValueError( "Neither SSH URL, nor the name of sibling to inherit from " "was specified" ) # It might well be that we already have this remote setup try: sshurl = CreateSibling._get_remote_url(ds, name) except Exception as exc: lgr.debug('%s does not know about url for %s: %s', ds, name, exc_str(exc)) elif inherit: raise ValueError( "For now, for clarity not allowing specifying a custom sshurl " "while inheriting settings" ) # may be could be safely dropped -- still WiP if not sshurl: # TODO: may be more back up before _prep? super_ds = ds.get_superdataset() if not super_ds: raise ValueError( "Could not determine super dataset for %s to inherit URL" % ds ) super_url = CreateSibling._get_remote_url(super_ds, name) # for now assuming hierarchical setup # (TODO: to be able to destinguish between the two, probably # needs storing datalad.*.target_dir to have %RELNAME in there) sshurl = slash_join(super_url, relpath(ds.path, super_ds.path)) # check the login URL sshri = RI(sshurl) if not is_ssh(sshri): raise ValueError( "Unsupported SSH URL: '{0}', " "use ssh://host/path or host:path syntax".format(sshurl)) if not name: # use the hostname as default remote name name = sshri.hostname lgr.debug( "No sibling name given, use URL hostname '%s' as sibling name", name) if since == '': # consider creating siblings only since the point of # the last update # XXX here we assume one to one mapping of names from local branches # to the remote active_branch = ds.repo.get_active_branch() since = '%s/%s' % (name, active_branch) # # parse the base dataset to find all subdatasets that need processing # to_process = [] for ap in AnnotatePaths.__call__( dataset=refds_path, # only a single path! path=refds_path, recursive=recursive, recursion_limit=recursion_limit, action='create_sibling', # both next should not happen anyways unavailable_path_status='impossible', nondataset_path_status='error', modified=since, return_type='generator', on_failure='ignore'): if ap.get('status', None): # this is done yield ap continue if ap.get('type', None) != 'dataset' or ap.get('state', None) == 'absent': # this can happen when there is `since`, but we have no # use for anything but datasets here continue checkds_remotes = Dataset(ap['path']).repo.get_remotes() \ if ap.get('state', None) != 'absent' \ else [] if publish_depends: # make sure dependencies are valid # TODO: inherit -- we might want to automagically create # those dependents as well??? unknown_deps = set(assure_list(publish_depends)).difference(checkds_remotes) if unknown_deps: ap['status'] = 'error' ap['message'] = ( 'unknown sibling(s) specified as publication dependency: %s', unknown_deps) yield ap continue if name in checkds_remotes and existing in ('error', 'skip'): ap['status'] = 'error' if existing == 'error' else 'notneeded' ap['message'] = ( "sibling '%s' already configured (specify alternative name, or force " "reconfiguration via --existing", name) yield ap continue to_process.append(ap) if not to_process: # we ruled out all possibilities # TODO wait for gh-1218 and make better return values lgr.info("No datasets qualify for sibling creation. " "Consider different settings for --existing " "or --since if this is unexpected") return if target_dir is None: if sshri.path: target_dir = sshri.path else: target_dir = '.' # TODO: centralize and generalize template symbol handling replicate_local_structure = "%RELNAME" not in target_dir # request ssh connection: lgr.info("Connecting ...") assert(sshurl is not None) # delayed anal verification ssh = ssh_manager.get_connection(sshurl) if not ssh.get_annex_version(): raise MissingExternalDependency( 'git-annex', msg='on the remote system') # # all checks done and we have a connection, now do something # # loop over all datasets, ordered from top to bottom to make test # below valid (existing directories would cause the machinery to halt) # But we need to run post-update hook in depth-first fashion, so # would only collect first and then run (see gh #790) yielded = set() remote_repos_to_run_hook_for = [] for currentds_ap in \ sorted(to_process, key=lambda x: x['path'].count('/')): current_ds = Dataset(currentds_ap['path']) path = _create_dataset_sibling( name, current_ds, ds.path, ssh, replicate_local_structure, sshri, target_dir, target_url, target_pushurl, existing, shared, group, publish_depends, publish_by_default, ui, as_common_datasrc, annex_wanted, annex_group, annex_groupwanted, inherit ) if not path: # nothing new was created # TODO is 'notneeded' appropriate in this case? currentds_ap['status'] = 'notneeded' # TODO explain status in 'message' yield currentds_ap yielded.add(currentds_ap['path']) continue remote_repos_to_run_hook_for.append((path, currentds_ap)) # publish web-interface to root dataset on publication server if current_ds.path == ds.path and ui: lgr.info("Uploading web interface to %s" % path) try: CreateSibling.upload_web_interface(path, ssh, shared, ui) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to push web interface to the remote datalad repository (%s)", exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue # in reverse order would be depth first lgr.info("Running post-update hooks in all created siblings") # TODO: add progressbar for path, currentds_ap in remote_repos_to_run_hook_for[::-1]: # Trigger the hook lgr.debug("Running hook for %s (if exists and executable)", path) try: ssh("cd {} " "&& ( [ -x hooks/post-update ] && hooks/post-update || : )" "".format(sh_quote(_path_(path, ".git")))) except CommandError as e: currentds_ap['status'] = 'error' currentds_ap['message'] = ( "failed to run post-update hook under remote path %s (%s)", path, exc_str(e)) yield currentds_ap yielded.add(currentds_ap['path']) continue if not currentds_ap['path'] in yielded: # if we were silent until now everything is just splendid currentds_ap['status'] = 'ok' yield currentds_ap
def test_add_source(path, url, ds_dir): raise SkipTest('functionality is not supported ATM') from os import listdir from datalad.support.network import RI urls = [RI(url + f) for f in listdir(path)] ds = Dataset(ds_dir).create() eq_(len(ds.repo.get_annexed_files()), 0) # add a remote source to git => fail: assert_raises(NotImplementedError, ds.add, source=urls[0], to_git=True) # annex add a remote source: ds.add(source=urls[0]) eq_(len(ds.repo.get_annexed_files()), 1) # add two remote source an give local names: ds.add(path=['local1.dat', 'local2.dat'], source=urls[1:3]) annexed = ds.repo.get_annexed_files() eq_(len(annexed), 3) assert_in('local1.dat', annexed) assert_in('local2.dat', annexed) # add a second source for one of them ds.add(path='local1.dat', source=urls[3]) eq_(len(annexed), 3) whereis_dict = ds.repo.whereis('local1.dat', output='full') reg_urls = [whereis_dict[uuid]['urls'] for uuid in whereis_dict if not whereis_dict[uuid]['here']] eq_(len(reg_urls), 1) # one remote for 'local1.dat', that is not "here" eq_({str(urls[1]), str(urls[3])}, set(reg_urls[0])) # just to be sure compare to 'local2.dat': whereis_dict = ds.repo.whereis('local2.dat', output='full') reg_urls = [whereis_dict[uuid]['urls'] for uuid in whereis_dict if not whereis_dict[uuid]['here']] eq_(len(reg_urls), 1) # one remote for 'local2.dat', that is not "here" eq_([urls[2]], reg_urls[0]) # provide more paths than sources: # report failure on non-existing 'local4.dat': result = ds.add(path=['local3.dat', 'local4.dat'], source=urls[4]) ok_(all([r['success'] is False and r['note'] == 'not found' for r in result if r['file'] == 'local4.dat'])) with open(opj(ds.path, 'local4.dat'), 'w') as f: f.write('local4 content') ds.add(path=['local3.dat', 'local4.dat'], source=urls[4]) annexed = ds.repo.get_annexed_files() eq_(len(annexed), 5) assert_in('local3.dat', annexed) assert_in('local4.dat', annexed) # 'local3.dat' has a remote source whereis_dict = ds.repo.whereis('local3.dat', output='full') reg_urls = [whereis_dict[uuid]['urls'] for uuid in whereis_dict if not whereis_dict[uuid]['here']] eq_(len(reg_urls), 1) # one remote for 'local3.dat', that is not "here" eq_([urls[4]], reg_urls[0]) # 'local4.dat' has no remote source whereis_dict = ds.repo.whereis('local4.dat', output='full') reg_urls = [whereis_dict[uuid]['urls'] for uuid in whereis_dict if not whereis_dict[uuid]['here']] eq_(len(reg_urls), 0) # provide more sources than paths: ds.add('local5.dat', source=urls[5:]) annexed = ds.repo.get_annexed_files() assert_in('local5.dat', annexed) eq_(len(annexed), 5 + len(urls[5:])) # Note: local4.dat didn't come from an url, # but 'local1.dat' consumes two urls eq_(len(annexed), len(urls)) # all files annexed (-2 for '.git' and '.datalad'): eq_(len(annexed), len(listdir(ds.path)) - 2)