def test_publish_file_handle(origin, src_path, dst_path): # prepare src source = install(path=src_path, source=origin, recursive=True) # TODO: For now, circumnavigate the detached head issue. # Figure out, what to do. for subds in source.get_dataset_handles(recursive=True): AnnexRepo(opj(src_path, subds), init=True, create=True).git_checkout("master") source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) # actually not needed for this test, but provide same setup as # everywhere else: target.git_checkout("TMP", "-b") source.repo.git_remote_add("target", dst_path) # directly publish a file handle, not the dataset itself: res = publish(dataset=source, dest="target", path="test-annex.dat") eq_(res, opj(source.path, 'test-annex.dat')) # only file was published, not the dataset itself: assert_not_in("master", target.git_get_branches()) eq_(Dataset(dst_path).get_dataset_handles(), []) assert_not_in("test.dat", target.git_get_files()) # content is now available from 'target': assert_in("target", source.repo.annex_whereis('test-annex.dat', output="descriptions")) source.repo.annex_drop('test-annex.dat') eq_(source.repo.file_has_content(['test-annex.dat']), [False]) source.repo._run_annex_command('get', annex_options=['test-annex.dat', '--from=target']) eq_(source.repo.file_has_content(['test-annex.dat']), [True])
def test_publish_file_handle(origin, src_path, dst_path): # prepare src source = install(path=src_path, source=origin, recursive=True) # TODO: For now, circumnavigate the detached head issue. # Figure out, what to do. for subds in source.get_dataset_handles(recursive=True): AnnexRepo(opj(src_path, subds), init=True, create=True).git_checkout("master") source.repo.get('test-annex.dat') # create plain git at target: target = AnnexRepo(dst_path, create=True) # actually not needed for this test, but provide same setup as # everywhere else: target.git_checkout("TMP", "-b") source.repo.git_remote_add("target", dst_path) # directly publish a file handle, not the dataset itself: res = publish(dataset=source, dest="target", path="test-annex.dat") eq_(res, opj(source.path, 'test-annex.dat')) # only file was published, not the dataset itself: assert_not_in("master", target.git_get_branches()) eq_(Dataset(dst_path).get_dataset_handles(), []) assert_not_in("test.dat", target.git_get_files()) # content is now available from 'target': assert_in( "target", source.repo.annex_whereis('test-annex.dat', output="descriptions")) source.repo.annex_drop('test-annex.dat') eq_(source.repo.file_has_content(['test-annex.dat']), [False]) source.repo._run_annex_command( 'get', annex_options=['test-annex.dat', '--from=target']) eq_(source.repo.file_has_content(['test-annex.dat']), [True])
class Dataset(object): __slots__ = ['_path', '_repo'] def __init__(self, path): self._path = abspath(path) self._repo = None def __repr__(self): return "<Dataset path=%s>" % self.path @property def path(self): """path to the dataset""" return self._path @property def repo(self): """Get an instance of the version control system/repo for this dataset, or None if there is none yet. If creating an instance of GitRepo is guaranteed to be really cheap this could also serve as a test whether a repo is present. Returns ------- GitRepo """ if self._repo is None: with swallow_logs(): try: self._repo = AnnexRepo(self._path, create=False, init=False) except (InvalidGitRepositoryError, NoSuchPathError, RuntimeError): try: self._repo = GitRepo(self._path, create=False) except (InvalidGitRepositoryError, NoSuchPathError): pass elif not isinstance(self._repo, AnnexRepo): # repo was initially set to be self._repo but might become AnnexRepo # at a later moment, so check if it didn't happen if 'git-annex' in self._repo.git_get_branches(): # we acquired git-annex branch self._repo = AnnexRepo(self._repo.path, create=False) return self._repo def register_sibling(self, name, url, publish_url=None, verify=None): """Register the location of a sibling dataset under a given name. Optionally, different URLs can be given for retrieving information from the sibling and for publishing information to it. This is a cheap operation that does not confirm that at the given location an actual sibling dataset is available, unless verify is set. The value "dataset" verifies, that at the given URL an accessible dataset is available and the value "sibling" furthermore verifies, that this dataset shares at least one commit with self. Parameters ---------- name url publish_url verify None | "dataset" | "sibling" """ repo = self.repo if verify is not None: raise NotImplementedError("TODO: verify not implemented yet") if name not in repo.git_get_remotes(): # Add remote repo.git_remote_add(name, url) if publish_url is not None: # set push url: repo._git_custom_command('', ["git", "remote", "set-url", "--push", name, publish_url]) lgr.info("Added remote '%s':\n %s (pull)\n%s (push)." % (name, url, publish_url if publish_url else url)) else: lgr.warning("Remote '%s' already exists. Ignore.") raise ValueError("'%s' already exists. Couldn't register sibling.") def get_dataset_handles(self, pattern=None, fulfilled=None, absolute=False, recursive=False): """Get names/paths of all known dataset_handles (subdatasets), optionally matching a specific name pattern. Parameters ---------- pattern : None Not implemented fulfilled : None or bool If not None, return either only present or absent datasets. absolute : bool If True, absolute paths will be returned. recursive : bool If True, recurse into all subdatasets and report their dataset handles too. Returns ------- list(Dataset paths) or None None is return if there is not repository instance yet. For an existing repository with no subdatasets an empty list is returned. """ if pattern is not None: raise NotImplementedError repo = self.repo if repo is None: return # check whether we have anything in the repo. if not go home early if not repo.repo.head.is_valid(): return [] try: submodules = repo.get_submodules() except InvalidGitRepositoryError: # this happens when we access a repository with a submodule that # has no commits, hence doesn't appear in the index and # 'git submodule status' also doesn't list it return [] # filter if desired if fulfilled is None: submodules = [sm.path for sm in submodules] else: submodules = [sm.path for sm in submodules if sm.module_exists() == fulfilled] # expand list with child submodules. keep all paths relative to parent # and convert jointly at the end if recursive: rsm = [] for sm in submodules: rsm.append(sm) sdspath = opj(self._path, sm) rsm.extend( [opj(sm, sdsh) for sdsh in Dataset(sdspath).get_dataset_handles( pattern=pattern, fulfilled=fulfilled, absolute=False, recursive=recursive)]) submodules = rsm if absolute: return [opj(self._path, sm) for sm in submodules] else: return submodules # def get_file_handles(self, pattern=None, fulfilled=None): # """Get paths to all known file_handles, optionally matching a specific # name pattern. # # If fulfilled is True, only paths to fullfiled handles are returned, # if False, only paths to unfulfilled handles are returned. # # Parameters # ---------- # pattern: str # fulfilled: bool # # Returns # ------- # list of str # (paths) # """ # raise NotImplementedError("TODO") # TODO maybe needs to get its own interface def remember_state(self, message, auto_add_changes=True, version=None): """ Parameters ---------- auto_add_changes: bool message: str update_superdataset: bool version: str """ if not self.is_installed(): raise RuntimeError( "cannot remember a state when a dataset is not yet installed") repo = self.repo if auto_add_changes: repo.annex_add('.') repo.commit(message) if version: repo._git_custom_command('', 'git tag "{0}"'.format(version)) def recall_state(self, whereto): """Something that can be used to checkout a particular state (tag, commit) to "undo" a change or switch to a otherwise desired previous state. Parameters ---------- whereto: str """ if not self.is_installed(): raise RuntimeError( "cannot remember a state when a dataset is not yet installed") self.repo.git_checkout(whereto) def is_installed(self): """Returns whether a dataset is installed. A dataset is installed when a repository for it exists on the filesystem. Returns ------- bool """ return self.path is not None and self.repo is not None
class Dataset(object): __slots__ = ['_path', '_repo'] def __init__(self, path): self._path = abspath(path) self._repo = None def __repr__(self): return "<Dataset path=%s>" % self.path @property def path(self): """path to the dataset""" return self._path @property def repo(self): """Get an instance of the version control system/repo for this dataset, or None if there is none yet. If creating an instance of GitRepo is guaranteed to be really cheap this could also serve as a test whether a repo is present. Returns ------- GitRepo """ if self._repo is None: with swallow_logs(): try: self._repo = AnnexRepo(self._path, create=False, init=False) except (InvalidGitRepositoryError, NoSuchPathError, RuntimeError): try: self._repo = GitRepo(self._path, create=False) except (InvalidGitRepositoryError, NoSuchPathError): pass elif not isinstance(self._repo, AnnexRepo): # repo was initially set to be self._repo but might become AnnexRepo # at a later moment, so check if it didn't happen if 'git-annex' in self._repo.git_get_branches(): # we acquired git-annex branch self._repo = AnnexRepo(self._repo.path, create=False) return self._repo def register_sibling(self, name, url, publish_url=None, verify=None): """Register the location of a sibling dataset under a given name. Optionally, different URLs can be given for retrieving information from the sibling and for publishing information to it. This is a cheap operation that does not confirm that at the given location an actual sibling dataset is available, unless verify is set. The value "dataset" verifies, that at the given URL an accessible dataset is available and the value "sibling" furthermore verifies, that this dataset shares at least one commit with self. Parameters ---------- name url publish_url verify None | "dataset" | "sibling" """ repo = self.repo if verify is not None: raise NotImplementedError("TODO: verify not implemented yet") if name not in repo.git_get_remotes(): # Add remote repo.git_remote_add(name, url) if publish_url is not None: # set push url: repo._git_custom_command( '', ["git", "remote", "set-url", "--push", name, publish_url]) lgr.info("Added remote '%s':\n %s (pull)\n%s (push)." % (name, url, publish_url if publish_url else url)) else: lgr.warning("Remote '%s' already exists. Ignore.") raise ValueError("'%s' already exists. Couldn't register sibling.") def get_dataset_handles(self, pattern=None, fulfilled=None, absolute=False, recursive=False): """Get names/paths of all known dataset_handles (subdatasets), optionally matching a specific name pattern. Parameters ---------- pattern : None Not implemented fulfilled : None or bool If not None, return either only present or absent datasets. absolute : bool If True, absolute paths will be returned. recursive : bool If True, recurse into all subdatasets and report their dataset handles too. Returns ------- list(Dataset paths) or None None is return if there is not repository instance yet. For an existing repository with no subdatasets an empty list is returned. """ if pattern is not None: raise NotImplementedError repo = self.repo if repo is None: return # check whether we have anything in the repo. if not go home early if not repo.repo.head.is_valid(): return [] try: submodules = repo.get_submodules() except InvalidGitRepositoryError: # this happens when we access a repository with a submodule that # has no commits, hence doesn't appear in the index and # 'git submodule status' also doesn't list it return [] # filter if desired if fulfilled is None: submodules = [sm.path for sm in submodules] else: submodules = [ sm.path for sm in submodules if sm.module_exists() == fulfilled ] # expand list with child submodules. keep all paths relative to parent # and convert jointly at the end if recursive: rsm = [] for sm in submodules: rsm.append(sm) sdspath = opj(self._path, sm) rsm.extend([ opj(sm, sdsh) for sdsh in Dataset( sdspath).get_dataset_handles(pattern=pattern, fulfilled=fulfilled, absolute=False, recursive=recursive) ]) submodules = rsm if absolute: return [opj(self._path, sm) for sm in submodules] else: return submodules # def get_file_handles(self, pattern=None, fulfilled=None): # """Get paths to all known file_handles, optionally matching a specific # name pattern. # # If fulfilled is True, only paths to fullfiled handles are returned, # if False, only paths to unfulfilled handles are returned. # # Parameters # ---------- # pattern: str # fulfilled: bool # # Returns # ------- # list of str # (paths) # """ # raise NotImplementedError("TODO") # TODO maybe needs to get its own interface def remember_state(self, message, auto_add_changes=True, version=None): """ Parameters ---------- auto_add_changes: bool message: str update_superdataset: bool version: str """ if not self.is_installed(): raise RuntimeError( "cannot remember a state when a dataset is not yet installed") repo = self.repo if auto_add_changes: repo.annex_add('.') repo.commit(message) if version: repo._git_custom_command('', 'git tag "{0}"'.format(version)) def recall_state(self, whereto): """Something that can be used to checkout a particular state (tag, commit) to "undo" a change or switch to a otherwise desired previous state. Parameters ---------- whereto: str """ if not self.is_installed(): raise RuntimeError( "cannot remember a state when a dataset is not yet installed") self.repo.git_checkout(whereto) def is_installed(self): """Returns whether a dataset is installed. A dataset is installed when a repository for it exists on the filesystem. Returns ------- bool """ return self.path is not None and self.repo is not None