def get_used_cache(self, *args, **kwargs): from dvc.objects.db import NamedCache cache = NamedCache() for out in self.filter_outs(kwargs.get("filter_info")): cache.update(out.get_used_cache(*args, **kwargs)) return cache
def collect_used_dir_cache( self, remote=None, force=False, jobs=None, filter_info=None ): """Get a list of `info`s related to the given directory. - Pull the directory entry from the remote cache if it was changed. Example: Given the following commands: $ echo "foo" > directory/foo $ echo "bar" > directory/bar $ dvc add directory It will return a NamedCache like: nc = NamedCache() nc.add(self.scheme, 'c157a79031e1', 'directory/foo') nc.add(self.scheme, 'd3b07384d113', 'directory/bar') """ cache = NamedCache() try: self.get_dir_cache(jobs=jobs, remote=remote) except DvcException: logger.debug(f"failed to pull cache for '{self}'") try: objects.check(self.odb, self.odb.get(self.hash_info)) except (FileNotFoundError, ObjectFormatError): msg = ( "Missing cache for directory '{}'. " "Cache for files inside will be lost. " "Would you like to continue? Use '-f' to force." ) if not force and not prompt.confirm(msg.format(self.path_info)): raise CollectCacheError( "unable to fully collect used cache" " without cache for directory '{}'".format(self) ) return cache path = str(self.path_info) filter_path = str(filter_info) if filter_info else None for entry_key, entry_obj in self.obj: entry_path = os.path.join(path, *entry_key) if ( not filter_path or entry_path == filter_path or entry_path.startswith(filter_path + os.sep) ): cache.add(self.scheme, entry_obj.hash_info.value, entry_path) return cache
def get_used_cache(self, used_run_cache, *args, **kwargs): from dvc.objects.db import NamedCache cache = NamedCache() for key, value in used_run_cache: entry = self._load_cache(key, value) if not entry: continue stage = self._create_stage(entry) cache.update(stage.get_used_cache(*args, **kwargs)) return cache
def get_used_cache(self, **kwargs): """Get a dumpd of the given `out`, with an entry including the branch. The `used_cache` of an output is no more than its `info`. In case that the given output is a directory, it will also include the `info` of its files. """ if not self.use_cache: return NamedCache() if self.stage.is_repo_import: cache = NamedCache() (dep,) = self.stage.deps cache.external[dep.repo_pair].add(dep.def_path) return cache if not self.hash_info: msg = ( "Output '{}'({}) is missing version info. " "Cache for it will not be collected. " "Use `dvc repro` to get your pipeline up to date.".format( self, self.stage ) ) if self.exists: msg += ( "\n" "You can also use `dvc commit {stage.addressing}` " "to associate existing '{out}' with {stage}.".format( out=self, stage=self.stage ) ) logger.warning(msg) return NamedCache() ret = NamedCache.make(self.scheme, self.hash_info.value, str(self)) if not self.is_dir_checksum: return ret ret.add_child_cache( self.hash_info.value, self.collect_used_dir_cache(**kwargs) ) return ret
def test_used_cache(tmp_dir, dvc, path): from dvc.objects.db import NamedCache tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}, "other": "other"}}) expected = NamedCache.make("local", "70922d6bf66eb073053a82f77d58c536.dir", "dir") expected.add_child_cache( "70922d6bf66eb073053a82f77d58c536.dir", NamedCache.make( "local", "8c7dd922ad47494fc02c388e12c00eac", os.path.join("dir", "subdir", "file"), ), ) used_cache = dvc.used_cache([path]) assert (used_cache._items == expected._items and used_cache.external == expected.external)
def test_status_download_optimization(mocker, dvc): """When comparing the status to pull a remote cache, And the desired files to fetch are already on the local cache, Don't check the existence of the desired files on the remote cache """ odb = LocalObjectDB(LocalFileSystem()) infos = NamedCache() infos.add("local", "acbd18db4cc2f85cedef654fccc4a4d8", "foo") infos.add("local", "37b51d194a7513e45b56f6524f2d51f2", "bar") local_exists = list(infos["local"]) mocker.patch.object(odb, "hashes_exist", return_value=local_exists) other_remote = mocker.Mock() other_remote.url = "other_remote" other_remote.hashes_exist.return_value = [] other_remote.index = RemoteIndexNoop() other_remote.status(odb, infos, download=True) assert other_remote.hashes_exist.call_count == 0
def get_dir_cache(self, **kwargs): if not self.is_dir_checksum: raise DvcException("cannot get dir cache for file checksum") try: objects.check(self.odb, self.odb.get(self.hash_info)) except (FileNotFoundError, ObjectFormatError): self.repo.cloud.pull( NamedCache.make("local", self.hash_info.value, str(self)), show_checksums=False, **kwargs, ) try: self.obj = objects.load(self.odb, self.hash_info) except (FileNotFoundError, ObjectFormatError): self.obj = None return self.obj
def used_cache( self, targets=None, all_branches=False, with_deps=False, all_tags=False, all_commits=False, all_experiments=False, remote=None, force=False, jobs=None, recursive=False, used_run_cache=None, revs=None, ): """Get the stages related to the given target and collect the `info` of its outputs. This is useful to know what files from the cache are _in use_ (namely, a file described as an output on a stage). The scope is, by default, the working directory, but you can use `all_branches`/`all_tags`/`all_commits`/`all_experiments` to expand the scope. Returns: A dictionary with Schemes (representing output's location) mapped to items containing the output's `dumpd` names and the output's children (if the given output is a directory). """ from dvc.objects.db import NamedCache cache = NamedCache() for branch in self.brancher( revs=revs, all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, all_experiments=all_experiments, ): targets = targets or [None] pairs = cat( self.stage.collect_granular( target, recursive=recursive, with_deps=with_deps) for target in targets) suffix = f"({branch})" if branch else "" for stage, filter_info in pairs: used_cache = stage.get_used_cache( remote=remote, force=force, jobs=jobs, filter_info=filter_info, ) cache.update(used_cache, suffix=suffix) if used_run_cache: used_cache = self.stage_cache.get_used_cache( used_run_cache, remote=remote, force=force, jobs=jobs, ) cache.update(used_cache) return cache
def test_cloud(tmp_dir, dvc, remote): # pylint:disable=unused-argument (stage,) = tmp_dir.dvc_gen("foo", "foo") out = stage.outs[0] cache = out.cache_path md5 = out.hash_info.value info = out.get_used_cache() (stage_dir,) = tmp_dir.dvc_gen( { "data_dir": { "data_sub_dir": {"data_sub": "data_sub"}, "data": "data", "empty": "", } } ) out_dir = stage_dir.outs[0] cache_dir = out_dir.cache_path name_dir = str(out_dir) md5_dir = out_dir.hash_info.value info_dir = NamedCache.make(out_dir.scheme, md5_dir, name_dir) # Check status status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_NEW}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_NEW}} assert status_dir == expected # Move cache and check status # See issue https://github.com/iterative/dvc/issues/4383 for details backup_dir = dvc.odb.local.cache_dir + ".backup" move(dvc.odb.local.cache_dir, backup_dir) status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_MISSING}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_MISSING}} assert status_dir == expected # Restore original cache: remove(dvc.odb.local.cache_dir) move(backup_dir, dvc.odb.local.cache_dir) # Push and check status dvc.cloud.push(info) assert os.path.exists(cache) assert os.path.isfile(cache) dvc.cloud.push(info_dir) assert os.path.isfile(cache_dir) status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_OK}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}} assert status_dir == expected # Remove and check status remove(dvc.odb.local.cache_dir) status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_DELETED}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_DELETED}} assert status_dir == expected # Pull and check status dvc.cloud.pull(info) assert os.path.exists(cache) assert os.path.isfile(cache) with open(cache) as fd: assert fd.read() == "foo" dvc.cloud.pull(info_dir) assert os.path.isfile(cache_dir) status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_OK}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}} assert status_dir == expected
def gc( self, all_branches=False, cloud=False, remote=None, with_deps=False, all_tags=False, all_commits=False, all_experiments=False, force=False, jobs=None, repos=None, workspace=False, ): # require `workspace` to be true to come into effect. # assume `workspace` to be enabled if any of `all_tags`, `all_commits`, # `all_experiments` or `all_branches` are enabled. _raise_error_if_all_disabled( workspace=workspace, all_tags=all_tags, all_commits=all_commits, all_branches=all_branches, all_experiments=all_experiments, ) from contextlib import ExitStack from dvc.repo import Repo if not repos: repos = [] all_repos = [Repo(path) for path in repos] with ExitStack() as stack: for repo in all_repos: stack.enter_context(repo.lock) stack.enter_context(repo.state) used = NamedCache() for repo in all_repos + [self]: used.update( repo.used_cache( all_branches=all_branches, with_deps=with_deps, all_tags=all_tags, all_commits=all_commits, all_experiments=all_experiments, remote=remote, force=force, jobs=jobs, )) for scheme, odb in self.odb.by_scheme(): if not odb: continue removed = odb.gc(set(used.scheme_keys(scheme)), jobs=jobs) if not removed: logger.info(f"No unused '{scheme}' cache to remove.") if not cloud: return remote = self.cloud.get_remote(remote, "gc -c") removed = remote.gc(set(used.scheme_keys(Schemes.LOCAL)), jobs=jobs) if not removed: logger.info("No unused cache to remove from remote.")