def used_cache( self, targets=None, all_branches=False, with_deps=False, all_tags=False, all_commits=False, remote=None, force=False, jobs=None, recursive=False, ): """Get the stages related to the given target and collect the `info` of its outputs. This is useful to know what files from the cache are _in use_ (namely, a file described as an output on a stage). The scope is, by default, the working directory, but you can use `all_branches`/`all_tags`/`all_commits` to expand the scope. Returns: A dictionary with Schemes (representing output's location) mapped to items containing the output's `dumpd` names and the output's children (if the given output is a directory). """ from dvc.cache import NamedCache cache = NamedCache() for branch in self.brancher( all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, ): targets = targets or [None] pairs = cat( self.collect_granular( target, recursive=recursive, with_deps=with_deps) for target in targets) suffix = "({})".format(branch) if branch else "" for stage, filter_info in pairs: used_cache = stage.get_used_cache( remote=remote, force=force, jobs=jobs, filter_info=filter_info, ) cache.update(used_cache, suffix=suffix) return cache
def get_used_cache(self, used_run_cache, *args, **kwargs): from dvc.cache import NamedCache cache = NamedCache() for key, value in used_run_cache: entry = self._load_cache(key, value) if not entry: continue stage = self._create_stage(entry) cache.update(stage.get_used_cache(*args, **kwargs)) return cache
def _collect_used_dir_cache(self, remote=None, force=False, jobs=None): """Get a list of `info`s retaled to the given directory. - Pull the directory entry from the remote cache if it was changed. Example: Given the following commands: $ echo "foo" > directory/foo $ echo "bar" > directory/bar $ dvc add directory It will return a NamedCache like: nc = NamedCache() nc.add(self.scheme, 'c157a79031e1', 'directory/foo') nc.add(self.scheme, 'd3b07384d113', 'directory/bar') """ ret = [] if self.cache.changed_cache_file(self.checksum): try: self.repo.cloud.pull( NamedCache.make("local", self.checksum, str(self)), jobs=jobs, remote=remote, show_checksums=False, ) except DvcException: logger.debug("failed to pull cache for '{}'".format(self)) if self.cache.changed_cache_file(self.checksum): msg = ("Missing cache for directory '{}'. " "Cache for files inside will be lost. " "Would you like to continue? Use '-f' to force.") if not force and not prompt.confirm(msg.format(self.path_info)): raise DvcException( "unable to fully collect used cache" " without cache for directory '{}'".format(self)) else: return ret cache = NamedCache() for entry in self.dir_cache: checksum = entry[self.remote.PARAM_CHECKSUM] path_info = self.path_info / entry[self.remote.PARAM_RELPATH] cache.add(self.scheme, checksum, str(path_info)) return cache
def get_used_cache(self, **kwargs): """Get a dumpd of the given `out`, with an entry including the branch. The `used_cache` of an output is no more than its `info`. In case that the given output is a directory, it will also include the `info` of its files. """ if not self.use_cache: return NamedCache() if self.stage.is_repo_import: cache = NamedCache() (dep,) = self.stage.deps cache.external[dep.repo_pair].add(dep.def_path) return cache if not self.hash_info: msg = ( "Output '{}'({}) is missing version info. " "Cache for it will not be collected. " "Use `dvc repro` to get your pipeline up to date.".format( self, self.stage ) ) if self.exists: msg += ( "\n" "You can also use `dvc commit {stage.addressing}` " "to associate existing '{out}' with {stage}.".format( out=self, stage=self.stage ) ) logger.warning(msg) return NamedCache() ret = NamedCache.make(self.scheme, self.hash_info.value, str(self)) if not self.is_dir_checksum: return ret ret.add_child_cache( self.hash_info.value, self.collect_used_dir_cache(**kwargs), ) return ret
def test_used_cache(tmp_dir, dvc, path): from dvc.cache import NamedCache tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}, "other": "other"}}) expected = NamedCache.make("local", "70922d6bf66eb073053a82f77d58c536.dir", "dir") expected.add_child_cache( "70922d6bf66eb073053a82f77d58c536.dir", NamedCache.make( "local", "8c7dd922ad47494fc02c388e12c00eac", os.path.join("dir", "subdir", "file"), ), ) used_cache = dvc.used_cache([path]) assert (used_cache._items == expected._items and used_cache.external == expected.external)
def get_dir_cache(self, **kwargs): if not self.is_dir_checksum: raise DvcException("cannot get dir cache for file checksum") if self.cache.changed_cache_file(self.checksum): self.repo.cloud.pull( NamedCache.make("local", self.checksum, str(self)), show_checksums=False, **kwargs, ) return self.dir_cache
def test_status_download_optimization(mocker, dvc): """When comparing the status to pull a remote cache, And the desired files to fetch are already on the local cache, Don't check the existence of the desired files on the remote cache """ remote = RemoteLOCAL(dvc, {}) infos = NamedCache() infos.add("local", "acbd18db4cc2f85cedef654fccc4a4d8", "foo") infos.add("local", "37b51d194a7513e45b56f6524f2d51f2", "bar") local_exists = list(infos["local"]) mocker.patch.object(remote, "cache_exists", return_value=local_exists) other_remote = mocker.Mock() other_remote.url = "other_remote" other_remote.cache_exists.return_value = [] remote.status(infos, other_remote, download=True) assert other_remote.cache_exists.call_count == 0
def get_used_cache(self, used_run_cache, *args, **kwargs): from dvc.cache import NamedCache from dvc.stage import create_stage, PipelineStage cache = NamedCache() for key, value in used_run_cache: entry = self._load_cache(key, value) if not entry: continue stage = create_stage( PipelineStage, repo=self.repo, path="dvc.yaml", cmd=entry["cmd"], deps=[dep["path"] for dep in entry["deps"]], outs=[out["path"] for out in entry["outs"]], ) StageLoader.fill_from_lock(stage, entry) cache.update(stage.get_used_cache(*args, **kwargs)) return cache
def _fetch_external(self, repo_url, repo_rev, files, jobs): from dvc.external_repo import external_repo, ExternalRepo failed, downloaded = 0, 0 try: with external_repo(repo_url, repo_rev) as repo: is_dvc_repo = isinstance(repo, ExternalRepo) # gather git-only tracked files if dvc repo git_files = [] if is_dvc_repo else files if is_dvc_repo: repo.cache.local.cache_dir = self.cache.local.cache_dir with repo.state: cache = NamedCache() for name in files: try: out = repo.find_out_by_relpath(name) except OutputNotFoundError: # try to add to cache if they are git-tracked files git_files.append(name) else: cache.update(out.get_used_cache()) try: downloaded += repo.cloud.pull(cache, jobs=jobs) except DownloadError as exc: failed += exc.amount d, f = _git_to_cache(self.cache.local, repo.root_dir, git_files) downloaded += d failed += f except CloneError: failed += 1 logger.exception("failed to fetch data for '{}'".format( ", ".join(files))) return downloaded, failed
def get_dir_cache(self, **kwargs): if not self.is_dir_checksum: raise DvcException("cannot get dir cache for file checksum") try: objects.check(self.cache, self.cache.get(self.hash_info)) except (FileNotFoundError, objects.ObjectFormatError): self.repo.cloud.pull( NamedCache.make("local", self.hash_info.value, str(self)), show_checksums=False, **kwargs, ) try: objects.load(self.cache, self.hash_info) assert self.hash_info.dir_info except (objects.ObjectFormatError, FileNotFoundError): self.hash_info.dir_info = None return self.dir_cache
def _test_cloud(self): self._setup_cloud() stages = self.dvc.add(self.FOO) self.assertEqual(len(stages), 1) stage = stages[0] self.assertTrue(stage is not None) out = stage.outs[0] cache = out.cache_path md5 = out.checksum info = out.get_used_cache() stages = self.dvc.add(self.DATA_DIR) self.assertEqual(len(stages), 1) stage_dir = stages[0] self.assertTrue(stage_dir is not None) out_dir = stage_dir.outs[0] cache_dir = out_dir.cache_path name_dir = str(out_dir) md5_dir = out_dir.checksum info_dir = NamedCache.make(out_dir.scheme, md5_dir, name_dir) with self.cloud.repo.state: # Check status status = self.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_NEW}} self.assertEqual(status, expected) status_dir = self.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_NEW}} self.assertEqual(status_dir, expected) # Push and check status self.cloud.push(info) self.assertTrue(os.path.exists(cache)) self.assertTrue(os.path.isfile(cache)) self.cloud.push(info_dir) self.assertTrue(os.path.isfile(cache_dir)) status = self.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_OK}} self.assertEqual(status, expected) status_dir = self.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}} self.assertEqual(status_dir, expected) # Remove and check status remove(self.dvc.cache.local.cache_dir) status = self.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_DELETED}} self.assertEqual(status, expected) status_dir = self.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_DELETED}} self.assertEqual(status_dir, expected) # Pull and check status self.cloud.pull(info) self.assertTrue(os.path.exists(cache)) self.assertTrue(os.path.isfile(cache)) with open(cache, "r") as fd: self.assertEqual(fd.read(), self.FOO_CONTENTS) self.cloud.pull(info_dir) self.assertTrue(os.path.isfile(cache_dir)) status = self.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_OK}} self.assertEqual(status, expected) status_dir = self.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}} self.assertTrue(status_dir, expected)
def test_cloud(tmp_dir, dvc, remote): (stage,) = tmp_dir.dvc_gen("foo", "foo") out = stage.outs[0] cache = out.cache_path md5 = out.checksum info = out.get_used_cache() (stage_dir,) = tmp_dir.dvc_gen( { "data_dir": { "data_sub_dir": {"data_sub": "data_sub"}, "data": "data", } } ) out_dir = stage_dir.outs[0] cache_dir = out_dir.cache_path name_dir = str(out_dir) md5_dir = out_dir.checksum info_dir = NamedCache.make(out_dir.scheme, md5_dir, name_dir) with dvc.state: # Check status status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_NEW}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_NEW}} assert status_dir == expected # Push and check status dvc.cloud.push(info) assert os.path.exists(cache) assert os.path.isfile(cache) dvc.cloud.push(info_dir) assert os.path.isfile(cache_dir) status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_OK}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}} assert status_dir == expected # Remove and check status remove(dvc.cache.local.cache_dir) status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_DELETED}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_DELETED}} assert status_dir == expected # Pull and check status dvc.cloud.pull(info) assert os.path.exists(cache) assert os.path.isfile(cache) with open(cache) as fd: assert fd.read() == "foo" dvc.cloud.pull(info_dir) assert os.path.isfile(cache_dir) status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_OK}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}} assert status_dir == expected
def _collect_used_dir_cache(self, remote=None, force=False, jobs=None, filter_info=None): """Get a list of `info`s related to the given directory. - Pull the directory entry from the remote cache if it was changed. Example: Given the following commands: $ echo "foo" > directory/foo $ echo "bar" > directory/bar $ dvc add directory It will return a NamedCache like: nc = NamedCache() nc.add(self.scheme, 'c157a79031e1', 'directory/foo') nc.add(self.scheme, 'd3b07384d113', 'directory/bar') """ cache = NamedCache() if self.cache.changed_cache_file(self.checksum): try: self.repo.cloud.pull( NamedCache.make("local", self.checksum, str(self)), jobs=jobs, remote=remote, show_checksums=False, ) except DvcException: logger.debug("failed to pull cache for '{}'".format(self)) if self.cache.changed_cache_file(self.checksum): msg = ("Missing cache for directory '{}'. " "Cache for files inside will be lost. " "Would you like to continue? Use '-f' to force.") if not force and not prompt.confirm(msg.format(self.path_info)): raise CollectCacheError( "unable to fully collect used cache" " without cache for directory '{}'".format(self)) else: return cache path = str(self.path_info) filter_path = str(filter_info) if filter_info else None is_win = os.name == "nt" for entry in self.dir_cache: checksum = entry[self.remote.PARAM_CHECKSUM] entry_relpath = entry[self.remote.PARAM_RELPATH] if is_win: entry_relpath = entry_relpath.replace("/", os.sep) entry_path = os.path.join(path, entry_relpath) if (not filter_path or entry_path == filter_path or entry_path.startswith(filter_path + os.sep)): cache.add(self.scheme, checksum, entry_path) return cache
def test_cloud(tmp_dir, dvc, remote): # pylint:disable=unused-argument (stage,) = tmp_dir.dvc_gen("foo", "foo") out = stage.outs[0] cache = out.cache_path md5 = out.checksum info = out.get_used_cache() (stage_dir,) = tmp_dir.dvc_gen( { "data_dir": { "data_sub_dir": {"data_sub": "data_sub"}, "data": "data", } } ) out_dir = stage_dir.outs[0] cache_dir = out_dir.cache_path name_dir = str(out_dir) md5_dir = out_dir.checksum info_dir = NamedCache.make(out_dir.scheme, md5_dir, name_dir) with dvc.state: # Check status status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_NEW}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_NEW}} assert status_dir == expected # Move cache and check status # See issue https://github.com/iterative/dvc/issues/4383 for details backup_dir = dvc.cache.local.cache_dir + ".backup" move(dvc.cache.local.cache_dir, backup_dir) status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_MISSING}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_MISSING}} assert status_dir == expected # Restore original cache: remove(dvc.cache.local.cache_dir) move(backup_dir, dvc.cache.local.cache_dir) # Push and check status dvc.cloud.push(info) assert os.path.exists(cache) assert os.path.isfile(cache) dvc.cloud.push(info_dir) assert os.path.isfile(cache_dir) status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_OK}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}} assert status_dir == expected # Remove and check status remove(dvc.cache.local.cache_dir) status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_DELETED}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_DELETED}} assert status_dir == expected # Pull and check status dvc.cloud.pull(info) assert os.path.exists(cache) assert os.path.isfile(cache) with open(cache) as fd: assert fd.read() == "foo" dvc.cloud.pull(info_dir) assert os.path.isfile(cache_dir) status = dvc.cloud.status(info, show_checksums=True) expected = {md5: {"name": md5, "status": STATUS_OK}} assert status == expected status_dir = dvc.cloud.status(info_dir, show_checksums=True) expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}} assert status_dir == expected
def gc( self, all_branches=False, cloud=False, remote=None, with_deps=False, all_tags=False, all_commits=False, force=False, jobs=None, repos=None, workspace=False, ): # require `workspace` to be true to come into effect. # assume `workspace` to be enabled if any of `all_tags`, `all_commits`, # or `all_branches` are enabled. _raise_error_if_all_disabled( workspace=workspace, all_tags=all_tags, all_commits=all_commits, all_branches=all_branches, ) from contextlib import ExitStack from dvc.repo import Repo all_repos = [] if repos: all_repos = [Repo(path) for path in repos] with ExitStack() as stack: for repo in all_repos: stack.enter_context(repo.lock) stack.enter_context(repo.state) used = NamedCache() for repo in all_repos + [self]: used.update( repo.used_cache( all_branches=all_branches, with_deps=with_deps, all_tags=all_tags, all_commits=all_commits, remote=remote, force=force, jobs=jobs, )) _do_gc("local", self.cache.local.gc, used, jobs) if self.cache.s3: _do_gc("s3", self.cache.s3.gc, used, jobs) if self.cache.gs: _do_gc("gs", self.cache.gs.gc, used, jobs) if self.cache.ssh: _do_gc("ssh", self.cache.ssh.gc, used, jobs) if self.cache.hdfs: _do_gc("hdfs", self.cache.hdfs.gc, used, jobs) if self.cache.azure: _do_gc("azure", self.cache.azure.gc, used, jobs) if cloud: _do_gc("remote", self.cloud.get_remote(remote, "gc -c").gc, used, jobs)
def gc( self, all_branches=False, cloud=False, remote=None, with_deps=False, all_tags=False, all_commits=False, force=False, jobs=None, repos=None, workspace=False, ): # require `workspace` to be true to come into effect. # assume `workspace` to be enabled if any of `all_tags`, `all_commits`, # or `all_branches` are enabled. _raise_error_if_all_disabled( workspace=workspace, all_tags=all_tags, all_commits=all_commits, all_branches=all_branches, ) from contextlib import ExitStack from dvc.repo import Repo if not repos: repos = [] all_repos = [Repo(path) for path in repos] with ExitStack() as stack: for repo in all_repos: stack.enter_context(repo.lock) stack.enter_context(repo.state) used = NamedCache() for repo in all_repos + [self]: used.update( repo.used_cache( all_branches=all_branches, with_deps=with_deps, all_tags=all_tags, all_commits=all_commits, remote=remote, force=force, jobs=jobs, )) for scheme, cache in self.cache.by_scheme(): if not cache: continue removed = cache.gc(set(used.scheme_keys(scheme)), jobs=jobs) if not removed: logger.info(f"No unused '{scheme}' cache to remove.") if not cloud: return remote = self.cloud.get_remote(remote, "gc -c") removed = remote.gc(set(used.scheme_keys(Schemes.LOCAL)), jobs=jobs) if not removed: logger.info("No unused cache to remove from remote.")