Пример #1
0
    def used_cache(
        self,
        targets=None,
        all_branches=False,
        with_deps=False,
        all_tags=False,
        all_commits=False,
        remote=None,
        force=False,
        jobs=None,
        recursive=False,
    ):
        """Get the stages related to the given target and collect
        the `info` of its outputs.

        This is useful to know what files from the cache are _in use_
        (namely, a file described as an output on a stage).

        The scope is, by default, the working directory, but you can use
        `all_branches`/`all_tags`/`all_commits` to expand the scope.

        Returns:
            A dictionary with Schemes (representing output's location) mapped
            to items containing the output's `dumpd` names and the output's
            children (if the given output is a directory).
        """
        from dvc.cache import NamedCache

        cache = NamedCache()

        for branch in self.brancher(
                all_branches=all_branches,
                all_tags=all_tags,
                all_commits=all_commits,
        ):
            targets = targets or [None]

            pairs = cat(
                self.collect_granular(
                    target, recursive=recursive, with_deps=with_deps)
                for target in targets)

            suffix = "({})".format(branch) if branch else ""
            for stage, filter_info in pairs:
                used_cache = stage.get_used_cache(
                    remote=remote,
                    force=force,
                    jobs=jobs,
                    filter_info=filter_info,
                )
                cache.update(used_cache, suffix=suffix)

        return cache
Пример #2
0
    def get_used_cache(self, used_run_cache, *args, **kwargs):
        from dvc.cache import NamedCache

        cache = NamedCache()

        for key, value in used_run_cache:
            entry = self._load_cache(key, value)
            if not entry:
                continue
            stage = self._create_stage(entry)
            cache.update(stage.get_used_cache(*args, **kwargs))
        return cache
Пример #3
0
    def _collect_used_dir_cache(self, remote=None, force=False, jobs=None):
        """Get a list of `info`s retaled to the given directory.

        - Pull the directory entry from the remote cache if it was changed.

        Example:

            Given the following commands:

            $ echo "foo" > directory/foo
            $ echo "bar" > directory/bar
            $ dvc add directory

            It will return a NamedCache like:

            nc = NamedCache()
            nc.add(self.scheme, 'c157a79031e1', 'directory/foo')
            nc.add(self.scheme, 'd3b07384d113', 'directory/bar')
        """

        ret = []

        if self.cache.changed_cache_file(self.checksum):
            try:
                self.repo.cloud.pull(
                    NamedCache.make("local", self.checksum, str(self)),
                    jobs=jobs,
                    remote=remote,
                    show_checksums=False,
                )
            except DvcException:
                logger.debug("failed to pull cache for '{}'".format(self))

        if self.cache.changed_cache_file(self.checksum):
            msg = ("Missing cache for directory '{}'. "
                   "Cache for files inside will be lost. "
                   "Would you like to continue? Use '-f' to force.")
            if not force and not prompt.confirm(msg.format(self.path_info)):
                raise DvcException(
                    "unable to fully collect used cache"
                    " without cache for directory '{}'".format(self))
            else:
                return ret

        cache = NamedCache()
        for entry in self.dir_cache:
            checksum = entry[self.remote.PARAM_CHECKSUM]
            path_info = self.path_info / entry[self.remote.PARAM_RELPATH]
            cache.add(self.scheme, checksum, str(path_info))

        return cache
Пример #4
0
    def get_used_cache(self, **kwargs):
        """Get a dumpd of the given `out`, with an entry including the branch.

        The `used_cache` of an output is no more than its `info`.

        In case that the given output is a directory, it will also
        include the `info` of its files.
        """

        if not self.use_cache:
            return NamedCache()

        if self.stage.is_repo_import:
            cache = NamedCache()
            (dep,) = self.stage.deps
            cache.external[dep.repo_pair].add(dep.def_path)
            return cache

        if not self.hash_info:
            msg = (
                "Output '{}'({}) is missing version info. "
                "Cache for it will not be collected. "
                "Use `dvc repro` to get your pipeline up to date.".format(
                    self, self.stage
                )
            )
            if self.exists:
                msg += (
                    "\n"
                    "You can also use `dvc commit {stage.addressing}` "
                    "to associate existing '{out}' with {stage}.".format(
                        out=self, stage=self.stage
                    )
                )
            logger.warning(msg)
            return NamedCache()

        ret = NamedCache.make(self.scheme, self.hash_info.value, str(self))

        if not self.is_dir_checksum:
            return ret

        ret.add_child_cache(
            self.hash_info.value, self.collect_used_dir_cache(**kwargs),
        )

        return ret
Пример #5
0
def test_used_cache(tmp_dir, dvc, path):
    from dvc.cache import NamedCache

    tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}, "other": "other"}})
    expected = NamedCache.make("local", "70922d6bf66eb073053a82f77d58c536.dir",
                               "dir")
    expected.add_child_cache(
        "70922d6bf66eb073053a82f77d58c536.dir",
        NamedCache.make(
            "local",
            "8c7dd922ad47494fc02c388e12c00eac",
            os.path.join("dir", "subdir", "file"),
        ),
    )

    used_cache = dvc.used_cache([path])
    assert (used_cache._items == expected._items
            and used_cache.external == expected.external)
Пример #6
0
 def get_dir_cache(self, **kwargs):
     if not self.is_dir_checksum:
         raise DvcException("cannot get dir cache for file checksum")
     if self.cache.changed_cache_file(self.checksum):
         self.repo.cloud.pull(
             NamedCache.make("local", self.checksum, str(self)),
             show_checksums=False,
             **kwargs,
         )
     return self.dir_cache
Пример #7
0
def test_status_download_optimization(mocker, dvc):
    """When comparing the status to pull a remote cache,
        And the desired files to fetch are already on the local cache,
        Don't check the existence of the desired files on the remote cache
    """
    remote = RemoteLOCAL(dvc, {})

    infos = NamedCache()
    infos.add("local", "acbd18db4cc2f85cedef654fccc4a4d8", "foo")
    infos.add("local", "37b51d194a7513e45b56f6524f2d51f2", "bar")

    local_exists = list(infos["local"])
    mocker.patch.object(remote, "cache_exists", return_value=local_exists)

    other_remote = mocker.Mock()
    other_remote.url = "other_remote"
    other_remote.cache_exists.return_value = []

    remote.status(infos, other_remote, download=True)

    assert other_remote.cache_exists.call_count == 0
Пример #8
0
    def get_used_cache(self, used_run_cache, *args, **kwargs):
        from dvc.cache import NamedCache
        from dvc.stage import create_stage, PipelineStage

        cache = NamedCache()

        for key, value in used_run_cache:
            entry = self._load_cache(key, value)
            if not entry:
                continue
            stage = create_stage(
                PipelineStage,
                repo=self.repo,
                path="dvc.yaml",
                cmd=entry["cmd"],
                deps=[dep["path"] for dep in entry["deps"]],
                outs=[out["path"] for out in entry["outs"]],
            )
            StageLoader.fill_from_lock(stage, entry)
            cache.update(stage.get_used_cache(*args, **kwargs))
        return cache
Пример #9
0
def _fetch_external(self, repo_url, repo_rev, files, jobs):
    from dvc.external_repo import external_repo, ExternalRepo

    failed, downloaded = 0, 0
    try:
        with external_repo(repo_url, repo_rev) as repo:
            is_dvc_repo = isinstance(repo, ExternalRepo)
            # gather git-only tracked files if dvc repo
            git_files = [] if is_dvc_repo else files
            if is_dvc_repo:
                repo.cache.local.cache_dir = self.cache.local.cache_dir
                with repo.state:
                    cache = NamedCache()
                    for name in files:
                        try:
                            out = repo.find_out_by_relpath(name)
                        except OutputNotFoundError:
                            # try to add to cache if they are git-tracked files
                            git_files.append(name)
                        else:
                            cache.update(out.get_used_cache())

                        try:
                            downloaded += repo.cloud.pull(cache, jobs=jobs)
                        except DownloadError as exc:
                            failed += exc.amount

            d, f = _git_to_cache(self.cache.local, repo.root_dir, git_files)
            downloaded += d
            failed += f
    except CloneError:
        failed += 1
        logger.exception("failed to fetch data for '{}'".format(
            ", ".join(files)))

    return downloaded, failed
Пример #10
0
    def get_dir_cache(self, **kwargs):

        if not self.is_dir_checksum:
            raise DvcException("cannot get dir cache for file checksum")

        try:
            objects.check(self.cache, self.cache.get(self.hash_info))
        except (FileNotFoundError, objects.ObjectFormatError):
            self.repo.cloud.pull(
                NamedCache.make("local", self.hash_info.value, str(self)),
                show_checksums=False,
                **kwargs,
            )

        try:
            objects.load(self.cache, self.hash_info)
            assert self.hash_info.dir_info
        except (objects.ObjectFormatError, FileNotFoundError):
            self.hash_info.dir_info = None

        return self.dir_cache
Пример #11
0
    def _test_cloud(self):
        self._setup_cloud()

        stages = self.dvc.add(self.FOO)
        self.assertEqual(len(stages), 1)
        stage = stages[0]
        self.assertTrue(stage is not None)
        out = stage.outs[0]
        cache = out.cache_path
        md5 = out.checksum
        info = out.get_used_cache()

        stages = self.dvc.add(self.DATA_DIR)
        self.assertEqual(len(stages), 1)
        stage_dir = stages[0]
        self.assertTrue(stage_dir is not None)
        out_dir = stage_dir.outs[0]
        cache_dir = out_dir.cache_path
        name_dir = str(out_dir)
        md5_dir = out_dir.checksum
        info_dir = NamedCache.make(out_dir.scheme, md5_dir, name_dir)

        with self.cloud.repo.state:
            # Check status
            status = self.cloud.status(info, show_checksums=True)
            expected = {md5: {"name": md5, "status": STATUS_NEW}}
            self.assertEqual(status, expected)

            status_dir = self.cloud.status(info_dir, show_checksums=True)
            expected = {md5_dir: {"name": md5_dir, "status": STATUS_NEW}}
            self.assertEqual(status_dir, expected)

            # Push and check status
            self.cloud.push(info)
            self.assertTrue(os.path.exists(cache))
            self.assertTrue(os.path.isfile(cache))

            self.cloud.push(info_dir)
            self.assertTrue(os.path.isfile(cache_dir))

            status = self.cloud.status(info, show_checksums=True)
            expected = {md5: {"name": md5, "status": STATUS_OK}}
            self.assertEqual(status, expected)

            status_dir = self.cloud.status(info_dir, show_checksums=True)
            expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}}
            self.assertEqual(status_dir, expected)

            # Remove and check status
            remove(self.dvc.cache.local.cache_dir)

            status = self.cloud.status(info, show_checksums=True)
            expected = {md5: {"name": md5, "status": STATUS_DELETED}}
            self.assertEqual(status, expected)

            status_dir = self.cloud.status(info_dir, show_checksums=True)
            expected = {md5_dir: {"name": md5_dir, "status": STATUS_DELETED}}
            self.assertEqual(status_dir, expected)

            # Pull and check status
            self.cloud.pull(info)
            self.assertTrue(os.path.exists(cache))
            self.assertTrue(os.path.isfile(cache))
            with open(cache, "r") as fd:
                self.assertEqual(fd.read(), self.FOO_CONTENTS)

            self.cloud.pull(info_dir)
            self.assertTrue(os.path.isfile(cache_dir))

            status = self.cloud.status(info, show_checksums=True)
            expected = {md5: {"name": md5, "status": STATUS_OK}}
            self.assertEqual(status, expected)

            status_dir = self.cloud.status(info_dir, show_checksums=True)
            expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}}
            self.assertTrue(status_dir, expected)
Пример #12
0
def test_cloud(tmp_dir, dvc, remote):
    (stage,) = tmp_dir.dvc_gen("foo", "foo")
    out = stage.outs[0]
    cache = out.cache_path
    md5 = out.checksum
    info = out.get_used_cache()

    (stage_dir,) = tmp_dir.dvc_gen(
        {
            "data_dir": {
                "data_sub_dir": {"data_sub": "data_sub"},
                "data": "data",
            }
        }
    )
    out_dir = stage_dir.outs[0]
    cache_dir = out_dir.cache_path
    name_dir = str(out_dir)
    md5_dir = out_dir.checksum
    info_dir = NamedCache.make(out_dir.scheme, md5_dir, name_dir)

    with dvc.state:
        # Check status
        status = dvc.cloud.status(info, show_checksums=True)
        expected = {md5: {"name": md5, "status": STATUS_NEW}}
        assert status == expected

        status_dir = dvc.cloud.status(info_dir, show_checksums=True)
        expected = {md5_dir: {"name": md5_dir, "status": STATUS_NEW}}
        assert status_dir == expected

        # Push and check status
        dvc.cloud.push(info)
        assert os.path.exists(cache)
        assert os.path.isfile(cache)

        dvc.cloud.push(info_dir)
        assert os.path.isfile(cache_dir)

        status = dvc.cloud.status(info, show_checksums=True)
        expected = {md5: {"name": md5, "status": STATUS_OK}}
        assert status == expected

        status_dir = dvc.cloud.status(info_dir, show_checksums=True)
        expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}}
        assert status_dir == expected

        # Remove and check status
        remove(dvc.cache.local.cache_dir)

        status = dvc.cloud.status(info, show_checksums=True)
        expected = {md5: {"name": md5, "status": STATUS_DELETED}}
        assert status == expected

        status_dir = dvc.cloud.status(info_dir, show_checksums=True)
        expected = {md5_dir: {"name": md5_dir, "status": STATUS_DELETED}}
        assert status_dir == expected

        # Pull and check status
        dvc.cloud.pull(info)
        assert os.path.exists(cache)
        assert os.path.isfile(cache)
        with open(cache) as fd:
            assert fd.read() == "foo"

        dvc.cloud.pull(info_dir)
        assert os.path.isfile(cache_dir)

        status = dvc.cloud.status(info, show_checksums=True)
        expected = {md5: {"name": md5, "status": STATUS_OK}}
        assert status == expected

        status_dir = dvc.cloud.status(info_dir, show_checksums=True)
        expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}}
        assert status_dir == expected
Пример #13
0
    def _collect_used_dir_cache(self,
                                remote=None,
                                force=False,
                                jobs=None,
                                filter_info=None):
        """Get a list of `info`s related to the given directory.

        - Pull the directory entry from the remote cache if it was changed.

        Example:

            Given the following commands:

            $ echo "foo" > directory/foo
            $ echo "bar" > directory/bar
            $ dvc add directory

            It will return a NamedCache like:

            nc = NamedCache()
            nc.add(self.scheme, 'c157a79031e1', 'directory/foo')
            nc.add(self.scheme, 'd3b07384d113', 'directory/bar')
        """

        cache = NamedCache()

        if self.cache.changed_cache_file(self.checksum):
            try:
                self.repo.cloud.pull(
                    NamedCache.make("local", self.checksum, str(self)),
                    jobs=jobs,
                    remote=remote,
                    show_checksums=False,
                )
            except DvcException:
                logger.debug("failed to pull cache for '{}'".format(self))

        if self.cache.changed_cache_file(self.checksum):
            msg = ("Missing cache for directory '{}'. "
                   "Cache for files inside will be lost. "
                   "Would you like to continue? Use '-f' to force.")
            if not force and not prompt.confirm(msg.format(self.path_info)):
                raise CollectCacheError(
                    "unable to fully collect used cache"
                    " without cache for directory '{}'".format(self))
            else:
                return cache

        path = str(self.path_info)
        filter_path = str(filter_info) if filter_info else None
        is_win = os.name == "nt"
        for entry in self.dir_cache:
            checksum = entry[self.remote.PARAM_CHECKSUM]
            entry_relpath = entry[self.remote.PARAM_RELPATH]
            if is_win:
                entry_relpath = entry_relpath.replace("/", os.sep)
            entry_path = os.path.join(path, entry_relpath)
            if (not filter_path or entry_path == filter_path
                    or entry_path.startswith(filter_path + os.sep)):
                cache.add(self.scheme, checksum, entry_path)

        return cache
Пример #14
0
def test_cloud(tmp_dir, dvc, remote):  # pylint:disable=unused-argument
    (stage,) = tmp_dir.dvc_gen("foo", "foo")
    out = stage.outs[0]
    cache = out.cache_path
    md5 = out.checksum
    info = out.get_used_cache()

    (stage_dir,) = tmp_dir.dvc_gen(
        {
            "data_dir": {
                "data_sub_dir": {"data_sub": "data_sub"},
                "data": "data",
            }
        }
    )
    out_dir = stage_dir.outs[0]
    cache_dir = out_dir.cache_path
    name_dir = str(out_dir)
    md5_dir = out_dir.checksum
    info_dir = NamedCache.make(out_dir.scheme, md5_dir, name_dir)

    with dvc.state:
        # Check status
        status = dvc.cloud.status(info, show_checksums=True)
        expected = {md5: {"name": md5, "status": STATUS_NEW}}
        assert status == expected

        status_dir = dvc.cloud.status(info_dir, show_checksums=True)
        expected = {md5_dir: {"name": md5_dir, "status": STATUS_NEW}}
        assert status_dir == expected

        # Move cache and check status
        # See issue https://github.com/iterative/dvc/issues/4383 for details
        backup_dir = dvc.cache.local.cache_dir + ".backup"
        move(dvc.cache.local.cache_dir, backup_dir)
        status = dvc.cloud.status(info, show_checksums=True)
        expected = {md5: {"name": md5, "status": STATUS_MISSING}}
        assert status == expected

        status_dir = dvc.cloud.status(info_dir, show_checksums=True)
        expected = {md5_dir: {"name": md5_dir, "status": STATUS_MISSING}}
        assert status_dir == expected

        # Restore original cache:
        remove(dvc.cache.local.cache_dir)
        move(backup_dir, dvc.cache.local.cache_dir)

        # Push and check status
        dvc.cloud.push(info)
        assert os.path.exists(cache)
        assert os.path.isfile(cache)

        dvc.cloud.push(info_dir)
        assert os.path.isfile(cache_dir)

        status = dvc.cloud.status(info, show_checksums=True)
        expected = {md5: {"name": md5, "status": STATUS_OK}}
        assert status == expected

        status_dir = dvc.cloud.status(info_dir, show_checksums=True)
        expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}}
        assert status_dir == expected

        # Remove and check status
        remove(dvc.cache.local.cache_dir)

        status = dvc.cloud.status(info, show_checksums=True)
        expected = {md5: {"name": md5, "status": STATUS_DELETED}}
        assert status == expected

        status_dir = dvc.cloud.status(info_dir, show_checksums=True)
        expected = {md5_dir: {"name": md5_dir, "status": STATUS_DELETED}}
        assert status_dir == expected

        # Pull and check status
        dvc.cloud.pull(info)
        assert os.path.exists(cache)
        assert os.path.isfile(cache)
        with open(cache) as fd:
            assert fd.read() == "foo"

        dvc.cloud.pull(info_dir)
        assert os.path.isfile(cache_dir)

        status = dvc.cloud.status(info, show_checksums=True)
        expected = {md5: {"name": md5, "status": STATUS_OK}}
        assert status == expected

        status_dir = dvc.cloud.status(info_dir, show_checksums=True)
        expected = {md5_dir: {"name": md5_dir, "status": STATUS_OK}}
        assert status_dir == expected
Пример #15
0
def gc(
    self,
    all_branches=False,
    cloud=False,
    remote=None,
    with_deps=False,
    all_tags=False,
    all_commits=False,
    force=False,
    jobs=None,
    repos=None,
    workspace=False,
):

    # require `workspace` to be true to come into effect.
    # assume `workspace` to be enabled if any of `all_tags`, `all_commits`,
    # or `all_branches` are enabled.
    _raise_error_if_all_disabled(
        workspace=workspace,
        all_tags=all_tags,
        all_commits=all_commits,
        all_branches=all_branches,
    )

    from contextlib import ExitStack
    from dvc.repo import Repo

    all_repos = []

    if repos:
        all_repos = [Repo(path) for path in repos]

    with ExitStack() as stack:
        for repo in all_repos:
            stack.enter_context(repo.lock)
            stack.enter_context(repo.state)

        used = NamedCache()
        for repo in all_repos + [self]:
            used.update(
                repo.used_cache(
                    all_branches=all_branches,
                    with_deps=with_deps,
                    all_tags=all_tags,
                    all_commits=all_commits,
                    remote=remote,
                    force=force,
                    jobs=jobs,
                ))

    _do_gc("local", self.cache.local.gc, used, jobs)

    if self.cache.s3:
        _do_gc("s3", self.cache.s3.gc, used, jobs)

    if self.cache.gs:
        _do_gc("gs", self.cache.gs.gc, used, jobs)

    if self.cache.ssh:
        _do_gc("ssh", self.cache.ssh.gc, used, jobs)

    if self.cache.hdfs:
        _do_gc("hdfs", self.cache.hdfs.gc, used, jobs)

    if self.cache.azure:
        _do_gc("azure", self.cache.azure.gc, used, jobs)

    if cloud:
        _do_gc("remote", self.cloud.get_remote(remote, "gc -c").gc, used, jobs)
Пример #16
0
def gc(
    self,
    all_branches=False,
    cloud=False,
    remote=None,
    with_deps=False,
    all_tags=False,
    all_commits=False,
    force=False,
    jobs=None,
    repos=None,
    workspace=False,
):

    # require `workspace` to be true to come into effect.
    # assume `workspace` to be enabled if any of `all_tags`, `all_commits`,
    # or `all_branches` are enabled.
    _raise_error_if_all_disabled(
        workspace=workspace,
        all_tags=all_tags,
        all_commits=all_commits,
        all_branches=all_branches,
    )

    from contextlib import ExitStack

    from dvc.repo import Repo

    if not repos:
        repos = []
    all_repos = [Repo(path) for path in repos]

    with ExitStack() as stack:
        for repo in all_repos:
            stack.enter_context(repo.lock)
            stack.enter_context(repo.state)

        used = NamedCache()
        for repo in all_repos + [self]:
            used.update(
                repo.used_cache(
                    all_branches=all_branches,
                    with_deps=with_deps,
                    all_tags=all_tags,
                    all_commits=all_commits,
                    remote=remote,
                    force=force,
                    jobs=jobs,
                ))

    for scheme, cache in self.cache.by_scheme():
        if not cache:
            continue

        removed = cache.gc(set(used.scheme_keys(scheme)), jobs=jobs)
        if not removed:
            logger.info(f"No unused '{scheme}' cache to remove.")

    if not cloud:
        return

    remote = self.cloud.get_remote(remote, "gc -c")
    removed = remote.gc(set(used.scheme_keys(Schemes.LOCAL)), jobs=jobs)
    if not removed:
        logger.info("No unused cache to remove from remote.")