Пример #1
0
    def __init__(
        self,
        root_dir,
        url,
        scm=None,
        rev=None,
        for_write=False,
        cache_dir=None,
        cache_types=None,
        **kwargs,
    ):
        self.root_dir = os.path.realpath(root_dir)
        self.scm = scm

        self.url = url
        self.for_write = for_write
        self.cache_dir = cache_dir or self._get_cache_dir()
        self.cache_types = cache_types

        self.rev = rev
        self.tree_confs = kwargs

        self.config = {"cache": {"dir": self.cache_dir}}
        self.cache = Cache(self)
        if cache_types:
            self.cache.local.cache_types = cache_types

        self.state = StateNoop()
Пример #2
0
    def __init__(self, root_dir):
        from dvc.logger import Logger
        from dvc.config import Config
        from dvc.state import LinkState, State
        from dvc.lock import Lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.updater import Updater
        from dvc.prompt import Prompt

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)
        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self)
        self.link_state = LinkState(self)

        core = self.config._config[Config.SECTION_CORE]
        self.logger = Logger(core.get(Config.SECTION_CORE_LOGLEVEL, None))

        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config._config)
        self.updater = Updater(self.dvc_dir)
        self.prompt = Prompt()

        self._ignore()

        self.updater.check()
Пример #3
0
    def init(root_dir=os.curdir):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)
        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        cache = Cache.init(dvc_dir)
        state = State.init(root_dir, dvc_dir)
        lock = Lock(dvc_dir)

        scm = SCM(root_dir)
        scm.ignore_list([cache.cache_dir, state.state_file, lock.lock_file])

        ignore_file = os.path.join(dvc_dir, scm.ignore_file())
        scm.add([config.config_file, ignore_file])

        return Project(root_dir)
Пример #4
0
def test_import_url_dir(tmp_dir, dvc, workspace, stage_md5, dir_md5):
    workspace.gen({"dir": {"file": "file", "subdir": {"subfile": "subfile"}}})

    # remove external cache to make sure that we don't need it to import dirs
    with dvc.config.edit() as conf:
        del conf["cache"]
    dvc.cache = Cache(dvc)

    assert not (tmp_dir / "dir").exists()  # sanity check
    dvc.imp_url("remote://workspace/dir")
    assert set(os.listdir(tmp_dir / "dir")) == {"file", "subdir"}
    assert (tmp_dir / "dir" / "file").read_text() == "file"
    assert list(os.listdir(tmp_dir / "dir" / "subdir")) == ["subfile"]
    assert (tmp_dir / "dir" / "subdir" / "subfile").read_text() == "subfile"

    assert (tmp_dir / "dir.dvc").read_text() == (
        f"md5: {stage_md5}\n"
        "frozen: true\n"
        "deps:\n"
        f"- md5: {dir_md5}\n"
        "  path: remote://workspace/dir\n"
        "outs:\n"
        "- md5: b6dcab6ccd17ca0a8bf4a215a37d14cc.dir\n"
        "  path: dir\n")

    assert dvc.status() == {}
Пример #5
0
    def __init__(self, root_dir=None, scm=None, rev=None):
        from dvc.state import State, StateNoop
        from dvc.lock import make_lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.metrics import Metrics
        from dvc.repo.plots import Plots
        from dvc.repo.params import Params
        from dvc.scm.tree import WorkingTree
        from dvc.utils.fs import makedirs
        from dvc.stage.cache import StageCache

        if scm:
            # use GitTree instead of WorkingTree as default repo tree instance
            tree = scm.get_tree(rev)
            self.root_dir = self.find_root(root_dir, tree)
            self.scm = scm
            self.tree = tree
            self.state = StateNoop()
        else:
            root_dir = self.find_root(root_dir)
            self.root_dir = os.path.abspath(os.path.realpath(root_dir))

        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)
        self.config = Config(self.dvc_dir)

        if not scm:
            no_scm = self.config["core"].get("no_scm", False)
            self.scm = SCM(self.root_dir, no_scm=no_scm)
            self.tree = WorkingTree(self.root_dir)

        self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
        self.index_dir = os.path.join(self.tmp_dir, "index")
        makedirs(self.index_dir, exist_ok=True)

        hardlink_lock = self.config["core"].get("hardlink_lock", False)
        self.lock = make_lock(
            os.path.join(self.tmp_dir, "lock"),
            tmp_dir=self.tmp_dir,
            hardlink_lock=hardlink_lock,
            friendly=True,
        )

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        if not scm:
            # NOTE: storing state and link_state in the repository itself to
            # avoid any possible state corruption in 'shared cache dir'
            # scenario.
            self.state = State(self.cache.local)

        self.stage_cache = StageCache(self)

        self.metrics = Metrics(self)
        self.plots = Plots(self)
        self.params = Params(self)

        self._ignore()
Пример #6
0
    def __init__(self, root_dir=None):
        from dvc.config import Config
        from dvc.state import State
        from dvc.lock import Lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.updater import Updater

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.scm = SCM(self.root_dir, project=self)
        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config.config)

        core = self.config.config[Config.SECTION_CORE]

        logger.set_level(core.get(Config.SECTION_CORE_LOGLEVEL))

        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config.config)
        self.updater = Updater(self.dvc_dir)

        self.files_to_git_add = []

        self._ignore()

        self.updater.check()
Пример #7
0
    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)
        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.cache = Cache(self.root_dir, self.dvc_dir, cache_dir=self.config._config[Config.SECTION_CACHE].get(Config.SECTION_CACHE_DIR, None),
                                         cache_type=self.config._config[Config.SECTION_CACHE].get(Config.SECTION_CACHE_TYPE, None))
        self.state = State(self.dvc_dir)
        self.logger = Logger(self.config._config[Config.SECTION_CORE].get(Config.SECTION_CORE_LOGLEVEL, None))
        self.cloud = DataCloud(cache=self.cache, config=self.config._config)
        self.updater = Updater(self.dvc_dir)

        self._ignore()

        self.updater.check()
Пример #8
0
def test_windows_should_add_when_cache_on_different_drive(
        tmp_dir, dvc, temporary_windows_drive):
    dvc.config["cache"]["dir"] = temporary_windows_drive
    dvc.cache = Cache(dvc)

    (stage, ) = tmp_dir.dvc_gen({"file": "file"})
    cache_path = stage.outs[0].cache_path

    assert path_isin(cache_path, temporary_windows_drive)
    assert os.path.isfile(cache_path)
    filecmp.cmp("file", cache_path)
Пример #9
0
def test_cache_type_is_properly_overridden(tmp_dir, erepo_dir):
    with erepo_dir.chdir():
        erepo_dir.dvc.config.set(Config.SECTION_CACHE,
                                 Config.SECTION_CACHE_TYPE, "symlink")
        erepo_dir.dvc.cache = Cache(erepo_dir.dvc)
        erepo_dir.scm_add([erepo_dir.dvc.config.config_file],
                          "set cache type to symlinks")
        erepo_dir.dvc_gen("file", "contents", "create file")
    assert System.is_symlink(erepo_dir / "file")

    Repo.get(fspath(erepo_dir), "file", "file_imported")

    assert not System.is_symlink("file_imported")
    assert (tmp_dir / "file_imported").read_text() == "contents"
Пример #10
0
def test_cache_type_is_properly_overridden(tmp_dir, erepo_dir):
    with erepo_dir.chdir():
        with erepo_dir.dvc.config.edit() as conf:
            conf["cache"]["type"] = "symlink"
        erepo_dir.dvc.cache = Cache(erepo_dir.dvc)
        erepo_dir.scm_add([erepo_dir.dvc.config.files["repo"]],
                          "set cache type to symlinks")
        erepo_dir.dvc_gen("file", "contents", "create file")
    assert System.is_symlink(erepo_dir / "file")

    Repo.get(fspath(erepo_dir), "file", "file_imported")

    assert not System.is_symlink("file_imported")
    assert (tmp_dir / "file_imported").read_text() == "contents"
Пример #11
0
    def __init__(self, root_dir=None):
        from dvc.state import State
        from dvc.lock import make_lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.metrics import Metrics
        from dvc.scm.tree import WorkingTree
        from dvc.repo.tag import Tag
        from dvc.utils import makedirs

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.scm = SCM(self.root_dir)

        self.tree = CleanTree(WorkingTree(self.root_dir))

        self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
        makedirs(self.tmp_dir, exist_ok=True)

        hardlink_lock = self.config.config["core"].get("hardlink_lock", False)
        self.lock = make_lock(
            os.path.join(self.dvc_dir, "lock"),
            tmp_dir=os.path.join(self.dvc_dir, "tmp"),
            hardlink_lock=hardlink_lock,
            friendly=True,
        )

        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config.config)

        core = self.config.config[Config.SECTION_CORE]

        level = core.get(Config.SECTION_CORE_LOGLEVEL)
        if level:
            logger.setLevel(level.upper())

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        self.metrics = Metrics(self)
        self.tag = Tag(self)

        self._ignore()
Пример #12
0
    def __init__(self, root_dir=None):
        from dvc.state import State
        from dvc.lock import make_lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.metrics import Metrics
        from dvc.repo.params import Params
        from dvc.scm.tree import WorkingTree
        from dvc.utils.fs import makedirs
        from dvc.stage.cache import StageCache

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        no_scm = self.config["core"].get("no_scm", False)
        self.scm = SCM(self.root_dir, no_scm=no_scm)

        self.tree = WorkingTree(self.root_dir)

        self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
        self.index_dir = os.path.join(self.tmp_dir, "index")
        makedirs(self.index_dir, exist_ok=True)

        hardlink_lock = self.config["core"].get("hardlink_lock", False)
        self.lock = make_lock(
            os.path.join(self.tmp_dir, "lock"),
            tmp_dir=self.tmp_dir,
            hardlink_lock=hardlink_lock,
            friendly=True,
        )

        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self)

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        self.stage_cache = StageCache(self.cache.local.cache_dir)

        self.metrics = Metrics(self)
        self.params = Params(self)

        self._ignore()
Пример #13
0
def test_shared_stage_cache(tmp_dir, dvc, run_copy):
    import stat

    from dvc.cache import Cache

    tmp_dir.gen("foo", "foo")

    with dvc.config.edit() as config:
        config["cache"]["shared"] = "group"

    dvc.cache = Cache(dvc)

    assert not os.path.exists(dvc.cache.local.cache_dir)

    run_copy("foo", "bar", name="copy-foo-bar")

    parent_cache_dir = os.path.join(
        dvc.stage_cache.cache_dir,
        "88",
    )
    cache_dir = os.path.join(
        parent_cache_dir,
        "883395068439203a9de3d1e1649a16e9027bfd1ab5dab4f438d321c4a928b328",
    )
    cache_file = os.path.join(
        cache_dir,
        "e42b7ebb9bc5ac4bccab769c8d1338914dad25d7ffecc8671dbd4581bad4aa15",
    )

    # sanity check
    assert os.path.isdir(cache_dir)
    assert os.listdir(cache_dir) == [os.path.basename(cache_file)]
    assert os.path.isfile(cache_file)

    def _mode(path):
        return stat.S_IMODE(os.stat(path).st_mode)

    if os.name == "nt":
        dir_mode = 0o777
        file_mode = 0o666
    else:
        dir_mode = 0o2775
        file_mode = 0o664

    assert _mode(dvc.cache.local.cache_dir) == dir_mode
    assert _mode(dvc.stage_cache.cache_dir) == dir_mode
    assert _mode(parent_cache_dir) == dir_mode
    assert _mode(cache_dir) == dir_mode
    assert _mode(cache_file) == file_mode
Пример #14
0
def test_destroy(tmp_dir, dvc, run_copy):
    from dvc.dvcfile import PIPELINE_FILE, PIPELINE_LOCK

    dvc.config["cache"]["type"] = ["symlink"]
    dvc.cache = Cache(dvc)

    tmp_dir.dvc_gen("file", "text")
    tmp_dir.dvc_gen({"dir": {"file": "lorem", "subdir/file": "ipsum"}})

    run_copy("file", "file2", single_stage=True)
    run_copy("file2", "file3", name="copy-file2-file3")
    run_copy("file3", "file4", name="copy-file3-file4")

    dvc.destroy()

    # Remove all the files related to DVC
    for path in [
            ".dvc",
            "file.dvc",
            "file2.dvc",
            "dir.dvc",
            PIPELINE_FILE,
            PIPELINE_LOCK,
    ]:
        assert not (tmp_dir / path).exists()

    # Leave the rest of the files
    for path in [
            "file",
            "file2",
            "file3",
            "file4",
            "dir/file",
            "dir/subdir/file",
    ]:
        assert (tmp_dir / path).is_file()

    # Make sure that data was unprotected after `destroy`
    for path in [
            "file",
            "file2",
            "file3",
            "file4",
            "dir",
            "dir/file",
            "dir/subdir",
            "dir/subdir/file",
    ]:
        assert not System.is_symlink(tmp_dir / path)
Пример #15
0
    def test_find_cache(self):
        fname1 = os.path.basename(self.cache1)
        fname1_md5 = self.cache1_md5
        fname2 = os.path.basename(self.cache2)
        fname2_md5 = self.cache2_md5
        fname3 = 'non_existing'

        System.hardlink(self.cache1, fname1)
        System.hardlink(self.cache2, fname2)

        cache = Cache(self.dvc.dvc_dir).find_cache([fname1, fname2, fname3])

        expected = {fname1: fname1_md5, fname2: fname2_md5}

        self.assertEqual(len(cache), 2)
        self.assertEqual(cache, expected)
Пример #16
0
def test_cache_type_is_properly_overridden(tmp_dir, scm, dvc, erepo_dir):
    with erepo_dir.chdir():
        erepo_dir.dvc.config.set(Config.SECTION_CACHE,
                                 Config.SECTION_CACHE_TYPE, "symlink")
        erepo_dir.dvc.cache = Cache(erepo_dir.dvc)
        erepo_dir.scm_add(
            [erepo_dir.dvc.config.config_file],
            "set source repo cache type to symlink",
        )
        erepo_dir.dvc_gen("foo", "foo content", "create foo")
    assert System.is_symlink(erepo_dir / "foo")

    dvc.imp(fspath(erepo_dir), "foo", "foo_imported")

    assert not System.is_symlink("foo_imported")
    assert (tmp_dir / "foo_imported").read_text() == "foo content"
    assert scm.repo.git.check_ignore("foo_imported")
Пример #17
0
def test_cache_type_is_properly_overridden(tmp_dir, scm, dvc, erepo_dir):
    with erepo_dir.chdir():
        with erepo_dir.dvc.config.edit() as conf:
            conf["cache"]["type"] = "symlink"
        erepo_dir.dvc.cache = Cache(erepo_dir.dvc)
        erepo_dir.scm_add(
            [erepo_dir.dvc.config.files["repo"]],
            "set source repo cache type to symlink",
        )
        erepo_dir.dvc_gen("foo", "foo content", "create foo")
    assert System.is_symlink(erepo_dir / "foo")

    dvc.imp(os.fspath(erepo_dir), "foo", "foo_imported")

    assert not System.is_symlink("foo_imported")
    assert (tmp_dir / "foo_imported").read_text() == "foo content"
    assert scm.is_ignored("foo_imported")
Пример #18
0
    def __init__(self, root_dir=None):
        from dvc.config import Config
        from dvc.state import State
        from dvc.lock import Lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.updater import Updater
        from dvc.repo.metrics import Metrics
        from dvc.scm.tree import WorkingTree
        from dvc.repo.tag import Tag
        from dvc.repo.pkg import Pkg

        root_dir = self.find_root(root_dir)

        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)

        self.tree = WorkingTree(self.root_dir)

        self.scm = SCM(self.root_dir, repo=self)
        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self, self.config.config)

        core = self.config.config[Config.SECTION_CORE]

        level = core.get(Config.SECTION_CORE_LOGLEVEL)
        if level:
            logger.setLevel(level.upper())

        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config.config)
        self.updater = Updater(self.dvc_dir)

        self.metrics = Metrics(self)
        self.tag = Tag(self)
        self.pkg = Pkg(self)

        self._ignore()

        self.updater.check()
Пример #19
0
def test_shared_cache(tmp_dir, dvc, group, dir_mode):
    if group:
        with dvc.config.edit() as conf:
            conf["cache"].update({"shared": "group"})
    dvc.cache = Cache(dvc)

    tmp_dir.dvc_gen(
        {"file": "file content", "dir": {"file2": "file 2 " "content"}}
    )

    for root, dnames, fnames in os.walk(dvc.cache.local.cache_dir):
        for dname in dnames:
            path = os.path.join(root, dname)
            assert stat.S_IMODE(os.stat(path).st_mode) == dir_mode

        for fname in fnames:
            path = os.path.join(root, fname)
            assert stat.S_IMODE(os.stat(path).st_mode) == 0o444
Пример #20
0
def test_destroy(tmp_dir, dvc):
    dvc.config["cache"]["type"] = ["symlink"]
    dvc.cache = Cache(dvc)

    tmp_dir.dvc_gen("file", "text")
    tmp_dir.dvc_gen({"dir": {"file": "lorem", "subdir/file": "ipsum"}})

    dvc.destroy()

    # Remove all the files related to DVC
    for path in [".dvc", "file.dvc", "dir.dvc"]:
        assert not (tmp_dir / path).exists()

    # Leave the rest of the files
    for path in ["file", "dir/file", "dir/subdir/file"]:
        assert (tmp_dir / path).is_file()

    # Make sure that data was unprotected after `destroy`
    for path in ["file", "dir", "dir/file", "dir/subdir", "dir/subdir/file"]:
        assert not System.is_symlink(fspath(tmp_dir / path))
Пример #21
0
def workspace(tmp_dir, dvc, request):
    from dvc.cache import Cache

    cloud = request.param

    assert cloud

    tmp_dir.add_remote(name="workspace", config=cloud.config, default=False)
    tmp_dir.add_remote(name="cache",
                       url="remote://workspace/cache",
                       default=False)

    scheme = getattr(cloud, "scheme", "local")
    if scheme != "http":
        with dvc.config.edit() as conf:
            conf["cache"][scheme] = "cache"

        dvc.cache = Cache(dvc)

    return cloud
Пример #22
0
    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.config = Config(self.dvc_dir)
        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        # NOTE: storing state and link_state in the repository itself to avoid
        # any possible state corruption in 'shared cache dir' scenario.
        self.state = State(self)
        self.link_state = LinkState(self)
        self.logger = Logger(self.config._config[Config.SECTION_CORE].get(
            Config.SECTION_CORE_LOGLEVEL, None))
        self.cache = Cache(self)
        self.cloud = DataCloud(self, config=self.config._config)
        self.updater = Updater(self.dvc_dir)

        self._ignore()

        self.updater.check()
Пример #23
0
def test_shared_cache(tmp_dir, dvc, group):
    from dvc.utils.fs import umask

    if group:
        with dvc.config.edit() as conf:
            conf["cache"].update({"shared": "group"})
    dvc.cache = Cache(dvc)
    cache_dir = dvc.cache.local.cache_dir

    assert not os.path.exists(cache_dir)

    tmp_dir.dvc_gen({
        "file": "file content",
        "dir": {
            "file2": "file 2 "
            "content"
        }
    })

    actual = {}
    for root, dnames, fnames in os.walk(cache_dir):
        for name in dnames + fnames:
            path = os.path.join(root, name)
            actual[path] = oct(stat.S_IMODE(os.stat(path).st_mode))

    file_mode = oct(0o444)
    dir_mode = oct(0o2775 if group else (0o777 & ~umask))

    expected = {
        os.path.join(cache_dir, "17"): dir_mode,
        os.path.join(cache_dir, "17", "4eaa1dd94050255b7b98a7e1924b31.dir"):
        file_mode,
        os.path.join(cache_dir, "97"): dir_mode,
        os.path.join(cache_dir, "97", "e17781c198500e2766ea56bd697c03"):
        file_mode,
        os.path.join(cache_dir, "d1"): dir_mode,
        os.path.join(cache_dir, "d1", "0b4c3ff123b26dc068d43a8bef2d23"):
        file_mode,
    }

    assert expected == actual
Пример #24
0
def test_shared_cache(tmp_dir, dvc, protected, dir_mode, file_mode):
    dvc.config.set("cache", "shared", "group")
    dvc.config.set("cache", "protected", str(protected))
    dvc.cache = Cache(dvc)

    tmp_dir.dvc_gen({
        "file": "file content",
        "dir": {
            "file2": "file 2 "
            "content"
        }
    })

    for root, dnames, fnames in os.walk(dvc.cache.local.cache_dir):
        for dname in dnames:
            path = os.path.join(root, dname)
            assert stat.S_IMODE(os.stat(path).st_mode) == dir_mode

        for fname in fnames:
            path = os.path.join(root, fname)
            assert stat.S_IMODE(os.stat(path).st_mode) == file_mode
Пример #25
0
    def __init__(
        self,
        root_dir=None,
        scm=None,
        rev=None,
        subrepos=False,
        uninitialized=False,
    ):
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.lock import LockNoop, make_lock
        from dvc.repo.experiments import Experiments
        from dvc.repo.metrics import Metrics
        from dvc.repo.params import Params
        from dvc.repo.plots import Plots
        from dvc.scm import SCM
        from dvc.stage.cache import StageCache
        from dvc.state import State, StateNoop
        from dvc.tree.local import LocalTree
        from dvc.utils.fs import makedirs

        try:
            tree = scm.get_tree(rev) if rev else None
            self.root_dir = self.find_root(root_dir, tree)
            self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)
            self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
            makedirs(self.tmp_dir, exist_ok=True)
        except NotDvcRepoError:
            if not uninitialized:
                raise
            self.root_dir = SCM(root_dir or os.curdir).root_dir
            self.dvc_dir = None
            self.tmp_dir = None

        tree_kwargs = dict(use_dvcignore=True, dvcignore_root=self.root_dir)
        if scm:
            self.tree = scm.get_tree(rev, **tree_kwargs)
        else:
            self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs)

        self.config = Config(self.dvc_dir, tree=self.tree)
        self._scm = scm

        # used by RepoTree to determine if it should traverse subrepos
        self.subrepos = subrepos

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        if scm or not self.dvc_dir:
            self.lock = LockNoop()
            self.state = StateNoop()
        else:
            self.lock = make_lock(
                os.path.join(self.tmp_dir, "lock"),
                tmp_dir=self.tmp_dir,
                hardlink_lock=self.config["core"].get("hardlink_lock", False),
                friendly=True,
            )

            # NOTE: storing state and link_state in the repository itself to
            # avoid any possible state corruption in 'shared cache dir'
            # scenario.
            self.state = State(self)
            self.stage_cache = StageCache(self)

            try:
                self.experiments = Experiments(self)
            except NotImplementedError:
                self.experiments = None

            self._ignore()

        self.metrics = Metrics(self)
        self.plots = Plots(self)
        self.params = Params(self)
Пример #26
0
 def test_get(self):
     cache = Cache(self.dvc).local.get(self.cache1_md5)
     self.assertEqual(cache, self.cache1)
Пример #27
0
 def test_all(self):
     md5_list = list(Cache(self.dvc).local.all())
     self.assertEqual(len(md5_list), 2)
     self.assertIn(self.cache1_md5, md5_list)
     self.assertIn(self.cache2_md5, md5_list)
Пример #28
0
    def __init__(
        self,
        root_dir=None,
        scm=None,
        rev=None,
        subrepos=False,
        uninitialized=False,
    ):
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.lock import LockNoop, make_lock
        from dvc.repo.metrics import Metrics
        from dvc.repo.params import Params
        from dvc.repo.plots import Plots
        from dvc.repo.stage import StageLoad
        from dvc.stage.cache import StageCache
        from dvc.state import State, StateNoop
        from dvc.tree.local import LocalTree

        self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs(
            root_dir=root_dir, scm=scm, rev=rev, uninitialized=uninitialized
        )

        tree_kwargs = {"use_dvcignore": True, "dvcignore_root": self.root_dir}
        if scm:
            self.tree = scm.get_tree(rev, **tree_kwargs)
        else:
            self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs)

        self.config = Config(self.dvc_dir, tree=self.tree)
        self._scm = scm

        # used by RepoTree to determine if it should traverse subrepos
        self.subrepos = subrepos

        self.cache = Cache(self)
        self.cloud = DataCloud(self)
        self.stage = StageLoad(self)

        if scm or not self.dvc_dir:
            self.lock = LockNoop()
            self.state = StateNoop()
        else:
            self.lock = make_lock(
                os.path.join(self.tmp_dir, "lock"),
                tmp_dir=self.tmp_dir,
                hardlink_lock=self.config["core"].get("hardlink_lock", False),
                friendly=True,
            )

            # NOTE: storing state and link_state in the repository itself to
            # avoid any possible state corruption in 'shared cache dir'
            # scenario.
            self.state = State(self)
            self.stage_cache = StageCache(self)

            self._ignore()

        self.metrics = Metrics(self)
        self.plots = Plots(self)
        self.params = Params(self)
        self.stage_collection_error_handler = None
        self._lock_depth = 0
Пример #29
0
class Project(object):
    DVC_DIR = '.dvc'

    def __init__(self, root_dir):
        self.root_dir = os.path.abspath(os.path.realpath(root_dir))
        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)

        self.scm = SCM(self.root_dir)
        self.lock = Lock(self.dvc_dir)
        self.cache = Cache(self.dvc_dir)
        self.state = State(self.root_dir, self.dvc_dir)
        self.config = Config(self.dvc_dir)
        self.logger = Logger(self.config._config[Config.SECTION_CORE].get(
            Config.SECTION_CORE_LOGLEVEL, None))
        self.cloud = DataCloud(cache=self.cache,
                               state=self.state,
                               config=self.config._config)

    @staticmethod
    def init(root_dir=os.curdir):
        """
        Initiate dvc project in directory.

        Args:
            root_dir: Path to project's root directory.

        Returns:
            Project instance.

        Raises:
            KeyError: Raises an exception.
        """
        root_dir = os.path.abspath(root_dir)
        dvc_dir = os.path.join(root_dir, Project.DVC_DIR)
        os.mkdir(dvc_dir)

        config = Config.init(dvc_dir)
        cache = Cache.init(dvc_dir)
        state = State.init(root_dir, dvc_dir)
        lock = Lock(dvc_dir)

        scm = SCM(root_dir)
        scm.ignore_list([cache.cache_dir, state.state_file, lock.lock_file])

        ignore_file = os.path.join(dvc_dir, scm.ignore_file())
        scm.add([config.config_file, ignore_file])

        return Project(root_dir)

    def to_dvc_path(self, path):
        return os.path.relpath(path, self.root_dir)

    def add(self, fname):
        out = os.path.basename(fname)
        stage_fname = out + Stage.STAGE_FILE_SUFFIX
        cwd = os.path.dirname(os.path.abspath(fname))
        stage = Stage.loads(project=self,
                            cmd=None,
                            deps=[],
                            outs=[out],
                            fname=stage_fname,
                            cwd=cwd)

        stage.save()
        stage.dump()
        return stage

    def remove(self, target):
        if not Stage.is_stage_file(target):
            raise StageNotFoundError(target)

        stage = Stage.load(self, target)
        for out in stage.outs:
            out.remove()

        return stage

    def run(self,
            cmd=None,
            deps=[],
            outs=[],
            outs_no_cache=[],
            fname=Stage.STAGE_FILE,
            cwd=os.curdir,
            no_exec=False):
        stage = Stage.loads(project=self,
                            fname=fname,
                            cmd=cmd,
                            cwd=cwd,
                            outs=outs,
                            outs_no_cache=outs_no_cache,
                            deps=deps)
        if not no_exec:
            stage.run()
        stage.dump()
        return stage

    def _reproduce_stage(self, stages, node, force):
        stage = stages[node].reproduce(force=force)
        if not stage:
            return []
        stage.dump()
        return [stage]

    def reproduce(self, target, recursive=True, force=False):
        stages = nx.get_node_attributes(self.graph(), 'stage')
        node = os.path.relpath(os.path.abspath(target), self.root_dir)
        if node not in stages:
            raise StageNotFoundError(target)

        if recursive:
            return self._reproduce_stages(stages, node, force)

        return self._reproduce_stage(stages, node, force)

    def _reproduce_stages(self, stages, node, force):
        result = []
        for n in nx.dfs_postorder_nodes(self.graph(), node):
            try:
                result += self._reproduce_stage(stages, n, force)
            except Exception as ex:
                raise ReproductionError(stages[n].relpath, ex)
        return result

    def _remove_untracked_hardlinks(self):
        untracked = self.scm.untracked_files()
        cache = dict((System.inode(c), c) for c in self.cache.all())
        for file in untracked:
            inode = System.inode(file)
            if inode not in cache.keys():
                continue

            Logger.info(u'Remove \'{}\''.format(file))
            os.remove(file)

            dir = os.path.dirname(file)
            if len(dir) != 0 and not os.listdir(dir):
                Logger.info(u'Remove empty directory \'{}\''.format(dir))
                os.removedirs(dir)

    def checkout(self):
        self._remove_untracked_hardlinks()
        for stage in self.stages():
            stage.checkout()

    def _used_cache(self, target=None):
        cache_set = set()

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in stages:
            for out in stage.outs:
                if not out.use_cache or not out.cache:
                    continue
                cache_set |= set([out.cache])
                if out.is_dir_cache(out.cache) and os.path.isfile(out.cache):
                    dir_cache = out.dir_cache()
                    cache_set |= set(dir_cache.values())

        return list(cache_set)

    def gc(self):
        clist = self._used_cache()
        for cache in self.cache.all():
            if cache in clist:
                continue
            os.unlink(cache)
            self.logger.info(u'\'{}\' was removed'.format(
                self.to_dvc_path(cache)))

    def push(self, target=None, jobs=1, remote=None):
        return self.cloud.push(self._used_cache(target), jobs, remote=remote)

    def fetch(self, target=None, jobs=1, remote=None):
        return self.cloud.pull(self._used_cache(target), jobs, remote=remote)

    def pull(self, target=None, jobs=1, remote=None):
        ret = self.fetch(target, jobs, remote=remote)
        self.checkout()
        return ret

    def _local_status(self, target=None):
        status = {}

        if target:
            stages = [Stage.load(self, target)]
        else:
            stages = self.stages()

        for stage in self.stages():
            status.update(stage.status())

        return status

    def _cloud_status(self, target=None, jobs=1, remote=None):
        status = {}
        for target, ret in self.cloud.status(self._used_cache(target),
                                             jobs,
                                             remote=remote):
            if ret == cloud.STATUS_UNKNOWN or ret == cloud.STATUS_OK:
                continue

            prefix_map = {
                cloud.STATUS_DELETED: 'deleted',
                cloud.STATUS_MODIFIED: 'modified',
                cloud.STATUS_NEW: 'new',
            }

            path = os.path.relpath(target, self.cache.cache_dir)

            status[path] = prefix_map[ret]

        return status

    def status(self, target=None, jobs=1, cloud=False, remote=None):
        if cloud:
            return self._cloud_status(target, jobs, remote=remote)
        return self._local_status(target)

    def graph(self):
        G = nx.DiGraph()

        for stage in self.stages():
            node = os.path.relpath(stage.path, self.root_dir)
            G.add_node(node, stage=stage)
            for dep in stage.deps:
                dep_stage = dep.stage()
                if not dep_stage:
                    continue
                dep_node = os.path.relpath(dep_stage.path, self.root_dir)
                G.add_node(dep_node, stage=dep_stage)
                G.add_edge(node, dep_node)

        return G

    def stages(self):
        stages = []
        for root, dirs, files in os.walk(self.root_dir):
            for fname in files:
                path = os.path.join(root, fname)
                if not Stage.is_stage_file(path):
                    continue
                stages.append(Stage.load(self, path))
        return stages

    def outs(self):
        outs = []
        for stage in self.stages():
            outs += stage.outs
        return outs
Пример #30
0
    def __init__(self, root_dir=None, scm=None, rev=None):
        from dvc.state import State, StateNoop
        from dvc.lock import make_lock
        from dvc.scm import SCM
        from dvc.cache import Cache
        from dvc.data_cloud import DataCloud
        from dvc.repo.experiments import Experiments
        from dvc.repo.metrics import Metrics
        from dvc.repo.plots import Plots
        from dvc.repo.params import Params
        from dvc.tree.local import LocalRemoteTree
        from dvc.utils.fs import makedirs
        from dvc.stage.cache import StageCache

        if scm:
            tree = scm.get_tree(rev)
            self.root_dir = self.find_root(root_dir, tree)
            self.scm = scm
            self.tree = scm.get_tree(rev,
                                     use_dvcignore=True,
                                     dvcignore_root=self.root_dir)
            self.state = StateNoop()
        else:
            root_dir = self.find_root(root_dir)
            self.root_dir = os.path.abspath(os.path.realpath(root_dir))
            self.tree = LocalRemoteTree(
                self,
                {"url": self.root_dir},
                use_dvcignore=True,
                dvcignore_root=self.root_dir,
            )

        self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)
        self.config = Config(self.dvc_dir, tree=self.tree)

        if not scm:
            no_scm = self.config["core"].get("no_scm", False)
            self.scm = SCM(self.root_dir, no_scm=no_scm)

        self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
        self.index_dir = os.path.join(self.tmp_dir, "index")
        makedirs(self.index_dir, exist_ok=True)

        hardlink_lock = self.config["core"].get("hardlink_lock", False)
        self.lock = make_lock(
            os.path.join(self.tmp_dir, "lock"),
            tmp_dir=self.tmp_dir,
            hardlink_lock=hardlink_lock,
            friendly=True,
        )

        self.cache = Cache(self)
        self.cloud = DataCloud(self)

        if not scm:
            # NOTE: storing state and link_state in the repository itself to
            # avoid any possible state corruption in 'shared cache dir'
            # scenario.
            self.state = State(self.cache.local)

        self.stage_cache = StageCache(self)

        self.metrics = Metrics(self)
        self.plots = Plots(self)
        self.params = Params(self)

        try:
            self.experiments = Experiments(self)
        except NotImplementedError:
            self.experiments = None

        self._ignore()