def test_cache(tmp_dir, dvc): cache1_md5 = "123" cache2_md5 = "234" cache1 = os.path.join( dvc.odb.local.cache_dir, cache1_md5[0:2], cache1_md5[2:], ) cache2 = os.path.join( dvc.odb.local.cache_dir, cache2_md5[0:2], cache2_md5[2:], ) tmp_dir.gen({cache1: "1", cache2: "2"}) assert os.path.exists(cache1) assert os.path.exists(cache2) odb = ODBManager(dvc) md5_list = list(odb.local.all()) assert len(md5_list) == 2 assert cache1_md5 in md5_list assert cache2_md5 in md5_list odb_cache1 = odb.local.oid_to_path(cache1_md5) odb_cache2 = odb.local.oid_to_path(cache2_md5) assert os.fspath(odb_cache1) == cache1 assert os.fspath(odb_cache2) == cache2
def test_cache_link_type(tmp_dir, scm, dvc): with dvc.config.edit() as conf: conf["cache"]["type"] = "reflink,copy" dvc.odb = ODBManager(dvc) stages = tmp_dir.dvc_gen({"foo": "foo"}) assert len(stages) == 1 assert (tmp_dir / "foo").read_text().strip() == "foo"
def test_remote_cache_references(tmp_dir, dvc): with dvc.config.edit() as conf: conf["remote"]["storage"] = {"url": "ssh://user@localhost:23"} conf["remote"]["cache"] = {"url": "remote://storage/tmp"} conf["cache"]["ssh"] = "cache" dvc.odb = ODBManager(dvc) assert dvc.odb.ssh.path == "/tmp"
def test_windows_should_add_when_cache_on_different_drive( tmp_dir, dvc, temporary_windows_drive): dvc.config["cache"]["dir"] = temporary_windows_drive dvc.odb = ODBManager(dvc) (stage, ) = tmp_dir.dvc_gen({"file": "file"}) cache_path = stage.outs[0].cache_path assert path_isin(cache_path, temporary_windows_drive) assert os.path.isfile(cache_path) filecmp.cmp("file", cache_path)
def test_cache_type_is_properly_overridden(tmp_dir, erepo_dir): with erepo_dir.chdir(): with erepo_dir.dvc.config.edit() as conf: conf["cache"]["type"] = "symlink" erepo_dir.dvc.odb = ODBManager(erepo_dir.dvc) erepo_dir.scm_add( [erepo_dir.dvc.config.files["repo"]], "set cache type to symlinks" ) erepo_dir.dvc_gen("file", "contents", "create file") assert system.is_symlink(erepo_dir / "file") Repo.get(os.fspath(erepo_dir), "file", "file_imported") assert not system.is_symlink("file_imported") assert (tmp_dir / "file_imported").read_text() == "contents"
def test_destroy(tmp_dir, dvc, run_copy): dvc.config["cache"]["type"] = ["symlink"] dvc.odb = ODBManager(dvc) tmp_dir.dvc_gen("file", "text") tmp_dir.dvc_gen({"dir": {"file": "lorem", "subdir/file": "ipsum"}}) run_copy("file", "file2", single_stage=True) run_copy("file2", "file3", name="copy-file2-file3") run_copy("file3", "file4", name="copy-file3-file4") dvc.destroy() # Remove all the files related to DVC for path in [ ".dvc", ".dvcignore", "file.dvc", "file2.dvc", "dir.dvc", PIPELINE_FILE, PIPELINE_LOCK, ]: assert not (tmp_dir / path).exists() # Leave the rest of the files for path in [ "file", "file2", "file3", "file4", "dir/file", "dir/subdir/file", ]: assert (tmp_dir / path).is_file() # Make sure that data was unprotected after `destroy` for path in [ "file", "file2", "file3", "file4", "dir", "dir/file", "dir/subdir", "dir/subdir/file", ]: assert not system.is_symlink(tmp_dir / path)
def test_cache_type_is_properly_overridden(tmp_dir, scm, dvc, erepo_dir): with erepo_dir.chdir(): with erepo_dir.dvc.config.edit() as conf: conf["cache"]["type"] = "symlink" erepo_dir.dvc.odb = ODBManager(erepo_dir.dvc) erepo_dir.scm_add( [erepo_dir.dvc.config.files["repo"]], "set source repo cache type to symlink", ) erepo_dir.dvc_gen("foo", "foo content", "create foo") assert system.is_symlink(erepo_dir / "foo") dvc.imp(os.fspath(erepo_dir), "foo", "foo_imported") assert not system.is_symlink("foo_imported") assert (tmp_dir / "foo_imported").read_text() == "foo content" assert scm.is_ignored("foo_imported")
def test_shared_stage_cache(tmp_dir, dvc, run_copy): import stat from dvc.odbmgr import ODBManager tmp_dir.gen("foo", "foo") with dvc.config.edit() as config: config["cache"]["shared"] = "group" dvc.odb = ODBManager(dvc) assert not os.path.exists(dvc.odb.local.cache_dir) run_copy("foo", "bar", name="copy-foo-bar") parent_cache_dir = os.path.join(dvc.stage_cache.cache_dir, "88") cache_dir = os.path.join( parent_cache_dir, "883395068439203a9de3d1e1649a16e9027bfd1ab5dab4f438d321c4a928b328", ) cache_file = os.path.join( cache_dir, "e42b7ebb9bc5ac4bccab769c8d1338914dad25d7ffecc8671dbd4581bad4aa15", ) # sanity check assert os.path.isdir(cache_dir) assert os.listdir(cache_dir) == [os.path.basename(cache_file)] assert os.path.isfile(cache_file) def _mode(path): return stat.S_IMODE(os.stat(path).st_mode) if os.name == "nt": dir_mode = 0o777 file_mode = 0o666 else: dir_mode = 0o2775 file_mode = 0o664 assert _mode(dvc.odb.local.cache_dir) == dir_mode assert _mode(dvc.stage_cache.cache_dir) == dir_mode assert _mode(parent_cache_dir) == dir_mode assert _mode(cache_dir) == dir_mode assert _mode(cache_file) == file_mode
def _make_workspace(name, typ="local"): from dvc.odbmgr import ODBManager cloud = make_cloud(typ) # pylint: disable=W0621 tmp_dir.add_remote(name=name, config=cloud.config, default=False) tmp_dir.add_remote(name=f"{name}-cache", url="remote://workspace/cache", default=False) scheme = getattr(cloud, "scheme", "local") if scheme != "http": with dvc.config.edit() as conf: conf["cache"][scheme] = f"{name}-cache" dvc.odb = ODBManager(dvc) return cloud
def test_shared_cache(tmp_dir, dvc, group): from dvc.fs import system if group: with dvc.config.edit() as conf: conf["cache"].update({"shared": "group"}) dvc.odb = ODBManager(dvc) cache_dir = dvc.odb.local.cache_dir assert not os.path.exists(cache_dir) tmp_dir.dvc_gen({ "file": "file content", "dir": { "file2": "file 2 " "content" } }) actual = {} for root, dnames, fnames in os.walk(cache_dir): for name in dnames + fnames: path = os.path.join(root, name) actual[path] = oct(stat.S_IMODE(os.stat(path).st_mode)) file_mode = oct(0o444) dir_mode = oct(0o2775 if group else (0o777 & ~system.umask)) expected = { os.path.join(cache_dir, "17"): dir_mode, os.path.join(cache_dir, "17", "4eaa1dd94050255b7b98a7e1924b31"): file_mode, os.path.join(cache_dir, "17", "4eaa1dd94050255b7b98a7e1924b31.dir"): file_mode, os.path.join(cache_dir, "97"): dir_mode, os.path.join(cache_dir, "97", "e17781c198500e2766ea56bd697c03"): file_mode, os.path.join(cache_dir, "d1"): dir_mode, os.path.join(cache_dir, "d1", "0b4c3ff123b26dc068d43a8bef2d23"): file_mode, } assert expected == actual
def test_import_dir(self, tmp_dir, dvc, workspace, stage_md5, dir_md5): from dvc.odbmgr import ODBManager workspace.gen( {"dir": { "file": "file", "subdir": { "subfile": "subfile" } }}) # remove external cache to make sure that we don't need it # to import dirs with dvc.config.edit() as conf: del conf["cache"] dvc.odb = ODBManager(dvc) assert not (tmp_dir / "dir").exists() # sanity check dvc.imp_url("remote://workspace/dir") assert set(os.listdir(tmp_dir / "dir")) == {"file", "subdir"} assert (tmp_dir / "dir" / "file").read_text() == "file" assert list(os.listdir(tmp_dir / "dir" / "subdir")) == ["subfile"] assert (tmp_dir / "dir" / "subdir" / "subfile").read_text() == "subfile" assert dvc.status() == {} if stage_md5 is not None and dir_md5 is not None: assert (tmp_dir / "dir.dvc").read_text() == ( f"md5: {stage_md5}\n" "frozen: true\n" "deps:\n" f"- md5: {dir_md5}\n" " size: 11\n" " nfiles: 2\n" " path: remote://workspace/dir\n" "outs:\n" "- md5: b6dcab6ccd17ca0a8bf4a215a37d14cc.dir\n" " size: 11\n" " nfiles: 2\n" " path: dir\n")
def test_external_cache_dir(tmp_dir, dvc, make_tmp_dir): cache_dir = make_tmp_dir("cache") with dvc.config.edit() as conf: conf["cache"]["dir"] = cache_dir.fs_path assert not os.path.exists(dvc.odb.local.cache_dir) dvc.odb = ODBManager(dvc) tmp_dir.dvc_gen({"foo": "foo"}) tmp_dir.dvc_gen({ "data_dir": { "data": "data_dir/data", "data_sub_dir": { "data_sub": "data_dir/data_sub_dir/data_sub" }, } }) assert not os.path.exists(".dvc/cache") assert len(os.listdir(cache_dir)) != 0
def test_cmd_cache_relative_path(tmp_dir, scm, dvc, make_tmp_dir): cache_dir = make_tmp_dir("cache") dname = relpath(cache_dir) ret = main(["cache", "dir", dname]) assert ret == 0 dvc.config.load() dvc.odb = ODBManager(dvc) # NOTE: we are in the repo's root and config is in .dvc/, so # dir path written to config should be just one level above. rel = os.path.join("..", dname) config = configobj.ConfigObj(dvc.config.files["repo"]) assert config["cache"]["dir"] == rel.replace("\\", "/") tmp_dir.dvc_gen({"foo": "foo"}) subdirs = os.listdir(cache_dir) assert len(subdirs) == 1 files = os.listdir(os.path.join(cache_dir, subdirs[0])) assert len(files) == 1
def __init__( self, root_dir=None, fs=None, rev=None, subrepos=False, uninitialized=False, config=None, url=None, repo_factory=None, scm=None, ): from dvc.config import Config from dvc.data_cloud import DataCloud from dvc.fs import GitFileSystem, localfs from dvc.lock import LockNoop, make_lock from dvc.odbmgr import ODBManager from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.scm import SCM from dvc.stage.cache import StageCache from dvc_data.hashfile.state import State, StateNoop self.url = url self._fs_conf = {"repo_factory": repo_factory} self._fs = fs or localfs self._scm = scm if rev and not fs: self._scm = scm = SCM(root_dir or os.curdir) root_dir = "/" self._fs = GitFileSystem(scm=self._scm, rev=rev) self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, fs=self.fs, uninitialized=uninitialized, scm=scm, ) self.config = Config(self.dvc_dir, fs=self.fs, config=config) self._uninitialized = uninitialized # used by DvcFileSystem to determine if it should traverse subrepos self.subrepos = subrepos self.cloud = DataCloud(self) self.stage = StageLoad(self) if isinstance(self.fs, GitFileSystem) or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() self.odb = ODBManager(self) self.tmp_dir = None else: from dvc.utils.fs import makedirs makedirs(self.tmp_dir, exist_ok=True) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) state_db_dir = self._get_database_dir("state") self.state = State(self.root_dir, state_db_dir, self.dvcignore) self.odb = ODBManager(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.stage_collection_error_handler: Optional[Callable[ [str, Exception], None]] = None self._lock_depth = 0