def __init__( self, root_dir, url, scm=None, rev=None, for_write=False, cache_dir=None, cache_types=None, **kwargs, ): self.root_dir = os.path.realpath(root_dir) self.scm = scm self.url = url self.for_write = for_write self.cache_dir = cache_dir or self._get_cache_dir() self.cache_types = cache_types self.rev = rev self.tree_confs = kwargs self.config = {"cache": {"dir": self.cache_dir}} self.cache = Cache(self) if cache_types: self.cache.local.cache_types = cache_types self.state = StateNoop()
def __init__(self, root_dir): from dvc.logger import Logger from dvc.config import Config from dvc.state import LinkState, State from dvc.lock import Lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.updater import Updater from dvc.prompt import Prompt self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.link_state = LinkState(self) core = self.config._config[Config.SECTION_CORE] self.logger = Logger(core.get(Config.SECTION_CORE_LOGLEVEL, None)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config._config) self.updater = Updater(self.dvc_dir) self.prompt = Prompt() self._ignore() self.updater.check()
def init(root_dir=os.curdir): """ Initiate dvc project in directory. Args: root_dir: Path to project's root directory. Returns: Project instance. Raises: KeyError: Raises an exception. """ root_dir = os.path.abspath(root_dir) dvc_dir = os.path.join(root_dir, Project.DVC_DIR) os.mkdir(dvc_dir) config = Config.init(dvc_dir) cache = Cache.init(dvc_dir) state = State.init(root_dir, dvc_dir) lock = Lock(dvc_dir) scm = SCM(root_dir) scm.ignore_list([cache.cache_dir, state.state_file, lock.lock_file]) ignore_file = os.path.join(dvc_dir, scm.ignore_file()) scm.add([config.config_file, ignore_file]) return Project(root_dir)
def test_import_url_dir(tmp_dir, dvc, workspace, stage_md5, dir_md5): workspace.gen({"dir": {"file": "file", "subdir": {"subfile": "subfile"}}}) # remove external cache to make sure that we don't need it to import dirs with dvc.config.edit() as conf: del conf["cache"] dvc.cache = Cache(dvc) assert not (tmp_dir / "dir").exists() # sanity check dvc.imp_url("remote://workspace/dir") assert set(os.listdir(tmp_dir / "dir")) == {"file", "subdir"} assert (tmp_dir / "dir" / "file").read_text() == "file" assert list(os.listdir(tmp_dir / "dir" / "subdir")) == ["subfile"] assert (tmp_dir / "dir" / "subdir" / "subfile").read_text() == "subfile" assert (tmp_dir / "dir.dvc").read_text() == ( f"md5: {stage_md5}\n" "frozen: true\n" "deps:\n" f"- md5: {dir_md5}\n" " path: remote://workspace/dir\n" "outs:\n" "- md5: b6dcab6ccd17ca0a8bf4a215a37d14cc.dir\n" " path: dir\n") assert dvc.status() == {}
def __init__(self, root_dir=None, scm=None, rev=None): from dvc.state import State, StateNoop from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.repo.plots import Plots from dvc.repo.params import Params from dvc.scm.tree import WorkingTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache if scm: # use GitTree instead of WorkingTree as default repo tree instance tree = scm.get_tree(rev) self.root_dir = self.find_root(root_dir, tree) self.scm = scm self.tree = tree self.state = StateNoop() else: root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) if not scm: no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tree = WorkingTree(self.root_dir) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) self.cache = Cache(self) self.cloud = DataCloud(self) if not scm: # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self.cache.local) self.stage_cache = StageCache(self) self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self._ignore()
def __init__(self, root_dir=None): from dvc.config import Config from dvc.state import State from dvc.lock import Lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.updater import Updater root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir, project=self) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self, self.config.config) core = self.config.config[Config.SECTION_CORE] logger.set_level(core.get(Config.SECTION_CORE_LOGLEVEL)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config.config) self.updater = Updater(self.dvc_dir) self.files_to_git_add = [] self._ignore() self.updater.check()
def __init__(self, root_dir): self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) self.cache = Cache(self.root_dir, self.dvc_dir, cache_dir=self.config._config[Config.SECTION_CACHE].get(Config.SECTION_CACHE_DIR, None), cache_type=self.config._config[Config.SECTION_CACHE].get(Config.SECTION_CACHE_TYPE, None)) self.state = State(self.dvc_dir) self.logger = Logger(self.config._config[Config.SECTION_CORE].get(Config.SECTION_CORE_LOGLEVEL, None)) self.cloud = DataCloud(cache=self.cache, config=self.config._config) self.updater = Updater(self.dvc_dir) self._ignore() self.updater.check()
def test_windows_should_add_when_cache_on_different_drive( tmp_dir, dvc, temporary_windows_drive): dvc.config["cache"]["dir"] = temporary_windows_drive dvc.cache = Cache(dvc) (stage, ) = tmp_dir.dvc_gen({"file": "file"}) cache_path = stage.outs[0].cache_path assert path_isin(cache_path, temporary_windows_drive) assert os.path.isfile(cache_path) filecmp.cmp("file", cache_path)
def test_cache_type_is_properly_overridden(tmp_dir, erepo_dir): with erepo_dir.chdir(): erepo_dir.dvc.config.set(Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "symlink") erepo_dir.dvc.cache = Cache(erepo_dir.dvc) erepo_dir.scm_add([erepo_dir.dvc.config.config_file], "set cache type to symlinks") erepo_dir.dvc_gen("file", "contents", "create file") assert System.is_symlink(erepo_dir / "file") Repo.get(fspath(erepo_dir), "file", "file_imported") assert not System.is_symlink("file_imported") assert (tmp_dir / "file_imported").read_text() == "contents"
def test_cache_type_is_properly_overridden(tmp_dir, erepo_dir): with erepo_dir.chdir(): with erepo_dir.dvc.config.edit() as conf: conf["cache"]["type"] = "symlink" erepo_dir.dvc.cache = Cache(erepo_dir.dvc) erepo_dir.scm_add([erepo_dir.dvc.config.files["repo"]], "set cache type to symlinks") erepo_dir.dvc_gen("file", "contents", "create file") assert System.is_symlink(erepo_dir / "file") Repo.get(fspath(erepo_dir), "file", "file_imported") assert not System.is_symlink("file_imported") assert (tmp_dir / "file_imported").read_text() == "contents"
def __init__(self, root_dir=None): from dvc.state import State from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.scm.tree import WorkingTree from dvc.repo.tag import Tag from dvc.utils import makedirs root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.tree = CleanTree(WorkingTree(self.root_dir)) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") makedirs(self.tmp_dir, exist_ok=True) hardlink_lock = self.config.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.dvc_dir, "lock"), tmp_dir=os.path.join(self.dvc_dir, "tmp"), hardlink_lock=hardlink_lock, friendly=True, ) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self, self.config.config) core = self.config.config[Config.SECTION_CORE] level = core.get(Config.SECTION_CORE_LOGLEVEL) if level: logger.setLevel(level.upper()) self.cache = Cache(self) self.cloud = DataCloud(self) self.metrics = Metrics(self) self.tag = Tag(self) self._ignore()
def __init__(self, root_dir=None): from dvc.state import State from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.scm.tree import WorkingTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tree = WorkingTree(self.root_dir) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.cache = Cache(self) self.cloud = DataCloud(self) self.stage_cache = StageCache(self.cache.local.cache_dir) self.metrics = Metrics(self) self.params = Params(self) self._ignore()
def test_shared_stage_cache(tmp_dir, dvc, run_copy): import stat from dvc.cache import Cache tmp_dir.gen("foo", "foo") with dvc.config.edit() as config: config["cache"]["shared"] = "group" dvc.cache = Cache(dvc) assert not os.path.exists(dvc.cache.local.cache_dir) run_copy("foo", "bar", name="copy-foo-bar") parent_cache_dir = os.path.join( dvc.stage_cache.cache_dir, "88", ) cache_dir = os.path.join( parent_cache_dir, "883395068439203a9de3d1e1649a16e9027bfd1ab5dab4f438d321c4a928b328", ) cache_file = os.path.join( cache_dir, "e42b7ebb9bc5ac4bccab769c8d1338914dad25d7ffecc8671dbd4581bad4aa15", ) # sanity check assert os.path.isdir(cache_dir) assert os.listdir(cache_dir) == [os.path.basename(cache_file)] assert os.path.isfile(cache_file) def _mode(path): return stat.S_IMODE(os.stat(path).st_mode) if os.name == "nt": dir_mode = 0o777 file_mode = 0o666 else: dir_mode = 0o2775 file_mode = 0o664 assert _mode(dvc.cache.local.cache_dir) == dir_mode assert _mode(dvc.stage_cache.cache_dir) == dir_mode assert _mode(parent_cache_dir) == dir_mode assert _mode(cache_dir) == dir_mode assert _mode(cache_file) == file_mode
def test_destroy(tmp_dir, dvc, run_copy): from dvc.dvcfile import PIPELINE_FILE, PIPELINE_LOCK dvc.config["cache"]["type"] = ["symlink"] dvc.cache = Cache(dvc) tmp_dir.dvc_gen("file", "text") tmp_dir.dvc_gen({"dir": {"file": "lorem", "subdir/file": "ipsum"}}) run_copy("file", "file2", single_stage=True) run_copy("file2", "file3", name="copy-file2-file3") run_copy("file3", "file4", name="copy-file3-file4") dvc.destroy() # Remove all the files related to DVC for path in [ ".dvc", "file.dvc", "file2.dvc", "dir.dvc", PIPELINE_FILE, PIPELINE_LOCK, ]: assert not (tmp_dir / path).exists() # Leave the rest of the files for path in [ "file", "file2", "file3", "file4", "dir/file", "dir/subdir/file", ]: assert (tmp_dir / path).is_file() # Make sure that data was unprotected after `destroy` for path in [ "file", "file2", "file3", "file4", "dir", "dir/file", "dir/subdir", "dir/subdir/file", ]: assert not System.is_symlink(tmp_dir / path)
def test_find_cache(self): fname1 = os.path.basename(self.cache1) fname1_md5 = self.cache1_md5 fname2 = os.path.basename(self.cache2) fname2_md5 = self.cache2_md5 fname3 = 'non_existing' System.hardlink(self.cache1, fname1) System.hardlink(self.cache2, fname2) cache = Cache(self.dvc.dvc_dir).find_cache([fname1, fname2, fname3]) expected = {fname1: fname1_md5, fname2: fname2_md5} self.assertEqual(len(cache), 2) self.assertEqual(cache, expected)
def test_cache_type_is_properly_overridden(tmp_dir, scm, dvc, erepo_dir): with erepo_dir.chdir(): erepo_dir.dvc.config.set(Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "symlink") erepo_dir.dvc.cache = Cache(erepo_dir.dvc) erepo_dir.scm_add( [erepo_dir.dvc.config.config_file], "set source repo cache type to symlink", ) erepo_dir.dvc_gen("foo", "foo content", "create foo") assert System.is_symlink(erepo_dir / "foo") dvc.imp(fspath(erepo_dir), "foo", "foo_imported") assert not System.is_symlink("foo_imported") assert (tmp_dir / "foo_imported").read_text() == "foo content" assert scm.repo.git.check_ignore("foo_imported")
def test_cache_type_is_properly_overridden(tmp_dir, scm, dvc, erepo_dir): with erepo_dir.chdir(): with erepo_dir.dvc.config.edit() as conf: conf["cache"]["type"] = "symlink" erepo_dir.dvc.cache = Cache(erepo_dir.dvc) erepo_dir.scm_add( [erepo_dir.dvc.config.files["repo"]], "set source repo cache type to symlink", ) erepo_dir.dvc_gen("foo", "foo content", "create foo") assert System.is_symlink(erepo_dir / "foo") dvc.imp(os.fspath(erepo_dir), "foo", "foo_imported") assert not System.is_symlink("foo_imported") assert (tmp_dir / "foo_imported").read_text() == "foo content" assert scm.is_ignored("foo_imported")
def __init__(self, root_dir=None): from dvc.config import Config from dvc.state import State from dvc.lock import Lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.updater import Updater from dvc.repo.metrics import Metrics from dvc.scm.tree import WorkingTree from dvc.repo.tag import Tag from dvc.repo.pkg import Pkg root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.tree = WorkingTree(self.root_dir) self.scm = SCM(self.root_dir, repo=self) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self, self.config.config) core = self.config.config[Config.SECTION_CORE] level = core.get(Config.SECTION_CORE_LOGLEVEL) if level: logger.setLevel(level.upper()) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config.config) self.updater = Updater(self.dvc_dir) self.metrics = Metrics(self) self.tag = Tag(self) self.pkg = Pkg(self) self._ignore() self.updater.check()
def test_shared_cache(tmp_dir, dvc, group, dir_mode): if group: with dvc.config.edit() as conf: conf["cache"].update({"shared": "group"}) dvc.cache = Cache(dvc) tmp_dir.dvc_gen( {"file": "file content", "dir": {"file2": "file 2 " "content"}} ) for root, dnames, fnames in os.walk(dvc.cache.local.cache_dir): for dname in dnames: path = os.path.join(root, dname) assert stat.S_IMODE(os.stat(path).st_mode) == dir_mode for fname in fnames: path = os.path.join(root, fname) assert stat.S_IMODE(os.stat(path).st_mode) == 0o444
def test_destroy(tmp_dir, dvc): dvc.config["cache"]["type"] = ["symlink"] dvc.cache = Cache(dvc) tmp_dir.dvc_gen("file", "text") tmp_dir.dvc_gen({"dir": {"file": "lorem", "subdir/file": "ipsum"}}) dvc.destroy() # Remove all the files related to DVC for path in [".dvc", "file.dvc", "dir.dvc"]: assert not (tmp_dir / path).exists() # Leave the rest of the files for path in ["file", "dir/file", "dir/subdir/file"]: assert (tmp_dir / path).is_file() # Make sure that data was unprotected after `destroy` for path in ["file", "dir", "dir/file", "dir/subdir", "dir/subdir/file"]: assert not System.is_symlink(fspath(tmp_dir / path))
def workspace(tmp_dir, dvc, request): from dvc.cache import Cache cloud = request.param assert cloud tmp_dir.add_remote(name="workspace", config=cloud.config, default=False) tmp_dir.add_remote(name="cache", url="remote://workspace/cache", default=False) scheme = getattr(cloud, "scheme", "local") if scheme != "http": with dvc.config.edit() as conf: conf["cache"][scheme] = "cache" dvc.cache = Cache(dvc) return cloud
def __init__(self, root_dir): self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.link_state = LinkState(self) self.logger = Logger(self.config._config[Config.SECTION_CORE].get( Config.SECTION_CORE_LOGLEVEL, None)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config._config) self.updater = Updater(self.dvc_dir) self._ignore() self.updater.check()
def test_shared_cache(tmp_dir, dvc, group): from dvc.utils.fs import umask if group: with dvc.config.edit() as conf: conf["cache"].update({"shared": "group"}) dvc.cache = Cache(dvc) cache_dir = dvc.cache.local.cache_dir assert not os.path.exists(cache_dir) tmp_dir.dvc_gen({ "file": "file content", "dir": { "file2": "file 2 " "content" } }) actual = {} for root, dnames, fnames in os.walk(cache_dir): for name in dnames + fnames: path = os.path.join(root, name) actual[path] = oct(stat.S_IMODE(os.stat(path).st_mode)) file_mode = oct(0o444) dir_mode = oct(0o2775 if group else (0o777 & ~umask)) expected = { os.path.join(cache_dir, "17"): dir_mode, os.path.join(cache_dir, "17", "4eaa1dd94050255b7b98a7e1924b31.dir"): file_mode, os.path.join(cache_dir, "97"): dir_mode, os.path.join(cache_dir, "97", "e17781c198500e2766ea56bd697c03"): file_mode, os.path.join(cache_dir, "d1"): dir_mode, os.path.join(cache_dir, "d1", "0b4c3ff123b26dc068d43a8bef2d23"): file_mode, } assert expected == actual
def test_shared_cache(tmp_dir, dvc, protected, dir_mode, file_mode): dvc.config.set("cache", "shared", "group") dvc.config.set("cache", "protected", str(protected)) dvc.cache = Cache(dvc) tmp_dir.dvc_gen({ "file": "file content", "dir": { "file2": "file 2 " "content" } }) for root, dnames, fnames in os.walk(dvc.cache.local.cache_dir): for dname in dnames: path = os.path.join(root, dname) assert stat.S_IMODE(os.stat(path).st_mode) == dir_mode for fname in fnames: path = os.path.join(root, fname) assert stat.S_IMODE(os.stat(path).st_mode) == file_mode
def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, ): from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.lock import LockNoop, make_lock from dvc.repo.experiments import Experiments from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.scm import SCM from dvc.stage.cache import StageCache from dvc.state import State, StateNoop from dvc.tree.local import LocalTree from dvc.utils.fs import makedirs try: tree = scm.get_tree(rev) if rev else None self.root_dir = self.find_root(root_dir, tree) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") makedirs(self.tmp_dir, exist_ok=True) except NotDvcRepoError: if not uninitialized: raise self.root_dir = SCM(root_dir or os.curdir).root_dir self.dvc_dir = None self.tmp_dir = None tree_kwargs = dict(use_dvcignore=True, dvcignore_root=self.root_dir) if scm: self.tree = scm.get_tree(rev, **tree_kwargs) else: self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs) self.config = Config(self.dvc_dir, tree=self.tree) self._scm = scm # used by RepoTree to determine if it should traverse subrepos self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self) self.stage_cache = StageCache(self) try: self.experiments = Experiments(self) except NotImplementedError: self.experiments = None self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self)
def test_get(self): cache = Cache(self.dvc).local.get(self.cache1_md5) self.assertEqual(cache, self.cache1)
def test_all(self): md5_list = list(Cache(self.dvc).local.all()) self.assertEqual(len(md5_list), 2) self.assertIn(self.cache1_md5, md5_list) self.assertIn(self.cache2_md5, md5_list)
def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, ): from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.lock import LockNoop, make_lock from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.repo.stage import StageLoad from dvc.stage.cache import StageCache from dvc.state import State, StateNoop from dvc.tree.local import LocalTree self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( root_dir=root_dir, scm=scm, rev=rev, uninitialized=uninitialized ) tree_kwargs = {"use_dvcignore": True, "dvcignore_root": self.root_dir} if scm: self.tree = scm.get_tree(rev, **tree_kwargs) else: self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs) self.config = Config(self.dvc_dir, tree=self.tree) self._scm = scm # used by RepoTree to determine if it should traverse subrepos self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) self.stage = StageLoad(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self) self.stage_cache = StageCache(self) self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self.stage_collection_error_handler = None self._lock_depth = 0
class Project(object): DVC_DIR = '.dvc' def __init__(self, root_dir): self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) self.cache = Cache(self.dvc_dir) self.state = State(self.root_dir, self.dvc_dir) self.config = Config(self.dvc_dir) self.logger = Logger(self.config._config[Config.SECTION_CORE].get( Config.SECTION_CORE_LOGLEVEL, None)) self.cloud = DataCloud(cache=self.cache, state=self.state, config=self.config._config) @staticmethod def init(root_dir=os.curdir): """ Initiate dvc project in directory. Args: root_dir: Path to project's root directory. Returns: Project instance. Raises: KeyError: Raises an exception. """ root_dir = os.path.abspath(root_dir) dvc_dir = os.path.join(root_dir, Project.DVC_DIR) os.mkdir(dvc_dir) config = Config.init(dvc_dir) cache = Cache.init(dvc_dir) state = State.init(root_dir, dvc_dir) lock = Lock(dvc_dir) scm = SCM(root_dir) scm.ignore_list([cache.cache_dir, state.state_file, lock.lock_file]) ignore_file = os.path.join(dvc_dir, scm.ignore_file()) scm.add([config.config_file, ignore_file]) return Project(root_dir) def to_dvc_path(self, path): return os.path.relpath(path, self.root_dir) def add(self, fname): out = os.path.basename(fname) stage_fname = out + Stage.STAGE_FILE_SUFFIX cwd = os.path.dirname(os.path.abspath(fname)) stage = Stage.loads(project=self, cmd=None, deps=[], outs=[out], fname=stage_fname, cwd=cwd) stage.save() stage.dump() return stage def remove(self, target): if not Stage.is_stage_file(target): raise StageNotFoundError(target) stage = Stage.load(self, target) for out in stage.outs: out.remove() return stage def run(self, cmd=None, deps=[], outs=[], outs_no_cache=[], fname=Stage.STAGE_FILE, cwd=os.curdir, no_exec=False): stage = Stage.loads(project=self, fname=fname, cmd=cmd, cwd=cwd, outs=outs, outs_no_cache=outs_no_cache, deps=deps) if not no_exec: stage.run() stage.dump() return stage def _reproduce_stage(self, stages, node, force): stage = stages[node].reproduce(force=force) if not stage: return [] stage.dump() return [stage] def reproduce(self, target, recursive=True, force=False): stages = nx.get_node_attributes(self.graph(), 'stage') node = os.path.relpath(os.path.abspath(target), self.root_dir) if node not in stages: raise StageNotFoundError(target) if recursive: return self._reproduce_stages(stages, node, force) return self._reproduce_stage(stages, node, force) def _reproduce_stages(self, stages, node, force): result = [] for n in nx.dfs_postorder_nodes(self.graph(), node): try: result += self._reproduce_stage(stages, n, force) except Exception as ex: raise ReproductionError(stages[n].relpath, ex) return result def _remove_untracked_hardlinks(self): untracked = self.scm.untracked_files() cache = dict((System.inode(c), c) for c in self.cache.all()) for file in untracked: inode = System.inode(file) if inode not in cache.keys(): continue Logger.info(u'Remove \'{}\''.format(file)) os.remove(file) dir = os.path.dirname(file) if len(dir) != 0 and not os.listdir(dir): Logger.info(u'Remove empty directory \'{}\''.format(dir)) os.removedirs(dir) def checkout(self): self._remove_untracked_hardlinks() for stage in self.stages(): stage.checkout() def _used_cache(self, target=None): cache_set = set() if target: stages = [Stage.load(self, target)] else: stages = self.stages() for stage in stages: for out in stage.outs: if not out.use_cache or not out.cache: continue cache_set |= set([out.cache]) if out.is_dir_cache(out.cache) and os.path.isfile(out.cache): dir_cache = out.dir_cache() cache_set |= set(dir_cache.values()) return list(cache_set) def gc(self): clist = self._used_cache() for cache in self.cache.all(): if cache in clist: continue os.unlink(cache) self.logger.info(u'\'{}\' was removed'.format( self.to_dvc_path(cache))) def push(self, target=None, jobs=1, remote=None): return self.cloud.push(self._used_cache(target), jobs, remote=remote) def fetch(self, target=None, jobs=1, remote=None): return self.cloud.pull(self._used_cache(target), jobs, remote=remote) def pull(self, target=None, jobs=1, remote=None): ret = self.fetch(target, jobs, remote=remote) self.checkout() return ret def _local_status(self, target=None): status = {} if target: stages = [Stage.load(self, target)] else: stages = self.stages() for stage in self.stages(): status.update(stage.status()) return status def _cloud_status(self, target=None, jobs=1, remote=None): status = {} for target, ret in self.cloud.status(self._used_cache(target), jobs, remote=remote): if ret == cloud.STATUS_UNKNOWN or ret == cloud.STATUS_OK: continue prefix_map = { cloud.STATUS_DELETED: 'deleted', cloud.STATUS_MODIFIED: 'modified', cloud.STATUS_NEW: 'new', } path = os.path.relpath(target, self.cache.cache_dir) status[path] = prefix_map[ret] return status def status(self, target=None, jobs=1, cloud=False, remote=None): if cloud: return self._cloud_status(target, jobs, remote=remote) return self._local_status(target) def graph(self): G = nx.DiGraph() for stage in self.stages(): node = os.path.relpath(stage.path, self.root_dir) G.add_node(node, stage=stage) for dep in stage.deps: dep_stage = dep.stage() if not dep_stage: continue dep_node = os.path.relpath(dep_stage.path, self.root_dir) G.add_node(dep_node, stage=dep_stage) G.add_edge(node, dep_node) return G def stages(self): stages = [] for root, dirs, files in os.walk(self.root_dir): for fname in files: path = os.path.join(root, fname) if not Stage.is_stage_file(path): continue stages.append(Stage.load(self, path)) return stages def outs(self): outs = [] for stage in self.stages(): outs += stage.outs return outs
def __init__(self, root_dir=None, scm=None, rev=None): from dvc.state import State, StateNoop from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.experiments import Experiments from dvc.repo.metrics import Metrics from dvc.repo.plots import Plots from dvc.repo.params import Params from dvc.tree.local import LocalRemoteTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache if scm: tree = scm.get_tree(rev) self.root_dir = self.find_root(root_dir, tree) self.scm = scm self.tree = scm.get_tree(rev, use_dvcignore=True, dvcignore_root=self.root_dir) self.state = StateNoop() else: root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.tree = LocalRemoteTree( self, {"url": self.root_dir}, use_dvcignore=True, dvcignore_root=self.root_dir, ) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir, tree=self.tree) if not scm: no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) self.cache = Cache(self) self.cloud = DataCloud(self) if not scm: # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self.cache.local) self.stage_cache = StageCache(self) self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) try: self.experiments = Experiments(self) except NotImplementedError: self.experiments = None self._ignore()