def init(root_dir=os.curdir, no_scm=False): """ Initiate dvc project in directory. Args: root_dir: Path to project's root directory. Returns: Project instance. Raises: KeyError: Raises an exception. """ root_dir = os.path.abspath(root_dir) dvc_dir = os.path.join(root_dir, Project.DVC_DIR) scm = SCM(root_dir) if type(scm) == Base and not no_scm: msg = "{} is not tracked by any supported scm tool(e.g. git).".format(root_dir) raise InitError(msg) os.mkdir(dvc_dir) config = Config.init(dvc_dir) Cache.init(root_dir, dvc_dir) State.init(dvc_dir) proj = Project(root_dir) scm.add([config.config_file]) if scm.ignore_file(): scm.add([os.path.join(dvc_dir, scm.ignore_file())]) return proj
def test_update(self): path = os.path.join(self.dvc.root_dir, self.FOO) md5 = file_md5(path)[0] mtime = os.path.getmtime(path) inode = System.inode(path) state = State(self.dvc, self.dvc.config._config) with state: entry_md5 = state.update(path) self.assertEqual(entry_md5, md5) # Sleep some time to simulate realistic behavior. # Some filesystems have a bad date resolution for # mtime(i.e. 1sec for HFS) that cause problems with # our 'state' system not being able to distinguish # files that were modified within that delta. time.sleep(1) os.unlink(path) with open(path, 'w+') as fd: fd.write('1') md5 = file_md5(path)[0] mtime = os.path.getmtime(path) inode = System.inode(path) entry_md5 = state.update(path) self.assertEqual(entry_md5, md5)
def __init__(self, root_dir=None): from dvc.config import Config from dvc.state import State from dvc.lock import Lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.updater import Updater root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir, project=self) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self, self.config.config) core = self.config.config[Config.SECTION_CORE] logger.set_level(core.get(Config.SECTION_CORE_LOGLEVEL)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config.config) self.updater = Updater(self.dvc_dir) self.files_to_git_add = [] self._ignore() self.updater.check()
def __init__(self, project, config): self.project = project self.link_state = project.link_state self.cache_dir = config.get(Config.SECTION_REMOTE_URL, None) self.cache_type = config.get(Config.SECTION_CACHE_TYPE, None) if self.cache_dir != None and not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) self.state = State(self.cache_dir)
def test_update(self): path = os.path.join(self.dvc.root_dir, '1') md5_1 = '1' md5_2 = '2' mtime_1 = 1 mtime_2 = 2 inode_1 = 1 inode_2 = 2 state = State(self.dvc.root_dir, self.dvc.dvc_dir) self.assertIsNone(state.get(path)) state.add(path, md5_1, mtime_1, inode_1) entry = state.get(path) self.assertIsInstance(entry, StateEntry) self.assertEqual(entry.path, path) self.assertEqual(entry.md5, md5_1) self.assertEqual(entry.mtime, mtime_1) self.assertEqual(entry.inode, inode_1) state.update(path, md5_2, mtime_2, inode_2) entry = state.get(path) self.assertIsInstance(entry, StateEntry) self.assertEqual(entry.path, path) self.assertEqual(entry.md5, md5_2) self.assertEqual(entry.mtime, mtime_2) self.assertEqual(entry.inode, inode_2)
def init(root_dir=os.curdir): """ Initiate dvc project in directory. Args: root_dir: Path to project's root directory. Returns: Project instance. Raises: KeyError: Raises an exception. """ root_dir = os.path.abspath(root_dir) dvc_dir = os.path.join(root_dir, Project.DVC_DIR) os.mkdir(dvc_dir) config = Config.init(dvc_dir) cache = Cache.init(dvc_dir) state = State.init(root_dir, dvc_dir) lock = Lock(dvc_dir) scm = SCM(root_dir) scm.ignore_list([cache.cache_dir, state.state_file, lock.lock_file]) ignore_file = os.path.join(dvc_dir, scm.ignore_file()) scm.add([config.config_file, ignore_file]) return Project(root_dir)
def __init__(self, root_dir, dvc_dir, cache_dir=None, cache_type=None): self.cache_type = cache_type cache_dir = cache_dir if cache_dir else self.CACHE_DIR if os.path.isabs(cache_dir): self.cache_dir = cache_dir else: self.cache_dir = os.path.abspath( os.path.realpath(os.path.join(dvc_dir, cache_dir))) if not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) self.state = State(self.cache_dir) self.link_state = LinkState(root_dir, dvc_dir) self.lock = Lock(self.cache_dir, name=self.CACHE_DIR_LOCK)
def __init__(self, root_dir): from dvc.logger import Logger from dvc.config import Config from dvc.state import LinkState, State from dvc.lock import Lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.updater import Updater from dvc.prompt import Prompt self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.link_state = LinkState(self) core = self.config._config[Config.SECTION_CORE] self.logger = Logger(core.get(Config.SECTION_CORE_LOGLEVEL, None)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config._config) self.updater = Updater(self.dvc_dir) self.prompt = Prompt() self._ignore() self.updater.check()
def __init__(self, root_dir=None, scm=None, rev=None): from dvc.state import State, StateNoop from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.repo.plots import Plots from dvc.repo.params import Params from dvc.scm.tree import WorkingTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache if scm: # use GitTree instead of WorkingTree as default repo tree instance tree = scm.get_tree(rev) self.root_dir = self.find_root(root_dir, tree) self.scm = scm self.tree = tree self.state = StateNoop() else: root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.tree = WorkingTree(self.root_dir) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir, tree=self.tree) if not scm: no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) self.cache = Cache(self) self.cloud = DataCloud(self) if not scm: # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self.cache.local) self.stage_cache = StageCache(self) self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self) self._ignore()
def test(self): file_time, file_size = State._mtime_and_size(self.DATA) dir_time, dir_size = State._mtime_and_size(self.DATA_DIR) actual_file_size = os.path.getsize(self.DATA) actual_dir_size = (os.path.getsize(self.DATA_DIR) + os.path.getsize(self.DATA) + os.path.getsize(self.DATA_SUB_DIR) + os.path.getsize(self.DATA_SUB)) self.assertIs(type(file_time), str) self.assertIs(type(file_size), str) self.assertEqual(file_size, str(actual_file_size)) self.assertIs(type(dir_time), str) self.assertIs(type(dir_size), str) self.assertEqual(dir_size, str(actual_dir_size))
def test_update(self): path = os.path.join(self.dvc.root_dir, self.FOO) md5 = file_md5(path)[0] state = State(self.dvc, self.dvc.config.config) with state: entry_md5 = state.update(path) self.assertEqual(entry_md5, md5) os.unlink(path) with open(path, "a") as fd: fd.write("1") md5 = file_md5(path)[0] entry_md5 = state.update(path) self.assertEqual(entry_md5, md5)
def __init__(self, project, config): self.project = project self.link_state = project.link_state storagepath = config.get(Config.SECTION_AWS_STORAGEPATH, None) self.cache_dir = config.get(Config.SECTION_REMOTE_URL, storagepath) types = config.get(Config.SECTION_CACHE_TYPE, None) if types: if isinstance(types, str): types = [t.strip() for t in types.split(',')] self.cache_types = types else: self.cache_types = self.CACHE_TYPES if self.cache_dir != None and not os.path.exists(self.cache_dir): os.mkdir(self.cache_dir) self.state = State(self.cache_dir)
def __init__(self, root_dir): self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) self.cache = Cache(self.dvc_dir) self.state = State(self.root_dir, self.dvc_dir) self.config = Config(self.dvc_dir) self.logger = Logger(self.config._config) self.cloud = DataCloud(self.cache, self.config._config)
def test_update(self): path = os.path.join(self.dvc.root_dir, self.FOO) md5 = file_md5(path)[0] mtime = os.path.getmtime(path) inode = System.inode(path) state = State(self.dvc.root_dir, self.dvc.dvc_dir) state.update(path) entry = state.get(path) self.assertIsInstance(entry, StateEntry) self.assertEqual(entry.md5, md5) self.assertEqual(entry.mtime, mtime) self.assertEqual(entry.inode, inode) os.chmod(path, stat.S_IWRITE) os.unlink(path) with open(path, 'w+') as fd: fd.write('1') md5 = file_md5(path)[0] mtime = os.path.getmtime(path) inode = System.inode(path) entry = state.update(path) self.assertIsInstance(entry, StateEntry) self.assertEqual(entry.md5, md5) self.assertEqual(entry.mtime, mtime) self.assertEqual(entry.inode, inode)
def test_state(tmp_dir, dvc): tmp_dir.gen("foo", "foo content") path = tmp_dir / "foo" hash_info = HashInfo("md5", file_md5(path, dvc.fs)) state = State(dvc.root_dir, dvc.tmp_dir, dvc.dvcignore) state.save(path, dvc.fs, hash_info) assert state.get(path, dvc.fs)[1] == hash_info path.unlink() path.write_text("1") assert state.get(path, dvc.fs) == (None, None) hash_info = HashInfo("md5", file_md5(path, dvc.fs)) state.save(path, dvc.fs, hash_info) assert state.get(path, dvc.fs)[1] == hash_info
def __init__(self, root_dir): self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) self.cache = Cache(self.dvc_dir) self.state = State(self.root_dir, self.dvc_dir) self.config = Config(self.dvc_dir) self.logger = Logger(self.config._config[Config.SECTION_CORE].get( Config.SECTION_CORE_LOGLEVEL, None)) self.cloud = DataCloud(cache=self.cache, state=self.state, config=self.config._config)
def __init__(self, root_dir=None): from dvc.state import State from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.scm.tree import WorkingTree from dvc.repo.tag import Tag from dvc.utils import makedirs root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.tree = CleanTree(WorkingTree(self.root_dir)) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") makedirs(self.tmp_dir, exist_ok=True) hardlink_lock = self.config.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.dvc_dir, "lock"), tmp_dir=os.path.join(self.dvc_dir, "tmp"), hardlink_lock=hardlink_lock, friendly=True, ) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self, self.config.config) core = self.config.config[Config.SECTION_CORE] level = core.get(Config.SECTION_CORE_LOGLEVEL) if level: logger.setLevel(level.upper()) self.cache = Cache(self) self.cloud = DataCloud(self) self.metrics = Metrics(self) self.tag = Tag(self) self._ignore()
def __init__(self, root_dir=None): from dvc.state import State from dvc.lock import make_lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.scm.tree import WorkingTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) no_scm = self.config["core"].get("no_scm", False) self.scm = SCM(self.root_dir, no_scm=no_scm) self.tree = WorkingTree(self.root_dir) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") self.index_dir = os.path.join(self.tmp_dir, "index") makedirs(self.index_dir, exist_ok=True) hardlink_lock = self.config["core"].get("hardlink_lock", False) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=hardlink_lock, friendly=True, ) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.cache = Cache(self) self.cloud = DataCloud(self) self.stage_cache = StageCache(self.cache.local.cache_dir) self.metrics = Metrics(self) self.params = Params(self) self._ignore()
def test_state(tmp_dir, dvc): tmp_dir.gen("foo", "foo content") path = tmp_dir / "foo" path_info = PathInfo(path) hash_info = HashInfo("md5", file_md5(path, dvc.fs)) state = State(dvc.root_dir, dvc.tmp_dir) state.save(path_info, dvc.fs, hash_info) assert state.get(path_info, dvc.fs) == hash_info path.unlink() path.write_text("1") assert state.get(path_info, dvc.fs) is None hash_info = HashInfo("md5", file_md5(path, dvc.fs)) state.save(path_info, dvc.fs, hash_info) assert state.get(path_info, dvc.fs) == hash_info
def __init__(self, root_dir): self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) self.cache = Cache(self.root_dir, self.dvc_dir, cache_dir=self.config._config[Config.SECTION_CACHE].get(Config.SECTION_CACHE_DIR, None), cache_type=self.config._config[Config.SECTION_CACHE].get(Config.SECTION_CACHE_TYPE, None)) self.state = State(self.dvc_dir) self.logger = Logger(self.config._config[Config.SECTION_CORE].get(Config.SECTION_CORE_LOGLEVEL, None)) self.cloud = DataCloud(cache=self.cache, config=self.config._config) self.updater = Updater(self.dvc_dir) self._ignore() self.updater.check()
def test_transforms_inode(self, get_inode_mock): state = State(self.dvc, self.dvc.config.config) inode = state.MAX_INT + 2 self.assertNotEqual(inode, state._to_sqlite(inode)) path = os.path.join(self.dvc.root_dir, self.FOO) get_inode_mock.side_effect = self.mock_get_inode(path, inode) with state: state.update(path) ret = state.get_state_record_for_inode(inode) self.assertIsNotNone(ret)
def __init__(self, root_dir=None): from dvc.config import Config from dvc.state import State from dvc.lock import Lock from dvc.scm import SCM from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.updater import Updater from dvc.repo.metrics import Metrics from dvc.scm.tree import WorkingTree from dvc.repo.tag import Tag from dvc.repo.pkg import Pkg root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.tree = WorkingTree(self.root_dir) self.scm = SCM(self.root_dir, repo=self) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self, self.config.config) core = self.config.config[Config.SECTION_CORE] level = core.get(Config.SECTION_CORE_LOGLEVEL) if level: logger.setLevel(level.upper()) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config.config) self.updater = Updater(self.dvc_dir) self.metrics = Metrics(self) self.tag = Tag(self) self.pkg = Pkg(self) self._ignore() self.updater.check()
def test_transforms_inode(self, get_inode_mock): state = State(self.dvc, self.dvc.config.config) inode = state.MAX_INT + 2 self.assertNotEqual(inode, state._to_sqlite(inode)) path = os.path.join(self.dvc.root_dir, self.FOO) md5 = file_md5(path)[0] get_inode_mock.side_effect = self.mock_get_inode(path, inode) with state: state.save({"scheme": "local", "path": path}, md5) ret = state.get_state_record_for_inode(inode) self.assertIsNotNone(ret)
def test_get_state_record_for_inode(get_inode_mock, dvc_repo, repo_dir): state = State(dvc_repo, dvc_repo.config.config) inode = state.MAX_INT + 2 assert inode != state._to_sqlite(inode) path = os.path.join(dvc_repo.root_dir, repo_dir.FOO) md5 = file_md5(path)[0] get_inode_mock.side_effect = mock_get_inode(inode) with state: state.save(PathInfo(path), md5) ret = state.get_state_record_for_inode(inode) assert ret is not None
def test_update(self): path = os.path.join(self.dvc.root_dir, self.FOO) path_info = {"scheme": "local", "path": path} md5 = file_md5(path)[0] state = State(self.dvc, self.dvc.config.config) with state: state.save(path_info, md5) entry_md5 = state.get(path_info) self.assertEqual(entry_md5, md5) os.unlink(path) with open(path, "a") as fd: fd.write("1") entry_md5 = state.get(path_info) self.assertTrue(entry_md5 is None) md5 = file_md5(path)[0] state.save(path_info, md5) entry_md5 = state.get(path_info) self.assertEqual(entry_md5, md5)
def test_state(dvc_repo, repo_dir): path = os.path.join(dvc_repo.root_dir, repo_dir.FOO) path_info = PathInfo(path) md5 = file_md5(path)[0] state = State(dvc_repo, dvc_repo.config.config) with state: state.save(path_info, md5) entry_md5 = state.get(path_info) assert entry_md5 == md5 os.unlink(path) with open(path, "a") as fd: fd.write("1") entry_md5 = state.get(path_info) assert entry_md5 is None md5 = file_md5(path)[0] state.save(path_info, md5) entry_md5 = state.get(path_info) assert entry_md5 == md5
def test_state(tmp_dir, dvc): tmp_dir.gen("foo", "foo content") path = tmp_dir / "foo" path_info = PathInfo(path) md5 = file_md5(path)[0] state = State(dvc, dvc.config.config) with state: state.save(path_info, md5) entry_md5 = state.get(path_info) assert entry_md5 == md5 path.unlink() path.write_text("1") entry_md5 = state.get(path_info) assert entry_md5 is None md5 = file_md5(path)[0] state.save(path_info, md5) entry_md5 = state.get(path_info) assert entry_md5 == md5
def __init__(self, root_dir): self.root_dir = os.path.abspath(os.path.realpath(root_dir)) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir) self.scm = SCM(self.root_dir) self.lock = Lock(self.dvc_dir) # NOTE: storing state and link_state in the repository itself to avoid # any possible state corruption in 'shared cache dir' scenario. self.state = State(self) self.link_state = LinkState(self) self.logger = Logger(self.config._config[Config.SECTION_CORE].get( Config.SECTION_CORE_LOGLEVEL, None)) self.cache = Cache(self) self.cloud = DataCloud(self, config=self.config._config) self.updater = Updater(self.dvc_dir) self._ignore() self.updater.check()
def test_get_state_record_for_inode(get_inode_mock, tmp_dir, dvc): tmp_dir.gen("foo", "foo content") state = State(dvc, dvc.config.config) inode = state.MAX_INT + 2 assert inode != state._to_sqlite(inode) foo = tmp_dir / "foo" md5 = file_md5(foo)[0] get_inode_mock.side_effect = mock_get_inode(inode) with state: state.save(PathInfo(foo), md5) ret = state.get_state_record_for_inode(inode) assert ret is not None
def __init__( self, root_dir=None, scm=None, rev=None, subrepos=False, uninitialized=False, ): from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.lock import LockNoop, make_lock from dvc.repo.experiments import Experiments from dvc.repo.metrics import Metrics from dvc.repo.params import Params from dvc.repo.plots import Plots from dvc.scm import SCM from dvc.stage.cache import StageCache from dvc.state import State, StateNoop from dvc.tree.local import LocalTree from dvc.utils.fs import makedirs try: tree = scm.get_tree(rev) if rev else None self.root_dir = self.find_root(root_dir, tree) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.tmp_dir = os.path.join(self.dvc_dir, "tmp") makedirs(self.tmp_dir, exist_ok=True) except NotDvcRepoError: if not uninitialized: raise self.root_dir = SCM(root_dir or os.curdir).root_dir self.dvc_dir = None self.tmp_dir = None tree_kwargs = dict(use_dvcignore=True, dvcignore_root=self.root_dir) if scm: self.tree = scm.get_tree(rev, **tree_kwargs) else: self.tree = LocalTree(self, {"url": self.root_dir}, **tree_kwargs) self.config = Config(self.dvc_dir, tree=self.tree) self._scm = scm # used by RepoTree to determine if it should traverse subrepos self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) if scm or not self.dvc_dir: self.lock = LockNoop() self.state = StateNoop() else: self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, hardlink_lock=self.config["core"].get("hardlink_lock", False), friendly=True, ) # NOTE: storing state and link_state in the repository itself to # avoid any possible state corruption in 'shared cache dir' # scenario. self.state = State(self) self.stage_cache = StageCache(self) try: self.experiments = Experiments(self) except NotImplementedError: self.experiments = None self._ignore() self.metrics = Metrics(self) self.plots = Plots(self) self.params = Params(self)