def _open(path, repo=None, rev=None, remote=None, mode="r", encoding=None): with Repo.open(repo, rev=rev, subrepos=True, uninitialized=True) as _repo: with _repo.open_by_relpath( path, remote=remote, mode=mode, encoding=encoding ) as fd: yield fd
def setUp(self): super().setUp() ret = main(["config", "cache.type", "hardlink"]) self.assertEqual(ret, 0) self.dvc = DvcRepo(".")
def test_absolute_file_outside_git_repo(tmp_dir, erepo_dir): erepo_dir.scm.repo.index.remove([erepo_dir.dvc.dvc_dir], r=True) erepo_dir.scm.commit("remove dvc") with pytest.raises(PathMissingError): Repo.get(fspath(erepo_dir), "/root/")
def exp_dvc(self): """Return clone dvc Repo instance.""" from dvc.repo import Repo return Repo(self.exp_dvc_dir)
def stages(): return set(stage.relpath for stage in Repo(fspath(tmp_dir)).stages)
def test_api_init(scm): DvcRepo.init().close() assert os.path.isdir(DvcRepo.DVC_DIR)
def test_init_no_scm_api(tmp_dir): repo = DvcRepo.init(no_scm=True) assert (tmp_dir / DvcRepo.DVC_DIR).is_dir() assert repo.config["core"]["no_scm"]
def test_api(self): with self.assertRaises(InitError): DvcRepo.init()
def reproduce( cls, dvc_dir: str, queue: "Queue", rev: str, cwd: Optional[str] = None, name: Optional[str] = None, ) -> Tuple[bool, Optional[str]]: """Run dvc repro and return the result. Returns tuple of (exp_hash, force) where exp_hash is the experiment hash (or None on error) and force is a bool specifying whether or not this experiment should force overwrite any existing duplicates. """ unchanged = [] queue.put((rev, os.getpid())) def filter_pipeline(stages): unchanged.extend([ stage for stage in stages if isinstance(stage, PipelineStage) ]) result = None force = False try: dvc = Repo(dvc_dir) old_cwd = os.getcwd() new_cwd = cwd if cwd else dvc.root_dir os.chdir(new_cwd) logger.debug("Running repro in '%s'", cwd) args_path = os.path.join(dvc.tmp_dir, BaseExecutor.PACKED_ARGS_FILE) if os.path.exists(args_path): args, kwargs = BaseExecutor.unpack_repro_args(args_path) remove(args_path) else: args = [] kwargs = {} force = kwargs.get("force", False) # NOTE: for checkpoint experiments we handle persist outs slightly # differently than normal: # # - checkpoint out may not yet exist if this is the first time this # experiment has been run, this is not an error condition for # experiments # - at the start of a repro run, we need to remove the persist out # and restore it to its last known (committed) state (which may # be removed/does not yet exist) so that our executor workspace # is not polluted with the (persistent) out from an unrelated # experiment run dvc.checkout(force=True, quiet=True) # We cannot use dvc.scm to make commits inside the executor since # cached props are not picklable. scm = Git() checkpoint_func = partial(cls.checkpoint_callback, scm, name) stages = dvc.reproduce( *args, on_unchanged=filter_pipeline, checkpoint_func=checkpoint_func, **kwargs, ) exp_hash = cls.hash_exp(stages) exp_rev = cls.commit(scm, exp_hash, exp_name=name) if scm.get_ref(EXEC_CHECKPOINT): scm.set_ref(EXEC_CHECKPOINT, exp_rev) except UnchangedExperimentError: pass finally: if scm: scm.close() del scm if old_cwd: os.chdir(old_cwd) # ideally we would return stages here like a normal repro() call, but # stages is not currently picklable and cannot be returned across # multiprocessing calls return result, force
def _list_files(repo, path=None): return set(map(itemgetter("path"), Repo.ls(os.fspath(repo), path)))
def test_api(self): DvcRepo.init() self._test_init()
def _ls(path): return Repo.ls(os.fspath(erepo_dir), path)
def gc( self, all_branches=False, cloud=False, remote=None, with_deps=False, all_tags=False, all_commits=False, force=False, jobs=None, repos=None, workspace=False, ): # require `workspace` to be true to come into effect. # assume `workspace` to be enabled if any of `all_tags`, `all_commits`, # or `all_branches` are enabled. _raise_error_if_all_disabled( workspace=workspace, all_tags=all_tags, all_commits=all_commits, all_branches=all_branches, ) from contextlib import ExitStack from dvc.repo import Repo all_repos = [] if repos: all_repos = [Repo(path) for path in repos] with ExitStack() as stack: for repo in all_repos: stack.enter_context(repo.lock) stack.enter_context(repo.state) used = NamedCache() for repo in all_repos + [self]: used.update( repo.used_cache( all_branches=all_branches, with_deps=with_deps, all_tags=all_tags, all_commits=all_commits, remote=remote, force=force, jobs=jobs, )) _do_gc("local", self.cache.local.gc, used, jobs) if self.cache.s3: _do_gc("s3", self.cache.s3.gc, used, jobs) if self.cache.gs: _do_gc("gs", self.cache.gs.gc, used, jobs) if self.cache.ssh: _do_gc("ssh", self.cache.ssh.gc, used, jobs) if self.cache.hdfs: _do_gc("hdfs", self.cache.hdfs.gc, used, jobs) if self.cache.azure: _do_gc("azure", self.cache.azure.gc, used, jobs) if cloud: _do_gc("remote", self.cloud.get_remote(remote, "gc -c").gc, used, jobs)
def setUp(self): super(TestShouldNotCheckoutUponCorruptedLocalHardlinkCache, self).setUp() ret = main(["config", "cache.type", "hardlink"]) self.assertEqual(ret, 0) self.dvc = DvcRepo(".")
def run(self): logger.info(relpath(Repo.find_root())) return 0
def test_run_without_cmd(kwargs): with pytest.raises(InvalidArgumentError) as exc: Repo().run(**kwargs) assert "command is not specified" == str(exc.value)
class TestGCMultipleDvcRepos(TestDvcGit): def _check_cache(self, num): total = 0 for root, dirs, files in os.walk(os.path.join(".dvc", "cache")): total += len(files) self.assertEqual(total, num) def setUp(self): super(TestGCMultipleDvcRepos, self).setUp() self.additional_path = TestDir.mkdtemp() self.additional_git = Repo.init(self.additional_path) self.additional_dvc = DvcRepo.init(self.additional_path) cache_path = os.path.join(self._root_dir, ".dvc", "cache") config_path = os.path.join(self.additional_path, ".dvc", "config.local") cfg = configobj.ConfigObj() cfg.filename = config_path cfg["cache"] = {"dir": cache_path} cfg.write() self.additional_dvc = DvcRepo(self.additional_path) def test(self): # ADD FILE ONLY IN MAIN PROJECT fname = "only_in_first" with open(fname, "w+") as fobj: fobj.write("only in main repo") stages = self.dvc.add(fname) self.assertEqual(len(stages), 1) # ADD FILE IN MAIN PROJECT THAT IS ALSO IN SECOND PROJECT fname = "in_both" with open(fname, "w+") as fobj: fobj.write("in both repos") stages = self.dvc.add(fname) self.assertEqual(len(stages), 1) cwd = os.getcwd() os.chdir(self.additional_path) # ADD FILE ONLY IN SECOND PROJECT fname = os.path.join(self.additional_path, "only_in_second") with open(fname, "w+") as fobj: fobj.write("only in additional repo") stages = self.additional_dvc.add(fname) self.assertEqual(len(stages), 1) # ADD FILE IN SECOND PROJECT THAT IS ALSO IN MAIN PROJECT fname = os.path.join(self.additional_path, "in_both") with open(fname, "w+") as fobj: fobj.write("in both repos") stages = self.additional_dvc.add(fname) self.assertEqual(len(stages), 1) os.chdir(cwd) self._check_cache(3) self.dvc.gc(repos=[self.additional_path]) self._check_cache(3) self.dvc.gc() self._check_cache(2)
def gc( self, all_branches=False, cloud=False, remote=None, with_deps=False, all_tags=False, all_commits=False, all_experiments=False, force=False, jobs=None, repos=None, workspace=False, ): # require `workspace` to be true to come into effect. # assume `workspace` to be enabled if any of `all_tags`, `all_commits`, # `all_experiments` or `all_branches` are enabled. _raise_error_if_all_disabled( workspace=workspace, all_tags=all_tags, all_commits=all_commits, all_branches=all_branches, all_experiments=all_experiments, ) from contextlib import ExitStack from dvc.data.db import get_index from dvc.data.gc import gc as ogc from dvc.repo import Repo if not repos: repos = [] all_repos = [Repo(path) for path in repos] used_obj_ids = set() with ExitStack() as stack: for repo in all_repos: stack.enter_context(repo.lock) for repo in all_repos + [self]: for obj_ids in repo.used_objs( all_branches=all_branches, with_deps=with_deps, all_tags=all_tags, all_commits=all_commits, all_experiments=all_experiments, remote=remote, force=force, jobs=jobs, ).values(): used_obj_ids.update(obj_ids) for scheme, odb in self.odb.by_scheme(): if not odb: continue removed = ogc(odb, used_obj_ids, jobs=jobs) if not removed: logger.info(f"No unused '{scheme}' cache to remove.") if not cloud: return odb = self.cloud.get_remote_odb(remote, "gc -c") removed = ogc(odb, used_obj_ids, jobs=jobs) if removed: get_index(odb).clear() else: logger.info("No unused cache to remove from remote.")
def test_init_no_scm_fail_api(tmp_dir): with pytest.raises(InitError): DvcRepo.init()
def test_ls_repo_with_removed_dvc_dir_with_path_file(tmp_dir, dvc, scm): create_dvc_pipeline(tmp_dir, dvc) path = os.path.join("out", "file") files = Repo.ls(os.fspath(tmp_dir), path) match_files(files, ((("file", ), True), ))
def test_gen_dvcignore(tmp_dir): DvcRepo.init(no_scm=True) text = ("# Add patterns of files dvc should ignore, which could improve\n" "# the performance. Learn more at\n" "# https://dvc.org/doc/user-guide/dvcignore\n") assert text == (tmp_dir / ".dvcignore").read_text()
def test_ls_not_existed_url(): from time import time dirname = "__{}_{}".format("not_existed", time()) with pytest.raises(CloneError): Repo.ls(dirname, recursive=True)
def _scm_in_use(): try: scm = SCM(root_dir=Repo.find_root()) return type(scm).__name__ except NotDvcRepoError: pass
class TestReproExternalHTTP(TestReproExternalBase): _external_cache_id = None @staticmethod def get_remote(port): return "http://localhost:{}/".format(port) @property def local_cache(self): return os.path.join(self.dvc.dvc_dir, "cache") def test(self): # Import with StaticFileServer() as httpd: import_url = urljoin(self.get_remote(httpd.server_port), self.FOO) import_output = "imported_file" import_stage = self.dvc.imp_url(import_url, import_output) self.assertTrue(os.path.exists(import_output)) self.assertTrue(filecmp.cmp(import_output, self.FOO, shallow=False)) self.dvc.remove("imported_file.dvc") with StaticFileServer(handler_class=ContentMD5Handler) as httpd: import_url = urljoin(self.get_remote(httpd.server_port), self.FOO) import_output = "imported_file" import_stage = self.dvc.imp_url(import_url, import_output) self.assertTrue(os.path.exists(import_output)) self.assertTrue(filecmp.cmp(import_output, self.FOO, shallow=False)) # Run --deps with StaticFileServer() as httpd: remote = self.get_remote(httpd.server_port) cache_id = str(uuid.uuid4()) cache = urljoin(remote, cache_id) ret1 = main(["remote", "add", "mycache", cache]) ret2 = main(["remote", "add", "myremote", remote]) self.assertEqual(ret1, 0) self.assertEqual(ret2, 0) self.dvc = DvcRepo(".") run_dependency = urljoin(remote, self.BAR) run_output = "remote_file" cmd = 'open("{}", "w+")'.format(run_output) with open("create-output.py", "w") as fd: fd.write(cmd) run_stage = self.dvc.run( deps=[run_dependency], outs=[run_output], cmd="python create-output.py", ) self.assertTrue(run_stage is not None) self.assertTrue(os.path.exists(run_output)) # Pull self.dvc.remove(import_stage.path, outs_only=True) self.assertFalse(os.path.exists(import_output)) shutil.move(self.local_cache, cache_id) self.assertFalse(os.path.exists(self.local_cache)) self.dvc.pull([import_stage.path], remote="mycache") self.assertTrue(os.path.exists(import_output))
def setUp(self): super(TestDvcFixture, self).setUp() self.dvc = DvcRepo.init(self._root_dir) self.dvc.scm.commit("init dvc")
class TestReproExternalBase(TestDvc): @staticmethod def should_test(): return False @property def cache_scheme(self): return self.scheme @property def cache_type(self): return "copy" @property def scheme(self): return None @property def scheme_sep(self): return "://" @property def sep(self): return "/" def check_already_cached(self, stage): stage.outs[0].remove() patch_download = patch.object(stage.deps[0], "download", wraps=stage.deps[0].download) patch_checkout = patch.object(stage.outs[0], "checkout", wraps=stage.outs[0].checkout) patch_run = patch.object(stage, "_run", wraps=stage._run) with self.dvc.lock, self.dvc.state: with patch_download as mock_download: with patch_checkout as mock_checkout: with patch_run as mock_run: stage.locked = False stage.run() stage.locked = True mock_run.assert_not_called() mock_download.assert_not_called() mock_checkout.assert_called_once() @patch("dvc.prompt.confirm", return_value=True) def test(self, mock_prompt): if not self.should_test(): raise SkipTest("Test {} is disabled".format( self.__class__.__name__)) cache = (self.scheme + self.scheme_sep + self.bucket + self.sep + str(uuid.uuid4())) ret = main(["config", "cache." + self.cache_scheme, "myrepo"]) self.assertEqual(ret, 0) ret = main(["remote", "add", "myrepo", cache]) self.assertEqual(ret, 0) ret = main(["remote", "modify", "myrepo", "type", self.cache_type]) self.assertEqual(ret, 0) remote_name = "myremote" remote_key = str(uuid.uuid4()) remote = (self.scheme + self.scheme_sep + self.bucket + self.sep + remote_key) ret = main(["remote", "add", remote_name, remote]) self.assertEqual(ret, 0) ret = main(["remote", "modify", remote_name, "type", self.cache_type]) self.assertEqual(ret, 0) self.dvc = DvcRepo(".") foo_key = remote_key + self.sep + self.FOO bar_key = remote_key + self.sep + self.BAR foo_path = (self.scheme + self.scheme_sep + self.bucket + self.sep + foo_key) bar_path = (self.scheme + self.scheme_sep + self.bucket + self.sep + bar_key) # Using both plain and remote notation out_foo_path = "remote://" + remote_name + "/" + self.FOO out_bar_path = bar_path self.write(self.bucket, foo_key, self.FOO_CONTENTS) import_stage = self.dvc.imp_url(out_foo_path, "import") self.assertTrue(os.path.exists("import")) self.assertTrue(filecmp.cmp("import", self.FOO, shallow=False)) self.assertEqual(self.dvc.status([import_stage.path]), {}) self.check_already_cached(import_stage) import_remote_stage = self.dvc.imp_url(out_foo_path, out_foo_path + "_imported") self.assertEqual(self.dvc.status([import_remote_stage.path]), {}) cmd_stage = self.dvc.run( outs=[out_bar_path], deps=[out_foo_path], cmd=self.cmd(foo_path, bar_path), ) self.assertEqual(self.dvc.status([cmd_stage.path]), {}) self.assertEqual(self.dvc.status(), {}) self.check_already_cached(cmd_stage) self.write(self.bucket, foo_key, self.BAR_CONTENTS) self.assertNotEqual(self.dvc.status(), {}) self.dvc.update(import_stage.path) self.assertTrue(os.path.exists("import")) self.assertTrue(filecmp.cmp("import", self.BAR, shallow=False)) self.assertEqual(self.dvc.status([import_stage.path]), {}) self.dvc.update(import_remote_stage.path) self.assertEqual(self.dvc.status([import_remote_stage.path]), {}) stages = self.dvc.reproduce(cmd_stage.path) self.assertEqual(len(stages), 1) self.assertEqual(self.dvc.status([cmd_stage.path]), {}) self.assertEqual(self.dvc.status(), {}) self.dvc.gc() self.assertEqual(self.dvc.status(), {}) self.dvc.remove(cmd_stage.path, outs_only=True) self.assertNotEqual(self.dvc.status([cmd_stage.path]), {}) self.dvc.checkout([cmd_stage.path], force=True) self.assertEqual(self.dvc.status([cmd_stage.path]), {})
def test_absolute_file_outside_repo(tmp_dir, erepo_dir): with pytest.raises(PathMissingError): Repo.get(fspath(erepo_dir), "/root/")
def test(self, mock_prompt): if not self.should_test(): raise SkipTest("Test {} is disabled".format( self.__class__.__name__)) cache = (self.scheme + self.scheme_sep + self.bucket + self.sep + str(uuid.uuid4())) ret = main(["config", "cache." + self.cache_scheme, "myrepo"]) self.assertEqual(ret, 0) ret = main(["remote", "add", "myrepo", cache]) self.assertEqual(ret, 0) ret = main(["remote", "modify", "myrepo", "type", self.cache_type]) self.assertEqual(ret, 0) remote_name = "myremote" remote_key = str(uuid.uuid4()) remote = (self.scheme + self.scheme_sep + self.bucket + self.sep + remote_key) ret = main(["remote", "add", remote_name, remote]) self.assertEqual(ret, 0) ret = main(["remote", "modify", remote_name, "type", self.cache_type]) self.assertEqual(ret, 0) self.dvc = DvcRepo(".") foo_key = remote_key + self.sep + self.FOO bar_key = remote_key + self.sep + self.BAR foo_path = (self.scheme + self.scheme_sep + self.bucket + self.sep + foo_key) bar_path = (self.scheme + self.scheme_sep + self.bucket + self.sep + bar_key) # Using both plain and remote notation out_foo_path = "remote://" + remote_name + "/" + self.FOO out_bar_path = bar_path self.write(self.bucket, foo_key, self.FOO_CONTENTS) import_stage = self.dvc.imp_url(out_foo_path, "import") self.assertTrue(os.path.exists("import")) self.assertTrue(filecmp.cmp("import", self.FOO, shallow=False)) self.assertEqual(self.dvc.status([import_stage.path]), {}) self.check_already_cached(import_stage) import_remote_stage = self.dvc.imp_url(out_foo_path, out_foo_path + "_imported") self.assertEqual(self.dvc.status([import_remote_stage.path]), {}) cmd_stage = self.dvc.run( outs=[out_bar_path], deps=[out_foo_path], cmd=self.cmd(foo_path, bar_path), ) self.assertEqual(self.dvc.status([cmd_stage.path]), {}) self.assertEqual(self.dvc.status(), {}) self.check_already_cached(cmd_stage) self.write(self.bucket, foo_key, self.BAR_CONTENTS) self.assertNotEqual(self.dvc.status(), {}) self.dvc.update(import_stage.path) self.assertTrue(os.path.exists("import")) self.assertTrue(filecmp.cmp("import", self.BAR, shallow=False)) self.assertEqual(self.dvc.status([import_stage.path]), {}) self.dvc.update(import_remote_stage.path) self.assertEqual(self.dvc.status([import_remote_stage.path]), {}) stages = self.dvc.reproduce(cmd_stage.path) self.assertEqual(len(stages), 1) self.assertEqual(self.dvc.status([cmd_stage.path]), {}) self.assertEqual(self.dvc.status(), {}) self.dvc.gc() self.assertEqual(self.dvc.status(), {}) self.dvc.remove(cmd_stage.path, outs_only=True) self.assertNotEqual(self.dvc.status([cmd_stage.path]), {}) self.dvc.checkout([cmd_stage.path], force=True) self.assertEqual(self.dvc.status([cmd_stage.path]), {})
def test_unknown_path(tmp_dir, erepo_dir): with pytest.raises(PathMissingError): Repo.get(fspath(erepo_dir), "a_non_existing_file")
def test_get_a_dvc_file(tmp_dir, erepo_dir): with pytest.raises(GetDVCFileError): Repo.get(os.fspath(erepo_dir), "some_file.dvc")