def test_get_hash_cached_granular(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": { "foo": "foo", "bar": "bar", "subdir": { "data": "data" } }}) fs = RepoFileSystem(dvc) subdir = PathInfo(tmp_dir) / "dir" / "subdir" assert fs.info(subdir).get("md5") is None assert stage(dvc.odb.local, subdir, fs, "md5").hash_info == HashInfo( "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir", ) assert fs.info(subdir / "data").get("md5") is None assert stage(dvc.odb.local, subdir / "data", fs, "md5").hash_info == HashInfo( "md5", "8d777f385d3dfec8815d20f7496026dc", ) (tmp_dir / "dir" / "subdir" / "data").unlink() assert (fs.info(subdir / "data")["md5"] == "8d777f385d3dfec8815d20f7496026dc")
def test_dir_hash_should_be_key_order_agnostic(tmp_dir, dvc): from dvc.objects.stage import stage from dvc.objects.tree import Tree tmp_dir.gen({"data": {"1": "1 content", "2": "2 content"}}) path_info = PathInfo("data") tree = Tree.from_list( [{"relpath": "1", "md5": "1"}, {"relpath": "2", "md5": "2"}] ) tree.digest() with patch("dvc.objects.stage._get_tree_obj", return_value=tree): hash1 = stage( dvc.odb.local, path_info, dvc.odb.local.fs, "md5" ).hash_info tree = Tree.from_list( [{"md5": "1", "relpath": "1"}, {"md5": "2", "relpath": "2"}] ) tree.digest() with patch("dvc.objects.stage._get_tree_obj", return_value=tree): hash2 = stage( dvc.odb.local, path_info, dvc.odb.local.fs, "md5" ).hash_info assert hash1 == hash2
def test_get_hash_cached_dir(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": { "foo": "foo", "bar": "bar", "subdir": { "data": "data" } }}) fs = RepoFileSystem(dvc) expected = "8761c4e9acad696bee718615e23e22db.dir" assert fs.info(PathInfo(tmp_dir) / "dir").get("md5") is None assert stage( dvc.odb.local, PathInfo(tmp_dir) / "dir", fs, "md5", ).hash_info == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir", ) shutil.rmtree(tmp_dir / "dir") assert fs.info(PathInfo(tmp_dir) / "dir")["md5"] == expected assert stage( dvc.odb.local, PathInfo(tmp_dir) / "dir", fs, "md5", ).hash_info == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir", )
def test_dir_hash_should_be_key_order_agnostic(tmp_dir, dvc): from dvc.objects.stage import stage from dvc.objects.tree import Tree tmp_dir.gen({"data": {"1": "1 content", "2": "2 content"}}) path = (tmp_dir / "data").fs_path tree = Tree.from_list([{ "relpath": "1", "md5": "1" }, { "relpath": "2", "md5": "2" }]) tree.digest() with patch("dvc.objects.stage._stage_tree", return_value=(None, tree)): _, _, obj = stage(dvc.odb.local, path, dvc.odb.local.fs, "md5") hash1 = obj.hash_info tree = Tree.from_list([{ "md5": "1", "relpath": "1" }, { "md5": "2", "relpath": "2" }]) tree.digest() with patch("dvc.objects.stage._stage_tree", return_value=(None, tree)): _, _, obj = stage(dvc.odb.local, path, dvc.odb.local.fs, "md5") hash2 = obj.hash_info assert hash1 == hash2
def test_dir_hash_should_be_key_order_agnostic(tmp_dir, dvc): from dvc.objects.stage import stage tmp_dir.gen({"data": {"1": "1 content", "2": "2 content"}}) path_info = PathInfo("data") dir_info = DirInfo.from_list([{ "relpath": "1", "md5": "1" }, { "relpath": "2", "md5": "2" }]) with patch( "dvc.objects.stage._collect_dir", return_value=dir_info, ): hash1 = stage(dvc.odb.local, path_info, dvc.odb.local.fs).hash_info dir_info = DirInfo.from_list([{ "md5": "1", "relpath": "1" }, { "md5": "2", "relpath": "2" }]) with patch( "dvc.objects.stage._collect_dir", return_value=dir_info, ): hash2 = stage(dvc.odb.local, path_info, dvc.odb.local.fs).hash_info assert hash1 == hash2
def test_fetch_external_repo_jobs(tmp_dir, scm, mocker, dvc, local_remote): tmp_dir.dvc_gen( { "dir1": { "file1": "file1", "file2": "file2", "file3": "file3", "file4": "file4", }, }, commit="init", ) dvc.push() with external_repo(str(tmp_dir)) as repo: spy = mocker.spy(repo.cloud, "pull") obj = stage( dvc.odb.local, PathInfo(repo.root_dir) / "dir1", repo.repo_fs, follow_subrepos=False, jobs=3, ) save( dvc.odb.local, obj, jobs=3, ) run_jobs = tuple(spy.call_args_list[0])[1].get("jobs") assert run_jobs == 3
def test_get_hash_dirty_file(tmp_dir, dvc): tmp_dir.dvc_gen("file", "file") (tmp_dir / "file").write_text("something") fs = RepoFileSystem(dvc) assert fs.info(PathInfo(tmp_dir) / "file").get("md5") is None actual = stage(dvc.odb.local, PathInfo(tmp_dir) / "file", fs).hash_info expected = HashInfo("md5", "437b930db84b8079c2dd804a71936b5f") assert actual == expected (tmp_dir / "file").unlink() assert (fs.info(PathInfo(tmp_dir) / "file")["md5"] == "8c7dd922ad47494fc02c388e12c00eac") actual = stage(dvc.odb.local, PathInfo(tmp_dir) / "file", fs).hash_info expected = HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac") assert actual == expected
def _changed(path_info, fs, obj, cache): logger.trace("checking if '%s'('%s') has changed.", path_info, obj) try: check(cache, obj) except (FileNotFoundError, ObjectFormatError): logger.debug("cache for '%s'('%s') has changed.", path_info, obj.hash_info) return True try: actual = stage(cache, path_info, fs, obj.hash_info.name).hash_info except FileNotFoundError: logger.debug("'%s' doesn't exist.", path_info) return True if obj.hash_info != actual: logger.debug( "hash value '%s' for '%s' has changed (actual '%s').", obj.hash_info, actual, path_info, ) return True logger.trace("'%s' hasn't changed.", path_info) return False
def download(self, to, jobs=None): from dvc.checkout import checkout from dvc.config import NoRemoteError from dvc.exceptions import NoOutputOrStageError from dvc.objects import save from dvc.objects.stage import stage odb = self.repo.odb.local with self._make_repo(cache_dir=odb.cache_dir) as repo: if self.def_repo.get(self.PARAM_REV_LOCK) is None: self.def_repo[self.PARAM_REV_LOCK] = repo.get_rev() path_info = PathInfo(repo.root_dir) / self.def_path try: repo.fetch([path_info.fspath], jobs=jobs, recursive=True) except (NoOutputOrStageError, NoRemoteError): pass obj = stage( odb, path_info, repo.repo_fs, jobs=jobs, follow_subrepos=False, ) save(odb, obj, jobs=jobs) checkout(to.path_info, to.fs, obj, odb)
def _diff( path_info, fs, obj, cache, relink=False, dvcignore: Optional[DvcIgnoreFilter] = None, ): old = None try: _, old = stage( cache, path_info, fs, obj.hash_info.name if obj else cache.fs.PARAM_CHECKSUM, dry_run=True, dvcignore=dvcignore, ) except FileNotFoundError: pass diff = odiff(old, obj, cache) if relink: diff.modified.extend(diff.unchanged) return diff
def test_staging_file(tmp_dir, dvc): from dvc.objects import check from dvc.objects.stage import stage from dvc.objects.transfer import transfer tmp_dir.gen("foo", "foo") fs = LocalFileSystem() local_odb = dvc.odb.local staging_odb, obj = stage(local_odb, tmp_dir / "foo", fs, "md5") assert not local_odb.exists(obj.hash_info) assert staging_odb.exists(obj.hash_info) with pytest.raises(FileNotFoundError): check(local_odb, obj) check(staging_odb, obj) transfer(staging_odb, local_odb, {obj.hash_info}, move=True) check(local_odb, obj) with pytest.raises(FileNotFoundError): check(staging_odb, obj) path_info = local_odb.hash_to_path_info(obj.hash_info.value) assert fs.exists(path_info)
def transfer( self, from_fs, from_info, jobs=None, update=False, no_progress_bar=False, ): # When running import-url --to-remote / add --to-remote/-o ... we # assume that it is unlikely that the odb will contain majority of the # hashes, so we transfer everything as is (even if that file might # already be in the cache) and don't waste an upload to scan the layout # of the source location. But when doing update --to-remote, there is # a high probability that the odb might contain some of the hashes, so # we first calculate all the hashes (but don't transfer anything) and # then only update the missing cache files. upload = not (update and from_fs.isdir(from_info)) jobs = jobs or min((from_fs.jobs, self.odb.fs.jobs)) obj = stage( self.odb, from_info, from_fs, "md5", upload=upload, jobs=jobs, no_progress_bar=no_progress_bar, ) save(self.odb, obj, jobs=jobs, move=upload) return obj.hash_info
def test_get_hash_dirty_dir(tmp_dir, dvc): tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}}) (tmp_dir / "dir" / "baz").write_text("baz") fs = RepoFileSystem(dvc) actual = stage(dvc.odb.local, PathInfo(tmp_dir) / "dir", fs).hash_info expected = HashInfo("md5", "ba75a2162ca9c29acecb7957105a0bc2.dir") assert actual == expected assert actual.dir_info.nfiles == 3
def test_get_hash_cached_file(tmp_dir, dvc, mocker): tmp_dir.dvc_gen({"foo": "foo"}) fs = RepoFileSystem(repo=dvc) expected = "acbd18db4cc2f85cedef654fccc4a4d8" assert fs.info(PathInfo(tmp_dir) / "foo").get("md5") is None _, _, obj = stage(dvc.odb.local, PathInfo(tmp_dir) / "foo", fs, "md5") assert obj.hash_info == HashInfo("md5", expected) (tmp_dir / "foo").unlink() assert fs.info(PathInfo(tmp_dir) / "foo")["md5"] == expected
def test_get_hash_dirty_dir(tmp_dir, dvc): tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}}) (tmp_dir / "dir" / "baz").write_text("baz") fs = DvcFileSystem(dvc) expected = "5ea40360f5b4ec688df672a4db9c17d1.dir" assert fs.info(PathInfo(tmp_dir) / "dir").get("md5") == expected assert stage(dvc.odb.local, PathInfo(tmp_dir) / "dir", fs, "md5").hash_info == HashInfo("md5", expected)
def test_get_hash_granular(tmp_dir, dvc): tmp_dir.dvc_gen( {"dir": { "foo": "foo", "bar": "bar", "subdir": { "data": "data" } }}) fs = DvcFileSystem(repo=dvc) subdir = PathInfo(tmp_dir) / "dir" / "subdir" assert fs.info(subdir).get("md5") is None _, _, obj = stage(dvc.odb.local, subdir, fs, "md5", dry_run=True) assert obj.hash_info == HashInfo("md5", "af314506f1622d107e0ed3f14ec1a3b5.dir") assert (fs.info(subdir / "data")["md5"] == "8d777f385d3dfec8815d20f7496026dc") _, _, obj = stage(dvc.odb.local, subdir / "data", fs, "md5", dry_run=True) assert obj.hash_info == HashInfo("md5", "8d777f385d3dfec8815d20f7496026dc")
def test_get_hash_dirty_file(tmp_dir, dvc): tmp_dir.dvc_gen("file", "file") (tmp_dir / "file").write_text("something") fs = DvcFileSystem(dvc) expected = "8c7dd922ad47494fc02c388e12c00eac" assert fs.info(PathInfo(tmp_dir) / "file").get("md5") == expected assert stage(dvc.odb.local, PathInfo(tmp_dir) / "file", fs, "md5").hash_info == HashInfo("md5", expected)
def test_get_hash_dirty_dir(tmp_dir, dvc): tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}}) (tmp_dir / "dir" / "baz").write_text("baz") clean_staging() fs = RepoFileSystem(repo=dvc) _, meta, obj = stage(dvc.odb.local, PathInfo(tmp_dir) / "dir", fs, "md5") assert obj.hash_info == HashInfo("md5", "ba75a2162ca9c29acecb7957105a0bc2.dir") assert meta.nfiles == 3
def _get_used_and_obj( self, obj_only=False, **kwargs ) -> Tuple[Dict[Optional["ObjectDB"], Set["HashInfo"]], "HashFile"]: from dvc.config import NoRemoteError from dvc.exceptions import NoOutputOrStageError, PathMissingError from dvc.objects.stage import stage from dvc.objects.tree import Tree local_odb = self.repo.odb.local locked = kwargs.pop("locked", True) with self._make_repo(locked=locked, cache_dir=local_odb.cache_dir) as repo: used_obj_ids = defaultdict(set) rev = repo.get_rev() if locked and self.def_repo.get(self.PARAM_REV_LOCK) is None: self.def_repo[self.PARAM_REV_LOCK] = rev path_info = PathInfo(repo.root_dir) / str(self.def_path) if not obj_only: try: for odb, obj_ids in repo.used_objs( [os.fspath(path_info)], force=True, jobs=kwargs.get("jobs"), recursive=True, ).items(): if odb is None: odb = repo.cloud.get_remote_odb() odb.read_only = True self._check_circular_import(odb, obj_ids) used_obj_ids[odb].update(obj_ids) except (NoRemoteError, NoOutputOrStageError): pass try: staging, staged_obj = stage( local_odb, path_info, repo.repo_fs, local_odb.fs.PARAM_CHECKSUM, ) except FileNotFoundError as exc: raise PathMissingError(self.def_path, self.def_repo[self.PARAM_URL]) from exc staging = copy(staging) staging.read_only = True self._staged_objs[rev] = staged_obj used_obj_ids[staging].add(staged_obj.hash_info) if isinstance(staged_obj, Tree): used_obj_ids[staging].update(entry.hash_info for _, entry in staged_obj) return used_obj_ids, staged_obj
def test_get_hash_dirty_dir(tmp_dir, dvc): tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}}) (tmp_dir / "dir" / "baz").write_text("baz") fs = DvcFileSystem(repo=dvc) expected = "5ea40360f5b4ec688df672a4db9c17d1.dir" assert fs.info((tmp_dir / "dir").fs_path).get("md5") == expected _, _, obj = stage( dvc.odb.local, (tmp_dir / "dir").fs_path, fs, "md5", dry_run=True ) assert obj.hash_info == HashInfo("md5", expected)
def _get_hash(self, locked=True): from dvc.objects.stage import stage with self._make_repo(locked=locked) as repo: path_info = PathInfo(repo.root_dir) / self.def_path return stage( self.repo.odb.local, path_info, repo.repo_fs, self.repo.odb.local.fs.PARAM_CHECKSUM, ).hash_info
def _get_hash(self, locked=True): from dvc.objects.stage import stage with self._make_repo(locked=locked) as repo: path_info = PathInfo(repo.root_dir) / self.def_path return stage( self.repo.odb.local, path_info, repo.repo_fs, follow_subrepos=False, ).hash_info
def test_get_hash_dirty_file(tmp_dir, dvc): tmp_dir.dvc_gen("file", "file") (tmp_dir / "file").write_text("something") fs = DvcFileSystem(repo=dvc) expected = "8c7dd922ad47494fc02c388e12c00eac" assert fs.info((tmp_dir / "file").fs_path).get("md5") == expected _, _, obj = stage( dvc.odb.local, (tmp_dir / "file").fs_path, fs, "md5", dry_run=True ) assert obj.hash_info == HashInfo("md5", expected)
def test_get_hash_dirty_file(tmp_dir, dvc): from dvc.objects import check from dvc.objects.errors import ObjectFormatError from dvc.objects.stage import get_file_hash tmp_dir.dvc_gen("file", "file") file_hash_info = HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac") (tmp_dir / "file").write_text("something") something_hash_info = HashInfo("md5", "437b930db84b8079c2dd804a71936b5f") clean_staging() # file is modified in workspace # get_file_hash(file) should return workspace hash, not DVC cached hash fs = RepoFileSystem(repo=dvc) assert fs.info(PathInfo(tmp_dir) / "file").get("md5") is None staging, _, obj = stage(dvc.odb.local, PathInfo(tmp_dir) / "file", fs, "md5") assert obj.hash_info == something_hash_info check(staging, obj) # file is removed in workspace # any staged object referring to modified workspace obj is now invalid (tmp_dir / "file").unlink() with pytest.raises(ObjectFormatError): check(staging, obj) # get_file_hash(file) should return DVC cached hash assert fs.info(PathInfo(tmp_dir) / "file")["md5"] == file_hash_info.value _, hash_info = get_file_hash(PathInfo(tmp_dir) / "file", fs, "md5", state=dvc.state) assert hash_info == file_hash_info # tmp_dir/file can be staged even though it is missing in workspace since # repofs will use the DVC cached hash (and refer to the local cache object) _, _, obj = stage(dvc.odb.local, PathInfo(tmp_dir) / "file", fs, "md5") assert obj.hash_info == file_hash_info
def _fetch_external(self, repo_url, repo_rev, files, jobs): from dvc.external_repo import external_repo from dvc.objects import save from dvc.objects.stage import stage from dvc.path_info import PathInfo from dvc.scm.base import CloneError failed = 0 results = [] def cb(result): results.append(result) odb = self.odb.local try: with external_repo(repo_url, repo_rev, cache_dir=odb.cache_dir) as repo: root = PathInfo(repo.root_dir) for path in files: path_info = root / path try: used = repo.used_cache( [os.fspath(path_info)], force=True, jobs=jobs, recursive=True, ) cb(repo.cloud.pull(used, jobs)) except (NoOutputOrStageError, NoRemoteError): pass obj = stage( odb, path_info, repo.repo_fs, "md5", jobs=jobs, follow_subrepos=False, ) save( odb, obj, jobs=jobs, download_callback=cb, ) except CloneError: failed += 1 logger.exception("failed to fetch data for '{}'".format( ", ".join(files))) return sum(results), failed
def test_get_hash_mixed_dir(tmp_dir, scm, dvc): tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) tmp_dir.dvc.add(os.path.join("dir", "foo")) tmp_dir.scm.add([ os.path.join("dir", "bar"), os.path.join("dir", ".gitignore"), os.path.join("dir", "foo.dvc"), ]) tmp_dir.scm.commit("add dir") fs = RepoFileSystem(dvc) actual = stage(dvc.odb.local, PathInfo(tmp_dir) / "dir", fs).hash_info expected = HashInfo("md5", "e1d9e8eae5374860ae025ec84cfd85c7.dir") assert actual == expected
def test_get_hash_mixed_dir(tmp_dir, scm, dvc): tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) tmp_dir.dvc.add(os.path.join("dir", "foo")) tmp_dir.scm.add([ os.path.join("dir", "bar"), os.path.join("dir", ".gitignore"), os.path.join("dir", "foo.dvc"), ]) tmp_dir.scm.commit("add dir") clean_staging() fs = RepoFileSystem(repo=dvc) _, _, obj = stage(dvc.odb.local, PathInfo(tmp_dir) / "dir", fs, "md5") assert obj.hash_info == HashInfo("md5", "e1d9e8eae5374860ae025ec84cfd85c7.dir")
def download(self, to, jobs=None): from dvc.checkout import checkout from dvc.objects import save from dvc.objects.stage import stage odb = self.repo.odb.local with self._make_repo(cache_dir=odb.cache_dir) as repo: if self.def_repo.get(self.PARAM_REV_LOCK) is None: self.def_repo[self.PARAM_REV_LOCK] = repo.get_rev() path_info = PathInfo(repo.root_dir) / self.def_path obj = stage( odb, path_info, repo.repo_fs, jobs=jobs, follow_subrepos=False, ) save(odb, obj, jobs=jobs) checkout(to.path_info, to.fs, obj, odb)
def get_used_objs(self, **kwargs) -> Dict[Optional["ObjectDB"], Set["HashFile"]]: from dvc.config import NoRemoteError from dvc.exceptions import NoOutputOrStageError, PathMissingError from dvc.objects.db.git import GitObjectDB from dvc.objects.stage import stage local_odb = self.repo.odb.local locked = kwargs.pop("locked", True) with self._make_repo(locked=locked, cache_dir=local_odb.cache_dir) as repo: used_objs = defaultdict(set) rev = repo.get_rev() if locked and self.def_repo.get(self.PARAM_REV_LOCK) is None: self.def_repo[self.PARAM_REV_LOCK] = rev path_info = PathInfo(repo.root_dir) / str(self.def_path) try: for odb, objs in repo.used_objs( [os.fspath(path_info)], force=True, jobs=kwargs.get("jobs"), recursive=True, ).items(): if odb is None: odb = repo.cloud.get_remote().odb self._check_circular_import(odb) used_objs[odb].update(objs) except (NoRemoteError, NoOutputOrStageError): pass try: staged_obj = stage( local_odb, path_info, repo.repo_fs, local_odb.fs.PARAM_CHECKSUM, ) except FileNotFoundError as exc: raise PathMissingError(self.def_path, self.def_repo[self.PARAM_URL]) from exc self._staged_objs[rev] = staged_obj git_odb = GitObjectDB(repo.repo_fs, repo.root_dir) used_objs[git_odb].add(staged_obj) return used_objs
def test_subrepos_are_ignored(tmp_dir, erepo_dir): subrepo = erepo_dir / "dir" / "subrepo" make_subrepo(subrepo, erepo_dir.scm) with erepo_dir.chdir(): erepo_dir.dvc_gen("dir/foo", "foo", commit="foo") erepo_dir.scm_gen("dir/bar", "bar", commit="bar") with subrepo.chdir(): subrepo.dvc_gen({"file": "file"}, commit="add files on subrepo") with external_repo(os.fspath(erepo_dir)) as repo: repo.repo_fs.download( PathInfo(repo.root_dir) / "dir", PathInfo(tmp_dir / "out"), follow_subrepos=False, ) expected_files = {"foo": "foo", "bar": "bar", ".gitignore": "/foo\n"} assert (tmp_dir / "out").read_text() == expected_files # clear cache to test saving to cache cache_dir = tmp_dir / repo.odb.local.cache_dir remove(cache_dir) clean_staging() makedirs(cache_dir) staging, _, obj = stage( repo.odb.local, PathInfo(repo.root_dir) / "dir", repo.repo_fs, "md5", dvcignore=repo.dvcignore, ) transfer( staging, repo.odb.local, {obj.hash_info}, shallow=False, move=True, ) assert set(cache_dir.glob("??/*")) == { cache_dir / "e1" / "d9e8eae5374860ae025ec84cfd85c7.dir", cache_dir / "37" / "b51d194a7513e45b56f6524f2d51f2", cache_dir / "94" / "7d2b84e5aa88170e80dff467a5bfb6", cache_dir / "ac" / "bd18db4cc2f85cedef654fccc4a4d8", }