def test_dir_hash_should_be_key_order_agnostic(tmp_dir, dvc): from dvc.objects.stage import get_hash tmp_dir.gen({"data": {"1": "1 content", "2": "2 content"}}) path_info = PathInfo("data") dir_info = DirInfo.from_list([{ "relpath": "1", "md5": "1" }, { "relpath": "2", "md5": "2" }]) with patch( "dvc.objects.stage._collect_dir", return_value=dir_info, ): hash1 = get_hash(path_info, dvc.odb.local.fs, "md5") dir_info = DirInfo.from_list([{ "md5": "1", "relpath": "1" }, { "md5": "2", "relpath": "2" }]) with patch( "dvc.objects.stage._collect_dir", return_value=dir_info, ): hash2 = get_hash(path_info, dvc.odb.local.fs, "md5") assert hash1 == hash2
def _changed(path_info, fs, obj, cache): logger.trace("checking if '%s'('%s') has changed.", path_info, obj) try: check(cache, obj) except (FileNotFoundError, ObjectFormatError): logger.debug("cache for '%s'('%s') has changed.", path_info, obj.hash_info) return True try: actual = get_hash(path_info, fs, obj.hash_info.name) except FileNotFoundError: logger.debug("'%s' doesn't exist.", path_info) return True if obj.hash_info != actual: logger.debug( "hash value '%s' for '%s' has changed (actual '%s').", obj.hash_info, actual, path_info, ) return True logger.trace("'%s' hasn't changed.", path_info) return False
def test_get_hash_cached_dir(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}} ) fs = RepoFileSystem(dvc) expected = "8761c4e9acad696bee718615e23e22db.dir" assert fs.info(PathInfo(tmp_dir) / "dir").get("md5") is None assert get_hash(PathInfo(tmp_dir) / "dir", fs, "md5") == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir", ) shutil.rmtree(tmp_dir / "dir") assert fs.info(PathInfo(tmp_dir) / "dir")["md5"] == expected assert get_hash(PathInfo(tmp_dir) / "dir", fs, "md5") == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir", )
def _get_hash(self, locked=True): from dvc.objects.stage import get_hash with self._make_repo(locked=locked) as repo: path_info = PathInfo(repo.root_dir) / self.def_path return get_hash( path_info, repo.repo_fs, "md5", follow_subrepos=False )
def get_hash(self): if not self.use_cache: return get_hash( self.path_info, self.fs, self.fs.PARAM_CHECKSUM, self.repo.odb.local, ) return ostage(self.odb, self.path_info, self.fs).hash_info
def test_get_hash_dirty_dir(tmp_dir, dvc): tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}}) (tmp_dir / "dir" / "baz").write_text("baz") fs = DvcFileSystem(dvc) expected = "5ea40360f5b4ec688df672a4db9c17d1.dir" assert fs.info(PathInfo(tmp_dir) / "dir").get("md5") == expected assert get_hash(PathInfo(tmp_dir) / "dir", fs, "md5") == HashInfo("md5", expected)
def test_get_hash_dirty_file(tmp_dir, dvc): tmp_dir.dvc_gen("file", "file") (tmp_dir / "file").write_text("something") fs = DvcFileSystem(dvc) expected = "8c7dd922ad47494fc02c388e12c00eac" assert fs.info(PathInfo(tmp_dir) / "file").get("md5") == expected assert get_hash(PathInfo(tmp_dir) / "file", fs, "md5") == HashInfo("md5", expected)
def test_get_hash_dirty_dir(tmp_dir, dvc): tmp_dir.dvc_gen({"dir": {"foo": "foo", "bar": "bar"}}) (tmp_dir / "dir" / "baz").write_text("baz") fs = RepoFileSystem(dvc) actual = get_hash(PathInfo(tmp_dir) / "dir", fs, "md5") expected = HashInfo("md5", "ba75a2162ca9c29acecb7957105a0bc2.dir") assert actual == expected assert actual.dir_info.nfiles == 3
def test_get_hash_dirty_file(tmp_dir, dvc): tmp_dir.dvc_gen("file", "file") (tmp_dir / "file").write_text("something") fs = RepoFileSystem(dvc) assert fs.info(PathInfo(tmp_dir) / "file").get("md5") is None actual = get_hash(PathInfo(tmp_dir) / "file", fs, "md5") expected = HashInfo("md5", "437b930db84b8079c2dd804a71936b5f") assert actual == expected (tmp_dir / "file").unlink() assert ( fs.info(PathInfo(tmp_dir) / "file")["md5"] == "8c7dd922ad47494fc02c388e12c00eac" ) actual = get_hash(PathInfo(tmp_dir) / "file", fs, "md5") expected = HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac") assert actual == expected
def test_get_hash_cached_granular(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}} ) fs = RepoFileSystem(dvc) subdir = PathInfo(tmp_dir) / "dir" / "subdir" assert fs.info(subdir).get("md5") is None assert get_hash(subdir, fs, "md5") == HashInfo( "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir", ) assert fs.info(subdir / "data").get("md5") is None assert get_hash(subdir / "data", fs, "md5") == HashInfo( "md5", "8d777f385d3dfec8815d20f7496026dc", ) (tmp_dir / "dir" / "subdir" / "data").unlink() assert ( fs.info(subdir / "data")["md5"] == "8d777f385d3dfec8815d20f7496026dc" )
def test_get_hash_cached_file(tmp_dir, dvc, mocker): tmp_dir.dvc_gen({"foo": "foo"}) fs = RepoFileSystem(dvc) expected = "acbd18db4cc2f85cedef654fccc4a4d8" assert fs.info(PathInfo(tmp_dir) / "foo").get("md5") is None assert get_hash(PathInfo(tmp_dir) / "foo", fs, "md5") == HashInfo( "md5", expected, ) (tmp_dir / "foo").unlink() assert fs.info(PathInfo(tmp_dir) / "foo")["md5"] == expected
def test_get_hash_mixed_dir(tmp_dir, scm, dvc): tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) tmp_dir.dvc.add(os.path.join("dir", "foo")) tmp_dir.scm.add( [ os.path.join("dir", "bar"), os.path.join("dir", ".gitignore"), os.path.join("dir", "foo.dvc"), ] ) tmp_dir.scm.commit("add dir") fs = RepoFileSystem(dvc) actual = get_hash(PathInfo(tmp_dir) / "dir", fs, "md5") expected = HashInfo("md5", "e1d9e8eae5374860ae025ec84cfd85c7.dir") assert actual == expected
def _remove(path_info, fs, cache, force=False): if not fs.exists(path_info): return if force: fs.remove(path_info) return current = get_hash(path_info, fs, fs.PARAM_CHECKSUM) try: obj = load(cache, current) check(cache, obj) except (FileNotFoundError, ObjectFormatError): msg = (f"file/directory '{path_info}' is going to be removed. " "Are you sure you want to proceed?") if not prompt.confirm(msg): raise ConfirmRemoveError(str(path_info)) fs.remove(path_info)
def get_url(path, repo=None, rev=None, remote=None): """ Returns the URL to the storage location of a data file or directory tracked in a DVC repo. For Git repos, HEAD is used unless a rev argument is supplied. The default remote is tried unless a remote argument is supplied. Raises OutputNotFoundError if the file is not tracked by DVC. NOTE: This function does not check for the actual existence of the file or directory in the remote storage. """ with Repo.open(repo, rev=rev, subrepos=True, uninitialized=True) as _repo: path_info = PathInfo(_repo.root_dir) / path with reraise(FileNotFoundError, PathMissingError(path, repo)): metadata = _repo.repo_fs.metadata(path_info) if not metadata.is_dvc: raise OutputNotFoundError(path, repo) cloud = metadata.repo.cloud hash_info = get_hash(path_info, _repo.repo_fs, "md5") return cloud.get_url_for(remote, checksum=hash_info.value)
def _to_checksum(output): if on_working_fs: return get_hash(output.path_info, repo.odb.local.fs, "md5").value return output.hash_info.value