def test_walk_onerror(tmp_dir, dvc): def onerror(exc): raise exc tmp_dir.dvc_gen("foo", "foo") fs = DvcFileSystem(dvc) # path does not exist for _ in fs.walk("dir"): pass with pytest.raises(OSError): for _ in fs.walk("dir", onerror=onerror): pass # path is not a directory for _ in fs.walk("foo"): pass with pytest.raises(OSError): for _ in fs.walk("foo", onerror=onerror): pass
def dvcfs(self): from dvc.fs.dvc import DvcFileSystem return DvcFileSystem(repo=self)
def test_get_hash_file(tmp_dir, dvc): tmp_dir.dvc_gen({"foo": "foo"}) fs = DvcFileSystem(repo=dvc) assert (fs.info( (tmp_dir / "foo").fs_path)["md5"] == "acbd18db4cc2f85cedef654fccc4a4d8")
def test_isdvc(tmp_dir, dvc): tmp_dir.gen({"foo": "foo", "bar": "bar"}) dvc.add("foo") fs = DvcFileSystem(repo=dvc) assert fs.isdvc("foo") assert not fs.isdvc("bar")
def test_get_key(tmp_dir, dvc, path, key): fs = DvcFileSystem(repo=dvc) assert fs.fs._get_key(path) == key
def test_walk_not_a_dir(tmp_dir, dvc): tmp_dir.dvc_gen("foo", "foo") fs = DvcFileSystem(repo=dvc) for _ in fs.walk("foo"): pass
def test_walk_missing(tmp_dir, dvc): fs = DvcFileSystem(repo=dvc) for _ in fs.walk("dir"): pass
def test_get_hash_file(tmp_dir, dvc): tmp_dir.dvc_gen({"foo": "foo"}) fs = DvcFileSystem(dvc) assert (fs.info(PathInfo(tmp_dir) / "foo")["md5"] == "acbd18db4cc2f85cedef654fccc4a4d8")
def test_isdir_isfile(tmp_dir, dvc): tmp_dir.gen( { "datafile": "data", "datadir": { "foo": "foo", "bar": "bar", }, "subdir": { "baz": "baz", "data": { "abc": "abc", "xyz": "xyz", }, }, }, ) fs = DvcFileSystem(repo=dvc) assert fs.isdir("datadir") assert not fs.isfile("datadir") assert not fs.isdvc("datadir") assert not fs.isdir("datafile") assert fs.isfile("datafile") assert not fs.isdvc("datafile") dvc.add( [ "datadir", "datafile", os.path.join("subdir", "baz"), os.path.join("subdir", "data"), ] ) shutil.rmtree(tmp_dir / "datadir") shutil.rmtree(tmp_dir / "subdir" / "data") (tmp_dir / "datafile").unlink() (tmp_dir / "subdir" / "baz").unlink() assert fs.isdir("datadir") assert not fs.isfile("datadir") assert fs.isdvc("datadir") assert not fs.isdir("datafile") assert fs.isfile("datafile") assert fs.isdvc("datafile") assert fs.isdir("subdir") assert not fs.isfile("subdir") assert not fs.isdvc("subdir") assert fs.isfile("subdir/baz") assert fs.isdir("subdir/data")
def test_dvcfs_no_subrepos(tmp_dir, dvc, scm): tmp_dir.scm_gen( {"dir": {"repo.txt": "file to confuse DvcFileSystem"}}, commit="dir/repo.txt", ) tmp_dir.dvc_gen({"lorem": "lorem"}, commit="add foo") subrepo = tmp_dir / "dir" / "repo" make_subrepo(subrepo, scm) with subrepo.chdir(): subrepo.dvc_gen({"foo": "foo", "dir1": {"bar": "bar"}}, commit="FOO") subrepo.scm_gen({"ipsum": "ipsum"}, commit="BAR") # using fs that does not have dvcignore dvc._reset() fs = DvcFileSystem(repo=dvc) expected = [ "/.dvcignore", "/.gitignore", "/lorem", "/lorem.dvc", "/dir", "/dir/repo.txt", ] actual = [] for root, dirs, files in fs.walk("/", dvcfiles=True): for entry in dirs + files: actual.append(posixpath.join(root, entry)) assert set(actual) == set(expected) assert len(actual) == len(expected) assert fs.isfile("lorem") is True assert fs.isfile("dir/repo/foo") is False assert fs.isdir("dir/repo") is False assert fs.isdir("dir") is True assert fs.isdvc("lorem") is True assert fs.isdvc("dir/repo/dir1") is False assert fs.exists("dir/repo.txt") is True assert fs.exists("repo/ipsum") is False
def test_subrepos(tmp_dir, scm, dvc, mocker): tmp_dir.scm_gen( {"dir": {"repo.txt": "file to confuse DvcFileSystem"}}, commit="dir/repo.txt", ) subrepo1 = tmp_dir / "dir" / "repo" subrepo2 = tmp_dir / "dir" / "repo2" for repo in [subrepo1, subrepo2]: make_subrepo(repo, scm) with subrepo1.chdir(): subrepo1.dvc_gen({"foo": "foo", "dir1": {"bar": "bar"}}, commit="FOO") with subrepo2.chdir(): subrepo2.dvc_gen( {"lorem": "lorem", "dir2": {"ipsum": "ipsum"}}, commit="BAR" ) dvc._reset() fs = DvcFileSystem(repo=dvc, subrepos=True) def assert_fs_belongs_to_repo(ret_val): method = fs.fs._get_repo def f(*args, **kwargs): r = method(*args, **kwargs) assert r.root_dir == ret_val.root_dir return r return f with mock.patch.object( fs.fs, "_get_repo", side_effect=assert_fs_belongs_to_repo(subrepo1.dvc) ): assert fs.exists("dir/repo/foo") is True assert fs.exists("dir/repo/bar") is False assert fs.isfile("dir/repo/foo") is True assert fs.isfile("dir/repo/dir1/bar") is True assert fs.isfile("dir/repo/dir1") is False assert fs.isdir("dir/repo/dir1") is True assert fs.isdir("dir/repo/dir1/bar") is False assert fs.isdvc("dir/repo/foo") is True with mock.patch.object( fs.fs, "_get_repo", side_effect=assert_fs_belongs_to_repo(subrepo2.dvc) ): assert fs.exists("dir/repo2/lorem") is True assert fs.exists("dir/repo2/ipsum") is False assert fs.isfile("dir/repo2/lorem") is True assert fs.isfile("dir/repo2/dir2/ipsum") is True assert fs.isfile("dir/repo2/dir2") is False assert fs.isdir("dir/repo2/dir2") is True assert fs.isdir("dir/repo2/dir2/ipsum") is False assert fs.isdvc("dir/repo2/lorem") is True
def test_exists_isdir_isfile_dirty(tmp_dir, dvc): tmp_dir.dvc_gen( {"datafile": "data", "datadir": {"foo": "foo", "bar": "bar"}} ) fs = DvcFileSystem(repo=dvc) shutil.rmtree(tmp_dir / "datadir") (tmp_dir / "datafile").unlink() assert fs.exists("datafile") assert fs.exists("datadir") assert fs.exists("datadir/foo") assert fs.isfile("datafile") assert not fs.isfile("datadir") assert fs.isfile("datadir/foo") assert not fs.isdir("datafile") assert fs.isdir("datadir") assert not fs.isdir("datadir/foo") # NOTE: creating file instead of dir and dir instead of file tmp_dir.gen({"datadir": "data", "datafile": {"foo": "foo", "bar": "bar"}}) assert fs.exists("datafile") assert fs.exists("datadir") assert not fs.exists("datadir/foo") assert fs.exists("datafile/foo") assert not fs.isfile("datafile") assert fs.isfile("datadir") assert not fs.isfile("datadir/foo") assert fs.isfile("datafile/foo") assert fs.isdir("datafile") assert not fs.isdir("datadir") assert not fs.isdir("datadir/foo") assert not fs.isdir("datafile/foo")
def diff(self, a_rev="HEAD", b_rev=None, targets=None): """ By default, it compares the workspace with the last commit's fs. This implementation differs from `git diff` since DVC doesn't have the concept of `index`, but it keeps the same interface, thus, `dvc diff` would be the same as `dvc diff HEAD`. """ if self.scm.no_commits: return {} from dvc.fs.dvc import DvcFileSystem dvcfs = DvcFileSystem(repo=self) b_rev = b_rev if b_rev else "workspace" results = {} missing_targets = {} for rev in self.brancher(revs=[a_rev, b_rev]): if rev == "workspace" and rev != b_rev: # brancher always returns workspace, but we only need to compute # workspace paths/checksums if b_rev was None continue targets_paths = None if targets is not None: # convert targets to paths, and capture any missing targets targets_paths, missing_targets[rev] = _targets_to_paths( dvcfs, targets) results[rev] = _paths_checksums(self, targets_paths) if targets is not None: # check for overlapping missing targets between a_rev and b_rev for target in set(missing_targets[a_rev]) & set( missing_targets[b_rev]): raise PathMissingError(target, self) old = results[a_rev] new = results[b_rev] # Compare paths between the old and new fs. # set() efficiently converts dict keys to a set added = sorted(set(new) - set(old)) deleted_or_missing = set(old) - set(new) if b_rev == "workspace": # missing status is only applicable when diffing local workspace # against a commit missing = sorted(_filter_missing(dvcfs, deleted_or_missing)) else: missing = [] deleted = sorted(deleted_or_missing - set(missing)) modified = sorted(set(old) & set(new)) # Cases when file was changed and renamed are resulted # in having deleted and added record # To cover such cases we need to change hashing function # to produce rolling/chunking hash renamed = _calculate_renamed(new, old, added, deleted) for renamed_item in renamed: added.remove(renamed_item["path"]["new"]) deleted.remove(renamed_item["path"]["old"]) ret = { "added": [{ "path": path, "hash": new[path] } for path in added], "deleted": [{ "path": path, "hash": old[path] } for path in deleted], "modified": [{ "path": path, "hash": { "old": old[path], "new": new[path] } } for path in modified if old[path] != new[path]], "renamed": renamed, "not in cache": [{ "path": path, "hash": old[path] } for path in missing], } return ret if any(ret.values()) else {}
def dvcfs(self): from dvc.fs.dvc import DvcFileSystem return DvcFileSystem(repo=self, subrepos=self.subrepos, **self._fs_conf)