def test_order(dvc): stage = create_stage(PipelineStage, dvc, deps=["input"], outs=["output"], params=["foo-param"], **kwargs) params, deps = split_params_deps(stage) deps[0].hash_info = HashInfo("md5", "md-five") params[0].hash_info = HashInfo("params", {"foo-param": "value"}) stage.outs[0].hash_info = HashInfo("md5", "md5-output") assert to_single_stage_lockfile(stage) == OrderedDict([ ("cmd", "command"), ("deps", [{ "path": "input", "md5": "md-five" }]), ("params", { "params.yaml": { "foo-param": "value" } }), ("outs", [{ "path": "output", "md5": "md5-output" }]), ])
def test_get_hash_cached_dir(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": { "foo": "foo", "bar": "bar", "subdir": { "data": "data" } }}) tree = RepoTree(dvc) get_file_hash_spy = mocker.spy(tree, "get_file_hash") dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_dir_hash") with dvc.state: assert tree.get_hash(PathInfo(tmp_dir) / "dir") == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir", ) assert get_file_hash_spy.called assert not dvc_tree_spy.called get_file_hash_spy.reset_mock() shutil.rmtree(tmp_dir / "dir") with dvc.state: assert tree.get_hash(PathInfo(tmp_dir) / "dir") == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir", ) assert not get_file_hash_spy.called assert dvc_tree_spy.called
def test_get_hash_cached_granular(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": { "foo": "foo", "bar": "bar", "subdir": { "data": "data" } }}) fs = RepoFileSystem(dvc) subdir = PathInfo(tmp_dir) / "dir" / "subdir" assert fs.info(subdir).get("md5") is None assert stage(dvc.odb.local, subdir, fs, "md5").hash_info == HashInfo( "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir", ) assert fs.info(subdir / "data").get("md5") is None assert stage(dvc.odb.local, subdir / "data", fs, "md5").hash_info == HashInfo( "md5", "8d777f385d3dfec8815d20f7496026dc", ) (tmp_dir / "dir" / "subdir" / "data").unlink() assert (fs.info(subdir / "data")["md5"] == "8d777f385d3dfec8815d20f7496026dc")
def test_get_hash_cached_dir(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": { "foo": "foo", "bar": "bar", "subdir": { "data": "data" } }}) fs = RepoFileSystem(dvc) expected = "8761c4e9acad696bee718615e23e22db.dir" assert fs.info(PathInfo(tmp_dir) / "dir").get("md5") is None assert stage( dvc.odb.local, PathInfo(tmp_dir) / "dir", fs, "md5", ).hash_info == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir", ) shutil.rmtree(tmp_dir / "dir") assert fs.info(PathInfo(tmp_dir) / "dir")["md5"] == expected assert stage( dvc.odb.local, PathInfo(tmp_dir) / "dir", fs, "md5", ).hash_info == HashInfo( "md5", "8761c4e9acad696bee718615e23e22db.dir", )
def test_status_download_optimization(mocker, dvc): """When comparing the status to pull a remote cache, And the desired files to fetch are already on the local cache, Don't check the existence of the desired files on the remote cache """ odb = LocalObjectDB(LocalFileSystem(), PathInfo(".")) objs = { HashFile(None, odb.fs, HashInfo("md5", "acbd18db4cc2f85cedef654fccc4a4d8")), HashFile(None, odb.fs, HashInfo("md5", "37b51d194a7513e45b56f6524f2d51f2")), } local_exists = [obj.hash_info.value for obj in objs] mocker.patch.object(odb, "hashes_exist", return_value=local_exists) other_remote = mocker.Mock() other_remote.url = "other_remote" other_remote.hashes_exist.return_value = [] other_remote.index = RemoteIndexNoop() other_remote.status(odb, objs, download=True) assert other_remote.hashes_exist.call_count == 0
def test_used_objs(tmp_dir, scm, dvc, run_copy, rev): from dvc.hash_info import HashInfo dvc.config["core"]["autostage"] = True tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}}, "foo": "foo"}) run_copy("foo", "bar", name="copy-foo-bar") scm.commit("commit") index = get_index(dvc, rev) expected_objs = [ HashInfo( name="md5", value="acbd18db4cc2f85cedef654fccc4a4d8", obj_name="bar", ), HashInfo( name="md5", value="8c7dd922ad47494fc02c388e12c00eac", obj_name="dir/subdir/file", ), HashInfo( name="md5", value="d28c9e28591aeb7e303dc6772ffa6f6b.dir", obj_name="dir", ), ] assert index.used_objs() == {None: set(expected_objs)} assert index.used_objs("dir") == {None: set(expected_objs[1:])} assert index.used_objs(".", recursive=True) == {None: set(expected_objs)} assert index.used_objs("copy-foo-bar", with_deps=True) == { None: {expected_objs[0]} }
def get_file_hash(self, path_info): outs = self._find_outs(path_info, strict=False) if len(outs) != 1: raise OutputNotFoundError out = outs[0] if out.is_dir_checksum: return HashInfo( out.tree.PARAM_CHECKSUM, self._get_granular_checksum(path_info, out), ) return HashInfo(out.tree.PARAM_CHECKSUM, out.checksum)
def test_fill_from_lock_deps_outs(dvc, lock_data): stage = create_stage( PipelineStage, dvc, PIPELINE_FILE, deps=["foo"], outs=["bar"] ) for item in chain(stage.deps, stage.outs): assert not item.hash_info StageLoader.fill_from_lock(stage, lock_data) assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def test_load_stage_with_params(dvc, stage_data, lock_data): lock_data["params"] = {"params.yaml": {"lorem": "ipsum"}} stage_data["params"] = ["lorem"] dvcfile = Dvcfile(dvc, PIPELINE_FILE) stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data) params, deps = split_params_deps(stage) assert deps[0].def_path == "foo" and stage.outs[0].def_path == "bar" assert params[0].def_path == "params.yaml" assert params[0].hash_info == HashInfo("params", {"lorem": "ipsum"}) assert deps[0].hash_info == HashInfo("md5", "foo_checksum") assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def test_load_stage(dvc, stage_data, lock_data): dvcfile = Dvcfile(dvc, PIPELINE_FILE) stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data) assert stage.wdir == os.path.abspath(os.curdir) assert stage.name == "stage-1" assert stage.cmd == "command" assert stage.path == os.path.abspath(PIPELINE_FILE) assert stage.deps[0].def_path == "foo" assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") assert stage.outs[0].def_path == "bar" assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def test_fill_from_lock_use_appropriate_checksum(dvc, lock_data): stage = create_stage( PipelineStage, dvc, PIPELINE_FILE, deps=["s3://dvc-temp/foo"], outs=["bar"], ) lock_data["deps"] = [{"path": "s3://dvc-temp/foo", "etag": "e-tag"}] StageLoader.fill_from_lock(stage, lock_data) assert stage.deps[0].hash_info == HashInfo("etag", "e-tag") assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def test_used_objs(tmp_dir, dvc, path): tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}, "other": "other"}}) expected = { HashInfo("md5", "70922d6bf66eb073053a82f77d58c536.dir"), HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac"), } used = set() for _, obj_ids in dvc.used_objs([path]).items(): used.update(obj_ids) assert used == expected
def test(self): dir_hash = "123.dir" fname = os.fspath( self.dvc.cache.local.tree.hash_to_path_info(dir_hash)) self.create(fname, "<clearly>not,json") with pytest.raises(DirCacheError): self.dvc.cache.local.load_dir_cache(HashInfo("md5", dir_hash)) dir_hash = "234.dir" fname = os.fspath( self.dvc.cache.local.tree.hash_to_path_info(dir_hash)) self.create(fname, '{"a": "b"}') self._do_test( self.dvc.cache.local.load_dir_cache(HashInfo("md5", dir_hash)))
def test_get_hash_cached_granular(tmp_dir, dvc, mocker): tmp_dir.dvc_gen( {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}} ) tree = RepoTree(dvc) dvc_tree_spy = mocker.spy(tree._dvctrees[dvc.root_dir], "get_file_hash") subdir = PathInfo(tmp_dir) / "dir" / "subdir" assert tree.get_hash(subdir) == HashInfo( "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir", ) assert tree.get_hash(subdir / "data") == HashInfo( "md5", "8d777f385d3dfec8815d20f7496026dc", ) assert dvc_tree_spy.called
def test_fill_from_lock_missing_checksums(dvc, lock_data): stage = create_stage( PipelineStage, dvc, PIPELINE_FILE, deps=["foo", "foo1"], outs=["bar", "bar1"], ) StageLoader.fill_from_lock(stage, lock_data) assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum") assert not stage.deps[1].hash_info and not stage.outs[1].hash_info
def test(self): from dvc.objects import load dir_hash = "123.dir" fname = os.fspath(self.dvc.odb.local.hash_to_path(dir_hash)) self.create(fname, "<clearly>not,json") with pytest.raises(ObjectFormatError): load(self.dvc.odb.local, HashInfo("md5", dir_hash)) dir_hash = "234.dir" fname = os.fspath(self.dvc.odb.local.hash_to_path(dir_hash)) self.create(fname, '{"a": "b"}') with pytest.raises(ObjectFormatError): load(self.dvc.odb.local, HashInfo("md5", dir_hash))
def test_cache_load_bad_dir_cache(tmp_dir, dvc): from dvc.data import load dir_hash = "123.dir" fname = os.fspath(dvc.odb.local.hash_to_path(dir_hash)) tmp_dir.gen({fname: "<clearly>not,json"}) with pytest.raises(ObjectFormatError): load(dvc.odb.local, HashInfo("md5", dir_hash)) dir_hash = "234.dir" fname = os.fspath(dvc.odb.local.hash_to_path(dir_hash)) tmp_dir.gen({fname: '{"a": "b"}'}) with pytest.raises(ObjectFormatError): load(dvc.odb.local, HashInfo("md5", dir_hash))
def test_get_hash_granular(tmp_dir, dvc): tmp_dir.dvc_gen( {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}} ) fs = DvcFileSystem(repo=dvc) subdir = "dir/subdir" assert fs.info(subdir).get("md5") is None _, _, obj = stage(dvc.odb.local, subdir, fs, "md5", dry_run=True) assert obj.hash_info == HashInfo( "md5", "af314506f1622d107e0ed3f14ec1a3b5.dir" ) data = posixpath.join(subdir, "data") assert fs.info(data)["md5"] == "8d777f385d3dfec8815d20f7496026dc" _, _, obj = stage(dvc.odb.local, data, fs, "md5", dry_run=True) assert obj.hash_info == HashInfo("md5", "8d777f385d3dfec8815d20f7496026dc")
def test_fill_from_lock_with_missing_sections(dvc, lock_data): stage = create_stage( PipelineStage, dvc, PIPELINE_FILE, deps=["foo"], outs=["bar"] ) lock = deepcopy(lock_data) del lock["deps"] StageLoader.fill_from_lock(stage, lock) assert not stage.deps[0].hash_info assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum") lock = deepcopy(lock_data) del lock["outs"] StageLoader.fill_from_lock(stage, lock) assert stage.deps[0].hash_info == HashInfo("md5", "foo_checksum") assert not stage.outs[0].hash_info
def test_lock_outs_order(dvc, typ): stage = create_stage(PipelineStage, dvc, **{typ: ["input1", "input0"]}, **kwargs) stage.outs[0].hash_info = HashInfo("md5", "md-one1") stage.outs[1].hash_info = HashInfo("md5", "md-zer0") assert to_single_stage_lockfile(stage) == OrderedDict([ ("cmd", "command"), ( "outs", [ OrderedDict([("path", "input0"), ("md5", "md-zer0")]), OrderedDict([("path", "input1"), ("md5", "md-one1")]), ], ), ])
def get_file_hash(self, path_info): with self._get_obj(path_info) as obj: return HashInfo( self.PARAM_CHECKSUM, obj.e_tag.strip('"'), size=obj.content_length, )
def get_file_hash(self, path_info): hash_info = HashInfo(self.PARAM_CHECKSUM, file_md5(path_info)[0],) if hash_info: hash_info.size = os.path.getsize(path_info) return hash_info
def get(self, path_info, fs): """Gets the hash for the specified path info. Hash will be retrieved from the state database if available. Args: path_info (dict): path info to get the hash for. Returns: HashInfo or None: hash for the specified path info or None if it doesn't exist in the state database. """ if not isinstance(fs, LocalFileSystem): return None assert isinstance(path_info, str) or path_info.scheme == "local" path = os.fspath(path_info) # NOTE: use os.path.exists instead of LocalFileSystem.exists # because it uses lexists() and will return True for broken # symlinks that we cannot stat() in get_mtime_and_size if not os.path.exists(path): return None mtime, size = get_mtime_and_size(path, self.fs) inode = get_inode(path) value = self.md5s.get(inode) if not value or value[0] != mtime or value[1] != size: return None return HashInfo("md5", value[2], size=int(size))
def test_lock_deps(dvc): stage = create_stage(PipelineStage, dvc, deps=["input"], **kwargs) stage.deps[0].hash_info = HashInfo("md5", "md-five") assert to_single_stage_lockfile(stage) == OrderedDict([ ("cmd", "command"), ("deps", [OrderedDict([("path", "input"), ("md5", "md-five")])]), ])
def md5(self, path_info): with self.ssh(path_info) as ssh: return HashInfo( "md5", ssh.md5(path_info.path), size=ssh.getsize(path_info.path), )
def test_get_hash_file(tmp_dir, dvc): tmp_dir.dvc_gen({"foo": "foo"}) tree = DvcTree(dvc) assert tree.get_hash(PathInfo(tmp_dir) / "foo") == HashInfo( "md5", "acbd18db4cc2f85cedef654fccc4a4d8", )
def test_load_stage_with_metrics_and_plots(dvc, stage_data, lock_data, typ): stage_data[typ] = stage_data.pop("outs") dvcfile = Dvcfile(dvc, PIPELINE_FILE) stage = StageLoader.load_stage(dvcfile, "stage-1", stage_data, lock_data) assert stage.outs[0].def_path == "bar" assert stage.outs[0].hash_info == HashInfo("md5", "bar_checksum")
def _get_tree_obj(path_info, fs, name, odb, state, upload, **kwargs): from .tree import Tree value = fs.info(path_info).get(name) if value: hash_info = HashInfo(name, value) try: return Tree.load(odb, hash_info) except FileNotFoundError: pass tree = _build_tree(path_info, fs, name, odb, state, upload, **kwargs) odb.add(tree.path_info, tree.fs, tree.hash_info) if name != "md5": # NOTE: used only for external outputs. Initial reasoning was to be # able to validate .dir files right in the workspace (e.g. check s3 # etag), but could be dropped for manual validation with regular md5, # that would be universal for all clouds. raw = odb.get(tree.hash_info) hash_info = get_file_hash(raw.path_info, raw.fs, name, state) tree.hash_info.name = hash_info.name tree.hash_info.value = hash_info.value if not tree.hash_info.value.endswith(".dir"): tree.hash_info.value += ".dir" odb.add(tree.path_info, tree.fs, tree.hash_info) return tree
def _get_file_hash(path_info, fs, name): info = fs.info(path_info) if name in info: assert not info[name].endswith(".dir") return HashInfo(name, info[name], size=info["size"]) func = getattr(fs, name, None) if func: return func(path_info) if name == "md5": return HashInfo( name, file_md5(path_info, fs), size=fs.getsize(path_info) ) raise NotImplementedError
def test(self): dir_hash = "123.dir" fname = os.fspath( self.dvc.cache.local.tree.hash_to_path_info(dir_hash)) self.create(fname, "<clearly>not,json") with pytest.raises(DirCacheError): self.dvc.cache.local.load_dir_cache(HashInfo("md5", dir_hash)) dir_hash = "234.dir" fname = os.fspath( self.dvc.cache.local.tree.hash_to_path_info(dir_hash)) self.create(fname, '{"a": "b"}') dir_info = self.dvc.cache.local.load_dir_cache( HashInfo("md5", dir_hash)) self.assertTrue(isinstance(dir_info, DirInfo)) self.assertEqual(dir_info.nfiles, 0)