def test_shallow_clone_tag(erepo_dir): with erepo_dir.chdir(): erepo_dir.dvc_gen("file", "foo", commit="init") erepo_dir.scm.tag("v1") erepo_dir.dvc_gen("file", "bar", commit="update file") url = os.fspath(erepo_dir) with patch.object(Git, "clone", wraps=Git.clone) as mock_clone: with external_repo(url, rev="v1") as repo: with repo.open_by_relpath("file") as fd: assert fd.read() == "foo" mock_clone.assert_called_with( url, ANY, shallow_branch="v1", progress=ANY ) _, shallow = CLONES[url] assert shallow with external_repo(url, rev="master") as repo: with repo.open_by_relpath("file") as fd: assert fd.read() == "bar" assert mock_clone.call_count == 1 _, shallow = CLONES[url] assert not shallow
def test_shallow_clone_branch(erepo_dir, mocker): with erepo_dir.chdir(): with erepo_dir.branch("branch", new=True): erepo_dir.dvc_gen("file", "branch", commit="create file on branch") erepo_dir.dvc_gen("file", "master", commit="create file on master") url = os.fspath(erepo_dir) clone_spy = mocker.spy(Git, "clone") with external_repo(url, rev="branch") as repo: with repo.open_by_relpath("file") as fd: assert fd.read() == "branch" clone_spy.assert_called_with( url, ANY, shallow_branch="branch", progress=ANY ) path, _ = CLONES[url] CLONES[url] = (path, True) mock_fetch = mocker.patch.object(Git, "fetch") with external_repo(url) as repo: with repo.open_by_relpath("file") as fd: assert fd.read() == "master" mock_fetch.assert_called_with(unshallow=True)
def test_shallow_clone_branch(erepo_dir): with erepo_dir.chdir(): with erepo_dir.branch("branch", new=True): erepo_dir.dvc_gen("file", "branch", commit="create file on branch") erepo_dir.dvc_gen("file", "master", commit="create file on master") url = os.fspath(erepo_dir) with patch.object(Git, "clone", wraps=Git.clone) as mock_clone: with external_repo(url, rev="branch") as repo: with repo.open_by_relpath("file") as fd: assert fd.read() == "branch" mock_clone.assert_called_with( url, ANY, shallow_branch="branch", progress=ANY ) _, shallow = CLONES[url] assert shallow with external_repo(url) as repo: with repo.open_by_relpath("file") as fd: assert fd.read() == "master" assert mock_clone.call_count == 1 _, shallow = CLONES[url] assert not shallow
def test_source_change(erepo_dir): url = os.fspath(erepo_dir) with external_repo(url) as repo: old_rev = repo.scm.get_rev() erepo_dir.scm_gen("file", "text", commit="a change") with external_repo(url) as repo: new_rev = repo.scm.get_rev() assert old_rev != new_rev
def test_known_sha(erepo_dir): url = "file://{}".format(erepo_dir) with external_repo(url) as repo: rev = repo.scm.get_rev() prev_rev = repo.scm.resolve_rev("HEAD^") # Hits cache with external_repo(url, rev) as repo: pass # No clone, no pull, copies a repo, checks out the known sha with external_repo(url, prev_rev) as repo: pass
def test_known_sha(erepo_dir): erepo_dir.scm.commit("init") url = f"file://{erepo_dir}" with external_repo(url) as repo: rev = repo.scm.get_rev() prev_rev = repo.scm.resolve_rev("HEAD^") # Hits cache with external_repo(url, rev) as repo: pass # No clone, no pull, copies a repo, checks out the known sha with external_repo(url, prev_rev) as repo: pass
def ls(url, target=None, rev=None, recursive=None, outs_only=False): from dvc.external_repo import external_repo from dvc.repo import Repo from dvc.utils import relpath with external_repo(url, rev) as repo: target_path_info = _get_target_path_info(repo, target) result = [] if isinstance(repo, Repo): result.extend(_ls_outs_repo(repo, target_path_info, recursive)) if not outs_only: result.extend(_ls_files_repo(target_path_info, recursive)) if target and not result: raise PathMissingError(target, repo, output_only=outs_only) def prettify(path_info): if path_info == target_path_info: return path_info.name return relpath(path_info, target_path_info) result = list(set(map(prettify, result))) result.sort() return result
def test_subrepos_are_ignored(tmp_dir, erepo_dir): subrepo = erepo_dir / "dir" / "subrepo" make_subrepo(subrepo, erepo_dir.scm) with erepo_dir.chdir(): erepo_dir.dvc_gen("dir/foo", "foo", commit="foo") erepo_dir.scm_gen("dir/bar", "bar", commit="bar") with subrepo.chdir(): subrepo.dvc_gen({"file": "file"}, commit="add files on subrepo") with external_repo(os.fspath(erepo_dir)) as repo: repo.get_external("dir", "out") expected_files = {"foo": "foo", "bar": "bar", ".gitignore": "/foo\n"} assert (tmp_dir / "out").read_text() == expected_files expected_hash = HashInfo("md5", "e1d9e8eae5374860ae025ec84cfd85c7.dir") assert (repo.repo_tree.get_hash( os.path.join(repo.root_dir, "dir"), follow_subrepos=False) == expected_hash) # clear cache to test `fetch_external` again cache_dir = tmp_dir / repo.cache.local.cache_dir remove(cache_dir) makedirs(cache_dir) assert repo.fetch_external(["dir"]) == ( len(expected_files), 0, [expected_hash], )
def test_hook_is_called(tmp_dir, erepo_dir, mocker): subrepo_paths = [ "subrepo1", "subrepo2", os.path.join("dir", "subrepo3"), os.path.join("dir", "subrepo4"), "subrepo5", os.path.join("subrepo5", "subrepo6"), ] subrepos = [erepo_dir / path for path in subrepo_paths] for repo in subrepos: make_subrepo(repo, erepo_dir.scm) for repo in subrepos + [erepo_dir]: with repo.chdir(): repo.scm_gen("foo", "foo", commit=f"git add {repo}/foo") repo.dvc_gen("bar", "bar", commit=f"dvc add {repo}/bar") with external_repo(str(erepo_dir)) as repo: spy = mocker.spy(repo.repo_fs.fs, "repo_factory") list(repo.repo_fs.walk("", ignore_subrepos=False)) # drain assert spy.call_count == len(subrepos) paths = [os.path.join(repo.root_dir, path) for path in subrepo_paths] spy.assert_has_calls( [ call( path, fs=repo.fs, repo_factory=repo.repo_fs.fs.repo_factory, ) for path in paths ], any_order=True, )
def _fetch_external(self, repo_url, repo_rev, files, jobs): from dvc.external_repo import external_repo from dvc.path_info import PathInfo from dvc.scm.base import CloneError failed = 0 results = [] def cb(result): results.append(result) cache = self.cache.local try: with external_repo(repo_url, repo_rev, cache_dir=cache.cache_dir) as repo: root = PathInfo(repo.root_dir) for path in files: path_info = root / path self.cache.local.save( path_info, repo.repo_tree, None, jobs=jobs, download_callback=cb, follow_subrepos=False, ) except CloneError: failed += 1 logger.exception("failed to fetch data for '{}'".format( ", ".join(files))) return sum(results), failed
def test_fetch_external_repo_jobs(tmp_dir, scm, mocker, dvc, local_remote): tmp_dir.dvc_gen( { "dir1": { "file1": "file1", "file2": "file2", "file3": "file3", "file4": "file4", }, }, commit="init", ) dvc.push() with external_repo(str(tmp_dir)) as repo: spy = mocker.spy(repo.cloud, "pull") obj = stage( dvc.odb.local, PathInfo(repo.root_dir) / "dir1", repo.repo_fs, follow_subrepos=False, jobs=3, ) save( dvc.odb.local, obj, jobs=3, ) run_jobs = tuple(spy.call_args_list[0])[1].get("jobs") assert run_jobs == 3
def test_relative_remote(erepo_dir, tmp_dir): # these steps reproduce the script on this issue: # https://github.com/iterative/dvc/issues/2756 with erepo_dir.chdir(): erepo_dir.dvc_gen("file", "contents", commit="create file") upstream_dir = tmp_dir upstream_url = relpath(upstream_dir, erepo_dir) with erepo_dir.dvc.config.edit() as conf: conf["remote"]["upstream"] = {"url": upstream_url} conf["core"]["remote"] = "upstream" erepo_dir.scm_add(erepo_dir.dvc.config.files["repo"], commit="Update dvc config") erepo_dir.dvc.push() (erepo_dir / "file").unlink() remove(erepo_dir.dvc.cache.local.cache_dir) url = fspath(erepo_dir) with external_repo(url) as repo: assert os.path.isabs(repo.config["remote"]["upstream"]["url"]) assert os.path.isdir(repo.config["remote"]["upstream"]["url"]) with repo.open_by_relpath("file") as fd: assert fd.read() == "contents"
def _open(*args, **kwargs): # NOTE: if original repo was an erepo (and has a URL), # we cannot use Repo.open() since it will skip erepo # cache/remote setup for local URLs if url is None: return Repo.open(*args, **kwargs) return external_repo(*args, **kwargs)
def _fetch_external(self, repo_url, repo_rev, files): failed = 0 cache_dir = self.cache.local.cache_dir try: with external_repo(repo_url, repo_rev, cache_dir=cache_dir) as repo: with repo.state: cache = NamedCache() for name in files: try: out = repo.find_out_by_relpath(name) except OutputNotFoundError: failed += 1 logger.exception( "failed to fetch data for '{}'".format(name)) continue else: cache.update(out.get_used_cache()) try: return repo.cloud.pull(cache), failed except DownloadError as exc: failed += exc.amount except CloneError: failed += 1 logger.exception("failed to fetch data for '{}'".format( ", ".join(files))) return 0, failed
def get(url, path, out=None, rev=None): from dvc.external_repo import external_repo from dvc.dvcfile import is_valid_filename out = resolve_output(path, out) if is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(url=url, rev=rev) as repo: if hasattr(repo, "cache"): repo.cache.local.cache_dir = tmp_dir # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we # are done, and to make that work we would have to copy data # over anyway before removing the cache, so we might just copy # it right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] repo.pull_to(path, PathInfo(out)) finally: remove(tmp_dir)
def _make_repo(repo_url, rev=None): if not repo_url or urlparse(repo_url).scheme == "": assert rev is None, "Custom revision is not supported for local repo" yield Repo(repo_url) else: with external_repo(url=repo_url, rev=rev) as repo: yield repo
def _make_repo(repo_url, rev=None): if not repo_url or os.path.exists(repo_url): assert rev is None, "Custom revision is not supported for local repo" yield Repo(repo_url) else: with external_repo(url=repo_url, rev=rev) as repo: yield repo
def get(url, path, out=None, rev=None): out = out or os.path.basename(urlparse(path).path) # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are # done, and to make that work we would have to copy data over # anyway before removing the cache, so we might just copy it # right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.config.set( Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "reflink,hardlink,copy", ) o = repo.find_out_by_relpath(path) repo.fetch(o.stage.path) o.path_info = PathInfo(os.path.abspath(out)) with o.repo.state: o.checkout() finally: remove(tmp_dir)
def ls( url, target=None, rev=None, recursive=None, outs_only=False, ): """Methods for getting files and outputs for the repo. Args: url (str): the repo url target (str, optional): relative path into the repo rev (str, optional): SHA commit, branch or tag name recursive (bool, optional): recursively walk the repo outs_only (bool, optional): show only DVC-artifacts Returns: list of `entry` Notes: `entry` is a dictionary with structure { "path": str, "isout": bool, "isdir": bool, "isexec": bool, } """ from dvc.external_repo import external_repo from dvc.repo import Repo from dvc.utils import relpath with external_repo(url, rev) as repo: target_path_info = _get_target_path_info(repo, target) fs_nodes = [] if isinstance(repo, Repo): fs_nodes.extend(_ls_outs_repo(repo, target_path_info, recursive)) if not outs_only: fs_nodes.extend(_ls_files_repo(target_path_info, recursive)) if target and not fs_nodes: raise PathMissingError(target, repo, output_only=outs_only) fs_nodes = {n["path_info"]: n for n in fs_nodes}.values() def get_entry(fs_node): path_info = fs_node["path_info"] path = (path_info.name if path_info == target_path_info else relpath(path_info, target_path_info)) return { "path": path, "isout": fs_node.get("isout", False), "isdir": fs_node.get("isdir", False), "isexec": fs_node.get("isexec", False), } entries = sorted(map(get_entry, fs_nodes), key=lambda f: f["path"]) return entries
def get(url, path, out=None, rev=None): out = resolve_output(path, out) if Stage.is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: # Note: we need to replace state, because in case of getting DVC # dependency on CIFS or NFS filesystems, sqlite-based state # will be unable to obtain lock repo.state = StateNoop() # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are # done, and to make that work we would have to copy data over # anyway before removing the cache, so we might just copy it # right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] output = None output_error = None try: output = repo.find_out_by_relpath(path) except OutputNotFoundError as ex: output_error = ex is_git_file = output_error and not os.path.isabs(path) is_not_cached = output and not output.use_cache if is_git_file or is_not_cached: _copy_git_file(repo, path, out, url) return if output_error: raise OutputNotFoundError(path) with repo.state: repo.cloud.pull(output.get_used_cache()) output.path_info = PathInfo(os.path.abspath(out)) with output.repo.state: output.checkout() except NotDvcRepoError: raise UrlNotDvcRepoError(url) finally: remove(tmp_dir)
def test_cache_reused(erepo_dir, mocker): with erepo_dir.chdir(): erepo_dir.dvc_gen("file", "text", commit="add file") download_spy = mocker.spy(RemoteLOCAL, "download") # Use URL to prevent any fishy optimizations url = "file://{}".format(erepo_dir) with external_repo(url) as repo: repo.fetch() assert download_spy.mock.call_count == 1 # Should not download second time erepo_dir.scm.branch("branch") with external_repo(url, "branch") as repo: repo.fetch() assert download_spy.mock.call_count == 1
def test_external_repo(erepo_dir): with erepo_dir.chdir(): with erepo_dir.branch("branch", new=True): erepo_dir.dvc_gen("file", "branch", commit="create file on branch") erepo_dir.dvc_gen("file", "master", commit="create file on master") url = os.fspath(erepo_dir) with patch.object(Git, "clone", wraps=Git.clone) as mock: with external_repo(url) as repo: with repo.open_by_relpath("file") as fd: assert fd.read() == "master" with external_repo(url, rev="branch") as repo: with repo.open_by_relpath("file") as fd: assert fd.read() == "branch" assert mock.call_count == 1
def open(url, *args, **kwargs): if os.path.exists(url): yield Repo(url, *args, **kwargs) return from dvc.external_repo import external_repo with external_repo(url, *args, **kwargs) as repo: yield repo
def get(url, path, out=None, rev=None, jobs=None): import shortuuid from dvc.dvcfile import is_valid_filename from dvc.external_repo import external_repo from dvc.fs.callbacks import Callback out = resolve_output(path, out) if is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we # are done, and to make that work we would have to copy data # over anyway before removing the cache, so we might just copy # it right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. cache_types = ["reflink", "hardlink", "copy"] try: with external_repo( url=url, rev=rev, cache_dir=tmp_dir, cache_types=cache_types ) as repo: if os.path.isabs(path): from dvc.fs.data import DataFileSystem fs = DataFileSystem(repo=repo, workspace="local") fs_path = path else: fs = repo.dvcfs fs_path = fs.from_os_path(path) with Callback.as_tqdm_callback( desc=f"Downloading {fs.path.name(path)}", unit="files", ) as cb: fs.get( fs_path, os.path.abspath(out), batch_size=jobs, callback=cb, ) finally: remove(tmp_dir)
def test_external_repo(erepo): url = erepo.root_dir # We will share cache dir, to fetch version file cache_dir = erepo.dvc.cache.local.cache_dir with patch.object(Git, "clone", wraps=Git.clone) as mock: with external_repo(url, cache_dir=cache_dir) as repo: with repo.open(os.path.join(repo.root_dir, "version")) as fd: assert fd.read() == "master" with external_repo(url, rev="branch", cache_dir=cache_dir) as repo: with repo.open(os.path.join(repo.root_dir, "version")) as fd: assert fd.read() == "branch" # Check cache_dir is unset with external_repo(url) as repo: assert path_isin(repo.cache.local.cache_dir, repo.root_dir) assert mock.call_count == 1
def test_external_repo(erepo_dir, mocker): with erepo_dir.chdir(): with erepo_dir.branch("branch", new=True): erepo_dir.dvc_gen("file", "branch", commit="create file on branch") erepo_dir.dvc_gen("file", "master", commit="create file on master") url = os.fspath(erepo_dir) clone_spy = mocker.spy(Git, "clone") with external_repo(url) as repo: with repo.open_by_relpath("file") as fd: assert fd.read() == "master" with external_repo(url, rev="branch") as repo: with repo.open_by_relpath("file") as fd: assert fd.read() == "branch" assert clone_spy.call_count == 1
def _make_repo(repo_url=None, rev=None): repo_url = repo_url or os.getcwd() if rev is None and os.path.exists(repo_url): try: yield Repo(repo_url, subrepos=True) return except NotDvcRepoError: pass # fallthrough to external_repo with external_repo(url=repo_url, rev=rev) as repo: yield repo
def test_cache_reused(erepo_dir, mocker, setup_remote): setup_remote(erepo_dir.dvc) with erepo_dir.chdir(): erepo_dir.dvc_gen("file", "text", commit="add file") erepo_dir.dvc.push() download_spy = mocker.spy(LocalRemote, "download") # Use URL to prevent any fishy optimizations url = f"file://{erepo_dir}" with external_repo(url) as repo: repo.fetch() assert download_spy.mock.call_count == 1 # Should not download second time erepo_dir.scm.branch("branch") with external_repo(url, "branch") as repo: repo.fetch() assert download_spy.mock.call_count == 1
def test_cache_reused(erepo_dir, mocker, local_cloud): erepo_dir.add_remote(config=local_cloud.config) with erepo_dir.chdir(): erepo_dir.dvc_gen("file", "text", commit="add file") erepo_dir.dvc.push() download_spy = mocker.spy(LocalFileSystem, "upload_fobj") # Use URL to prevent any fishy optimizations url = f"file://{erepo_dir}" with external_repo(url) as repo: repo.fetch() assert download_spy.mock.call_count == 1 # Should not download second time erepo_dir.scm.branch("branch") with external_repo(url, "branch") as repo: repo.fetch() assert download_spy.mock.call_count == 1
def ls( url, path=None, rev=None, recursive=None, outs_only=False, ): """Methods for getting files and outputs for the repo. Args: url (str): the repo url path (str, optional): relative path into the repo rev (str, optional): SHA commit, branch or tag name recursive (bool, optional): recursively walk the repo outs_only (bool, optional): show only DVC-artifacts Returns: list of `entry` Notes: `entry` is a dictionary with structure { "path": str, "isout": bool, "isdir": bool, "isexec": bool, } """ from dvc.external_repo import external_repo from dvc.repo import Repo with external_repo(url, rev) as repo: path_info = PathInfo(repo.root_dir) if path: path_info /= path ret = {} if isinstance(repo, Repo): ret = _ls(repo, path_info, recursive, True) nondvc = {} if not outs_only: nondvc = _ls(repo, path_info, recursive, False) ret.update(nondvc) if path and not ret: raise PathMissingError(path, repo, output_only=outs_only) ret_list = [] for path, info in ret.items(): info["path"] = path ret_list.append(info) ret_list.sort(key=lambda f: f["path"]) return ret_list