def get(url, path, out=None, rev=None): from dvc.external_repo import external_repo from dvc.dvcfile import is_valid_filename out = resolve_output(path, out) if is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(url=url, rev=rev) as repo: if hasattr(repo, "cache"): repo.cache.local.cache_dir = tmp_dir # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we # are done, and to make that work we would have to copy data # over anyway before removing the cache, so we might just copy # it right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] repo.pull_to(path, PathInfo(out)) finally: remove(tmp_dir)
def imp_url(self, url, out=None, fname=None, erepo=None, locked=True): from dvc.dvcfile import Dvcfile from dvc.stage import Stage out = resolve_output(url, out) path, wdir, out = resolve_paths(self, out) # NOTE: when user is importing something from within his own repository if os.path.exists(url) and path_isin(os.path.abspath(url), self.root_dir): url = relpath(url, wdir) stage = Stage.create( self, fname or path, wdir=wdir, deps=[url], outs=[out], erepo=erepo, ) if stage is None: return None dvcfile = Dvcfile(self, stage.path) dvcfile.overwrite_with_prompt(force=True) self.check_modified_graph([stage]) stage.run() stage.locked = locked dvcfile.dump(stage) return stage
def imp_url(self, url, out=None, fname=None, erepo=None, locked=True): from dvc.stage import Stage out = resolve_output(url, out) stage = Stage.create( self, cmd=None, deps=[url], outs=[out], fname=fname, erepo=erepo, accompany_outs=True, ) if stage is None: return None self.check_modified_graph([stage]) stage.run() stage.locked = locked stage.dump() return stage
def imp_url( self, url, out=None, fname=None, erepo=None, frozen=True, no_exec=False, desc=None, jobs=None, ): from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage, restore_meta out = resolve_output(url, out) path, wdir, out = resolve_paths(self, out) # NOTE: when user is importing something from within their own repository if ( erepo is None and os.path.exists(url) and path_isin(os.path.abspath(url), self.root_dir) ): url = relpath(url, wdir) stage = create_stage( Stage, self, fname or path, wdir=wdir, deps=[url], outs=[out], erepo=erepo, ) restore_meta(stage) if stage.can_be_skipped: return None if desc: stage.outs[0].desc = desc dvcfile = Dvcfile(self, stage.path) dvcfile.remove() try: self.check_modified_graph([stage]) except OutputDuplicationError as exc: raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) if no_exec: stage.ignore_outs() else: stage.run(jobs=jobs) stage.frozen = frozen dvcfile.dump(stage) return stage
def get(url, path, out=None, rev=None): out = resolve_output(path, out) if Stage.is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: # Note: we need to replace state, because in case of getting DVC # dependency on CIFS or NFS filesystems, sqlite-based state # will be unable to obtain lock repo.state = StateNoop() # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are # done, and to make that work we would have to copy data over # anyway before removing the cache, so we might just copy it # right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] output = None output_error = None try: output = repo.find_out_by_relpath(path) except OutputNotFoundError as ex: output_error = ex is_git_file = output_error and not os.path.isabs(path) is_not_cached = output and not output.use_cache if is_git_file or is_not_cached: _copy_git_file(repo, path, out, url) return if output_error: raise OutputNotFoundError(path) with repo.state: repo.cloud.pull(output.get_used_cache()) output.path_info = PathInfo(os.path.abspath(out)) with output.repo.state: output.checkout() except NotDvcRepoError: raise UrlNotDvcRepoError(url) finally: remove(tmp_dir)
def get(url, path, out=None, rev=None, jobs=None): import shortuuid from dvc.dvcfile import is_valid_filename from dvc.external_repo import external_repo from dvc.fs.callbacks import Callback out = resolve_output(path, out) if is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we # are done, and to make that work we would have to copy data # over anyway before removing the cache, so we might just copy # it right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. cache_types = ["reflink", "hardlink", "copy"] try: with external_repo( url=url, rev=rev, cache_dir=tmp_dir, cache_types=cache_types ) as repo: if os.path.isabs(path): from dvc.fs.data import DataFileSystem fs = DataFileSystem(repo=repo, workspace="local") fs_path = path else: fs = repo.dvcfs fs_path = fs.from_os_path(path) with Callback.as_tqdm_callback( desc=f"Downloading {fs.path.name(path)}", unit="files", ) as cb: fs.get( fs_path, os.path.abspath(out), batch_size=jobs, callback=cb, ) finally: remove(tmp_dir)
def get_url(url, out=None): out = resolve_output(url, out) if os.path.exists(url): url = os.path.abspath(url) out = os.path.abspath(out) dep, = dependency.loads_from(None, [url]) out, = output.loads_from(None, [out], use_cache=False) dep.download(out)
def get_url(url, out=None): out = resolve_output(url, out) if os.path.exists(url): url = os.path.abspath(url) out = os.path.abspath(out) (dep, ) = dependency.loads_from(None, [url]) (out, ) = output.loads_from(None, [out], use_cache=False) if not dep.exists: raise dep.DoesNotExistError(dep) dep.download(out)
def get_url(url, out=None, jobs=None): import dvc.dependency as dependency import dvc.output as output from dvc.utils import resolve_output out = resolve_output(url, out) if os.path.exists(url): url = os.path.abspath(url) out = os.path.abspath(out) (dep, ) = dependency.loads_from(None, [url]) (out, ) = output.loads_from(None, [out], use_cache=False) if not dep.exists: raise dep.DoesNotExistError(dep) dep.download(out, jobs=jobs)
def get(url, path, out=None, rev=None): out = resolve_output(path, out) if Stage.is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are # done, and to make that work we would have to copy data over # anyway before removing the cache, so we might just copy it # right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] try: output = repo.find_out_by_relpath(path) except OutputNotFoundError: output = None if output and output.use_cache: _get_cached(repo, output, out) else: # Either an uncached out with absolute path or a user error if os.path.isabs(path): raise FileNotFoundError _copy(os.path.join(repo.root_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) except NotDvcRepoError: raise UrlNotDvcRepoError(url) finally: remove(tmp_dir)
def get(url, path, out=None, rev=None): out = resolve_output(path, out) if Stage.is_valid_filename(out): raise GetDVCFileError() # Creating a directory right beside the output to make sure that they # are on the same filesystem, so we could take the advantage of # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we # are done, and to make that work we would have to copy data # over anyway before removing the cache, so we might just copy # it right away. # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] output = repo.find_out_by_relpath(path) if output.use_cache: _get_cached(repo, output, out) return # Non-cached output, fall through and try to copy from git. except (NotDvcRepoError, NoOutputInExternalRepoError): # Not a DVC repository or, possibly, path is not tracked by DVC. # Fall through and try to copy from git. pass if os.path.isabs(path): raise FileNotFoundError repo_dir = cached_clone(url, rev=rev) fs_copy(os.path.join(repo_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) finally: remove(tmp_dir)
def imp_url(self, url, out=None, fname=None, erepo=None, frozen=True): from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage out = resolve_output(url, out) path, wdir, out = resolve_paths(self, out) # NOTE: when user is importing something from within their own repository if ( erepo is None and os.path.exists(url) and path_isin(os.path.abspath(url), self.root_dir) ): url = relpath(url, wdir) stage = create_stage( Stage, self, fname or path, wdir=wdir, deps=[url], outs=[out], erepo=erepo, ) if stage is None: return None dvcfile = Dvcfile(self, stage.path) dvcfile.remove_with_prompt(force=True) try: self.check_modified_graph([stage]) except OutputDuplicationError as exc: raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) stage.run() stage.frozen = frozen dvcfile.dump(stage) return stage
def imp_url( self, url, out=None, fname=None, erepo=None, frozen=True, no_exec=False, remote=None, to_remote=False, desc=None, jobs=None, ): from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage, restore_meta out = resolve_output(url, out) path, wdir, out = resolve_paths(self, out, always_local=to_remote and not out) if to_remote and no_exec: raise InvalidArgumentError( "--no-exec can't be combined with --to-remote") if not to_remote and remote: raise InvalidArgumentError( "--remote can't be used without --to-remote") # NOTE: when user is importing something from within their own repository if (erepo is None and os.path.exists(url) and path_isin(os.path.abspath(url), self.root_dir)): url = relpath(url, wdir) stage = create_stage( Stage, self, fname or path, wdir=wdir, deps=[url], outs=[out], erepo=erepo, ) restore_meta(stage) if desc: stage.outs[0].desc = desc dvcfile = Dvcfile(self, stage.path) dvcfile.remove() try: new_index = self.index.add(stage) new_index.check_graph() except OutputDuplicationError as exc: raise OutputDuplicationError(exc.output, set(exc.stages) - {stage}) if no_exec: stage.ignore_outs() elif to_remote: remote_odb = self.cloud.get_remote_odb(remote, "import-url") stage.outs[0].transfer(url, odb=remote_odb, jobs=jobs) stage.save_deps() stage.md5 = stage.compute_md5() else: stage.run(jobs=jobs) stage.frozen = frozen dvcfile.dump(stage) return stage
def test_resolve_output(inp, out, is_dir, expected, mocker): mocker.patch("os.path.isdir", return_value=is_dir) result = resolve_output(inp, out) assert result == expected