def _resume_checkpoint(self, *args, resume_rev: Optional[str] = None, **kwargs): """Resume an existing (checkpoint) experiment. Experiment will be reproduced and checked out into the user's workspace. """ assert resume_rev branch: Optional[str] = None try: allow_multiple = bool(kwargs.get("params", None)) branch = self.get_branch_by_rev(resume_rev, allow_multiple=allow_multiple) if not branch: raise DvcException("Could not find checkpoint experiment " f"'{resume_rev[:7]}'") baseline_rev = self._get_baseline(branch) except MultipleBranchError as exc: baselines = { info.baseline_sha for info in exc.ref_infos if info.baseline_sha } if len(baselines) == 1: baseline_rev = baselines.pop() else: raise logger.debug( "Checkpoint run from '%s' with baseline '%s'", resume_rev[:7], baseline_rev, ) return self._stash_exp( *args, resume_rev=resume_rev, baseline_rev=baseline_rev, branch=branch, **kwargs, )
def move(self, from_path, to_path): import dvc.output as Output from_out = Output.loads_from(Stage(self, cwd=os.curdir), [from_path])[0] found = False self._files_to_git_add = [] with self.state: for stage in self.stages(): for out in stage.outs: if out.path != from_out.path: continue if not stage.is_data_source: raise MoveNotDataSourceError(stage.relpath) found = True to_out = Output.loads_from(out.stage, [to_path], out.cache, out.metric)[0] out.move(to_out) stage_base = os.path.basename(stage.path) stage_base = stage_base.rstrip(Stage.STAGE_FILE_SUFFIX) stage_dir = os.path.dirname(stage.path) from_base = os.path.basename(from_path) to_base = os.path.basename(to_path) if stage_base == from_base: os.unlink(stage.path) path = to_base + Stage.STAGE_FILE_SUFFIX stage.path = os.path.join(stage_dir, path) stage.dump() self._remind_to_git_add() if not found: msg = 'Unable to find dvcfile with output \'{}\'' raise DvcException(msg.format(from_path))
def new( self, *args, checkpoint: Optional[bool] = False, checkpoint_continue: Optional[str] = None, branch: Optional[str] = None, **kwargs, ): """Create a new experiment. Experiment will be reproduced and checked out into the user's workspace. """ if checkpoint_continue: rev = self.scm.resolve_rev(checkpoint_continue) branch = self._get_branch_containing(rev) if not branch: raise DvcException("Could not find checkpoint experiment " f"'{checkpoint_continue}'") logger.debug("Continuing checkpoint experiment '%s'", checkpoint_continue) kwargs["apply_workspace"] = False if branch: rev = self.scm.resolve_rev(branch) logger.debug("Using '%s' (tip of branch '%s') as baseline", rev, branch) else: rev = self.repo.scm.get_rev() self._scm_checkout(rev) try: stash_rev = self._stash_exp(*args, branch=branch, allow_unchanged=checkpoint, **kwargs) except UnchangedExperimentError as exc: logger.info("Reproducing existing experiment '%s'.", rev[:7]) raise exc logger.debug("Stashed experiment '%s' for future execution.", stash_rev[:7]) return stash_rev
def resolve_paths(repo, out, always_local=False): from urllib.parse import urlparse from dvc.fs.local import localfs from ..dvcfile import DVC_FILE_SUFFIX from ..exceptions import DvcException from ..system import System from .fs import contains_symlink_up_to abspath = os.path.abspath(out) dirname = os.path.dirname(abspath) base = os.path.basename(os.path.normpath(out)) scheme = urlparse(out).scheme if os.name == "nt" and scheme == os.path.splitdrive(abspath)[0][0].lower(): # urlparse interprets windows drive letters as URL scheme scheme = "" if scheme or not localfs.path.isin_or_eq(abspath, repo.root_dir): wdir = os.getcwd() elif contains_symlink_up_to( dirname, repo.root_dir) or (os.path.isdir(abspath) and System.is_symlink(abspath)): msg = ( "Cannot add files inside symlinked directories to DVC. " "See {} for more information." ).format( format_link( "https://dvc.org/doc/user-guide/troubleshooting#add-symlink")) raise DvcException(msg) else: wdir = dirname out = base if always_local: out = base path = os.path.join(wdir, base + DVC_FILE_SUFFIX) return (path, wdir, out)
def checkout_exp(self, rev, **kwargs): """Checkout an experiment to the user's workspace.""" from git.exc import GitCommandError from dvc.repo.checkout import checkout as dvc_checkout baseline_rev = self._check_baseline(rev) self._scm_checkout(rev) branch = self._get_branch_containing(rev) m = self.BRANCH_RE.match(branch) if m and m.group("checkpoint"): kwargs.update({"allow_missing": True, "quiet": True}) tmp = tempfile.NamedTemporaryFile(delete=False).name self.scm.repo.head.commit.diff(baseline_rev, patch=True, full_index=True, binary=True, output=tmp) dirty = self.repo.scm.is_dirty(untracked_files=True) if dirty: logger.debug("Stashing workspace changes.") self.repo.scm.repo.git.stash("push", "--include-untracked") try: if os.path.getsize(tmp): logger.debug("Patching local workspace") self.repo.scm.repo.git.apply(tmp, reverse=True) need_checkout = True else: need_checkout = False except GitCommandError: raise DvcException("failed to apply experiment changes.") finally: remove(tmp) if dirty: self._unstash_workspace() if need_checkout: dvc_checkout(self.repo, **kwargs)
def _exec(self, ssh, cmd): stdin, stdout, stderr = ssh.exec_command(cmd) channel = stdout.channel stdin.close() channel.shutdown_write() stdout_chunks = [] stderr_chunks = [] while (not channel.closed or channel.recv_ready() or channel.recv_stderr_ready()): import select got_chunk = False readq, _, _ = select.select([stdout.channel], [], [], self.timeout) for c in readq: if c.recv_ready(): stdout_chunks.append(stdout.channel.recv(len(c.in_buffer))) got_chunk = True if c.recv_stderr_ready(): stderr_len = len(c.in_stderr_buffer) s = stderr.channel.recv_stderr(stderr_len) stderr_chunks.append(s) got_chunk = True if not got_chunk \ and stdout.channel.exit_status_ready() \ and not stderr.channel.recv_stderr_ready() \ and not stdout.channel.recv_ready(): stdout.channel.shutdown_read() stdout.channel.close() break stdout.close() stderr.close() if stdout.channel.recv_exit_status() != 0: err = ''.join(stderr_chunks) msg = 'SSH command \'{}\' failed: {}'.format(cmd, err) raise DvcException(msg) return b''.join(stdout_chunks).decode('utf-8')
def reflink(source, link_name): import platform source, link_name = fspath(source), fspath(link_name) system = platform.system() try: if system == "Windows": ret = System._reflink_windows(source, link_name) elif system == "Darwin": ret = System._reflink_darwin(source, link_name) elif system == "Linux": ret = System._reflink_linux(source, link_name) else: ret = -1 except IOError: ret = -1 if ret != 0: raise DvcException("reflink is not supported")
def __init__( self, stage, path, info=None, remote=None, cache=True, metric=False ): self.stage = stage self.repo = stage.repo self.url = path self.info = info self.remote = remote or self.REMOTE(self.repo, {}) self.use_cache = False if self.IS_DEPENDENCY else cache self.metric = False if self.IS_DEPENDENCY else metric if ( self.use_cache and getattr(self.repo.cache, self.REMOTE.scheme) is None ): raise DvcException( "no cache location setup for '{}' outputs.".format( self.REMOTE.scheme ) )
def test_nested_exceptions(self, caplog): with caplog.at_level(logging.DEBUG, logger="dvc"): try: raise Exception("first") except Exception as exc: try: raise DvcException("second") from exc except DvcException: stack_trace = traceback.format_exc() logger.exception("message") expected = ("{red}ERROR{nc}: message - second: first\n" "{red}{line}{nc}\n" "{stack_trace}" "{red}{line}{nc}\n".format(line="-" * 60, stack_trace=stack_trace, **colors)) assert expected == formatter.format(caplog.records[0]) assert "Exception: first" in stack_trace assert "dvc.exceptions.DvcException: second" in stack_trace
def _filter_fields(data_points, filename, revision, fields=None, **kwargs): if not fields: return data_points assert isinstance(fields, set) new_data = [] for data_point in data_points: new_dp = copy(data_point) keys = set(data_point.keys()) if keys & fields != fields: raise DvcException( "Could not find fields: '{}' for '{}' at '{}'.".format( ", ".join(fields), filename, revision)) to_del = keys - fields for key in to_del: del new_dp[key] new_data.append(new_dp) return new_data
def run(self): try: statement = ( "This will destroy all information about your pipelines," " all data files, as well as cache in .dvc/cache." "\n" "Are you sure you want to continue?" ) if not self.args.force and not prompt.confirm(statement): raise DvcException( "cannot destroy without a confirmation from the user." " Use `-f` to force." ) self.repo.destroy() except Exception: # noqa, pylint: disable=broad-except logger.exception("failed to destroy DVC") return 1 return 0
def reproduce(self, interactive=False, **kwargs): if not (kwargs.get("force", False) or self.changed()): if not isinstance(self, PipelineStage) and self.is_data_source: logger.info("'%s' didn't change, skipping", self.addressing) else: logger.info("Stage '%s' didn't change, skipping", self.addressing) return None msg = ("Going to reproduce {stage}. " "Are you sure you want to continue?".format(stage=self)) if interactive and not prompt.confirm(msg): raise DvcException("reproduction aborted by the user") self.run(**kwargs) logger.debug(f"{self} was reproduced") return self
def link(self, cache, path): assert os.path.isfile(cache) dname = os.path.dirname(path) if not os.path.exists(dname): os.makedirs(dname) i = len(self.cache_types) while i > 0: try: self.CACHE_TYPE_MAP[self.cache_types[0]](cache, path) return except Exception as exc: msg = 'Cache type \'{}\' is not supported'.format( self.cache_types[0]) Logger.debug(msg) del self.cache_types[0] i -= 1 raise DvcException('No possible cache types left to try out.')
def md5(self, path): """ Use different md5 commands depending on the OS: - Darwin's `md5` returns BSD-style checksums by default - Linux's `md5sum` needs the `--tag` flag for a similar output Example: MD5 (foo.txt) = f3d220a856b52aabbf294351e8a24300 """ if self.uname == "Linux": md5 = self.execute("md5sum " + path).split()[0] elif self.uname == "Darwin": md5 = self.execute("md5 " + path).split()[-1] else: raise DvcException("'{}' is not supported as a SSH remote".format( self.uname)) assert len(md5) == 32 return md5
def git_object_by_path(self, path): import git path = relpath(os.path.realpath(path), self.git.working_dir) assert path.split(os.sep, 1)[0] != ".." try: tree = self.git.tree(self.rev) except git.exc.BadName as exc: raise DvcException("revision '{}' not found in git '{}'".format( self.rev, os.path.relpath(self.git.working_dir))) from exc if not path or path == ".": return tree for i in path.split(os.sep): if not self._is_tree_and_contains(tree, i): # there is no tree for specified path return None tree = tree[i] return tree
def link(self, src, link): dname = os.path.dirname(link) if not os.path.exists(dname): os.makedirs(dname) if self.cache_type != None: types = [self.cache_type] else: types = self.CACHE_TYPES for typ in types: try: self.CACHE_TYPE_MAP[typ](src, link) self.link_state.update(link) return except Exception as exc: msg = 'Cache type \'{}\' is not supported'.format(typ) Logger.debug(msg) if typ == types[-1]: raise DvcException(msg, cause=exc)
def get_dir_cache(self, **kwargs): if not self.is_dir_checksum: raise DvcException("cannot get dir cache for file checksum") try: objects.check(self.odb, self.odb.get(self.hash_info)) except (FileNotFoundError, ObjectFormatError): self.repo.cloud.pull( NamedCache.make("local", self.hash_info.value, str(self)), show_checksums=False, **kwargs, ) try: self.obj = objects.load(self.odb, self.hash_info) except (FileNotFoundError, ObjectFormatError): self.obj = None return self.obj
def link(self, cache, path): assert os.path.isfile(cache) dname = os.path.dirname(path) if not os.path.exists(dname): os.makedirs(dname) # NOTE: just create an empty file for an empty cache if os.path.getsize(cache) == 0: open(path, 'w+').close() msg = "Created empty file: {} -> {}".format( os.path.relpath(cache), os.path.relpath(path), ) Logger.debug(msg) return i = len(self.cache_types) while i > 0: try: self.CACHE_TYPE_MAP[self.cache_types[0]](cache, path) if self.protected: os.chmod(path, stat.S_IREAD | stat.S_IRGRP | stat.S_IROTH) msg = "Created {}'{}': {} -> {}".format( 'protected ' if self.protected else '', self.cache_types[0], os.path.relpath(cache), os.path.relpath(path)) Logger.debug(msg) return except DvcException as exc: msg = 'Cache type \'{}\' is not supported: {}' Logger.debug(msg.format(self.cache_types[0], str(exc))) del self.cache_types[0] i -= 1 raise DvcException('No possible cache types left to try out.')
def __init__(self, **config): super().__init__(**config) self.path_info = self.PATH_CLS(config["url"]) if not self.path_info.bucket: raise DvcException( "Empty GDrive URL '{}'. Learn more at {}".format( config["url"], format_link("https://man.dvc.org/remote/add"), ) ) self._bucket = self.path_info.bucket self._path = self.path_info.path self._trash_only = config.get("gdrive_trash_only") self._use_service_account = config.get("gdrive_use_service_account") self._service_account_user_email = config.get( "gdrive_service_account_user_email" ) self._service_account_json_file_path = config.get( "gdrive_service_account_json_file_path" ) self._client_id = config.get("gdrive_client_id") self._client_secret = config.get("gdrive_client_secret") self._validate_config() tmp_dir = config["gdrive_credentials_tmp_dir"] assert tmp_dir self._gdrive_service_credentials_path = tmp_fname( os.path.join(tmp_dir, "") ) self._gdrive_user_credentials_path = ( tmp_fname(os.path.join(tmp_dir, "")) if os.getenv(GDriveFileSystem.GDRIVE_CREDENTIALS_DATA) else config.get( "gdrive_user_credentials_file", os.path.join(tmp_dir, self.DEFAULT_USER_CREDENTIALS_FILE), ) )
def _load(self): from dvc import VERSION cmd = "PRAGMA user_version;" self.c.execute(cmd) ret = self.c.fetchall() assert len(ret) == 1 assert len(ret[0]) == 1 assert isinstance(ret[0][0], int) self.version = ret[0][0] if self.version > self.VERSION: msg = "You are using an old version '{}' of dvc that is using " \ "state file version '{}' which is not compatible with the " \ "state file version '{}' that is used in this projet. " \ "Please upgrade right now!" raise DvcException(msg.format(VERSION, self.VERSION, self.version)) elif self.version < self.VERSION: msg = "State file version '{}' is too old. " \ "Reformatting to the current version '{}'." self.project.logger.warn(msg.format(self.version, self.VERSION)) cmd = "DROP TABLE IF EXISTS {};" self.c.execute(cmd.format(self.STATE_TABLE)) self.c.execute(cmd.format(self.STATE_INFO_TABLE)) self.c.execute(cmd.format(self.LINK_STATE_TABLE)) # Check that the state file is indeed a database cmd = "CREATE TABLE IF NOT EXISTS {} ({})" self.c.execute(cmd.format(self.STATE_TABLE, self.STATE_TABLE_LAYOUT)) self.c.execute( cmd.format(self.STATE_INFO_TABLE, self.STATE_INFO_TABLE_LAYOUT)) self.c.execute( cmd.format(self.LINK_STATE_TABLE, self.LINK_STATE_TABLE_LAYOUT)) cmd = "INSERT OR IGNORE INTO {} (count) SELECT 0 " \ "WHERE NOT EXISTS (SELECT * FROM {})" self.c.execute(cmd.format(self.STATE_INFO_TABLE, self.STATE_INFO_TABLE)) cmd = "PRAGMA user_version = {};" self.c.execute(cmd.format(self.VERSION))
def reproduce( self, force=False, dry=False, interactive=False, no_commit=False ): if not self.changed() and not force: return None msg = ( "Going to reproduce '{stage}'. " "Are you sure you want to continue?".format(stage=self.relpath) ) if interactive and not prompt.confirm(msg): raise DvcException("reproduction aborted by the user") logger.info("Reproducing '{stage}'".format(stage=self.relpath)) self.run(dry=dry, no_commit=no_commit, force=force) logger.debug("'{stage}' was reproduced".format(stage=self.relpath)) return self
def reproduce(self, force=False, dry=False, interactive=False): if not self.changed() and not force: return None if (self.cmd or self.is_import) and not self.locked and not dry: # Removing outputs only if we actually have command to reproduce self.remove_outs(ignore_remove=False) msg = "Going to reproduce '{stage}'. Are you sure you want to continue?".format( stage=self.relpath) if interactive and not prompt.confirm(msg): raise DvcException("reproduction aborted by the user") logger.info("Reproducing '{stage}'".format(stage=self.relpath)) self.run(dry=dry) logger.debug("'{stage}' was reproduced".format(stage=self.relpath)) return self
def _is_outs_only(self, target): if not self.args.purge: return True if self.args.force: return False msg = ( u'Are you sure you want to remove {} with its outputs?' .format(target) ) confirmed = self.project.prompt.prompt(msg) if confirmed: return False raise DvcException( u'Cannot purge without a confirmation from the user.' u" Use '-f' to force." )
def hadoop_fs(self, cmd, user=None): cmd = 'hadoop fs -' + cmd if user: cmd = 'HADOOP_USER_NAME={} '.format(user) + cmd # NOTE: close_fds doesn't work with redirected stdin/stdout/stderr. # See https://github.com/iterative/dvc/issues/1197. close_fds = (os.name != 'nt') p = Popen(cmd, shell=True, close_fds=close_fds, executable=os.getenv('SHELL'), env=fix_env(os.environ), stdin=PIPE, stdout=PIPE, stderr=PIPE) out, err = p.communicate() if p.returncode != 0: raise DvcException('HDFS command failed: {}: {}'.format(cmd, err)) return out.decode('utf-8')
def remove(self, path_info): import dropbox path = path_info_to_dropbox_path(path_info) logger.debug("Removing {0}".format(path)) try: self.client.files_delete_v2(path) except dropbox.exceptions.ApiError as ex: if ex.error.is_path_lookup() or ex.error.is_path_write(): raise DvcException( "No write access for '{}':\n\n" "1. Confirm the file exists and you can write it.\n" "2. Make sure that credentials in '{}'\n" " are correct for this remote e.g. " "use the `dropbox_user_credentials_file` config\n" " option if you use multiple Dropbox remotes with " "different email accounts.\n\nDetails".format( path, FileCredProvider.DEFAULT_FILE ) ) from ex raise
def __init__( self, selected: Optional[Iterable[str]], tmp_dir: StrPath, **kwargs, ) -> None: selected = selected or list(self.DEFAULT) self.backends: Dict[str, "BackendCls"] = {} for key in selected: cls = self.DEFAULT.get(key) if cls is None: raise DvcException( f"'dvc machine' backend '{key}' is missing required " "dependencies. Install them with:\n" f"\tpip install dvc[{key}]") self.backends[key] = cls self.initialized: Dict[str, "BaseMachineBackend"] = {} self.tmp_dir = tmp_dir self.kwargs = kwargs
def reproduce(self, force=False, dry=False, interactive=False): if not self.changed() and not force: return None if (self.cmd or self.is_import) and not self.locked and not dry: # Removing outputs only if we actually have command to reproduce self.remove_outs(ignore_remove=False) msg = "Going to reproduce '{}'. Are you sure you want to continue?" msg = msg.format(self.relpath) if interactive and not prompt(msg): raise DvcException('Reproduction aborted by the user') self.project.logger.info(u'Reproducing \'{}\''.format(self.relpath)) self.run(dry=dry) msg = u'\'{}\' was reproduced'.format(self.relpath) self.project.logger.debug(msg) return self
def init_drive(self): self.client_id = self.config.get(Config.SECTION_GDRIVE_CLIENT_ID, None) self.client_secret = self.config.get( Config.SECTION_GDRIVE_CLIENT_SECRET, None ) if not self.client_id or not self.client_secret: raise DvcException( "Please specify Google Drive's client id and " "secret in DVC's config. Learn more at " "https://man.dvc.org/remote/add." ) self.gdrive_user_credentials_path = ( tmp_fname(os.path.join(self.repo.tmp_dir, "")) if os.getenv(RemoteGDrive.GDRIVE_USER_CREDENTIALS_DATA) else self.config.get( Config.SECTION_GDRIVE_USER_CREDENTIALS_FILE, os.path.join( self.repo.tmp_dir, self.DEFAULT_USER_CREDENTIALS_FILE ), ) )
def get_dir_cache(self, **kwargs): if not self.is_dir_checksum: raise DvcException("cannot get dir cache for file checksum") obj = self.odb.get(self.hash_info) try: objects.check(self.odb, obj) except FileNotFoundError: if self.remote: kwargs["remote"] = self.remote self.repo.cloud.pull([obj.hash_info], **kwargs) if self.obj: return self.obj try: self.obj = objects.load(self.odb, self.hash_info) except (FileNotFoundError, ObjectFormatError): self.obj = None return self.obj
def _generate_download_url(self, path_info): import dropbox path = path_info_to_dropbox_path(path_info) try: # expires in 4 hrs return self.client.files_get_temporary_link(path).link except dropbox.exceptions.ApiError as ex: if ex.error.is_path() and ex.error.get_path().is_not_found(): raise DvcException( "Path not found for '{}':\n\n" "1. Confirm the file exists and you can access it.\n" "2. Make sure that credentials in '{}'\n" " are correct for this remote e.g. " "use the `dropbox_user_credentials_file` config\n" " option if you use multiple Dropbox remotes with " "different email accounts.\n\nDetails".format( path, FileCredProvider.DEFAULT_FILE ) ) from ex raise