def _run(self): self._check_missing_deps() executable = os.getenv("SHELL") if os.name != "nt" else None self._warn_if_fish(executable) main_thread = isinstance( threading.current_thread(), threading._MainThread ) if main_thread: old_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) p = None try: p = subprocess.Popen( self.cmd, cwd=self.wdir, shell=True, env=fix_env(os.environ), executable=executable, close_fds=True, ) p.communicate() finally: if main_thread: signal.signal(signal.SIGINT, old_handler) if (p is None) or (p.returncode != 0): raise StageCmdFailedError(self)
def daemon(args): """Launch a `dvc daemon` command in a detached process. Args: args (list): list of arguments to append to `dvc daemon` command. """ cmd = [sys.executable] if not is_binary(): cmd += ["-m", "dvc"] cmd += ["daemon", "-q"] + args env = fix_env() file_path = os.path.abspath(inspect.stack()[0][1]) env[cast_bytes_py2("PYTHONPATH")] = cast_bytes_py2( os.path.dirname(os.path.dirname(file_path)) ) logger.debug("Trying to spawn '{}' with env '{}'".format(cmd, env)) if os.name == "nt": _spawn_windows(cmd, env) elif os.name == "posix": _spawn_posix(cmd, env) else: raise NotImplementedError logger.debug("Spawned '{}'".format(cmd))
def run(self, dry=False): if self.locked: msg = u'Verifying outputs in locked stage \'{}\'' self.project.logger.info(msg.format(self.relpath)) if not dry: self.check_missing_outputs() elif self.is_import: msg = u'Importing \'{}\' -> \'{}\'' self.project.logger.info( msg.format(self.deps[0].path, self.outs[0].path)) if not dry: self.deps[0].download(self.outs[0].path_info) elif self.is_data_source: msg = u'Verifying data sources in \'{}\''.format(self.relpath) self.project.logger.info(msg) if not dry: self.check_missing_outputs() else: msg = u'Running command:\n\t{}'.format(self.cmd) self.project.logger.info(msg) if not dry: self._check_missing_deps() p = subprocess.Popen(self.cmd, cwd=self.cwd, shell=True, env=fix_env(os.environ), executable=os.getenv('SHELL')) p.communicate() if p.returncode != 0: raise StageCmdFailedError(self) if not dry: self.save()
def __init__(self, root_dir=os.curdir, search_parent_directories=True): """Git class constructor. Requires `Repo` class from `git` module (from gitpython package). """ super().__init__(root_dir) import git from git.exc import InvalidGitRepositoryError try: self.repo = git.Repo( root_dir, search_parent_directories=search_parent_directories ) except InvalidGitRepositoryError: msg = "{} is not a git repository" raise SCMError(msg.format(root_dir)) # NOTE: fixing LD_LIBRARY_PATH for binary built by PyInstaller. # http://pyinstaller.readthedocs.io/en/stable/runtime-information.html env = fix_env(None) libpath = env.get("LD_LIBRARY_PATH", None) self.repo.git.update_environment(LD_LIBRARY_PATH=libpath) self.ignored_paths = [] self.files_to_track = set()
def _spawn_posix(cmd): # NOTE: using os._exit instead of sys.exit, because dvc built # with PyInstaller has trouble with SystemExit exeption and throws # errors such as "[26338] Failed to execute script __main__" try: pid = os.fork() if pid > 0: return except OSError: logger.error("failed at first fork") os._exit(1) # pylint: disable=protected-access os.setsid() os.umask(0) try: pid = os.fork() if pid > 0: os._exit(0) # pylint: disable=protected-access except OSError: logger.error("failed at second fork") os._exit(1) # pylint: disable=protected-access sys.stdin.close() sys.stdout.close() sys.stderr.close() Popen(cmd, env=fix_env(), close_fds=True, shell=False).communicate() os._exit(0) # pylint: disable=protected-access
def test_fix_env_pyenv(path, orig): env = { "PATH": path, "PYENV_ROOT": "/pyenv", "PYENV_VERSION": "3.7.2", "PYENV_DIR": "/some/dir", "PYENV_HOOK_PATH": "/some/hook/path", } assert fix_env(env)["PATH"] == orig
def clone( url: str, to_path: str, rev: Optional[str] = None, shallow_branch: Optional[str] = None, ): import git ld_key = "LD_LIBRARY_PATH" env = fix_env(None) if is_binary() and ld_key not in env.keys(): # In fix_env, we delete LD_LIBRARY_PATH key if it was empty before # PyInstaller modified it. GitPython, in git.Repo.clone_from, uses # env to update its own internal state. When there is no key in # env, this value is not updated and GitPython re-uses # LD_LIBRARY_PATH that has been set by PyInstaller. # See [1] for more info. # [1] https://github.com/gitpython-developers/GitPython/issues/924 env[ld_key] = "" try: if shallow_branch is not None and os.path.exists(url): # git disables --depth for local clones unless file:// url # scheme is used url = f"file://{url}" with TqdmGit(desc="Cloning", unit="obj") as pbar: clone_from = partial( git.Repo.clone_from, url, to_path, env=env, # needed before we can fix it in __init__ no_single_branch=True, progress=pbar.update_git, ) if shallow_branch is None: tmp_repo = clone_from() else: tmp_repo = clone_from(branch=shallow_branch, depth=1) tmp_repo.close() except git.exc.GitCommandError as exc: # pylint: disable=no-member raise CloneError(url, to_path) from exc # NOTE: using our wrapper to make sure that env is fixed in __init__ repo = GitPythonBackend(to_path) if rev: try: repo.checkout(rev) except git.exc.GitCommandError as exc: # pylint: disable=no-member raise RevError( "failed to access revision '{}' for repo '{}'".format( rev, url ) ) from exc
def cmd_run(stage, *args, checkpoint_func=None, **kwargs): kwargs = {"cwd": stage.wdir, "env": fix_env(None), "close_fds": True} cmd = stage.cmd if isinstance(stage.cmd, list) else [stage.cmd] if checkpoint_func: # indicate that checkpoint cmd is being run inside DVC kwargs["env"].update(_checkpoint_env(stage)) if os.name == "nt": kwargs["shell"] = True executable = None else: # NOTE: when you specify `shell=True`, `Popen` [1] will default to # `/bin/sh` on *nix and will add ["/bin/sh", "-c"] to your command. # But we actually want to run the same shell that we are running # from right now, which is usually determined by the `SHELL` env # var. So instead, we compose our command on our own, making sure # to include special flags to prevent shell from reading any # configs and modifying env, which may change the behavior or the # command we are running. See [2] for more info. # # [1] https://github.com/python/cpython/blob/3.7/Lib/subprocess.py # #L1426 # [2] https://github.com/iterative/dvc/issues/2506 # #issuecomment-535396799 kwargs["shell"] = False executable = os.getenv("SHELL") or "/bin/sh" warn_if_fish(executable) main_thread = isinstance( threading.current_thread(), threading._MainThread, # pylint: disable=protected-access ) for _cmd in cmd: logger.info("$ %s", _cmd) old_handler = None p = None try: p = subprocess.Popen(_make_cmd(executable, _cmd), **kwargs) if main_thread: old_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) killed = threading.Event() with checkpoint_monitor(stage, checkpoint_func, p, killed): p.communicate() finally: if old_handler: signal.signal(signal.SIGINT, old_handler) retcode = None if not p else p.returncode if retcode != 0: if killed.is_set(): raise CheckpointKilledError(_cmd, retcode) raise StageCmdFailedError(_cmd, retcode)
def _run(self): self._check_missing_deps() executable = os.getenv('SHELL') if os.name != 'nt' else None self._check_if_fish(executable) p = subprocess.Popen(self.cmd, cwd=self.cwd, shell=True, env=fix_env(os.environ), executable=executable) p.communicate() if p.returncode != 0: raise StageCmdFailedError(self)
def _spawn_windows(cmd): from subprocess import STARTUPINFO, STARTF_USESHOWWINDOW creationflags = CREATE_NEW_PROCESS_GROUP | DETACHED_PROCESS startupinfo = STARTUPINFO() startupinfo.dwFlags |= STARTF_USESHOWWINDOW Popen(cmd, env=fix_env(), close_fds=True, shell=False, creationflags=creationflags, startupinfo=startupinfo).communicate()
def __init__(self, root_dir=os.curdir): super(Git, self).__init__(root_dir) import git from git.exc import InvalidGitRepositoryError try: self.repo = git.Repo(root_dir) except InvalidGitRepositoryError: raise SCMError('{} is not a git repository'.format(root_dir)) # NOTE: fixing LD_LIBRARY_PATH for binary built by PyInstaller. # http://pyinstaller.readthedocs.io/en/stable/runtime-information.html env = fix_env(None) lp = env.get('LD_LIBRARY_PATH', None) self.repo.git.update_environment(LD_LIBRARY_PATH=lp)
def _run(self): self._check_missing_deps() kwargs = {"cwd": self.wdir, "env": fix_env(None), "close_fds": True} if os.name == "nt": kwargs["shell"] = True cmd = self.cmd else: # NOTE: when you specify `shell=True`, `Popen` [1] will default to # `/bin/sh` on *nix and will add ["/bin/sh", "-c"] to your command. # But we actually want to run the same shell that we are running # from right now, which is usually determined by the `SHELL` env # var. So instead, we compose our command on our own, making sure # to include special flags to prevent shell from reading any # configs and modifying env, which may change the behavior or the # command we are running. See [2] for more info. # # [1] https://github.com/python/cpython/blob/3.7/Lib/subprocess.py # #L1426 # [2] https://github.com/iterative/dvc/issues/2506 # #issuecomment-535396799 kwargs["shell"] = False executable = os.getenv("SHELL") or "/bin/sh" self._warn_if_fish(executable) opts = {"zsh": ["--no-rcs"], "bash": ["--noprofile", "--norc"]} name = os.path.basename(executable).lower() cmd = [executable] + opts.get(name, []) + ["-c", self.cmd] main_thread = isinstance( threading.current_thread(), threading._MainThread ) old_handler = None p = None try: p = subprocess.Popen(cmd, **kwargs) if main_thread: old_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) p.communicate() finally: if old_handler: signal.signal(signal.SIGINT, old_handler) if (p is None) or (p.returncode != 0): raise StageCmdFailedError(self)
def hadoop_fs(self, cmd, user=None): cmd = 'hadoop fs -' + cmd if user: cmd = 'HADOOP_USER_NAME={} '.format(user) + cmd p = Popen(cmd, shell=True, close_fds=True, executable=os.getenv('SHELL'), env=fix_env(os.environ), stdin=PIPE, stdout=PIPE, stderr=PIPE) out, err = p.communicate() if p.returncode != 0: raise DvcException('HDFS command failed: {}: {}'.format(cmd, err)) return out.decode('utf-8')
def _checksum(self, path_info, **kwargs): # PyArrow doesn't natively support retrieving the # checksum, so we have to use hadoop fs result = self._run_command( f"checksum {path_info.url}", env=fix_env(os.environ), user=path_info.user, ) if result is None: return None match = CHECKSUM_REGEX.match(result) if match is None: return None return match.group("checksum")
def daemon(args): """Launch a `dvc daemon` command in a detached process. Args: args (list): list of arguments to append to `dvc daemon` command. """ if os.environ.get(DVC_DAEMON): logger.debug("skipping launching a new daemon.") return cmd = ["daemon", "-q"] + args env = fix_env() file_path = os.path.abspath(inspect.stack()[0][1]) env["PYTHONPATH"] = os.path.dirname(os.path.dirname(file_path)) env[DVC_DAEMON] = "1" _spawn(cmd, env)
def _checksum(self, path, **kwargs): # PyArrow doesn't natively support retrieving the # checksum, so we have to use hadoop fs url = self.unstrip_protocol(path) result = self._run_command( f"checksum {url}", env=fix_env(os.environ), user=self.fs_args["user"], ) if result is None: return None match = CHECKSUM_REGEX.match(result) if match is None: return None return match.group("checksum")
def clone(url, to_path, rev=None): import git ld_key = "LD_LIBRARY_PATH" env = fix_env(None) if is_binary() and ld_key not in env.keys(): # In fix_env, we delete LD_LIBRARY_PATH key if it was empty before # PyInstaller modified it. GitPython, in git.Repo.clone_from, uses # env to update its own internal state. When there is no key in # env, this value is not updated and GitPython re-uses # LD_LIBRARY_PATH that has been set by PyInstaller. # See [1] for more info. # [1] https://github.com/gitpython-developers/GitPython/issues/924 env[ld_key] = "" try: with TqdmGit(desc="Cloning", unit="obj") as pbar: tmp_repo = git.Repo.clone_from( url, to_path, env=env, # needed before we can fix it in __init__ no_single_branch=True, progress=pbar.update_git, ) tmp_repo.close() except git.exc.GitCommandError as exc: # pylint: disable=no-member raise CloneError(url, to_path) from exc # NOTE: using our wrapper to make sure that env is fixed in __init__ repo = Git(to_path) if rev: try: repo.checkout(rev) except git.exc.GitCommandError as exc: # pylint: disable=no-member raise RevError( "failed to access revision '{}' for repo '{}'".format( rev, url ) ) from exc return repo
def __init__( # pylint:disable=W0231 self, root_dir=os.curdir, search_parent_directories=True ): import git from git.exc import InvalidGitRepositoryError try: self.repo = git.Repo( root_dir, search_parent_directories=search_parent_directories ) except InvalidGitRepositoryError: msg = "{} is not a git repository" raise SCMError(msg.format(root_dir)) # NOTE: fixing LD_LIBRARY_PATH for binary built by PyInstaller. # http://pyinstaller.readthedocs.io/en/stable/runtime-information.html env = fix_env(None) libpath = env.get("LD_LIBRARY_PATH", None) self.repo.git.update_environment(LD_LIBRARY_PATH=libpath)
def shell_command(self, cmd, user=None): # NOTE: close_fds doesn't work with redirected stdin/stdout/stderr. # See https://github.com/iterative/dvc/issues/1197. close_fds = os.name != "nt" executable = os.getenv("SHELL") if os.name != "nt" else None p = Popen( cmd, shell=True, close_fds=close_fds, executable=executable, env=fix_env(os.environ), stdin=PIPE, stdout=PIPE, stderr=PIPE, ) out, err = p.communicate() if p.returncode != 0: raise RemoteCmdError(self.scheme, cmd, p.returncode, err) return out.decode("utf-8")
def __init__(self, root_dir=os.curdir, repo=None): super(Git, self).__init__(root_dir, repo=repo) import git from git.exc import InvalidGitRepositoryError try: self.git = git.Repo(root_dir) except InvalidGitRepositoryError: msg = "{} is not a git repository" raise SCMError(msg.format(root_dir)) # NOTE: fixing LD_LIBRARY_PATH for binary built by PyInstaller. # http://pyinstaller.readthedocs.io/en/stable/runtime-information.html env = fix_env(None) libpath = env.get("LD_LIBRARY_PATH", None) self.git.git.update_environment(LD_LIBRARY_PATH=libpath) self.ignored_paths = [] self.files_to_track = []
def prepare_kwargs(stage, checkpoint_func=None): kwargs = {"cwd": stage.wdir, "env": fix_env(None), "close_fds": True} kwargs["env"].update(stage.env(checkpoint_func=checkpoint_func)) # NOTE: when you specify `shell=True`, `Popen` [1] will default to # `/bin/sh` on *nix and will add ["/bin/sh", "-c"] to your command. # But we actually want to run the same shell that we are running # from right now, which is usually determined by the `SHELL` env # var. So instead, we compose our command on our own, making sure # to include special flags to prevent shell from reading any # configs and modifying env, which may change the behavior or the # command we are running. See [2] for more info. # # [1] https://github.com/python/cpython/blob/3.7/Lib/subprocess.py # #L1426 # [2] https://github.com/iterative/dvc/issues/2506 # #issuecomment-535396799 kwargs["shell"] = True if os.name == "nt" else False return kwargs
def cmd_run(stage, *args, **kwargs): kwargs = {"cwd": stage.wdir, "env": fix_env(None), "close_fds": True} if os.name == "nt": kwargs["shell"] = True cmd = stage.cmd else: # NOTE: when you specify `shell=True`, `Popen` [1] will default to # `/bin/sh` on *nix and will add ["/bin/sh", "-c"] to your command. # But we actually want to run the same shell that we are running # from right now, which is usually determined by the `SHELL` env # var. So instead, we compose our command on our own, making sure # to include special flags to prevent shell from reading any # configs and modifying env, which may change the behavior or the # command we are running. See [2] for more info. # # [1] https://github.com/python/cpython/blob/3.7/Lib/subprocess.py # #L1426 # [2] https://github.com/iterative/dvc/issues/2506 # #issuecomment-535396799 kwargs["shell"] = False executable = os.getenv("SHELL") or "/bin/sh" warn_if_fish(executable) cmd = _nix_cmd(executable, stage.cmd) main_thread = isinstance(threading.current_thread(), threading._MainThread) old_handler = None p = None try: p = subprocess.Popen(cmd, **kwargs) if main_thread: old_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) p.communicate() finally: if old_handler: signal.signal(signal.SIGINT, old_handler) retcode = None if not p else p.returncode if retcode != 0: raise StageCmdFailedError(stage.cmd, retcode)
def hadoop_fs(self, cmd, user=None): cmd = 'hadoop fs -' + cmd if user: cmd = 'HADOOP_USER_NAME={} '.format(user) + cmd # NOTE: close_fds doesn't work with redirected stdin/stdout/stderr. # See https://github.com/iterative/dvc/issues/1197. close_fds = (os.name != 'nt') p = Popen(cmd, shell=True, close_fds=close_fds, executable=os.getenv('SHELL'), env=fix_env(os.environ), stdin=PIPE, stdout=PIPE, stderr=PIPE) out, err = p.communicate() if p.returncode != 0: raise DvcException('HDFS command failed: {}: {}'.format(cmd, err)) return out.decode('utf-8')
def daemon(args): """Launch a `dvc daemon` command in a detached process. Args: args (list): list of arguments to append to `dvc daemon` command. """ if os.environ.get(DVC_DAEMON): logger.debug("skipping launching a new daemon.") return cmd = [sys.executable] if not is_binary(): cmd += [sys.argv[0]] cmd += ["daemon", "-q"] + args env = fix_env() file_path = os.path.abspath(inspect.stack()[0][1]) env[cast_bytes_py2("PYTHONPATH")] = cast_bytes_py2( os.path.dirname(os.path.dirname(file_path))) env[cast_bytes_py2(DVC_DAEMON)] = cast_bytes_py2("1") _spawn(cmd, env)
def clone(url, to_path, rev=None): import git ld_key = "LD_LIBRARY_PATH" env = fix_env(None) if is_binary() and ld_key not in env.keys(): # In fix_env, we delete LD_LIBRARY_PATH key if it was empty before # PyInstaller modified it. GitPython, in git.Repo.clone_from, uses # env to update its own internal state. When there is no key in # env, this value is not updated and GitPython re-uses # LD_LIBRARY_PATH that has been set by PyInstaller. # See [1] for more info. # [1] https://github.com/gitpython-developers/GitPython/issues/924 env[cast_bytes_py2(ld_key)] = "" try: tmp_repo = git.Repo.clone_from( url, to_path, env=env, # needed before we can fix it in __init__ no_single_branch=True, ) tmp_repo.close() except git.exc.GitCommandError as exc: raise CloneError(url, to_path, exc) # NOTE: using our wrapper to make sure that env is fixed in __init__ repo = Git(to_path) if rev: try: repo.checkout(rev) except git.exc.GitCommandError as exc: raise RevError(url, rev, exc) return repo
def _hadoop_fs(cmd, user=None): cmd = "hadoop fs -" + cmd if user: cmd = f"HADOOP_USER_NAME={user} " + cmd # NOTE: close_fds doesn't work with redirected stdin/stdout/stderr. # See https://github.com/iterative/dvc/issues/1197. close_fds = os.name != "nt" executable = os.getenv("SHELL") if os.name != "nt" else None p = subprocess.Popen( cmd, shell=True, close_fds=close_fds, executable=executable, env=fix_env(os.environ), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) out, err = p.communicate() if p.returncode != 0: raise RemoteCmdError("hdfs", cmd, p.returncode, err) return out.decode("utf-8")
def clone(url, to_path, rev=None): import git try: tmp_repo = git.Repo.clone_from( url, to_path, env=fix_env(None), # needed before we can fix it in __init__ no_single_branch=True, ) tmp_repo.close() except git.exc.GitCommandError as exc: raise CloneError(url, to_path, exc) # NOTE: using our wrapper to make sure that env is fixed in __init__ repo = Git(to_path) if rev: try: repo.checkout(rev) except git.exc.GitCommandError as exc: raise RevError(url, rev, exc) return repo
def test_fix_env_pyenv(path, orig): env = {"PATH": path, "PYENV_ROOT": "/pyenv"} assert fix_env(env)["PATH"] == orig