def _init_headless(run, cloud=True): global join global _user_process_finished_called program = util.get_program() if program: os.environ[env.PROGRAM] = os.getenv(env.PROGRAM) or program environ = dict(os.environ) run.set_environment(environ) server = wandb_socket.Server() run.socket = server hooks = ExitHooks() hooks.hook() if platform.system() == "Windows": try: import win32api # Make sure we are not ignoring CTRL_C_EVENT # https://docs.microsoft.com/en-us/windows/console/setconsolectrlhandler # https://stackoverflow.com/questions/1364173/stopping-python-using-ctrlc win32api.SetConsoleCtrlHandler(None, False) except ImportError: termerror( "Install the win32api library with `pip install pypiwin32`") # PTYs don't work in windows so we create these unused pipes and # mirror stdout to run.dir/output.log. There should be a way to make # pipes work, but I haven't figured it out. See links in compat/windows stdout_master_fd, stdout_slave_fd = os.pipe() stderr_master_fd, stderr_slave_fd = os.pipe() else: stdout_master_fd, stdout_slave_fd = io_wrap.wandb_pty(resize=False) stderr_master_fd, stderr_slave_fd = io_wrap.wandb_pty(resize=False) headless_args = { 'command': 'headless', 'pid': os.getpid(), 'stdout_master_fd': stdout_master_fd, 'stderr_master_fd': stderr_master_fd, 'cloud': cloud, 'port': server.port } internal_cli_path = os.path.join(os.path.dirname(__file__), 'internal_cli.py') if six.PY2 or platform.system() == "Windows": # TODO(adrian): close_fds=False is bad for security. we set # it so we can pass the PTY FDs to the wandb process. We # should use subprocess32, which has pass_fds. popen_kwargs = {'close_fds': False} else: popen_kwargs = {'pass_fds': [stdout_master_fd, stderr_master_fd]} # TODO(adrian): ensure we use *exactly* the same python interpreter # TODO(adrian): make wandb the foreground process so we don't give # up terminal control until syncing is finished. # https://stackoverflow.com/questions/30476971/is-the-child-process-in-foreground-or-background-on-fork-in-c wandb_process = subprocess.Popen( [sys.executable, internal_cli_path, json.dumps(headless_args)], env=environ, **popen_kwargs) termlog('Tracking run with wandb version {}'.format(__version__)) os.close(stdout_master_fd) os.close(stderr_master_fd) # Listen on the socket waiting for the wandb process to be ready try: success, _ = server.listen(30) except KeyboardInterrupt: success = False else: if not success: termerror('W&B process (PID {}) did not respond'.format( wandb_process.pid)) if not success: wandb_process.kill() for _ in range(20): time.sleep(0.1) if wandb_process.poll() is not None: break if wandb_process.poll() is None: termerror('Failed to kill wandb process, PID {}'.format( wandb_process.pid)) # TODO attempt to upload a debug log path = GLOBAL_LOG_FNAME.replace(os.getcwd() + os.sep, "") raise LaunchError("W&B process failed to launch, see: {}".format(path)) if platform.system() == "Windows": output = open(os.path.join(run.dir, "output.log"), "wb") stdout_redirector = io_wrap.WindowsRedirector(sys.stdout, output) stderr_redirector = io_wrap.WindowsRedirector(sys.stderr, output) else: stdout_slave = os.fdopen(stdout_slave_fd, 'wb') stderr_slave = os.fdopen(stderr_slave_fd, 'wb') try: stdout_redirector = io_wrap.FileRedirector(sys.stdout, stdout_slave) stderr_redirector = io_wrap.FileRedirector(sys.stderr, stderr_slave) except (ValueError, AttributeError): # stdout / err aren't files output = open(os.path.join(run.dir, "output.log"), "wb") stdout_redirector = io_wrap.WindowsRedirector(sys.stdout, output) stderr_redirector = io_wrap.WindowsRedirector(sys.stderr, output) # TODO(adrian): we should register this right after starting the wandb process to # make sure we shut down the W&B process eg. if there's an exception in the code # above atexit.register(_user_process_finished, server, hooks, wandb_process, stdout_redirector, stderr_redirector) def _wandb_join(exit_code=None): global _global_run_stack shutdown_async_log_thread() run.close_files() if exit_code is not None: hooks.exit_code = exit_code _user_process_finished(server, hooks, wandb_process, stdout_redirector, stderr_redirector) if len(_global_run_stack) > 0: _global_run_stack.pop() join = _wandb_join _user_process_finished_called = False # redirect output last of all so we don't miss out on error messages stdout_redirector.redirect() if not env.is_debug(): stderr_redirector.redirect()
def setup(self): class TimeOutException(Exception): pass def alarm_handler(signum, frame): raise TimeOutException() self.data["root"] = os.getcwd() program = os.getenv(env.PROGRAM) or util.get_program() if program: self.data["program"] = program else: self.data["program"] = '<python with no main file>' if wandb._get_python_type() != "python": if os.getenv(env.NOTEBOOK_NAME): self.data["program"] = os.getenv(env.NOTEBOOK_NAME) else: meta = wandb.jupyter.notebook_metadata() if meta.get("path"): if "fileId=" in meta["path"]: self.data[ "colab"] = "https://colab.research.google.com/drive/" + meta[ "path"].split("fileId=")[1] self.data["program"] = meta["name"] else: self.data["program"] = meta["path"] self.data["root"] = meta["root"] if not os.getenv(env.DISABLE_CODE): logger.debug("code probe starting") in_jupyter = wandb._get_python_type() != "python" # windows doesn't support alarm() and jupyter could call this in a thread context if platform.system() == "Windows" or not hasattr( signal, 'SIGALRM') or in_jupyter: logger.debug("non time limited probe of code") self._setup_code_git() self._setup_code_program() else: old_alarm = None try: try: old_alarm = signal.signal(signal.SIGALRM, alarm_handler) signal.alarm(25) self._setup_code_git() self._setup_code_program() finally: signal.alarm(0) except TimeOutException: logger.debug("timeout waiting for setup_code") finally: if old_alarm: signal.signal(signal.SIGALRM, old_alarm) logger.debug("code probe done") self.data["startedAt"] = datetime.utcfromtimestamp( wandb.START_TIME).isoformat() try: username = getpass.getuser() except KeyError: # getuser() could raise KeyError in restricted environments like # chroot jails or docker containers. Return user id in these cases. username = str(os.getuid()) # Host names, usernames, emails, the root directory, and executable paths are sensitive for anonymous users. if self._api.settings().get('anonymous') != 'true': self.data["host"] = os.environ.get(env.HOST, socket.gethostname()) self.data["username"] = os.getenv(env.USERNAME, username) self.data["executable"] = sys.executable else: self.data.pop("email", None) self.data.pop("root", None) self.data["os"] = platform.platform(aliased=True) self.data["python"] = platform.python_version() if env.get_docker(): self.data["docker"] = env.get_docker() try: pynvml.nvmlInit() self.data["gpu"] = pynvml.nvmlDeviceGetName( pynvml.nvmlDeviceGetHandleByIndex(0)).decode("utf8") self.data["gpu_count"] = pynvml.nvmlDeviceGetCount() except pynvml.NVMLError: pass try: self.data["cpu_count"] = multiprocessing.cpu_count() except NotImplementedError: pass # TODO: we should use the cuda library to collect this if os.path.exists("/usr/local/cuda/version.txt"): with open("/usr/local/cuda/version.txt") as f: self.data["cuda"] = f.read().split(" ")[-1].strip() self.data["args"] = sys.argv[1:] self.data["state"] = "running"