def jupyter_login(force=True, api=None): """Attempt to login from a jupyter environment If force=False, we'll only attempt to auto-login, otherwise we'll prompt the user """ def get_api_key_from_browser(): key, anonymous = None, False if 'google.colab' in sys.modules: key = jupyter.attempt_colab_login(api.app_url) elif 'databricks_cli' in sys.modules and 'dbutils' in sys.modules: # Databricks does not seem to support getpass() so we need to fail # early and prompt the user to configure the key manually for now. termerror( "Databricks requires api_key to be configured manually, instructions at: http://docs.wandb.com/integrations/databricks" ) raise LaunchError( "Databricks integration requires api_key to be configured.") if not key and os.environ.get(env.ALLOW_ANONYMOUS) == "true": key = api.create_anonymous_api_key() anonymous = True if not key and force: termerror( "Not authenticated. Copy a key from https://app.wandb.ai/authorize" ) key = getpass.getpass("API Key: ").strip() return key, anonymous api = api or (run.api if run else None) if not api: raise LaunchError("Internal error: api required for jupyter login") return util.prompt_api_key(api, browser_callback=get_api_key_from_browser)
def get_api_key_from_browser(signup=False): key, anonymous = None, False if 'google.colab' in sys.modules: key = jupyter.attempt_colab_login(api.app_url) elif 'databricks_cli' in sys.modules and 'dbutils' in sys.modules: # Databricks does not seem to support getpass() so we need to fail # early and prompt the user to configure the key manually for now. termerror( "Databricks requires api_key to be configured manually, instructions at: http://docs.wandb.com/integrations/databricks" ) raise LaunchError( "Databricks integration requires api_key to be configured.") # For jupyter we default to not allowing anonymous if not key and os.environ.get(env.ANONYMOUS, "never") != "never": key = api.create_anonymous_api_key() anonymous = True if not key and force: try: termerror( "Not authenticated. Copy a key from https://app.wandb.ai/authorize" ) key = getpass.getpass("API Key: ").strip() except NotImplementedError: termerror( "Can't accept input in this environment, you should set WANDB_API_KEY or call wandb.login(key='YOUR_API_KEY')" ) return key, anonymous
def get_api_key_from_browser(): key, anonymous = None, False if 'google.colab' in sys.modules: key = jupyter.attempt_colab_login(api.app_url) elif 'databricks_cli' in sys.modules and 'dbutils' in sys.modules: # Databricks does not seem to support getpass() so we need to fail # early and prompt the user to configure the key manually for now. termerror( "Databricks requires api_key to be configured manually, instructions at: http://docs.wandb.com/integrations/databricks" ) raise LaunchError( "Databricks integration requires api_key to be configured.") if not key and os.environ.get(env.ALLOW_ANONYMOUS) == "true": key = api.create_anonymous_api_key() anonymous = True if not key and force: termerror( "Not authenticated. Copy a key from https://app.wandb.ai/authorize" ) key = getpass.getpass("API Key: ").strip() return key, anonymous
def init(job_type=None, dir=None, config=None, project=None, entity=None, reinit=None, tags=None, group=None, allow_val_change=False, resume=False, force=False, tensorboard=False, sync_tensorboard=False, name=None, notes=None, id=None, magic=None): """Initialize W&B If called from within Jupyter, initializes a new run and waits for a call to `wandb.log` to begin pushing metrics. Otherwise, spawns a new process to communicate with W&B. Args: job_type (str, optional): The type of job running, defaults to 'train' config (dict, argparse, or tf.FLAGS, optional): The hyper parameters to store with the run project (str, optional): The project to push metrics to entity (str, optional): The entity to push metrics to dir (str, optional): An absolute path to a directory where metadata will be stored group (str, optional): A unique string shared by all runs in a given group tags (list, optional): A list of tags to apply to the run id (str, optional): A globally unique (per project) identifier for the run name (str, optional): A display name which does not have to be unique notes (str, optional): A multiline string associated with the run reinit (bool, optional): Allow multiple calls to init in the same process resume (bool, str, optional): Automatically resume this run if run from the same machine, you can also pass a unique run_id sync_tensorboard (bool, optional): Synchronize wandb logs to tensorboard or tensorboardX force (bool, optional): Force authentication with wandb, defaults to False magic (bool, dict, or str, optional): magic configuration as bool, dict, json string, yaml filename Returns: A wandb.run object for metric and config logging. """ trigger.call('on_init', **locals()) global run global __stage_dir__ # We allow re-initialization when we're in Jupyter or explicity opt-in to it. in_jupyter = _get_python_type() != "python" if reinit or (in_jupyter and reinit != False): reset_env(exclude=env.immutable_keys()) run = None # TODO: deprecate tensorboard if tensorboard or sync_tensorboard and len(patched["tensorboard"]) == 0: util.get_module("wandb.tensorboard").patch() sagemaker_config = util.parse_sm_config() tf_config = util.parse_tfjob_config() if group == None: group = os.getenv(env.RUN_GROUP) if job_type == None: job_type = os.getenv(env.JOB_TYPE) if sagemaker_config: # Set run_id and potentially grouping if we're in SageMaker run_id = os.getenv('TRAINING_JOB_NAME') if run_id: os.environ[env.RUN_ID] = '-'.join([ run_id, os.getenv('CURRENT_HOST', socket.gethostname())]) conf = json.load( open("/opt/ml/input/config/resourceconfig.json")) if group == None and len(conf["hosts"]) > 1: group = os.getenv('TRAINING_JOB_NAME') # Set secret variables if os.path.exists("secrets.env"): for line in open("secrets.env", "r"): key, val = line.strip().split('=', 1) os.environ[key] = val elif tf_config: cluster = tf_config.get('cluster') job_name = tf_config.get('task', {}).get('type') task_index = tf_config.get('task', {}).get('index') if job_name is not None and task_index is not None: # TODO: set run_id for resuming? run_id = cluster[job_name][task_index].rsplit(":")[0] if job_type == None: job_type = job_name if group == None and len(cluster.get("worker", [])) > 0: group = cluster[job_name][0].rsplit("-"+job_name, 1)[0] image = util.image_id_from_k8s() if image: os.environ[env.DOCKER] = image if project: os.environ[env.PROJECT] = project if entity: os.environ[env.ENTITY] = entity if group: os.environ[env.RUN_GROUP] = group if job_type: os.environ[env.JOB_TYPE] = job_type if tags: os.environ[env.TAGS] = ",".join(tags) if id: os.environ[env.RUN_ID] = id if name is None: # We do this because of https://github.com/wandb/core/issues/2170 # to ensure that the run's name is explicitly set to match its # id. If we don't do this and the id is eight characters long, the # backend will set the name to a generated human-friendly value. # # In any case, if the user is explicitly setting `id` but not # `name`, their id is probably a meaningful string that we can # use to label the run. name = os.environ.get(env.NAME, id) # environment variable takes precedence over this. if name: os.environ[env.NAME] = name if notes: os.environ[env.NOTES] = notes if magic is not None and magic is not False: if isinstance(magic, dict): os.environ[env.MAGIC] = json.dumps(magic) elif isinstance(magic, str): os.environ[env.MAGIC] = magic elif isinstance(magic, bool): pass else: termwarn("wandb.init called with invalid magic parameter type", repeat=False) from wandb import magic_impl magic_impl.magic_install() if dir: os.environ[env.DIR] = dir util.mkdir_exists_ok(wandb_dir()) resume_path = os.path.join(wandb_dir(), wandb_run.RESUME_FNAME) if resume == True: os.environ[env.RESUME] = "auto" elif resume: os.environ[env.RESUME] = os.environ.get(env.RESUME, "allow") # TODO: remove allowing resume as a string in the future os.environ[env.RUN_ID] = id or resume elif os.path.exists(resume_path): os.remove(resume_path) if os.environ.get(env.RESUME) == 'auto' and os.path.exists(resume_path): if not os.environ.get(env.RUN_ID): os.environ[env.RUN_ID] = json.load(open(resume_path))["run_id"] # the following line is useful to ensure that no W&B logging happens in the user # process that might interfere with what they do # logging.basicConfig(format='user process %(asctime)s - %(name)s - %(levelname)s - %(message)s') # If a thread calls wandb.init() it will get the same Run object as # the parent. If a child process with distinct memory space calls # wandb.init(), it won't get an error, but it will get a result of # None. # This check ensures that a child process can safely call wandb.init() # after a parent has (only the parent will create the Run object). # This doesn't protect against the case where the parent doesn't call # wandb.init but two children do. if run or os.getenv(env.INITED): return run if __stage_dir__ is None: __stage_dir__ = "wandb" util.mkdir_exists_ok(wandb_dir()) try: signal.signal(signal.SIGQUIT, _debugger) except AttributeError: pass try: run = wandb_run.Run.from_environment_or_defaults() except IOError as e: termerror('Failed to create run directory: {}'.format(e)) raise LaunchError("Could not write to filesystem.") run.set_environment() def set_global_config(run): global config # because we already have a local config config = run.config set_global_config(run) global summary summary = run.summary # set this immediately after setting the run and the config. if there is an # exception after this it'll probably break the user script anyway os.environ[env.INITED] = '1' # we do these checks after setting the run and the config because users scripts # may depend on those things if sys.platform == 'win32' and run.mode != 'clirun': termerror( 'To use wandb on Windows, you need to run the command "wandb run python <your_train_script>.py"') return run if in_jupyter: _init_jupyter(run) elif run.mode == 'clirun': pass elif run.mode == 'run': api = InternalApi() # let init_jupyter handle this itself if not in_jupyter and not api.api_key: termlog( "W&B is a tool that helps track and visualize machine learning experiments") if force: termerror( "No credentials found. Run \"wandb login\" or \"wandb off\" to disable wandb") else: if run.check_anonymous(): _init_headless(run) else: termlog( "No credentials found. Run \"wandb login\" to visualize your metrics") run.mode = "dryrun" _init_headless(run, False) else: _init_headless(run) elif run.mode == 'dryrun': termlog( 'Dry run mode, not syncing to the cloud.') _init_headless(run, False) else: termerror( 'Invalid run mode "%s". Please unset WANDB_MODE.' % run.mode) raise LaunchError("The WANDB_MODE environment variable is invalid.") # set the run directory in the config so it actually gets persisted run.config.set_run_dir(run.dir) if sagemaker_config: run.config.update(sagemaker_config) allow_val_change = True if config: run.config.update(config, allow_val_change=allow_val_change) # Access history to ensure resumed is set when resuming run.history # Load the summary to support resuming run.summary.load() atexit.register(run.close_files) return run
def _init_headless(run, cloud=True): global join global _user_process_finished_called environ = dict(os.environ) run.set_environment(environ) server = wandb_socket.Server() run.socket = server hooks = ExitHooks() hooks.hook() if sys.platform == "win32": # PTYs don't work in windows so we use pipes. stdout_master_fd, stdout_slave_fd = os.pipe() stderr_master_fd, stderr_slave_fd = os.pipe() else: stdout_master_fd, stdout_slave_fd = io_wrap.wandb_pty(resize=False) stderr_master_fd, stderr_slave_fd = io_wrap.wandb_pty(resize=False) headless_args = { 'command': 'headless', 'pid': os.getpid(), 'stdout_master_fd': stdout_master_fd, 'stderr_master_fd': stderr_master_fd, 'cloud': cloud, 'port': server.port } internal_cli_path = os.path.join( os.path.dirname(__file__), 'internal_cli.py') if six.PY2: # TODO(adrian): close_fds=False is bad for security. we set # it so we can pass the PTY FDs to the wandb process. We # should use subprocess32, which has pass_fds. popen_kwargs = {'close_fds': False} else: popen_kwargs = {'pass_fds': [stdout_master_fd, stderr_master_fd]} # TODO(adrian): ensure we use *exactly* the same python interpreter # TODO(adrian): make wandb the foreground process so we don't give # up terminal control until syncing is finished. # https://stackoverflow.com/questions/30476971/is-the-child-process-in-foreground-or-background-on-fork-in-c wandb_process = subprocess.Popen([sys.executable, internal_cli_path, json.dumps( headless_args)], env=environ, **popen_kwargs) termlog('Started W&B process version {} with PID {}'.format( __version__, wandb_process.pid)) os.close(stdout_master_fd) os.close(stderr_master_fd) # Listen on the socket waiting for the wandb process to be ready try: success, message = server.listen(30) except KeyboardInterrupt: success = False else: if not success: termerror('W&B process (PID {}) did not respond'.format( wandb_process.pid)) if not success: wandb_process.kill() for i in range(20): time.sleep(0.1) if wandb_process.poll() is not None: break if wandb_process.poll() is None: termerror('Failed to kill wandb process, PID {}'.format( wandb_process.pid)) # TODO attempt to upload a debug log path = GLOBAL_LOG_FNAME.replace(os.getcwd()+os.sep, "") raise LaunchError( "W&B process failed to launch, see: {}".format(path)) stdout_slave = os.fdopen(stdout_slave_fd, 'wb') stderr_slave = os.fdopen(stderr_slave_fd, 'wb') stdout_redirector = io_wrap.FileRedirector(sys.stdout, stdout_slave) stderr_redirector = io_wrap.FileRedirector(sys.stderr, stderr_slave) # TODO(adrian): we should register this right after starting the wandb process to # make sure we shut down the W&B process eg. if there's an exception in the code # above atexit.register(_user_process_finished, server, hooks, wandb_process, stdout_redirector, stderr_redirector) def _wandb_join(): _user_process_finished(server, hooks, wandb_process, stdout_redirector, stderr_redirector) join = _wandb_join _user_process_finished_called = False # redirect output last of all so we don't miss out on error messages stdout_redirector.redirect() if not env.is_debug(): stderr_redirector.redirect()
def _init_headless(run, cloud=True): global join global _user_process_finished_called program = util.get_program() if program: os.environ[env.PROGRAM] = os.getenv(env.PROGRAM) or program environ = dict(os.environ) run.set_environment(environ) server = wandb_socket.Server() run.socket = server hooks = ExitHooks() hooks.hook() if platform.system() == "Windows": try: import win32api # Make sure we are not ignoring CTRL_C_EVENT # https://docs.microsoft.com/en-us/windows/console/setconsolectrlhandler # https://stackoverflow.com/questions/1364173/stopping-python-using-ctrlc win32api.SetConsoleCtrlHandler(None, False) except ImportError: termerror( "Install the win32api library with `pip install pypiwin32`") # PTYs don't work in windows so we create these unused pipes and # mirror stdout to run.dir/output.log. There should be a way to make # pipes work, but I haven't figured it out. See links in compat/windows stdout_master_fd, stdout_slave_fd = os.pipe() stderr_master_fd, stderr_slave_fd = os.pipe() else: stdout_master_fd, stdout_slave_fd = io_wrap.wandb_pty(resize=False) stderr_master_fd, stderr_slave_fd = io_wrap.wandb_pty(resize=False) headless_args = { 'command': 'headless', 'pid': os.getpid(), 'stdout_master_fd': stdout_master_fd, 'stderr_master_fd': stderr_master_fd, 'cloud': cloud, 'port': server.port } internal_cli_path = os.path.join(os.path.dirname(__file__), 'internal_cli.py') if six.PY2 or platform.system() == "Windows": # TODO(adrian): close_fds=False is bad for security. we set # it so we can pass the PTY FDs to the wandb process. We # should use subprocess32, which has pass_fds. popen_kwargs = {'close_fds': False} else: popen_kwargs = {'pass_fds': [stdout_master_fd, stderr_master_fd]} # TODO(adrian): ensure we use *exactly* the same python interpreter # TODO(adrian): make wandb the foreground process so we don't give # up terminal control until syncing is finished. # https://stackoverflow.com/questions/30476971/is-the-child-process-in-foreground-or-background-on-fork-in-c wandb_process = subprocess.Popen( [sys.executable, internal_cli_path, json.dumps(headless_args)], env=environ, **popen_kwargs) termlog('Tracking run with wandb version {}'.format(__version__)) os.close(stdout_master_fd) os.close(stderr_master_fd) # Listen on the socket waiting for the wandb process to be ready try: success, _ = server.listen(30) except KeyboardInterrupt: success = False else: if not success: termerror('W&B process (PID {}) did not respond'.format( wandb_process.pid)) if not success: wandb_process.kill() for _ in range(20): time.sleep(0.1) if wandb_process.poll() is not None: break if wandb_process.poll() is None: termerror('Failed to kill wandb process, PID {}'.format( wandb_process.pid)) # TODO attempt to upload a debug log path = GLOBAL_LOG_FNAME.replace(os.getcwd() + os.sep, "") raise LaunchError("W&B process failed to launch, see: {}".format(path)) if platform.system() == "Windows": output = open(os.path.join(run.dir, "output.log"), "wb") stdout_redirector = io_wrap.WindowsRedirector(sys.stdout, output) stderr_redirector = io_wrap.WindowsRedirector(sys.stderr, output) else: stdout_slave = os.fdopen(stdout_slave_fd, 'wb') stderr_slave = os.fdopen(stderr_slave_fd, 'wb') try: stdout_redirector = io_wrap.FileRedirector(sys.stdout, stdout_slave) stderr_redirector = io_wrap.FileRedirector(sys.stderr, stderr_slave) except (ValueError, AttributeError): # stdout / err aren't files output = open(os.path.join(run.dir, "output.log"), "wb") stdout_redirector = io_wrap.WindowsRedirector(sys.stdout, output) stderr_redirector = io_wrap.WindowsRedirector(sys.stderr, output) # TODO(adrian): we should register this right after starting the wandb process to # make sure we shut down the W&B process eg. if there's an exception in the code # above atexit.register(_user_process_finished, server, hooks, wandb_process, stdout_redirector, stderr_redirector) def _wandb_join(exit_code=None): global _global_run_stack shutdown_async_log_thread() run.close_files() if exit_code is not None: hooks.exit_code = exit_code _user_process_finished(server, hooks, wandb_process, stdout_redirector, stderr_redirector) if len(_global_run_stack) > 0: _global_run_stack.pop() join = _wandb_join _user_process_finished_called = False # redirect output last of all so we don't miss out on error messages stdout_redirector.redirect() if not env.is_debug(): stderr_redirector.redirect()