def write_netrc(host, entity, key): """Add our host and key to .netrc""" key_prefix, key_suffix = key.split('-', 1) if '-' in key else ('', key) if len(key_suffix) != 40: wandb.termlog( 'API-key must be exactly 40 characters long: {} ({} chars)'.format( key_suffix, len(key_suffix))) return None try: normalized_host = host.split("/")[-1].split(":")[0] wandb.termlog("Appending key for {} to your netrc file: {}".format( normalized_host, os.path.expanduser('~/.netrc'))) machine_line = 'machine %s' % normalized_host path = os.path.expanduser('~/.netrc') orig_lines = None try: with open(path) as f: orig_lines = f.read().strip().split('\n') except (IOError, OSError) as e: pass with open(path, 'w') as f: if orig_lines: # delete this machine from the file if it's already there. skip = 0 for line in orig_lines: if machine_line in line: skip = 2 elif skip: skip -= 1 else: f.write('%s\n' % line) f.write( textwrap.dedent("""\ machine {host} login {entity} password {key} """).format(host=normalized_host, entity=entity, key=key)) os.chmod(os.path.expanduser('~/.netrc'), stat.S_IRUSR | stat.S_IWUSR) return True except IOError as e: wandb.termerror("Unable to read ~/.netrc") return None
def process_chunks(self, chunks): chunk_id = self._chunk_id # TODO: chunk_id is getting reset on each request... self._chunk_id += len(chunks) chunk_data = [] for chunk in chunks: if len(chunk.data) > util.MAX_LINE_SIZE: msg = "Metric data exceeds maximum size of {} ({})".format( util.to_human_size(util.MAX_LINE_SIZE), util.to_human_size(len(chunk.data)), ) wandb.termerror(msg, repeat=False) util.sentry_message(msg) else: chunk_data.append(chunk.data) return { "offset": chunk_id, "content": chunk_data, }
def objective(phi, grad=0): # search on hypersphere surface in polar coordinates - map back to cartesian cx = centroid + polar_to_cartesian(phi, R) try: cx2d = dimensionality_reduction.transform([cx])[0] error = self.decision_boundary_distance(cx) if penalize_known: # slight penalty for being too close to already known decision boundary # keypoints db_distances = [ euclidean(cx2d, self.decision_boundary_points_2d[k]) for k in range(len(self.decision_boundary_points_2d)) ] error += (1e-8 * ((self.mean_2d_dist - np.min(db_distances)) / self.mean_2d_dist)**2) return error except Exception as ex: wandb.termerror("Error in objective function:", ex) return np.infty
def _save_model(self, epoch): if wandb.run.disabled: return if self.verbose > 0: print( "Epoch %05d: %s improved from %0.5f to %0.5f," " saving model to %s" % (epoch, self.monitor, self.best, self.current, self.filepath) ) try: if self.save_weights_only: self.model.save_weights(self.filepath, overwrite=True) else: self.model.save(self.filepath, overwrite=True) # Was getting `RuntimeError: Unable to create link` in TF 1.13.1 # also saw `TypeError: can't pickle _thread.RLock objects` except (ImportError, RuntimeError, TypeError) as e: wandb.termerror("Can't save model, h5py returned error: %s" % e) self.save_model = False
def _setup(self): logger.debug("Agent._setup()") self._init() parts = dict(entity=self._entity, project=self._project, name=self._sweep_path) err = util.parse_sweep_id(parts) if err: wandb.termerror(err) return entity = parts.get("entity") or self._entity project = parts.get("project") or self._project sweep_id = parts.get("name") or self._sweep_id if sweep_id: os.environ[wandb.env.SWEEP_ID] = sweep_id if entity: wandb.env.set_entity(entity) if project: wandb.env.set_project(project) if sweep_id: self._sweep_id = sweep_id self._register()
def apply(self) -> None: """Call require_* method for supported features.""" last_message: str = "" for feature_item in self._features: full_feature = feature_item.split("@", 2)[0] feature = full_feature.split(":", 2)[0] func_str = "require_{}".format(feature.replace("-", "_")) func = getattr(self, func_str, None) if not func: last_message = "require() unsupported requirement: {}".format( feature) wandb.termwarn(last_message) continue func() if last_message: wandb.termerror( "Supported wandb.require() features can be found at: http://wandb.me/library-require" ) raise RequireError(last_message)
def test_fitted(model): np = util.get_module("numpy", required="Logging plots requires numpy") pd = util.get_module("pandas", required="Logging dataframes requires pandas") scipy = util.get_module("scipy", required="Logging scipy matrices requires scipy") scikit = util.get_module( "sklearn", required="Logging plots matrices requires scikit-learn") try: model.predict(np.zeros((7, 3))) except scikit.exceptions.NotFittedError: wandb.termerror("Please fit the model before passing it in.") return False except AttributeError: # Some clustering models (LDA, PCA, Agglomerative) don't implement ``predict`` try: scikit.utils.validation.check_is_fitted( model, [ "coef_", "estimator_", "labels_", "n_clusters_", "children_", "components_", "n_components_", "n_iter_", "n_batch_iter_", "explained_variance_", "singular_values_", "mean_", ], all_or_any=any, ) return True except scikit.exceptions.NotFittedError: wandb.termerror("Please fit the model before passing it in.") return False except Exception: # Assume it's fitted, since ``NotFittedError`` wasn't raised return True
def patch(save=None, tensorboardX=None, pytorch=None): if len(wandb.patched["tensorboard"]) > 0: raise ValueError( "Tensorboard already patched, remove sync_tensorboard=True from wandb.init or only call wandb.tensorboard.patch once." ) wandb.util.get_module("tensorboard", required="Please install tensorboard package") c_writer = wandb.util.get_module(TENSORBOARD_C_MODULE) tb_writer = wandb.util.get_module(TENSORBOARD_PYTORCH_MODULE) if c_writer: _patch_tensorflow2(writer=c_writer, module=TENSORBOARD_C_MODULE, save=save) elif tb_writer: _patch_nontensorflow(writer=tb_writer, module=TENSORBOARD_PYTORCH_MODULE, save=save) else: wandb.termerror("Unsupported tensorboard configuration")
def agent_run(args): """A version of `wandb run` that the agent uses to run things. """ run = wandb.wandb_run.Run.from_environment_or_defaults() run.enable_logging() # TODO: better failure handling root = run.api.git.root # handle non-git directories if not root: root = os.path.abspath(os.getcwd()) host = socket.gethostname() remote_url = 'file://{}{}'.format(host, root) run.save(program=args['program']) env = dict(os.environ) run.set_environment(env) try: rm = wandb.run_manager.RunManager(run, agent_run=True) except wandb.run_manager.Error: exc_type, exc_value, exc_traceback = sys.exc_info() wandb.termerror( 'An Exception was raised during setup, see %s for full traceback.' % util.get_log_file_path()) wandb.termerror(exc_value) if 'permission' in str(exc_value): wandb.termerror( 'Are you sure you provided the correct API key to "wandb login"?' ) lines = traceback.format_exception(exc_type, exc_value, exc_traceback) logging.error('\n'.join(lines)) else: rm.run_user_process(args['program'], args['args'], env)
def run(ctx, program, args, id, resume, dir, configs, message, name, notes, show, tags, run_group, job_type): wandb.ensure_configured() if configs: config_paths = configs.split(',') else: config_paths = [] config = Config(config_paths=config_paths, wandb_dir=dir or wandb.wandb_dir()) tags = [tag for tag in tags.split(",") if tag] if tags else None # populate run parameters from env if not specified id = id or os.environ.get(env.RUN_ID) message = message or os.environ.get(env.DESCRIPTION) tags = tags or env.get_tags() run_group = run_group or os.environ.get(env.RUN_GROUP) job_type = job_type or os.environ.get(env.JOB_TYPE) name = name or os.environ.get(env.NAME) notes = notes or os.environ.get(env.NOTES) resume = resume or os.environ.get(env.RESUME) run = wandb_run.Run(run_id=id, mode='clirun', config=config, description=message, program=program, tags=tags, group=run_group, job_type=job_type, name=name, notes=notes, resume=resume) run.enable_logging() environ = dict(os.environ) if configs: environ[env.CONFIG_PATHS] = configs if show: environ[env.SHOW_RUN] = 'True' if not run.api.api_key: util.prompt_api_key(run.api, input_callback=click.prompt) try: rm = run_manager.RunManager(run) rm.init_run(environ) except run_manager.Error: exc_type, exc_value, exc_traceback = sys.exc_info() wandb.termerror( 'An Exception was raised during setup, see %s for full traceback.' % util.get_log_file_path()) wandb.termerror(str(exc_value)) if 'permission' in str(exc_value): wandb.termerror( 'Are you sure you provided the correct API key to "wandb login"?' ) lines = traceback.format_exception(exc_type, exc_value, exc_traceback) logger.error('\n'.join(lines)) sys.exit(1) rm.run_user_process(program, args, environ)
def sync_spell(self, run, env=None): """Syncs this run with spell""" try: env = env or os.environ run.config._set_wandb("spell_url", env.get("SPELL_RUN_URL")) run.config.persist() try: url = run.get_url() except CommError as e: wandb.termerror("Unable to register run with spell.run: %s" % e.message) return False return requests.put( env.get("SPELL_API_URL", "https://api.spell.run") + "/wandb_url", json={ "access_token": env.get("WANDB_ACCESS_TOKEN"), "url": url }, timeout=2) except requests.RequestException: return False
def _handle_event(self, event): if isinstance(event, upload_job.EventJobDone): job = event.job job.join() if job.artifact_id: if event.success: self._artifacts[job.artifact_id]['pending_count'] -= 1 self._maybe_commit_artifact(job.artifact_id) else: termerror( 'Uploading artifact file failed. Artifact won\'t be committed.' ) self._running_jobs.pop(job.save_name) # If we have any pending jobs, start one now if self._pending_jobs: event = self._pending_jobs.pop(0) self._start_upload_job(event) elif isinstance(event, RequestCommitArtifact): self._artifacts[event.artifact_id]['commit_requested'] = True if event.on_commit: self._artifacts[event.artifact_id]['commit_callbacks'].add( event.on_commit) self._maybe_commit_artifact(event.artifact_id) elif isinstance(event, RequestUpload): if event.artifact_id is not None: if event.artifact_id not in self._artifacts: self._artifacts[event.artifact_id] = { 'pending_count': 0, 'commit_requested': False, 'commit_callbacks': set(), } self._artifacts[event.artifact_id]['pending_count'] += 1 if len(self._running_jobs) == self._max_jobs: self._pending_jobs.append(event) else: self._start_upload_job(event) else: raise Exception('Programming error: unhandled event: %s' % str(event))
def _run_cmd( self, cmd: List[str], output_only: Optional[bool] = False ) -> Optional[Union["subprocess.Popen[bytes]", bytes]]: """Runs the command and returns a popen object or the stdout of the command. Arguments: cmd: The command to run output_only: If true just return the stdout bytes """ try: env = os.environ popen = subprocess.Popen(cmd, env=env, stdout=subprocess.PIPE) if output_only: popen.wait() if popen.stdout is not None: return popen.stdout.read() return popen except subprocess.CalledProcessError as e: wandb.termerror("Command failed: {}".format(e)) return None
def push(self): try: size = os.path.getsize(self.save_path) except OSError: size = 0 self._progress[self.label] = { 'total': size, 'uploaded': 0, 'failed': False } try: with open(self.save_path, 'rb') as f: self._api.push( {self.save_name: f}, progress=lambda _, t: self.progress(t)) except Exception as e: self._progress[self.label]['uploaded'] = 0 self._progress[self.label]['failed'] = True wandb.util.sentry_exc(e) wandb.termerror('Error uploading "{}": {}, {}'.format( self.save_name, type(e).__name__, e))
def sweep(ctx, controller, verbose, config_yaml): click.echo('Creating sweep from: %s' % config_yaml) try: yaml_file = open(config_yaml) except (OSError, IOError): wandb.termerror('Couldn\'t open sweep file: %s' % config_yaml) return try: config = util.load_yaml(yaml_file) except yaml.YAMLError as err: wandb.termerror('Error in configuration file: %s' % err) return if config is None: wandb.termerror('Configuration file is empty') return is_local = config.get('controller', {}).get('type') == 'local' if is_local: tuner = wandb_controller.controller() err = tuner._validate(config) if err: wandb.termerror('Error in sweep file: %s' % err) return else: if controller: wandb.termerror( 'Option "controller" only permitted for controller type "local"' ) return sweep_id = api.upsert_sweep(config) print('Create sweep with ID:', sweep_id) sweep_url = wandb_controller._get_sweep_url(api, sweep_id) if sweep_url: print('Sweep URL:', sweep_url) if controller: click.echo('Starting wandb controller...') tuner = wandb_controller.controller(sweep_id) tuner.run(verbose=verbose)
def test_missing(**kwargs): test_passed = True for k, v in kwargs.items(): # Missing/empty params/datapoint arrays if v is None: wandb.termerror("%s is None. Please try again." % (k)) test_passed = False if (k == "X") or (k == "X_test"): if isinstance(v, scipy.sparse.csr.csr_matrix): v = v.toarray() elif isinstance(v, (pd.DataFrame, pd.Series)): v = v.to_numpy() elif isinstance(v, list): v = np.asarray(v) # Warn the user about missing values missing = 0 missing = np.count_nonzero(pd.isnull(v)) if missing > 0: wandb.termwarn("%s contains %d missing values. " % (k, missing)) test_passed = False # Ensure the dataset contains only integers non_nums = 0 if v.ndim == 1: non_nums = sum(1 for val in v if (not isinstance(val, (int, float, complex)) and not isinstance(val, np.number))) else: non_nums = sum(1 for sl in v for val in sl if (not isinstance(val, (int, float, complex)) and not isinstance(val, np.number))) if non_nums > 0: wandb.termerror( "%s contains values that are not numbers. Please vectorize, label encode or one hot encode %s and call the plotting function again." % (k, k)) test_passed = False return test_passed
def _prompt_api_key(self) -> Tuple[Optional[str], ApiKeyStatus]: api = Api(self._settings) while True: try: key = apikey.prompt_api_key( self._settings, api=api, no_offline=self._settings.force if self._settings else None, no_create=self._settings.force if self._settings else None, ) except ValueError as e: # invalid key provided, try again wandb.termerror(e.args[0]) continue except TimeoutError: wandb.termlog("W&B disabled due to login timeout.") return None, ApiKeyStatus.DISABLED if key is False: return None, ApiKeyStatus.NOTTY if not key: return None, ApiKeyStatus.OFFLINE return key, ApiKeyStatus.VALID
def history(self, samples=500, keys=None, x_axis="_step", pandas=True, stream="default"): """ Returns sampled history metrics for a run. This is simpler and faster if you are ok with the history records being sampled. Args: samples (int, optional): The number of samples to return pandas (bool, optional): Return a pandas dataframe keys (list, optional): Only return metrics for specific keys x_axis (str, optional): Use this metric as the xAxis defaults to _step stream (str, optional): "default" for metrics, "system" for machine metrics Returns: If pandas=True returns a `pandas.DataFrame` of history metrics. If pandas=False returns a list of dicts of history metrics. """ if keys and stream != "default": wandb.termerror("stream must be default when specifying keys") return [] elif keys: lines = self._sampled_history(keys=keys, x_axis=x_axis, samples=samples) else: lines = self._full_history(samples=samples, stream=stream) if pandas: pandas = util.get_module("pandas") if pandas: lines = pandas.DataFrame.from_records(lines) else: print("Unable to load pandas, call history with pandas=False") return lines
def test_types(**kwargs): test_passed = True for k, v in kwargs.items(): # check for incorrect types if ((k == 'X') or (k == 'X_test') or (k == 'y') or (k == 'y_test') or (k == 'y_true') or (k == 'y_probas')): # FIXME: do this individually if not isinstance( v, (collections.Sequence, collections.Iterable, np.ndarray, np.generic, pd.DataFrame, pd.Series, list)): wandb.termerror("%s is not an array. Please try again." % (k)) test_passed = False # check for classifier types if (k == 'model'): if ((not sklearn.base.is_classifier(v)) and (not sklearn.base.is_regressor(v))): wandb.termerror( "%s is not a classifier or regressor. Please try again." % (k)) test_passed = False elif (k == 'clf' or k == 'binary_clf'): if (not (sklearn.base.is_classifier(v))): wandb.termerror("%s is not a classifier. Please try again." % (k)) test_passed = False elif (k == 'regressor'): if (not sklearn.base.is_regressor(v)): wandb.termerror("%s is not a regressor. Please try again." % (k)) test_passed = False elif (k == 'clusterer'): if (not (getattr(v, "_estimator_type", None) == "clusterer")): wandb.termerror("%s is not a clusterer. Please try again." % (k)) test_passed = False return test_passed
def write_netrc(host, entity, key): """Add our host and key to .netrc""" key_prefix, key_suffix = key.split("-", 1) if "-" in key else ("", key) if len(key_suffix) != 40: wandb.termerror( "API-key must be exactly 40 characters long: {} ({} chars)".format( key_suffix, len(key_suffix) ) ) return None try: normalized_host = host.rstrip("/").split("/")[-1].split(":")[0] if normalized_host != "localhost" and "." not in normalized_host: wandb.termerror("Host must be a url in the form https://some.address.com") return None wandb.termlog( "Appending key for {} to your netrc file: {}".format( normalized_host, os.path.expanduser("~/.netrc") ) ) machine_line = "machine %s" % normalized_host path = os.path.expanduser("~/.netrc") orig_lines = None try: with open(path) as f: orig_lines = f.read().strip().split("\n") except IOError: pass with open(path, "w") as f: if orig_lines: # delete this machine from the file if it's already there. skip = 0 for line in orig_lines: # we fix invalid netrc files with an empty host that we wrote before # verifying host... if line == "machine " or machine_line in line: skip = 2 elif skip: skip -= 1 else: f.write("%s\n" % line) f.write( textwrap.dedent( """\ machine {host} login {entity} password {key} """ ).format(host=normalized_host, entity=entity, key=key) ) os.chmod(os.path.expanduser("~/.netrc"), stat.S_IRUSR | stat.S_IWUSR) return True except IOError: wandb.termerror("Unable to read ~/.netrc") return None
def run(ctx, program, args, id, resume, dir, configs, message, name, notes, show, tags, run_group, job_type): wandb.ensure_configured() if configs: config_paths = configs.split(',') else: config_paths = [] config = Config(config_paths=config_paths, wandb_dir=dir or wandb.wandb_dir()) tags = [tag for tag in tags.split(",") if tag] if tags else None run = wandb_run.Run(run_id=id, mode='clirun', config=config, description=message, program=program, tags=tags, group=run_group, job_type=job_type, name=name, notes=notes, resume=resume) run.enable_logging() environ = dict(os.environ) if configs: environ[env.CONFIG_PATHS] = configs if show: environ[env.SHOW_RUN] = 'True' run.check_anonymous() try: rm = run_manager.RunManager(run) rm.init_run(environ) except run_manager.Error: exc_type, exc_value, exc_traceback = sys.exc_info() wandb.termerror( 'An Exception was raised during setup, see %s for full traceback.' % util.get_log_file_path()) wandb.termerror(str(exc_value)) if 'permission' in str(exc_value): wandb.termerror( 'Are you sure you provided the correct API key to "wandb login"?' ) lines = traceback.format_exception(exc_type, exc_value, exc_traceback) logger.error('\n'.join(lines)) sys.exit(1) rm.run_user_process(program, args, environ)
def sweep(ctx, config_yaml): click.echo('Creating sweep from: %s' % config_yaml) try: yaml_file = open(config_yaml) except (OSError, IOError): wandb.termerror('Couldn\'t open sweep file: %s' % config_yaml) return try: config = util.load_yaml(yaml_file) except yaml.YAMLError as err: wandb.termerror('Error in configuration file: %s' % err) return if config is None: wandb.termerror('Configuration file is empty') return sweep_id = api.upsert_sweep(config) print('Create sweep with ID:', sweep_id)
def agent_run(args): """A version of `wandb run` that the agent uses to run things. """ run = wandb.wandb_run.Run.from_environment_or_defaults() api = wandb.api.Api() api.set_current_run_id(run.id) # TODO: better failure handling root = api.git.root remote_url = api.git.remote_url host = socket.gethostname() # handle non-git directories if not root: root = os.path.abspath(os.getcwd()) remote_url = 'file://%s%s' % (host, root) upsert_result = api.upsert_run(id=run.storage_id, name=run.id, project=api.settings("project"), entity=api.settings("entity"), config=run.config.as_dict(), description=run.description, host=host, program_path=args['program'], repo=remote_url, sweep_name=run.sweep_id) run.storage_id = upsert_result['id'] env = dict(os.environ) run.set_environment(env) try: rm = wandb.run_manager.RunManager(api, run) except wandb.run_manager.Error: exc_type, exc_value, exc_traceback = sys.exc_info() wandb.termerror('An Exception was raised during setup, see %s for full traceback.' % util.get_log_file_path()) wandb.termerror(exc_value) if 'permission' in str(exc_value): wandb.termerror( 'Are you sure you provided the correct API key to "wandb login"?') lines = traceback.format_exception( exc_type, exc_value, exc_traceback) logger.error('\n'.join(lines)) else: rm.run_user_process(args['program'], args['args'], env)
def run(ctx, program, args, id, resume, dir, configs, message, show): api.ensure_configured() if configs: config_paths = configs.split(',') else: config_paths = [] config = Config(config_paths=config_paths, wandb_dir=dir or wandb.wandb_dir()) run = wandb_run.Run(run_id=id, mode='clirun', config=config, description=message, program=program, resume=resume) api.set_current_run_id(run.id) env = dict(os.environ) if configs: env['WANDB_CONFIG_PATHS'] = configs if show: env['WANDB_SHOW_RUN'] = 'True' try: rm = run_manager.RunManager(api, run) rm.init_run(env) except run_manager.Error: exc_type, exc_value, exc_traceback = sys.exc_info() wandb.termerror( 'An Exception was raised during setup, see %s for full traceback.' % util.get_log_file_path()) wandb.termerror(str(exc_value)) if 'permission' in str(exc_value): wandb.termerror( 'Are you sure you provided the correct API key to "wandb login"?' ) lines = traceback.format_exception(exc_type, exc_value, exc_traceback) logger.error('\n'.join(lines)) sys.exit(1) rm.run_user_process(program, args, env)
def init( job_type = None, dir=None, config = None, project = None, entity = None, reinit = None, tags = None, group = None, name = None, notes = None, magic = None, config_exclude_keys=None, config_include_keys=None, anonymous = None, mode = None, allow_val_change = None, resume = None, force = None, tensorboard=None, # alias for sync_tensorboard sync_tensorboard=None, monitor_gym=None, save_code=None, id=None, settings = None, ): """Initialize W&B Spawns a new process to start or resume a run locally and communicate with a wandb server. Should be called before any calls to wandb.log. Arguments: job_type (str, optional): The type of job running, defaults to 'train' dir (str, optional): An absolute path to a directory where metadata will be stored. config (dict, argparse, or absl.flags, str, optional): Sets the config parameters (typically hyperparameters) to store with the run. See also wandb.config. If dict, argparse or absl.flags: will load the key value pairs into the runs config object. If str: will look for a yaml file that includes config parameters and load them into the run's config object. project (str, optional): W&B Project. entity (str, optional): W&B Entity. reinit (bool, optional): Allow multiple calls to init in the same process. tags (list, optional): A list of tags to apply to the run. group (str, optional): A unique string shared by all runs in a given group. name (str, optional): A display name for the run which does not have to be unique. notes (str, optional): A multiline string associated with the run. magic (bool, dict, or str, optional): magic configuration as bool, dict, json string, yaml filename. config_exclude_keys (list, optional): string keys to exclude storing in W&B when specifying config. config_include_keys (list, optional): string keys to include storing in W&B when specifying config. anonymous (str, optional): Can be "allow", "must", or "never". Controls whether anonymous logging is allowed. Defaults to never. mode (str, optional): Can be "online", "offline" or "disabled". Defaults to online. allow_val_change (bool, optional): allow config values to be changed after setting. Defaults to true in jupyter and false otherwise. resume (bool, str, optional): Sets the resuming behavior. Should be one of: "allow", "must", "never", "auto" or None. Defaults to None. Cases: - "auto" (or True): automatically resume the previous run on the same machine. if the previous run crashed, otherwise starts a new run. - "allow": if id is set with init(id="UNIQUE_ID") or WANDB_RUN_ID="UNIQUE_ID" and it is identical to a previous run, wandb will automatically resume the run with the id. Otherwise wandb will start a new run. - "never": if id is set with init(id="UNIQUE_ID") or WANDB_RUN_ID="UNIQUE_ID" and it is identical to a previous run, wandb will crash. - "must": if id is set with init(id="UNIQUE_ID") or WANDB_RUN_ID="UNIQUE_ID" and it is identical to a previous run, wandb will automatically resume the run with the id. Otherwise wandb will crash. - None: never resumes - if a run has a duplicate run_id the previous run is overwritten. See https://docs.wandb.com/library/advanced/resuming for more detail. force (bool, optional): If true, will cause script to crash if user can't or isn't logged in to a wandb server. If false, will cause script to run in offline modes if user can't or isn't logged in to a wandb server. Defaults to false. sync_tensorboard (bool, optional): Synchronize wandb logs from tensorboard or tensorboardX and saves the relevant events file. Defaults to false. monitor_gym: (bool, optional): automatically logs videos of environment when using OpenAI Gym (see https://docs.wandb.com/library/integrations/openai-gym) Defaults to false. save_code (bool, optional): Save the entrypoint or jupyter session history source code. id (str, optional): A globally unique (per project) identifier for the run. This is primarily used for resuming. Examples: Basic usage ``` wandb.init() ``` Launch multiple runs from the same script ``` for x in range(10): with wandb.init(project="my-projo") as run: for y in range(100): run.log({"metric": x+y}) ``` Raises: Exception: if problem. Returns: A `Run` object. """ assert not wandb._IS_INTERNAL_PROCESS kwargs = dict(locals()) error_seen = None except_exit = None try: wi = _WandbInit() wi.setup(kwargs) except_exit = wi.settings._except_exit try: run = wi.init() except_exit = wi.settings._except_exit except (KeyboardInterrupt, Exception) as e: if not isinstance(e, KeyboardInterrupt): sentry_exc(e) if not ( wandb.wandb_agent._is_running() and isinstance(e, KeyboardInterrupt) ): getcaller() assert logger if wi.settings.problem == "fatal": raise if wi.settings.problem == "warn": pass # TODO(jhr): figure out how to make this RunDummy run = None except UsageError: raise except KeyboardInterrupt as e: assert logger logger.warning("interrupted", exc_info=e) raise e except Exception as e: error_seen = e traceback.print_exc() assert logger logger.error("error", exc_info=e) # Need to build delay into this sentry capture because our exit hooks # mess with sentry's ability to send out errors before the program ends. sentry_exc(e, delay=True) # reraise(*sys.exc_info()) # six.raise_from(Exception("problem"), e) finally: if error_seen: wandb.termerror("Abnormal program exit") if except_exit: os._exit(-1) six.raise_from(Exception("problem"), error_seen) return run
def from_directory(cls, directory, project=None, entity=None, run_id=None, api=None, ignore_globs=None): api = api or InternalApi() run_id = run_id or util.generate_id() run = Run(run_id=run_id, dir=directory) run_name = None project_from_meta = None snap = DirectorySnapshot(directory) meta = next((p for p in snap.paths if METADATA_FNAME in p), None) if meta: meta = json.load(open(meta)) run_name = meta.get("name") project_from_meta = meta.get("project") project = project or project_from_meta or api.settings( "project") or run.auto_project_name(api=api) if project is None: raise ValueError("You must specify project") api.set_current_run_id(run_id) api.set_setting("project", project) if entity: api.set_setting("entity", entity) res = api.upsert_run(name=run_id, project=project, entity=entity, display_name=run_name) entity = res["project"]["entity"]["name"] wandb.termlog("Syncing {} to:".format(directory)) try: wandb.termlog(res["displayName"] + " " + run.get_url(api)) except CommError as e: wandb.termwarn(e.message) file_api = api.get_file_stream_api() file_api.start() paths = [ os.path.relpath(abs_path, directory) for abs_path in snap.paths if os.path.isfile(abs_path) ] if ignore_globs: paths = set(paths) for g in ignore_globs: paths = paths - set(fnmatch.filter(paths, g)) paths = list(paths) run_update = {"id": res["id"]} tfevents = sorted([p for p in snap.paths if ".tfevents." in p]) history = next((p for p in snap.paths if HISTORY_FNAME in p), None) event = next((p for p in snap.paths if EVENTS_FNAME in p), None) config = next((p for p in snap.paths if CONFIG_FNAME in p), None) user_config = next((p for p in snap.paths if USER_CONFIG_FNAME in p), None) summary = next((p for p in snap.paths if SUMMARY_FNAME in p), None) if history: wandb.termlog("Uploading history metrics") file_api.stream_file(history) snap.paths.remove(history) elif len(tfevents) > 0: from wandb import tensorflow as wbtf wandb.termlog("Found tfevents file, converting...") summary = {} for path in tfevents: filename = os.path.basename(path) namespace = path.replace(filename, "").replace(directory, "").strip(os.sep) summary.update( wbtf.stream_tfevents(path, file_api, run, namespace=namespace)) for path in glob.glob(os.path.join(directory, "media/**/*"), recursive=True): if os.path.isfile(path): paths.append(path) else: wandb.termerror( "No history or tfevents files found, only syncing files") if event: file_api.stream_file(event) snap.paths.remove(event) if config: run_update["config"] = util.load_yaml(open(config)) elif user_config: # TODO: half backed support for config.json run_update["config"] = { k: { "value": v } for k, v in six.iteritems(user_config) } if isinstance(summary, dict): #TODO: summary should already have data_types converted here... run_update["summary_metrics"] = util.json_dumps_safer(summary) elif summary: run_update["summary_metrics"] = open(summary).read() if meta: if meta.get("git"): run_update["commit"] = meta["git"].get("commit") run_update["repo"] = meta["git"].get("remote") if meta.get("host"): run_update["host"] = meta["host"] run_update["program_path"] = meta["program"] run_update["job_type"] = meta.get("jobType") run_update["notes"] = meta.get("notes") else: run_update["host"] = run.host wandb.termlog("Updating run and uploading files") api.upsert_run(**run_update) pusher = FilePusher(api) for k in paths: path = os.path.abspath(os.path.join(directory, k)) pusher.update_file(k, path) pusher.file_changed(k, path) pusher.finish() pusher.print_status() file_api.finish(0) # Remove temporary media images generated from tfevents if history is None and os.path.exists(os.path.join(directory, "media")): shutil.rmtree(os.path.join(directory, "media")) wandb.termlog("Finished!") return run
def _run_jobs_from_queue(self): # noqa:C901 global _INSTANCES _INSTANCES += 1 try: waiting = False count = 0 while True: if self._exit_flag: return try: try: job = self._queue.get(timeout=5) if self._exit_flag: logger.debug("Exiting main loop due to exit flag.") wandb.termlog("Sweep Agent: Exiting.") return except queue.Empty: if not waiting: logger.debug("Paused.") wandb.termlog("Sweep Agent: Waiting for job.") waiting = True time.sleep(5) if self._exit_flag: logger.debug("Exiting main loop due to exit flag.") wandb.termlog("Sweep Agent: Exiting.") return continue if waiting: logger.debug("Resumed.") wandb.termlog("Job received.") waiting = False count += 1 run_id = job.run_id if self._run_status[run_id] == RunStatus.STOPPED: continue logger.debug( "Spawning new thread for run {}.".format(run_id)) thread = threading.Thread(target=self._run_job, args=(job, )) self._run_threads[run_id] = thread thread.start() self._run_status[run_id] = RunStatus.RUNNING thread.join() logger.debug("Thread joined for run {}.".format(run_id)) if self._run_status[run_id] == RunStatus.RUNNING: self._run_status[run_id] = RunStatus.DONE elif self._run_status[run_id] == RunStatus.ERRORED: exc = self._exceptions[run_id] logger.error("Run {} errored: {}".format( run_id, repr(exc))) wandb.termerror("Run {} errored: {}".format( run_id, repr(exc))) if os.getenv( wandb.env.AGENT_DISABLE_FLAPPING) == "true": self._exit_flag = True return elif (time.time() - self._start_time < self.FLAPPING_MAX_SECONDS) and ( len(self._exceptions) >= self.FLAPPING_MAX_FAILURES): msg = "Detected {} failed runs in the first {} seconds, killing sweep.".format( self.FLAPPING_MAX_FAILURES, self.FLAPPING_MAX_SECONDS) logger.error(msg) wandb.termerror(msg) wandb.termlog( "To disable this check set WANDB_AGENT_DISABLE_FLAPPING=true" ) self._exit_flag = True return if (self._max_initial_failures < len(self._exceptions) and len(self._exceptions) >= count): msg = "Detected {} failed runs in a row at start, killing sweep.".format( self._max_initial_failures) logger.error(msg) wandb.termerror(msg) wandb.termlog( "To change this value set WANDB_AGENT_MAX_INITIAL_FAILURES=val" ) self._exit_flag = True return if self._count and self._count == count: logger.debug( "Exiting main loop because max count reached.") self._exit_flag = True return except KeyboardInterrupt: logger.debug("Ctrl + C detected. Stopping sweep.") wandb.termlog("Ctrl + C detected. Stopping sweep.") self._exit() return except Exception as e: if self._exit_flag: logger.debug("Exiting main loop due to exit flag.") wandb.termlog("Sweep Agent: Killed.") return else: raise e finally: _INSTANCES -= 1
def init( job_type = None, dir=None, config = None, project = None, entity = None, reinit = None, tags = None, group = None, name = None, notes = None, magic = None, config_exclude_keys=None, config_include_keys=None, anonymous = None, mode = None, allow_val_change = None, resume = None, force = None, tensorboard=None, # alias for sync_tensorboard sync_tensorboard=None, monitor_gym=None, save_code=None, id=None, settings = None, ): """ Start a new tracked run with `wandb.init()`. In an ML training pipeline, you could add `wandb.init()` to the beginning of your training script as well as your evaluation script, and each piece would be tracked as a run in W&B. `wandb.init()` spawns a new background process to log data to a run, and it also syncs data to wandb.ai by default so you can see live visualizations. Call `wandb.init()` to start a run before logging data with `wandb.log()`. `wandb.init()` returns a run object, and you can also access the run object with wandb.run. Arguments: project: (str, optional) The name of the project where you're sending the new run. If the project is not specified, the run is put in an "Uncategorized" project. entity: (str, optional) An entity is a username or team name where you're sending runs. This entity must exist before you can send runs there, so make sure to create your account or team in the UI before starting to log runs. If you don't specify an entity, the run will be sent to your default entity, which is usually your username. Change your default entity in [Settings](wandb.ai/settings) under "default location to create new projects". config: (dict, argparse, absl.flags, str, optional) This sets wandb.config, a dictionary-like object for saving inputs to your job, like hyperparameters for a model or settings for a data preprocessing job. The config will show up in a table in the UI that you can use to group, filter, and sort runs. Keys should not contain `.` in their names, and values should be under 10 MB. If dict, argparse or absl.flags: will load the key value pairs into the wandb.config object. If str: will look for a yaml file by that name, and load config from that file into the wandb.config object. save_code: (bool, optional) Turn this on to save the main script or notebook to W&B. This is valuable for improving experiment reproducibility and to diff code across experiments in the UI. By default this is off, but you can flip the default behavior to "on" in [Settings](wandb.ai/settings). group: (str, optional) Specify a group to organize individual runs into a larger experiment. For example, you might be doing cross validation, or you might have multiple jobs that train and evaluate a model against different test sets. Group gives you a way to organize runs together into a larger whole, and you can toggle this on and off in the UI. For more details, see [Grouping](docs.wandb.com/library/grouping). job_type: (str, optional) Specify the type of run, which is useful when you're grouping runs together into larger experiments using group. For example, you might have multiple jobs in a group, with job types like train and eval. Setting this makes it easy to filter and group similar runs together in the UI so you can compare apples to apples. tags: (list, optional) A list of strings, which will populate the list of tags on this run in the UI. Tags are useful for organizing runs together, or applying temporary labels like "baseline" or "production". It's easy to add and remove tags in the UI, or filter down to just runs with a specific tag. name: (str, optional) A short display name for this run, which is how you'll identify this run in the UI. By default we generate a random two-word name that lets you easily cross-reference runs from the table to charts. Keeping these run names short makes the chart legends and tables easier to read. If you're looking for a place to save your hyperparameters, we recommend saving those in config. notes: (str, optional) A longer description of the run, like a -m commit message in git. This helps you remember what you were doing when you ran this run. dir: (str, optional) An absolute path to a directory where metadata will be stored. When you call download() on an artifact, this is the directory where downloaded files will be saved. By default this is the ./wandb directory. resume (bool, str, optional): Sets the resuming behavior. Options: "allow", "must", "never", "auto" or None. Defaults to None. Cases: - None (default): If the new run has the same ID as a previous run, this run overwrites that data. - "auto" (or True): if the preivous run on this machine crashed, automatically resume it. Otherwise, start a new run. - "allow": if id is set with init(id="UNIQUE_ID") or WANDB_RUN_ID="UNIQUE_ID" and it is identical to a previous run, wandb will automatically resume the run with that id. Otherwise, wandb will start a new run. - "never": if id is set with init(id="UNIQUE_ID") or WANDB_RUN_ID="UNIQUE_ID" and it is identical to a previous run, wandb will crash. - "must": if id is set with init(id="UNIQUE_ID") or WANDB_RUN_ID="UNIQUE_ID" and it is identical to a previous run, wandb will automatically resume the run with the id. Otherwise wandb will crash. See https://docs.wandb.com/library/advanced/resuming for more. reinit: (bool, optional) Allow multiple wandb.init() calls in the same process. (default: False) magic: (bool, dict, or str, optional) The bool controls whether we try to auto-instrument your script, capturing basic details of your run without you having to add more wandb code. (default: False) You can also pass a dict, json string, or yaml filename. config_exclude_keys: (list, optional) string keys to exclude from `wandb.config`. config_include_keys: (list, optional) string keys to include in wandb.config. anonymous: (str, optional) Controls anonymous data logging. Options: - "never" (default): requires you to link your W&B account before tracking the run so you don't accidentally create an anonymous run. - "allow": lets a logged-in user track runs with their account, but lets someone who is running the script without a W&B account see the charts in the UI. - "must": sends the run to an anonymous account instead of to a signed-up user account. mode: (str, optional) Can be "online", "offline" or "disabled". Defaults to online. allow_val_change: (bool, optional) Whether to allow config values to change after setting the keys once. By default we throw an exception if a config value is overwritten. If you want to track something like a varying learning_rate at multiple times during training, use wandb.log() instead. (default: False in scripts, True in Jupyter) force: (bool, optional) If True, this crashes the script if a user isn't logged in to W&B. If False, this will let the script run in offline mode if a user isn't logged in to W&B. (default: False) sync_tensorboard: (bool, optional) Synchronize wandb logs from tensorboard or tensorboardX and saves the relevant events file. (default: False) monitor_gym: (bool, optional) automatically logs videos of environment when using OpenAI Gym. (default: False) See https://docs.wandb.com/library/integrations/openai-gym id: (str, optional) A unique ID for this run, used for Resuming. It must be unique in the project, and if you delete a run you can't reuse the ID. Use the name field for a short descriptive name, or config for saving hyperparameters to compare across runs. The ID cannot contain special characters. See https://docs.wandb.com/library/resuming Examples: Basic usage ``` wandb.init() ``` Launch multiple runs from the same script ``` for x in range(10): with wandb.init(project="my-projo") as run: for y in range(100): run.log({"metric": x+y}) ``` Raises: Exception: if problem. Returns: A `Run` object. """ wandb._assert_is_user_process() kwargs = dict(locals()) error_seen = None except_exit = None try: wi = _WandbInit() wi.setup(kwargs) except_exit = wi.settings._except_exit try: run = wi.init() except_exit = wi.settings._except_exit except (KeyboardInterrupt, Exception) as e: if not isinstance(e, KeyboardInterrupt): sentry_exc(e) if not ( wandb.wandb_agent._is_running() and isinstance(e, KeyboardInterrupt) ): getcaller() assert logger if wi.settings.problem == "fatal": raise if wi.settings.problem == "warn": pass # TODO(jhr): figure out how to make this RunDummy run = None except UsageError: raise except KeyboardInterrupt as e: assert logger logger.warning("interrupted", exc_info=e) raise e except Exception as e: error_seen = e traceback.print_exc() assert logger logger.error("error", exc_info=e) # Need to build delay into this sentry capture because our exit hooks # mess with sentry's ability to send out errors before the program ends. sentry_exc(e, delay=True) # reraise(*sys.exc_info()) # six.raise_from(Exception("problem"), e) finally: if error_seen: wandb.termerror("Abnormal program exit") if except_exit: os._exit(-1) six.raise_from(Exception("problem"), error_seen) return run
def wandb_internal( settings, record_q, result_q, ): """Internal process function entrypoint. Read from record queue and dispatch work to various threads. Arguments: settings: dictionary of configuration parameters. record_q: records to be handled result_q: for sending results back """ # mark this process as internal wandb._set_internal_process() started = time.time() # register the exit handler only when wandb_internal is called, not on import @atexit.register def handle_exit(*args): logger.info("Internal process exited") # Lets make sure we dont modify settings so use a static object _settings = settings_static.SettingsStatic(settings) if _settings.log_internal: configure_logging(_settings.log_internal, _settings._log_level) parent_pid = os.getppid() pid = os.getpid() logger.info( "W&B internal server running at pid: %s, started at: %s", pid, datetime.fromtimestamp(started), ) publish_interface = interface.BackendSender(record_q=record_q) stopped = threading.Event() threads = [] send_record_q = queue.Queue() record_sender_thread = SenderThread( settings=_settings, record_q=send_record_q, result_q=result_q, stopped=stopped, interface=publish_interface, debounce_interval_ms=30000, ) threads.append(record_sender_thread) write_record_q = queue.Queue() record_writer_thread = WriterThread( settings=_settings, record_q=write_record_q, result_q=result_q, stopped=stopped, writer_q=write_record_q, ) threads.append(record_writer_thread) record_handler_thread = HandlerThread( settings=_settings, record_q=record_q, result_q=result_q, stopped=stopped, sender_q=send_record_q, writer_q=write_record_q, interface=publish_interface, ) threads.append(record_handler_thread) process_check = ProcessCheck(settings=_settings, pid=parent_pid) for thread in threads: thread.start() interrupt_count = 0 while not stopped.is_set(): try: # wait for stop event while not stopped.is_set(): time.sleep(1) if process_check.is_dead(): logger.error("Internal process shutdown.") stopped.set() except KeyboardInterrupt: interrupt_count += 1 logger.warning( "Internal process interrupt: {}".format(interrupt_count)) finally: if interrupt_count >= 2: logger.error("Internal process interrupted.") stopped.set() for thread in threads: thread.join() for thread in threads: exc_info = thread.get_exception() if exc_info: logger.error("Thread {}:".format(thread.name), exc_info=exc_info) print("Thread {}:".format(thread.name), file=sys.stderr) traceback.print_exception(*exc_info) sentry_exc(exc_info, delay=True) wandb.termerror("Internal wandb error: file data was not synced") sys.exit(-1)
def push(self): try: size = os.path.getsize(self.save_path) except OSError: size = 0 if self.save_fn: # Retry logic must happen in save_fn currently try: deduped = self.save_fn(lambda _, t: self._stats. update_uploaded_file(self.save_path, t)) except Exception as e: self._stats.update_failed_file(self.save_path) logger.exception("Failed to upload file: %s", self.save_path) wandb.util.sentry_exc(e) message = str(e) # TODO: this is usually XML, but could be JSON if hasattr(e, "response"): message = e.response.content wandb.termerror('Error uploading "{}": {}, {}'.format( self.save_path, type(e).__name__, message)) return False if deduped: logger.info("Skipped uploading %s", self.save_path) self._stats.set_file_deduped(self.save_path) else: logger.info("Uploaded file %s", self.save_path) return True if self.md5: # This is the new artifact manifest upload flow, in which we create the # database entry for the manifest file before creating it. This is used for # artifact L0 files. Which now is only artifact_manifest.json response = self._api.create_artifact_manifest( self.save_name, self.md5, self.artifact_id) upload_url = response["uploadUrl"] upload_headers = response["uploadHeaders"] else: # The classic file upload flow. We get a signed url and upload the file # then the backend handles the cloud storage metadata callback to create the # file entry. This flow has aged like a fine wine. project = self._api.get_project() _, upload_headers, result = self._api.upload_urls( project, [self.save_name]) file_info = result[self.save_name] upload_url = file_info["url"] if upload_url is None: logger.info("Skipped uploading %s", self.save_path) self._stats.set_file_deduped(self.save_name) else: extra_headers = {} for upload_header in upload_headers: key, val = upload_header.split(":", 1) extra_headers[key] = val # Copied from push TODO(artifacts): clean up # If the upload URL is relative, fill it in with the base URL, # since its a proxied file store like the on-prem VM. if upload_url.startswith("/"): upload_url = "{}{}".format(self._api.api_url, upload_url) try: with open(self.save_path, "rb") as f: self._api.upload_file_retry( upload_url, f, lambda _, t: self.progress(t), extra_headers=extra_headers, ) logger.info("Uploaded file %s", self.save_path) except Exception as e: self._stats.update_failed_file(self.save_name) logger.exception("Failed to upload file: %s", self.save_path) wandb.util.sentry_exc(e) wandb.termerror('Error uploading "{}": {}, {}'.format( self.save_name, type(e).__name__, e)) return False return True