示例#1
0
def write_netrc(host, entity, key):
    """Add our host and key to .netrc"""
    key_prefix, key_suffix = key.split('-', 1) if '-' in key else ('', key)
    if len(key_suffix) != 40:
        wandb.termlog(
            'API-key must be exactly 40 characters long: {} ({} chars)'.format(
                key_suffix, len(key_suffix)))
        return None
    try:
        normalized_host = host.split("/")[-1].split(":")[0]
        wandb.termlog("Appending key for {} to your netrc file: {}".format(
            normalized_host, os.path.expanduser('~/.netrc')))
        machine_line = 'machine %s' % normalized_host
        path = os.path.expanduser('~/.netrc')
        orig_lines = None
        try:
            with open(path) as f:
                orig_lines = f.read().strip().split('\n')
        except (IOError, OSError) as e:
            pass
        with open(path, 'w') as f:
            if orig_lines:
                # delete this machine from the file if it's already there.
                skip = 0
                for line in orig_lines:
                    if machine_line in line:
                        skip = 2
                    elif skip:
                        skip -= 1
                    else:
                        f.write('%s\n' % line)
            f.write(
                textwrap.dedent("""\
            machine {host}
              login {entity}
              password {key}
            """).format(host=normalized_host, entity=entity, key=key))
        os.chmod(os.path.expanduser('~/.netrc'), stat.S_IRUSR | stat.S_IWUSR)
        return True
    except IOError as e:
        wandb.termerror("Unable to read ~/.netrc")
        return None
    def process_chunks(self, chunks):
        chunk_id = self._chunk_id
        # TODO: chunk_id is getting reset on each request...
        self._chunk_id += len(chunks)
        chunk_data = []
        for chunk in chunks:
            if len(chunk.data) > util.MAX_LINE_SIZE:
                msg = "Metric data exceeds maximum size of {} ({})".format(
                    util.to_human_size(util.MAX_LINE_SIZE),
                    util.to_human_size(len(chunk.data)),
                )
                wandb.termerror(msg, repeat=False)
                util.sentry_message(msg)
            else:
                chunk_data.append(chunk.data)

        return {
            "offset": chunk_id,
            "content": chunk_data,
        }
示例#3
0
文件: utils.py 项目: gampx/client
 def objective(phi, grad=0):
     # search on hypersphere surface in polar coordinates - map back to cartesian
     cx = centroid + polar_to_cartesian(phi, R)
     try:
         cx2d = dimensionality_reduction.transform([cx])[0]
         error = self.decision_boundary_distance(cx)
         if penalize_known:
             # slight penalty for being too close to already known decision boundary
             # keypoints
             db_distances = [
                 euclidean(cx2d, self.decision_boundary_points_2d[k])
                 for k in range(len(self.decision_boundary_points_2d))
             ]
             error += (1e-8 *
                       ((self.mean_2d_dist - np.min(db_distances)) /
                        self.mean_2d_dist)**2)
         return error
     except Exception as ex:
         wandb.termerror("Error in objective function:", ex)
         return np.infty
示例#4
0
    def _save_model(self, epoch):
        if wandb.run.disabled:
            return
        if self.verbose > 0:
            print(
                "Epoch %05d: %s improved from %0.5f to %0.5f,"
                " saving model to %s"
                % (epoch, self.monitor, self.best, self.current, self.filepath)
            )

        try:
            if self.save_weights_only:
                self.model.save_weights(self.filepath, overwrite=True)
            else:
                self.model.save(self.filepath, overwrite=True)
        # Was getting `RuntimeError: Unable to create link` in TF 1.13.1
        # also saw `TypeError: can't pickle _thread.RLock objects`
        except (ImportError, RuntimeError, TypeError) as e:
            wandb.termerror("Can't save model, h5py returned error: %s" % e)
            self.save_model = False
示例#5
0
文件: pyagent.py 项目: vwxyzjn/client
 def _setup(self):
     logger.debug("Agent._setup()")
     self._init()
     parts = dict(entity=self._entity, project=self._project, name=self._sweep_path)
     err = util.parse_sweep_id(parts)
     if err:
         wandb.termerror(err)
         return
     entity = parts.get("entity") or self._entity
     project = parts.get("project") or self._project
     sweep_id = parts.get("name") or self._sweep_id
     if sweep_id:
         os.environ[wandb.env.SWEEP_ID] = sweep_id
     if entity:
         wandb.env.set_entity(entity)
     if project:
         wandb.env.set_project(project)
     if sweep_id:
         self._sweep_id = sweep_id
     self._register()
示例#6
0
    def apply(self) -> None:
        """Call require_* method for supported features."""
        last_message: str = ""
        for feature_item in self._features:
            full_feature = feature_item.split("@", 2)[0]
            feature = full_feature.split(":", 2)[0]
            func_str = "require_{}".format(feature.replace("-", "_"))
            func = getattr(self, func_str, None)
            if not func:
                last_message = "require() unsupported requirement: {}".format(
                    feature)
                wandb.termwarn(last_message)
                continue
            func()

        if last_message:
            wandb.termerror(
                "Supported wandb.require() features can be found at: http://wandb.me/library-require"
            )
            raise RequireError(last_message)
示例#7
0
def test_fitted(model):
    np = util.get_module("numpy", required="Logging plots requires numpy")
    pd = util.get_module("pandas",
                         required="Logging dataframes requires pandas")
    scipy = util.get_module("scipy",
                            required="Logging scipy matrices requires scipy")
    scikit = util.get_module(
        "sklearn", required="Logging plots matrices requires scikit-learn")
    try:
        model.predict(np.zeros((7, 3)))
    except scikit.exceptions.NotFittedError:
        wandb.termerror("Please fit the model before passing it in.")
        return False
    except AttributeError:
        # Some clustering models (LDA, PCA, Agglomerative) don't implement ``predict``
        try:
            scikit.utils.validation.check_is_fitted(
                model,
                [
                    "coef_",
                    "estimator_",
                    "labels_",
                    "n_clusters_",
                    "children_",
                    "components_",
                    "n_components_",
                    "n_iter_",
                    "n_batch_iter_",
                    "explained_variance_",
                    "singular_values_",
                    "mean_",
                ],
                all_or_any=any,
            )
            return True
        except scikit.exceptions.NotFittedError:
            wandb.termerror("Please fit the model before passing it in.")
            return False
    except Exception:
        # Assume it's fitted, since ``NotFittedError`` wasn't raised
        return True
def patch(save=None, tensorboardX=None, pytorch=None):
    if len(wandb.patched["tensorboard"]) > 0:
        raise ValueError(
            "Tensorboard already patched, remove sync_tensorboard=True from wandb.init or only call wandb.tensorboard.patch once."
        )

    wandb.util.get_module("tensorboard",
                          required="Please install tensorboard package")
    c_writer = wandb.util.get_module(TENSORBOARD_C_MODULE)
    tb_writer = wandb.util.get_module(TENSORBOARD_PYTORCH_MODULE)

    if c_writer:
        _patch_tensorflow2(writer=c_writer,
                           module=TENSORBOARD_C_MODULE,
                           save=save)
    elif tb_writer:
        _patch_nontensorflow(writer=tb_writer,
                             module=TENSORBOARD_PYTORCH_MODULE,
                             save=save)
    else:
        wandb.termerror("Unsupported tensorboard configuration")
示例#9
0
def agent_run(args):
    """A version of `wandb run` that the agent uses to run things.
    """
    run = wandb.wandb_run.Run.from_environment_or_defaults()
    run.enable_logging()

    # TODO: better failure handling
    root = run.api.git.root
    # handle non-git directories
    if not root:
        root = os.path.abspath(os.getcwd())
        host = socket.gethostname()
        remote_url = 'file://{}{}'.format(host, root)

    run.save(program=args['program'])
    env = dict(os.environ)
    run.set_environment(env)

    try:
        rm = wandb.run_manager.RunManager(run, agent_run=True)
    except wandb.run_manager.Error:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        wandb.termerror(
            'An Exception was raised during setup, see %s for full traceback.'
            % util.get_log_file_path())
        wandb.termerror(exc_value)
        if 'permission' in str(exc_value):
            wandb.termerror(
                'Are you sure you provided the correct API key to "wandb login"?'
            )
        lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
        logging.error('\n'.join(lines))
    else:
        rm.run_user_process(args['program'], args['args'], env)
示例#10
0
def run(ctx, program, args, id, resume, dir, configs, message, name, notes,
        show, tags, run_group, job_type):
    wandb.ensure_configured()
    if configs:
        config_paths = configs.split(',')
    else:
        config_paths = []
    config = Config(config_paths=config_paths,
                    wandb_dir=dir or wandb.wandb_dir())
    tags = [tag for tag in tags.split(",") if tag] if tags else None

    # populate run parameters from env if not specified
    id = id or os.environ.get(env.RUN_ID)
    message = message or os.environ.get(env.DESCRIPTION)
    tags = tags or env.get_tags()
    run_group = run_group or os.environ.get(env.RUN_GROUP)
    job_type = job_type or os.environ.get(env.JOB_TYPE)
    name = name or os.environ.get(env.NAME)
    notes = notes or os.environ.get(env.NOTES)
    resume = resume or os.environ.get(env.RESUME)

    run = wandb_run.Run(run_id=id,
                        mode='clirun',
                        config=config,
                        description=message,
                        program=program,
                        tags=tags,
                        group=run_group,
                        job_type=job_type,
                        name=name,
                        notes=notes,
                        resume=resume)
    run.enable_logging()

    environ = dict(os.environ)
    if configs:
        environ[env.CONFIG_PATHS] = configs
    if show:
        environ[env.SHOW_RUN] = 'True'

    if not run.api.api_key:
        util.prompt_api_key(run.api, input_callback=click.prompt)

    try:
        rm = run_manager.RunManager(run)
        rm.init_run(environ)
    except run_manager.Error:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        wandb.termerror(
            'An Exception was raised during setup, see %s for full traceback.'
            % util.get_log_file_path())
        wandb.termerror(str(exc_value))
        if 'permission' in str(exc_value):
            wandb.termerror(
                'Are you sure you provided the correct API key to "wandb login"?'
            )
        lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
        logger.error('\n'.join(lines))
        sys.exit(1)
    rm.run_user_process(program, args, environ)
示例#11
0
 def sync_spell(self, run, env=None):
     """Syncs this run with spell"""
     try:
         env = env or os.environ
         run.config._set_wandb("spell_url", env.get("SPELL_RUN_URL"))
         run.config.persist()
         try:
             url = run.get_url()
         except CommError as e:
             wandb.termerror("Unable to register run with spell.run: %s" %
                             e.message)
             return False
         return requests.put(
             env.get("SPELL_API_URL", "https://api.spell.run") +
             "/wandb_url",
             json={
                 "access_token": env.get("WANDB_ACCESS_TOKEN"),
                 "url": url
             },
             timeout=2)
     except requests.RequestException:
         return False
示例#12
0
 def _handle_event(self, event):
     if isinstance(event, upload_job.EventJobDone):
         job = event.job
         job.join()
         if job.artifact_id:
             if event.success:
                 self._artifacts[job.artifact_id]['pending_count'] -= 1
                 self._maybe_commit_artifact(job.artifact_id)
             else:
                 termerror(
                     'Uploading artifact file failed. Artifact won\'t be committed.'
                 )
         self._running_jobs.pop(job.save_name)
         # If we have any pending jobs, start one now
         if self._pending_jobs:
             event = self._pending_jobs.pop(0)
             self._start_upload_job(event)
     elif isinstance(event, RequestCommitArtifact):
         self._artifacts[event.artifact_id]['commit_requested'] = True
         if event.on_commit:
             self._artifacts[event.artifact_id]['commit_callbacks'].add(
                 event.on_commit)
         self._maybe_commit_artifact(event.artifact_id)
     elif isinstance(event, RequestUpload):
         if event.artifact_id is not None:
             if event.artifact_id not in self._artifacts:
                 self._artifacts[event.artifact_id] = {
                     'pending_count': 0,
                     'commit_requested': False,
                     'commit_callbacks': set(),
                 }
             self._artifacts[event.artifact_id]['pending_count'] += 1
         if len(self._running_jobs) == self._max_jobs:
             self._pending_jobs.append(event)
         else:
             self._start_upload_job(event)
     else:
         raise Exception('Programming error: unhandled event: %s' %
                         str(event))
示例#13
0
    def _run_cmd(
        self,
        cmd: List[str],
        output_only: Optional[bool] = False
    ) -> Optional[Union["subprocess.Popen[bytes]", bytes]]:
        """Runs the command and returns a popen object or the stdout of the command.

        Arguments:
        cmd: The command to run
        output_only: If true just return the stdout bytes
        """
        try:
            env = os.environ
            popen = subprocess.Popen(cmd, env=env, stdout=subprocess.PIPE)
            if output_only:
                popen.wait()
                if popen.stdout is not None:
                    return popen.stdout.read()
            return popen
        except subprocess.CalledProcessError as e:
            wandb.termerror("Command failed: {}".format(e))
            return None
示例#14
0
    def push(self):
        try:
            size = os.path.getsize(self.save_path)
        except OSError:
            size = 0

        self._progress[self.label] = {
            'total': size,
            'uploaded': 0,
            'failed': False
        }
        try:
            with open(self.save_path, 'rb') as f:
                self._api.push(
                    {self.save_name: f},
                    progress=lambda _, t: self.progress(t))
        except Exception as e:
            self._progress[self.label]['uploaded'] = 0
            self._progress[self.label]['failed'] = True
            wandb.util.sentry_exc(e)
            wandb.termerror('Error uploading "{}": {}, {}'.format(
                self.save_name, type(e).__name__, e))
示例#15
0
def sweep(ctx, controller, verbose, config_yaml):
    click.echo('Creating sweep from: %s' % config_yaml)
    try:
        yaml_file = open(config_yaml)
    except (OSError, IOError):
        wandb.termerror('Couldn\'t open sweep file: %s' % config_yaml)
        return
    try:
        config = util.load_yaml(yaml_file)
    except yaml.YAMLError as err:
        wandb.termerror('Error in configuration file: %s' % err)
        return
    if config is None:
        wandb.termerror('Configuration file is empty')
        return

    is_local = config.get('controller', {}).get('type') == 'local'
    if is_local:
        tuner = wandb_controller.controller()
        err = tuner._validate(config)
        if err:
            wandb.termerror('Error in sweep file: %s' % err)
            return
    else:
        if controller:
            wandb.termerror(
                'Option "controller" only permitted for controller type "local"'
            )
            return
    sweep_id = api.upsert_sweep(config)
    print('Create sweep with ID:', sweep_id)
    sweep_url = wandb_controller._get_sweep_url(api, sweep_id)
    if sweep_url:
        print('Sweep URL:', sweep_url)
    if controller:
        click.echo('Starting wandb controller...')
        tuner = wandb_controller.controller(sweep_id)
        tuner.run(verbose=verbose)
示例#16
0
def test_missing(**kwargs):
    test_passed = True
    for k, v in kwargs.items():
        # Missing/empty params/datapoint arrays
        if v is None:
            wandb.termerror("%s is None. Please try again." % (k))
            test_passed = False
        if (k == "X") or (k == "X_test"):
            if isinstance(v, scipy.sparse.csr.csr_matrix):
                v = v.toarray()
            elif isinstance(v, (pd.DataFrame, pd.Series)):
                v = v.to_numpy()
            elif isinstance(v, list):
                v = np.asarray(v)

            # Warn the user about missing values
            missing = 0
            missing = np.count_nonzero(pd.isnull(v))
            if missing > 0:
                wandb.termwarn("%s contains %d missing values. " %
                               (k, missing))
                test_passed = False
            # Ensure the dataset contains only integers
            non_nums = 0
            if v.ndim == 1:
                non_nums = sum(1 for val in v
                               if (not isinstance(val, (int, float, complex))
                                   and not isinstance(val, np.number)))
            else:
                non_nums = sum(1 for sl in v for val in sl
                               if (not isinstance(val, (int, float, complex))
                                   and not isinstance(val, np.number)))
            if non_nums > 0:
                wandb.termerror(
                    "%s contains values that are not numbers. Please vectorize, label encode or one hot encode %s and call the plotting function again."
                    % (k, k))
                test_passed = False
    return test_passed
示例#17
0
 def _prompt_api_key(self) -> Tuple[Optional[str], ApiKeyStatus]:
     api = Api(self._settings)
     while True:
         try:
             key = apikey.prompt_api_key(
                 self._settings,
                 api=api,
                 no_offline=self._settings.force
                 if self._settings else None,
                 no_create=self._settings.force if self._settings else None,
             )
         except ValueError as e:
             # invalid key provided, try again
             wandb.termerror(e.args[0])
             continue
         except TimeoutError:
             wandb.termlog("W&B disabled due to login timeout.")
             return None, ApiKeyStatus.DISABLED
         if key is False:
             return None, ApiKeyStatus.NOTTY
         if not key:
             return None, ApiKeyStatus.OFFLINE
         return key, ApiKeyStatus.VALID
示例#18
0
    def history(self,
                samples=500,
                keys=None,
                x_axis="_step",
                pandas=True,
                stream="default"):
        """
        Returns sampled history metrics for a run.  This is simpler and faster if you are ok with
        the history records being sampled.

        Args:
            samples (int, optional): The number of samples to return
            pandas (bool, optional): Return a pandas dataframe
            keys (list, optional): Only return metrics for specific keys
            x_axis (str, optional): Use this metric as the xAxis defaults to _step
            stream (str, optional): "default" for metrics, "system" for machine metrics
        
        Returns:
            If pandas=True returns a `pandas.DataFrame` of history metrics.
            If pandas=False returns a list of dicts of history metrics.    
        """
        if keys and stream != "default":
            wandb.termerror("stream must be default when specifying keys")
            return []
        elif keys:
            lines = self._sampled_history(keys=keys,
                                          x_axis=x_axis,
                                          samples=samples)
        else:
            lines = self._full_history(samples=samples, stream=stream)
        if pandas:
            pandas = util.get_module("pandas")
            if pandas:
                lines = pandas.DataFrame.from_records(lines)
            else:
                print("Unable to load pandas, call history with pandas=False")
        return lines
示例#19
0
def test_types(**kwargs):
    test_passed = True
    for k, v in kwargs.items():
        # check for incorrect types
        if ((k == 'X') or (k == 'X_test') or (k == 'y') or (k == 'y_test')
                or (k == 'y_true') or (k == 'y_probas')):
            # FIXME: do this individually
            if not isinstance(
                    v, (collections.Sequence, collections.Iterable, np.ndarray,
                        np.generic, pd.DataFrame, pd.Series, list)):
                wandb.termerror("%s is not an array. Please try again." % (k))
                test_passed = False
        # check for classifier types
        if (k == 'model'):
            if ((not sklearn.base.is_classifier(v))
                    and (not sklearn.base.is_regressor(v))):
                wandb.termerror(
                    "%s is not a classifier or regressor. Please try again." %
                    (k))
                test_passed = False
        elif (k == 'clf' or k == 'binary_clf'):
            if (not (sklearn.base.is_classifier(v))):
                wandb.termerror("%s is not a classifier. Please try again." %
                                (k))
                test_passed = False
        elif (k == 'regressor'):
            if (not sklearn.base.is_regressor(v)):
                wandb.termerror("%s is not a regressor. Please try again." %
                                (k))
                test_passed = False
        elif (k == 'clusterer'):
            if (not (getattr(v, "_estimator_type", None) == "clusterer")):
                wandb.termerror("%s is not a clusterer. Please try again." %
                                (k))
                test_passed = False
    return test_passed
示例#20
0
def write_netrc(host, entity, key):
    """Add our host and key to .netrc"""
    key_prefix, key_suffix = key.split("-", 1) if "-" in key else ("", key)
    if len(key_suffix) != 40:
        wandb.termerror(
            "API-key must be exactly 40 characters long: {} ({} chars)".format(
                key_suffix, len(key_suffix)
            )
        )
        return None
    try:
        normalized_host = host.rstrip("/").split("/")[-1].split(":")[0]
        if normalized_host != "localhost" and "." not in normalized_host:
            wandb.termerror("Host must be a url in the form https://some.address.com")
            return None
        wandb.termlog(
            "Appending key for {} to your netrc file: {}".format(
                normalized_host, os.path.expanduser("~/.netrc")
            )
        )
        machine_line = "machine %s" % normalized_host
        path = os.path.expanduser("~/.netrc")
        orig_lines = None
        try:
            with open(path) as f:
                orig_lines = f.read().strip().split("\n")
        except IOError:
            pass
        with open(path, "w") as f:
            if orig_lines:
                # delete this machine from the file if it's already there.
                skip = 0
                for line in orig_lines:
                    # we fix invalid netrc files with an empty host that we wrote before
                    # verifying host...
                    if line == "machine " or machine_line in line:
                        skip = 2
                    elif skip:
                        skip -= 1
                    else:
                        f.write("%s\n" % line)
            f.write(
                textwrap.dedent(
                    """\
            machine {host}
              login {entity}
              password {key}
            """
                ).format(host=normalized_host, entity=entity, key=key)
            )
        os.chmod(os.path.expanduser("~/.netrc"), stat.S_IRUSR | stat.S_IWUSR)
        return True
    except IOError:
        wandb.termerror("Unable to read ~/.netrc")
        return None
示例#21
0
def run(ctx, program, args, id, resume, dir, configs, message, name, notes,
        show, tags, run_group, job_type):
    wandb.ensure_configured()
    if configs:
        config_paths = configs.split(',')
    else:
        config_paths = []
    config = Config(config_paths=config_paths,
                    wandb_dir=dir or wandb.wandb_dir())
    tags = [tag for tag in tags.split(",") if tag] if tags else None
    run = wandb_run.Run(run_id=id,
                        mode='clirun',
                        config=config,
                        description=message,
                        program=program,
                        tags=tags,
                        group=run_group,
                        job_type=job_type,
                        name=name,
                        notes=notes,
                        resume=resume)
    run.enable_logging()

    environ = dict(os.environ)
    if configs:
        environ[env.CONFIG_PATHS] = configs
    if show:
        environ[env.SHOW_RUN] = 'True'
    run.check_anonymous()

    try:
        rm = run_manager.RunManager(run)
        rm.init_run(environ)
    except run_manager.Error:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        wandb.termerror(
            'An Exception was raised during setup, see %s for full traceback.'
            % util.get_log_file_path())
        wandb.termerror(str(exc_value))
        if 'permission' in str(exc_value):
            wandb.termerror(
                'Are you sure you provided the correct API key to "wandb login"?'
            )
        lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
        logger.error('\n'.join(lines))
        sys.exit(1)
    rm.run_user_process(program, args, environ)
示例#22
0
def sweep(ctx, config_yaml):
    click.echo('Creating sweep from: %s' % config_yaml)
    try:
        yaml_file = open(config_yaml)
    except (OSError, IOError):
        wandb.termerror('Couldn\'t open sweep file: %s' % config_yaml)
        return
    try:
        config = util.load_yaml(yaml_file)
    except yaml.YAMLError as err:
        wandb.termerror('Error in configuration file: %s' % err)
        return
    if config is None:
        wandb.termerror('Configuration file is empty')
        return
    sweep_id = api.upsert_sweep(config)
    print('Create sweep with ID:', sweep_id)
示例#23
0
def agent_run(args):
    """A version of `wandb run` that the agent uses to run things.
    """
    run = wandb.wandb_run.Run.from_environment_or_defaults()

    api = wandb.api.Api()
    api.set_current_run_id(run.id)

    # TODO: better failure handling
    root = api.git.root
    remote_url = api.git.remote_url
    host = socket.gethostname()
    # handle non-git directories
    if not root:
        root = os.path.abspath(os.getcwd())
        remote_url = 'file://%s%s' % (host, root)

    upsert_result = api.upsert_run(id=run.storage_id,
                                   name=run.id,
                                   project=api.settings("project"),
                                   entity=api.settings("entity"),
                                   config=run.config.as_dict(), description=run.description, host=host,
                                   program_path=args['program'], repo=remote_url, sweep_name=run.sweep_id)
    run.storage_id = upsert_result['id']
    env = dict(os.environ)
    run.set_environment(env)

    try:
        rm = wandb.run_manager.RunManager(api, run)
    except wandb.run_manager.Error:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        wandb.termerror('An Exception was raised during setup, see %s for full traceback.' %
                        util.get_log_file_path())
        wandb.termerror(exc_value)
        if 'permission' in str(exc_value):
            wandb.termerror(
                'Are you sure you provided the correct API key to "wandb login"?')
        lines = traceback.format_exception(
            exc_type, exc_value, exc_traceback)
        logger.error('\n'.join(lines))
    else:
        rm.run_user_process(args['program'], args['args'], env)
示例#24
0
文件: cli.py 项目: connorhough/client
def run(ctx, program, args, id, resume, dir, configs, message, show):
    api.ensure_configured()
    if configs:
        config_paths = configs.split(',')
    else:
        config_paths = []
    config = Config(config_paths=config_paths,
                    wandb_dir=dir or wandb.wandb_dir())
    run = wandb_run.Run(run_id=id,
                        mode='clirun',
                        config=config,
                        description=message,
                        program=program,
                        resume=resume)

    api.set_current_run_id(run.id)

    env = dict(os.environ)
    if configs:
        env['WANDB_CONFIG_PATHS'] = configs
    if show:
        env['WANDB_SHOW_RUN'] = 'True'

    try:
        rm = run_manager.RunManager(api, run)
        rm.init_run(env)
    except run_manager.Error:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        wandb.termerror(
            'An Exception was raised during setup, see %s for full traceback.'
            % util.get_log_file_path())
        wandb.termerror(str(exc_value))
        if 'permission' in str(exc_value):
            wandb.termerror(
                'Are you sure you provided the correct API key to "wandb login"?'
            )
        lines = traceback.format_exception(exc_type, exc_value, exc_traceback)
        logger.error('\n'.join(lines))
        sys.exit(1)

    rm.run_user_process(program, args, env)
示例#25
0
def init(
    job_type = None,
    dir=None,
    config = None,
    project = None,
    entity = None,
    reinit = None,
    tags = None,
    group = None,
    name = None,
    notes = None,
    magic = None,
    config_exclude_keys=None,
    config_include_keys=None,
    anonymous = None,
    mode = None,
    allow_val_change = None,
    resume = None,
    force = None,
    tensorboard=None,  # alias for sync_tensorboard
    sync_tensorboard=None,
    monitor_gym=None,
    save_code=None,
    id=None,
    settings = None,
):
    """Initialize W&B
    Spawns a new process to start or resume a run locally and communicate with a
    wandb server. Should be called before any calls to wandb.log.

    Arguments:
        job_type (str, optional): The type of job running, defaults to 'train'
        dir (str, optional): An absolute path to a directory where metadata will
            be stored.
        config (dict, argparse, or absl.flags, str, optional):
            Sets the config parameters (typically hyperparameters) to store with the
            run. See also wandb.config.
            If dict, argparse or absl.flags: will load the key value pairs into
                the runs config object.
            If str: will look for a yaml file that includes config parameters and
                load them into the run's config object.
        project (str, optional): W&B Project.
        entity (str, optional): W&B Entity.
        reinit (bool, optional): Allow multiple calls to init in the same process.
        tags (list, optional): A list of tags to apply to the run.
        group (str, optional): A unique string shared by all runs in a given group.
        name (str, optional): A display name for the run which does not have to be
            unique.
        notes (str, optional): A multiline string associated with the run.
        magic (bool, dict, or str, optional): magic configuration as bool, dict,
            json string, yaml filename.
        config_exclude_keys (list, optional): string keys to exclude storing in W&B
            when specifying config.
        config_include_keys (list, optional): string keys to include storing in W&B
            when specifying config.
        anonymous (str, optional): Can be "allow", "must", or "never". Controls
            whether anonymous logging is allowed.  Defaults to never.
        mode (str, optional): Can be "online", "offline" or "disabled". Defaults to
            online.
        allow_val_change (bool, optional): allow config values to be changed after
            setting. Defaults to true in jupyter and false otherwise.
        resume (bool, str, optional): Sets the resuming behavior. Should be one of:
            "allow", "must", "never", "auto" or None. Defaults to None.
            Cases:
            - "auto" (or True): automatically resume the previous run on the same machine.
                if the previous run crashed, otherwise starts a new run.
            - "allow": if id is set with init(id="UNIQUE_ID") or WANDB_RUN_ID="UNIQUE_ID"
                and it is identical to a previous run, wandb will automatically resume the
                run with the id. Otherwise wandb will start a new run.
            - "never": if id is set with init(id="UNIQUE_ID") or WANDB_RUN_ID="UNIQUE_ID"
                and it is identical to a previous run, wandb will crash.
            - "must": if id is set with init(id="UNIQUE_ID") or WANDB_RUN_ID="UNIQUE_ID"
                and it is identical to a previous run, wandb will automatically resume the
                run with the id. Otherwise wandb will crash.
            - None: never resumes - if a run has a duplicate run_id the previous run is
                overwritten.
            See https://docs.wandb.com/library/advanced/resuming for more detail.
        force (bool, optional): If true, will cause script to crash if user can't or isn't
            logged in to a wandb server.  If false, will cause script to run in offline
            modes if user can't or isn't logged in to a wandb server. Defaults to false.
        sync_tensorboard (bool, optional): Synchronize wandb logs from tensorboard or
            tensorboardX and saves the relevant events file. Defaults to false.
        monitor_gym: (bool, optional): automatically logs videos of environment when
            using OpenAI Gym (see https://docs.wandb.com/library/integrations/openai-gym)
            Defaults to false.
        save_code (bool, optional): Save the entrypoint or jupyter session history
            source code.
        id (str, optional): A globally unique (per project) identifier for the run. This
            is primarily used for resuming.

    Examples:
        Basic usage
        ```
        wandb.init()
        ```

        Launch multiple runs from the same script
        ```
        for x in range(10):
            with wandb.init(project="my-projo") as run:
                for y in range(100):
                    run.log({"metric": x+y})
        ```

    Raises:
        Exception: if problem.

    Returns:
        A `Run` object.
    """
    assert not wandb._IS_INTERNAL_PROCESS
    kwargs = dict(locals())
    error_seen = None
    except_exit = None
    try:
        wi = _WandbInit()
        wi.setup(kwargs)
        except_exit = wi.settings._except_exit
        try:
            run = wi.init()
            except_exit = wi.settings._except_exit
        except (KeyboardInterrupt, Exception) as e:
            if not isinstance(e, KeyboardInterrupt):
                sentry_exc(e)
            if not (
                wandb.wandb_agent._is_running() and isinstance(e, KeyboardInterrupt)
            ):
                getcaller()
            assert logger
            if wi.settings.problem == "fatal":
                raise
            if wi.settings.problem == "warn":
                pass
            # TODO(jhr): figure out how to make this RunDummy
            run = None
    except UsageError:
        raise
    except KeyboardInterrupt as e:
        assert logger
        logger.warning("interrupted", exc_info=e)
        raise e
    except Exception as e:
        error_seen = e
        traceback.print_exc()
        assert logger
        logger.error("error", exc_info=e)
        # Need to build delay into this sentry capture because our exit hooks
        # mess with sentry's ability to send out errors before the program ends.
        sentry_exc(e, delay=True)
        # reraise(*sys.exc_info())
        # six.raise_from(Exception("problem"), e)
    finally:
        if error_seen:
            wandb.termerror("Abnormal program exit")
            if except_exit:
                os._exit(-1)
            six.raise_from(Exception("problem"), error_seen)
    return run
示例#26
0
文件: wandb_run.py 项目: gampx/client
    def from_directory(cls,
                       directory,
                       project=None,
                       entity=None,
                       run_id=None,
                       api=None,
                       ignore_globs=None):
        api = api or InternalApi()
        run_id = run_id or util.generate_id()
        run = Run(run_id=run_id, dir=directory)

        run_name = None
        project_from_meta = None
        snap = DirectorySnapshot(directory)
        meta = next((p for p in snap.paths if METADATA_FNAME in p), None)
        if meta:
            meta = json.load(open(meta))
            run_name = meta.get("name")
            project_from_meta = meta.get("project")

        project = project or project_from_meta or api.settings(
            "project") or run.auto_project_name(api=api)
        if project is None:
            raise ValueError("You must specify project")
        api.set_current_run_id(run_id)
        api.set_setting("project", project)
        if entity:
            api.set_setting("entity", entity)
        res = api.upsert_run(name=run_id,
                             project=project,
                             entity=entity,
                             display_name=run_name)
        entity = res["project"]["entity"]["name"]
        wandb.termlog("Syncing {} to:".format(directory))
        try:
            wandb.termlog(res["displayName"] + " " + run.get_url(api))
        except CommError as e:
            wandb.termwarn(e.message)

        file_api = api.get_file_stream_api()
        file_api.start()
        paths = [
            os.path.relpath(abs_path, directory) for abs_path in snap.paths
            if os.path.isfile(abs_path)
        ]
        if ignore_globs:
            paths = set(paths)
            for g in ignore_globs:
                paths = paths - set(fnmatch.filter(paths, g))
            paths = list(paths)
        run_update = {"id": res["id"]}
        tfevents = sorted([p for p in snap.paths if ".tfevents." in p])
        history = next((p for p in snap.paths if HISTORY_FNAME in p), None)
        event = next((p for p in snap.paths if EVENTS_FNAME in p), None)
        config = next((p for p in snap.paths if CONFIG_FNAME in p), None)
        user_config = next((p for p in snap.paths if USER_CONFIG_FNAME in p),
                           None)
        summary = next((p for p in snap.paths if SUMMARY_FNAME in p), None)
        if history:
            wandb.termlog("Uploading history metrics")
            file_api.stream_file(history)
            snap.paths.remove(history)
        elif len(tfevents) > 0:
            from wandb import tensorflow as wbtf
            wandb.termlog("Found tfevents file, converting...")
            summary = {}
            for path in tfevents:
                filename = os.path.basename(path)
                namespace = path.replace(filename,
                                         "").replace(directory,
                                                     "").strip(os.sep)
                summary.update(
                    wbtf.stream_tfevents(path,
                                         file_api,
                                         run,
                                         namespace=namespace))
            for path in glob.glob(os.path.join(directory, "media/**/*"),
                                  recursive=True):
                if os.path.isfile(path):
                    paths.append(path)
        else:
            wandb.termerror(
                "No history or tfevents files found, only syncing files")
        if event:
            file_api.stream_file(event)
            snap.paths.remove(event)
        if config:
            run_update["config"] = util.load_yaml(open(config))
        elif user_config:
            # TODO: half backed support for config.json
            run_update["config"] = {
                k: {
                    "value": v
                }
                for k, v in six.iteritems(user_config)
            }
        if isinstance(summary, dict):
            #TODO: summary should already have data_types converted here...
            run_update["summary_metrics"] = util.json_dumps_safer(summary)
        elif summary:
            run_update["summary_metrics"] = open(summary).read()
        if meta:
            if meta.get("git"):
                run_update["commit"] = meta["git"].get("commit")
                run_update["repo"] = meta["git"].get("remote")
            if meta.get("host"):
                run_update["host"] = meta["host"]
            run_update["program_path"] = meta["program"]
            run_update["job_type"] = meta.get("jobType")
            run_update["notes"] = meta.get("notes")
        else:
            run_update["host"] = run.host

        wandb.termlog("Updating run and uploading files")
        api.upsert_run(**run_update)
        pusher = FilePusher(api)
        for k in paths:
            path = os.path.abspath(os.path.join(directory, k))
            pusher.update_file(k, path)
            pusher.file_changed(k, path)
        pusher.finish()
        pusher.print_status()
        file_api.finish(0)
        # Remove temporary media images generated from tfevents
        if history is None and os.path.exists(os.path.join(directory,
                                                           "media")):
            shutil.rmtree(os.path.join(directory, "media"))
        wandb.termlog("Finished!")
        return run
示例#27
0
 def _run_jobs_from_queue(self):  # noqa:C901
     global _INSTANCES
     _INSTANCES += 1
     try:
         waiting = False
         count = 0
         while True:
             if self._exit_flag:
                 return
             try:
                 try:
                     job = self._queue.get(timeout=5)
                     if self._exit_flag:
                         logger.debug("Exiting main loop due to exit flag.")
                         wandb.termlog("Sweep Agent: Exiting.")
                         return
                 except queue.Empty:
                     if not waiting:
                         logger.debug("Paused.")
                         wandb.termlog("Sweep Agent: Waiting for job.")
                         waiting = True
                     time.sleep(5)
                     if self._exit_flag:
                         logger.debug("Exiting main loop due to exit flag.")
                         wandb.termlog("Sweep Agent: Exiting.")
                         return
                     continue
                 if waiting:
                     logger.debug("Resumed.")
                     wandb.termlog("Job received.")
                     waiting = False
                 count += 1
                 run_id = job.run_id
                 if self._run_status[run_id] == RunStatus.STOPPED:
                     continue
                 logger.debug(
                     "Spawning new thread for run {}.".format(run_id))
                 thread = threading.Thread(target=self._run_job,
                                           args=(job, ))
                 self._run_threads[run_id] = thread
                 thread.start()
                 self._run_status[run_id] = RunStatus.RUNNING
                 thread.join()
                 logger.debug("Thread joined for run {}.".format(run_id))
                 if self._run_status[run_id] == RunStatus.RUNNING:
                     self._run_status[run_id] = RunStatus.DONE
                 elif self._run_status[run_id] == RunStatus.ERRORED:
                     exc = self._exceptions[run_id]
                     logger.error("Run {} errored: {}".format(
                         run_id, repr(exc)))
                     wandb.termerror("Run {} errored: {}".format(
                         run_id, repr(exc)))
                     if os.getenv(
                             wandb.env.AGENT_DISABLE_FLAPPING) == "true":
                         self._exit_flag = True
                         return
                     elif (time.time() - self._start_time <
                           self.FLAPPING_MAX_SECONDS) and (
                               len(self._exceptions) >=
                               self.FLAPPING_MAX_FAILURES):
                         msg = "Detected {} failed runs in the first {} seconds, killing sweep.".format(
                             self.FLAPPING_MAX_FAILURES,
                             self.FLAPPING_MAX_SECONDS)
                         logger.error(msg)
                         wandb.termerror(msg)
                         wandb.termlog(
                             "To disable this check set WANDB_AGENT_DISABLE_FLAPPING=true"
                         )
                         self._exit_flag = True
                         return
                     if (self._max_initial_failures < len(self._exceptions)
                             and len(self._exceptions) >= count):
                         msg = "Detected {} failed runs in a row at start, killing sweep.".format(
                             self._max_initial_failures)
                         logger.error(msg)
                         wandb.termerror(msg)
                         wandb.termlog(
                             "To change this value set WANDB_AGENT_MAX_INITIAL_FAILURES=val"
                         )
                         self._exit_flag = True
                         return
                 if self._count and self._count == count:
                     logger.debug(
                         "Exiting main loop because max count reached.")
                     self._exit_flag = True
                     return
             except KeyboardInterrupt:
                 logger.debug("Ctrl + C detected. Stopping sweep.")
                 wandb.termlog("Ctrl + C detected. Stopping sweep.")
                 self._exit()
                 return
             except Exception as e:
                 if self._exit_flag:
                     logger.debug("Exiting main loop due to exit flag.")
                     wandb.termlog("Sweep Agent: Killed.")
                     return
                 else:
                     raise e
     finally:
         _INSTANCES -= 1
示例#28
0
def init(
    job_type = None,
    dir=None,
    config = None,
    project = None,
    entity = None,
    reinit = None,
    tags = None,
    group = None,
    name = None,
    notes = None,
    magic = None,
    config_exclude_keys=None,
    config_include_keys=None,
    anonymous = None,
    mode = None,
    allow_val_change = None,
    resume = None,
    force = None,
    tensorboard=None,  # alias for sync_tensorboard
    sync_tensorboard=None,
    monitor_gym=None,
    save_code=None,
    id=None,
    settings = None,
):
    """
    Start a new tracked run with `wandb.init()`.

    In an ML training pipeline, you could add `wandb.init()`
    to the beginning of your training script as well as your evaluation
    script, and each piece would be tracked as a run in W&B.

    `wandb.init()` spawns a new background process to log data to a run, and it
    also syncs data to wandb.ai by default so you can see live visualizations.
    Call `wandb.init()` to start a run before logging data with `wandb.log()`.

    `wandb.init()` returns a run object, and you can also access the run object
    with wandb.run.

    Arguments:
        project: (str, optional) The name of the project where you're sending
            the new run. If the project is not specified, the run is put in an
            "Uncategorized" project.
        entity: (str, optional) An entity is a username or team name where
            you're sending runs. This entity must exist before you can send runs
            there, so make sure to create your account or team in the UI before
            starting to log runs.
            If you don't specify an entity, the run will be sent to your default
            entity, which is usually your username. Change your default entity
            in [Settings](wandb.ai/settings) under "default location to create
            new projects".
        config: (dict, argparse, absl.flags, str, optional)
            This sets wandb.config, a dictionary-like object for saving inputs
            to your job, like hyperparameters for a model or settings for a data
            preprocessing job. The config will show up in a table in the UI that
            you can use to group, filter, and sort runs. Keys should not contain
            `.` in their names, and values should be under 10 MB.
            If dict, argparse or absl.flags: will load the key value pairs into
                the wandb.config object.
            If str: will look for a yaml file by that name, and load config from
                that file into the wandb.config object.
        save_code: (bool, optional) Turn this on to save the main script or
            notebook to W&B. This is valuable for improving experiment
            reproducibility and to diff code across experiments in the UI. By
            default this is off, but you can flip the default behavior to "on"
            in [Settings](wandb.ai/settings).
        group: (str, optional) Specify a group to organize individual runs into
            a larger experiment. For example, you might be doing cross
            validation, or you might have multiple jobs that train and evaluate
            a model against different test sets. Group gives you a way to
            organize runs together into a larger whole, and you can toggle this
            on and off in the UI. For more details, see
            [Grouping](docs.wandb.com/library/grouping).
        job_type: (str, optional) Specify the type of run, which is useful when
            you're grouping runs together into larger experiments using group.
            For example, you might have multiple jobs in a group, with job types
            like train and eval. Setting this makes it easy to filter and group
            similar runs together in the UI so you can compare apples to apples.
        tags: (list, optional) A list of strings, which will populate the list
            of tags on this run in the UI. Tags are useful for organizing runs
            together, or applying temporary labels like "baseline" or
            "production". It's easy to add and remove tags in the UI, or filter
            down to just runs with a specific tag.
        name: (str, optional) A short display name for this run, which is how
            you'll identify this run in the UI. By default we generate a random
            two-word name that lets you easily cross-reference runs from the
            table to charts. Keeping these run names short makes the chart
            legends and tables easier to read. If you're looking for a place to
            save your hyperparameters, we recommend saving those in config.
        notes: (str, optional) A longer description of the run, like a -m commit
            message in git. This helps you remember what you were doing when you
            ran this run.
        dir: (str, optional) An absolute path to a directory where metadata will
            be stored. When you call download() on an artifact, this is the
            directory where downloaded files will be saved. By default this is
            the ./wandb directory.
        resume (bool, str, optional): Sets the resuming behavior. Options:
            "allow", "must", "never", "auto" or None. Defaults to None.
            Cases:
            - None (default): If the new run has the same ID as a previous run,
                this run overwrites that data.
            - "auto" (or True): if the preivous run on this machine crashed,
                automatically resume it. Otherwise, start a new run.
            - "allow": if id is set with init(id="UNIQUE_ID") or
                WANDB_RUN_ID="UNIQUE_ID" and it is identical to a previous run,
                wandb will automatically resume the run with that id. Otherwise,
                wandb will start a new run.
            - "never": if id is set with init(id="UNIQUE_ID") or
                WANDB_RUN_ID="UNIQUE_ID" and it is identical to a previous run,
                wandb will crash.
            - "must": if id is set with init(id="UNIQUE_ID") or
                WANDB_RUN_ID="UNIQUE_ID" and it is identical to a previous run,
                wandb will automatically resume the run with the id. Otherwise
                wandb will crash.
            See https://docs.wandb.com/library/advanced/resuming for more.
        reinit: (bool, optional) Allow multiple wandb.init() calls in the same
            process. (default: False)
        magic: (bool, dict, or str, optional) The bool controls whether we try to
            auto-instrument your script, capturing basic details of your run
            without you having to add more wandb code. (default: False)
            You can also pass a dict, json string, or yaml filename.
        config_exclude_keys: (list, optional) string keys to exclude from
            `wandb.config`.
        config_include_keys: (list, optional) string keys to include in
            wandb.config.
        anonymous: (str, optional) Controls anonymous data logging. Options:
            - "never" (default): requires you to link your W&B account before
                tracking the run so you don't accidentally create an anonymous
                run.
            - "allow": lets a logged-in user track runs with their account, but
                lets someone who is running the script without a W&B account see
                the charts in the UI.
            - "must": sends the run to an anonymous account instead of to a
                signed-up user account.
        mode: (str, optional) Can be "online", "offline" or "disabled". Defaults to
            online.
        allow_val_change: (bool, optional) Whether to allow config values to
            change after setting the keys once. By default we throw an exception
            if a config value is overwritten. If you want to track something
            like a varying learning_rate at multiple times during training, use
            wandb.log() instead. (default: False in scripts, True in Jupyter)
        force: (bool, optional) If True, this crashes the script if a user isn't
            logged in to W&B. If False, this will let the script run in offline
            mode if a user isn't logged in to W&B. (default: False)
        sync_tensorboard: (bool, optional) Synchronize wandb logs from tensorboard or
            tensorboardX and saves the relevant events file. (default: False)
        monitor_gym: (bool, optional) automatically logs videos of environment when
            using OpenAI Gym. (default: False)
            See https://docs.wandb.com/library/integrations/openai-gym
        id: (str, optional) A unique ID for this run, used for Resuming. It must
            be unique in the project, and if you delete a run you can't reuse
            the ID. Use the name field for a short descriptive name, or config
            for saving hyperparameters to compare across runs. The ID cannot
            contain special characters.
            See https://docs.wandb.com/library/resuming


    Examples:
        Basic usage
        ```
        wandb.init()
        ```

        Launch multiple runs from the same script
        ```
        for x in range(10):
            with wandb.init(project="my-projo") as run:
                for y in range(100):
                    run.log({"metric": x+y})
        ```

    Raises:
        Exception: if problem.

    Returns:
        A `Run` object.
    """
    wandb._assert_is_user_process()
    kwargs = dict(locals())
    error_seen = None
    except_exit = None
    try:
        wi = _WandbInit()
        wi.setup(kwargs)
        except_exit = wi.settings._except_exit
        try:
            run = wi.init()
            except_exit = wi.settings._except_exit
        except (KeyboardInterrupt, Exception) as e:
            if not isinstance(e, KeyboardInterrupt):
                sentry_exc(e)
            if not (
                wandb.wandb_agent._is_running() and isinstance(e, KeyboardInterrupt)
            ):
                getcaller()
            assert logger
            if wi.settings.problem == "fatal":
                raise
            if wi.settings.problem == "warn":
                pass
            # TODO(jhr): figure out how to make this RunDummy
            run = None
    except UsageError:
        raise
    except KeyboardInterrupt as e:
        assert logger
        logger.warning("interrupted", exc_info=e)
        raise e
    except Exception as e:
        error_seen = e
        traceback.print_exc()
        assert logger
        logger.error("error", exc_info=e)
        # Need to build delay into this sentry capture because our exit hooks
        # mess with sentry's ability to send out errors before the program ends.
        sentry_exc(e, delay=True)
        # reraise(*sys.exc_info())
        # six.raise_from(Exception("problem"), e)
    finally:
        if error_seen:
            wandb.termerror("Abnormal program exit")
            if except_exit:
                os._exit(-1)
            six.raise_from(Exception("problem"), error_seen)
    return run
示例#29
0
文件: internal.py 项目: nbardy/client
def wandb_internal(
    settings,
    record_q,
    result_q,
):
    """Internal process function entrypoint.

    Read from record queue and dispatch work to various threads.

    Arguments:
        settings: dictionary of configuration parameters.
        record_q: records to be handled
        result_q: for sending results back

    """
    # mark this process as internal
    wandb._set_internal_process()
    started = time.time()

    # register the exit handler only when wandb_internal is called, not on import
    @atexit.register
    def handle_exit(*args):
        logger.info("Internal process exited")

    # Lets make sure we dont modify settings so use a static object
    _settings = settings_static.SettingsStatic(settings)
    if _settings.log_internal:
        configure_logging(_settings.log_internal, _settings._log_level)

    parent_pid = os.getppid()
    pid = os.getpid()

    logger.info(
        "W&B internal server running at pid: %s, started at: %s",
        pid,
        datetime.fromtimestamp(started),
    )

    publish_interface = interface.BackendSender(record_q=record_q)

    stopped = threading.Event()
    threads = []

    send_record_q = queue.Queue()
    record_sender_thread = SenderThread(
        settings=_settings,
        record_q=send_record_q,
        result_q=result_q,
        stopped=stopped,
        interface=publish_interface,
        debounce_interval_ms=30000,
    )
    threads.append(record_sender_thread)

    write_record_q = queue.Queue()
    record_writer_thread = WriterThread(
        settings=_settings,
        record_q=write_record_q,
        result_q=result_q,
        stopped=stopped,
        writer_q=write_record_q,
    )
    threads.append(record_writer_thread)

    record_handler_thread = HandlerThread(
        settings=_settings,
        record_q=record_q,
        result_q=result_q,
        stopped=stopped,
        sender_q=send_record_q,
        writer_q=write_record_q,
        interface=publish_interface,
    )
    threads.append(record_handler_thread)

    process_check = ProcessCheck(settings=_settings, pid=parent_pid)

    for thread in threads:
        thread.start()

    interrupt_count = 0
    while not stopped.is_set():
        try:
            # wait for stop event
            while not stopped.is_set():
                time.sleep(1)
                if process_check.is_dead():
                    logger.error("Internal process shutdown.")
                    stopped.set()
        except KeyboardInterrupt:
            interrupt_count += 1
            logger.warning(
                "Internal process interrupt: {}".format(interrupt_count))
        finally:
            if interrupt_count >= 2:
                logger.error("Internal process interrupted.")
                stopped.set()

    for thread in threads:
        thread.join()

    for thread in threads:
        exc_info = thread.get_exception()
        if exc_info:
            logger.error("Thread {}:".format(thread.name), exc_info=exc_info)
            print("Thread {}:".format(thread.name), file=sys.stderr)
            traceback.print_exception(*exc_info)
            sentry_exc(exc_info, delay=True)
            wandb.termerror("Internal wandb error: file data was not synced")
            sys.exit(-1)
示例#30
0
    def push(self):
        try:
            size = os.path.getsize(self.save_path)
        except OSError:
            size = 0

        if self.save_fn:
            # Retry logic must happen in save_fn currently
            try:
                deduped = self.save_fn(lambda _, t: self._stats.
                                       update_uploaded_file(self.save_path, t))
            except Exception as e:
                self._stats.update_failed_file(self.save_path)
                logger.exception("Failed to upload file: %s", self.save_path)
                wandb.util.sentry_exc(e)
                message = str(e)
                # TODO: this is usually XML, but could be JSON
                if hasattr(e, "response"):
                    message = e.response.content
                wandb.termerror('Error uploading "{}": {}, {}'.format(
                    self.save_path,
                    type(e).__name__, message))
                return False

            if deduped:
                logger.info("Skipped uploading %s", self.save_path)
                self._stats.set_file_deduped(self.save_path)
            else:
                logger.info("Uploaded file %s", self.save_path)
            return True

        if self.md5:
            # This is the new artifact manifest upload flow, in which we create the
            # database entry for the manifest file before creating it. This is used for
            # artifact L0 files. Which now is only artifact_manifest.json
            response = self._api.create_artifact_manifest(
                self.save_name, self.md5, self.artifact_id)
            upload_url = response["uploadUrl"]
            upload_headers = response["uploadHeaders"]
        else:
            # The classic file upload flow. We get a signed url and upload the file
            # then the backend handles the cloud storage metadata callback to create the
            # file entry. This flow has aged like a fine wine.
            project = self._api.get_project()
            _, upload_headers, result = self._api.upload_urls(
                project, [self.save_name])
            file_info = result[self.save_name]
            upload_url = file_info["url"]

        if upload_url is None:
            logger.info("Skipped uploading %s", self.save_path)
            self._stats.set_file_deduped(self.save_name)
        else:
            extra_headers = {}
            for upload_header in upload_headers:
                key, val = upload_header.split(":", 1)
                extra_headers[key] = val
            # Copied from push TODO(artifacts): clean up
            # If the upload URL is relative, fill it in with the base URL,
            # since its a proxied file store like the on-prem VM.
            if upload_url.startswith("/"):
                upload_url = "{}{}".format(self._api.api_url, upload_url)
            try:
                with open(self.save_path, "rb") as f:
                    self._api.upload_file_retry(
                        upload_url,
                        f,
                        lambda _, t: self.progress(t),
                        extra_headers=extra_headers,
                    )
                logger.info("Uploaded file %s", self.save_path)
            except Exception as e:
                self._stats.update_failed_file(self.save_name)
                logger.exception("Failed to upload file: %s", self.save_path)
                wandb.util.sentry_exc(e)
                wandb.termerror('Error uploading "{}": {}, {}'.format(
                    self.save_name,
                    type(e).__name__, e))
                return False
        return True