Exemplo n.º 1
0
    def handle_result(self, results: List[Dict], **info):
        result = results[0]
        # Use TRAINING_ITERATION for step but remove it so it is not logged.
        step = result.pop(TRAINING_ITERATION)
        flat_result = flatten_dict(result, delimiter="/")
        path = ["ray", "train"]

        # same logic as in ray.tune.logger.TBXLogger
        for attr, value in flat_result.items():
            full_attr = "/".join(path + [attr])
            if isinstance(value, self.VALID_SUMMARY_TYPES) and not np.isnan(value):
                self._file_writer.add_scalar(full_attr, value, global_step=step)
            elif (isinstance(value, list) and len(value) > 0) or (
                isinstance(value, np.ndarray) and value.size > 0
            ):

                # Must be video
                if isinstance(value, np.ndarray) and value.ndim == 5:
                    self._file_writer.add_video(
                        full_attr, value, global_step=step, fps=20
                    )
                    continue

                try:
                    self._file_writer.add_histogram(full_attr, value, global_step=step)
                # In case TensorboardX still doesn't think it's a valid value
                # (e.g. `[[]]`), warn and move on.
                except (ValueError, TypeError):
                    if log_once("invalid_tbx_value"):
                        warnings.warn(
                            "You are trying to log an invalid value ({}={}) "
                            "via {}!".format(full_attr, value, type(self).__name__)
                        )
        self._file_writer.flush()
Exemplo n.º 2
0
    def training_loop(self) -> None:
        config = self.train_kwargs.copy()

        dmatrices = self._get_dmatrices(dmatrix_params=self.dmatrix_params, )
        train_dmatrix = dmatrices[TRAIN_DATASET_KEY]
        evals_result = {}

        init_model = None
        if self.resume_from_checkpoint:
            init_model, _ = self._load_checkpoint(self.resume_from_checkpoint)

        config.setdefault("verbose_eval", False)
        config.setdefault("callbacks", [])

        if not any(
                isinstance(cb, (self._tune_callback_report_cls,
                                self._tune_callback_checkpoint_cls))
                for cb in config["callbacks"]):
            # Only add our own callback if it hasn't been added before
            checkpoint_frequency = (
                self.run_config.checkpoint_config.checkpoint_frequency)
            if checkpoint_frequency > 0:
                callback = self._tune_callback_checkpoint_cls(
                    filename=MODEL_KEY, frequency=checkpoint_frequency)
            else:
                callback = self._tune_callback_report_cls()

            config["callbacks"] += [callback]

        config[self._init_model_arg_name] = init_model

        model = self._train(
            params=self.params,
            dtrain=train_dmatrix,
            evals_result=evals_result,
            evals=[(dmatrix, k) for k, dmatrix in dmatrices.items()],
            ray_params=self._ray_params,
            **config,
        )

        checkpoint_at_end = self.run_config.checkpoint_config.checkpoint_at_end
        if checkpoint_at_end is None:
            checkpoint_at_end = True

        if checkpoint_at_end:
            # We need to call tune.report to save checkpoints, so we report
            # the last received metrics (possibly again).
            result_dict = flatten_dict(evals_result, delimiter="-")
            for k in list(result_dict):
                result_dict[k] = result_dict[k][-1]

            with tune.checkpoint_dir(
                    step=self._model_iteration(model)) as cp_dir:
                self._save_model(model, path=os.path.join(cp_dir, MODEL_KEY))
                tune.report(**result_dict)