def handle_result(self, results: List[Dict], **info): result = results[0] # Use TRAINING_ITERATION for step but remove it so it is not logged. step = result.pop(TRAINING_ITERATION) flat_result = flatten_dict(result, delimiter="/") path = ["ray", "train"] # same logic as in ray.tune.logger.TBXLogger for attr, value in flat_result.items(): full_attr = "/".join(path + [attr]) if isinstance(value, self.VALID_SUMMARY_TYPES) and not np.isnan(value): self._file_writer.add_scalar(full_attr, value, global_step=step) elif (isinstance(value, list) and len(value) > 0) or ( isinstance(value, np.ndarray) and value.size > 0 ): # Must be video if isinstance(value, np.ndarray) and value.ndim == 5: self._file_writer.add_video( full_attr, value, global_step=step, fps=20 ) continue try: self._file_writer.add_histogram(full_attr, value, global_step=step) # In case TensorboardX still doesn't think it's a valid value # (e.g. `[[]]`), warn and move on. except (ValueError, TypeError): if log_once("invalid_tbx_value"): warnings.warn( "You are trying to log an invalid value ({}={}) " "via {}!".format(full_attr, value, type(self).__name__) ) self._file_writer.flush()
def training_loop(self) -> None: config = self.train_kwargs.copy() dmatrices = self._get_dmatrices(dmatrix_params=self.dmatrix_params, ) train_dmatrix = dmatrices[TRAIN_DATASET_KEY] evals_result = {} init_model = None if self.resume_from_checkpoint: init_model, _ = self._load_checkpoint(self.resume_from_checkpoint) config.setdefault("verbose_eval", False) config.setdefault("callbacks", []) if not any( isinstance(cb, (self._tune_callback_report_cls, self._tune_callback_checkpoint_cls)) for cb in config["callbacks"]): # Only add our own callback if it hasn't been added before checkpoint_frequency = ( self.run_config.checkpoint_config.checkpoint_frequency) if checkpoint_frequency > 0: callback = self._tune_callback_checkpoint_cls( filename=MODEL_KEY, frequency=checkpoint_frequency) else: callback = self._tune_callback_report_cls() config["callbacks"] += [callback] config[self._init_model_arg_name] = init_model model = self._train( params=self.params, dtrain=train_dmatrix, evals_result=evals_result, evals=[(dmatrix, k) for k, dmatrix in dmatrices.items()], ray_params=self._ray_params, **config, ) checkpoint_at_end = self.run_config.checkpoint_config.checkpoint_at_end if checkpoint_at_end is None: checkpoint_at_end = True if checkpoint_at_end: # We need to call tune.report to save checkpoints, so we report # the last received metrics (possibly again). result_dict = flatten_dict(evals_result, delimiter="-") for k in list(result_dict): result_dict[k] = result_dict[k][-1] with tune.checkpoint_dir( step=self._model_iteration(model)) as cp_dir: self._save_model(model, path=os.path.join(cp_dir, MODEL_KEY)) tune.report(**result_dict)