def __init__(self, optimizer: dy.Trainer, skip_noisy: bool = False) -> None: self.optimizer = optimizer self.skip_noisy = skip_noisy if skip_noisy: self.rolling_stats = utils.RollingStatistic()
def grad_log_norm(self) -> float: if getattr(self, "rolling_stats", None) is None: self.rolling_stats = utils.RollingStatistic() sq_norm = 0 for subcol in ParamManager.param_col.subcols.values(): for param in subcol.parameters_list(): cur_grads = param.grad_as_array() sq_norm += np.sum(np.square(cur_grads)) return np.log(np.sqrt(sq_norm))
def grad_log_norm(self) -> float: if getattr(self, "rolling_stats", None) is None: self.rolling_stats = utils.RollingStatistic() sq_norm = 0 for subcol in ParamManager.param_col.subcols.values(): for _, param in subcol.named_parameters(): if param.grad is not None: cur_grads = tt.npvalue(param.grad) sq_norm += np.sum(np.square(cur_grads)) return np.log(np.sqrt(sq_norm))
def check_gradients_noisy(self) -> bool: if getattr(self, "rolling_stats", None) is None: self.rolling_stats = utils.RollingStatistic() log_norm = self.grad_log_norm() if settings.USE_TENSORBOARD: tee.tensorboard_writer.add_scalars(name="grad", tag_scalar_dict={"norm": np.exp(log_norm)}, global_step=self.global_step) self.rolling_stats.update(log_norm) if self.rolling_stats.average is None: # too few statistics return False else: req_min = self.rolling_stats.average - 4*self.rolling_stats.stddev req_max = self.rolling_stats.average + 4*self.rolling_stats.stddev return not (req_min < log_norm < req_max)