def optimize(self, layer_list, epoch): """ Apply the learning rule to all the layers and update the states. Arguments: layer_list (list): a list of Layer objects to optimize. epoch (int): the current epoch, needed for the Schedule object. """ lrate = self.schedule.get_learning_rate(self.learning_rate, epoch) param_list = get_param_list(layer_list) scale_factor = self.clip_gradient_norm(param_list, self.gradient_clip_norm) for (param, grad), states in param_list: param.rounding = self.stochastic_round if len(states) == 0: states.append(self.be.zeros_like(grad)) states.append(self.be.zeros_like(grad)) grad = grad / self.be.bsz grad = self.clip_gradient_value(grad, self.gradient_clip_value) velocity = states[0] velocity_backup = states[-1] velocity_backup[:] = velocity velocity[:] = (self.momentum_coef * velocity - lrate * (scale_factor * grad + self.wdecay * param)) param[:] = (param + velocity * (1 + self.momentum_coef) - self.momentum_coef * velocity_backup)
def optimize(self, layer_list, epoch): lrate, epsilon, decay = (self.learning_rate, self.epsilon, self.decay_rate) param_list = get_param_list(layer_list) scale_factor = self.clip_gradient_norm(param_list, self.gradient_clip_norm) for (param, grad), states in param_list: param.rounding = self.stochastic_round if len(states) == 0: states.append(self.be.zeros_like(grad)) states.append(self.be.zeros_like(grad)) grad = grad / self.be.bsz grad = self.clip_gradient_value(grad, self.gradient_clip_value) # update state state = states[0] state[:] = decay * state + self.be.square(grad) * (1.0 - decay) # update velocity velocity = states[1] temp_velocity = - (scale_factor * grad * lrate) / self.be.sqrt(state + epsilon) velocity[:] = self.momentum * velocity + temp_velocity param[:] = param + self.momentum * velocity + temp_velocity
def clip_param_in_layers(layer_list, abs_bound=None): """ Element-wise clip all parameter tensors to between ``-abs_bound`` and ``+abs_bound`` in a list of layers. Arguments: layer_list (list): List of layers be (Backend object): Backend in which the tensor resides abs_bound (float, optional): Value to element-wise clip gradients or parameters. Defaults to None. """ param_list = get_param_list(layer_list) for (param, grad), states in param_list: if abs_bound: param[:] = param.backend.clip(param, -abs(abs_bound), abs(abs_bound))
def optimize(self, layer_list, epoch): self.optimizer.optimize(layer_list, epoch=epoch) param_list = get_param_list(layer_list) for idx, ((param, grad), states) in enumerate(param_list): if layer_list[idx].name in ['BiasLayer']: continue if layer_list[idx].name in ['ConvolutionLayer']: max_norm = self.max_kern_norm axes = 0 else: max_norm = self.max_col_norm axes = 1 norm = self.be.empty_like(param) norm[:] = self.be.sqrt(self.be.sum(self.be.square(param), axis=axes)) target_norm = self.be.clip(norm, 0., max_norm) param[:] = param * target_norm / (norm + 1e-7)
def optimize(self, layer_list, epoch): """ Apply the learning rule to all the layers and update the states. Arguments: layer_list (list): a list of Layer objects to optimize. epoch (int): the current epoch, needed for the Schedule object. """ param_list = get_param_list(layer_list) scale_factor = self.clip_gradient_norm(param_list, self.gradient_clip_norm) lrate = self.schedule.get_learning_rate(self.learning_rate, epoch) for i, params in enumerate(param_list): if layer_list[i].use_fast_weights: fast_params = get_fast_params(layer_list[i]) self._update_params_fast(params, fast_params, lrate, scale_factor) else: self._update_params(params, lrate, scale_factor)
def params(layers): for (param, grad), states in get_param_list(layers): yield param