Пример #1
0
    def optimize(self, layer_list, epoch):
        """
        Apply the learning rule to all the layers and update the states.
        Arguments:
            layer_list (list): a list of Layer objects to optimize.
            epoch (int): the current epoch, needed for the Schedule object.
        """
        lrate = self.schedule.get_learning_rate(self.learning_rate, epoch)
        param_list = get_param_list(layer_list)

        scale_factor = self.clip_gradient_norm(param_list, self.gradient_clip_norm)

        for (param, grad), states in param_list:
            param.rounding = self.stochastic_round
            if len(states) == 0:
                states.append(self.be.zeros_like(grad))
                states.append(self.be.zeros_like(grad))
            grad = grad / self.be.bsz
            grad = self.clip_gradient_value(grad, self.gradient_clip_value)

            velocity = states[0]
            velocity_backup = states[-1]

            velocity_backup[:] = velocity
            velocity[:] = (self.momentum_coef * velocity -
                           lrate * (scale_factor * grad + self.wdecay * param))
            param[:] = (param + velocity * (1 + self.momentum_coef) -
                        self.momentum_coef * velocity_backup)
Пример #2
0
    def optimize(self, layer_list, epoch):
        lrate, epsilon, decay = (self.learning_rate, self.epsilon, self.decay_rate)
        param_list = get_param_list(layer_list)

        scale_factor = self.clip_gradient_norm(param_list, self.gradient_clip_norm)

        for (param, grad), states in param_list:

            param.rounding = self.stochastic_round
            if len(states) == 0:
                states.append(self.be.zeros_like(grad))
                states.append(self.be.zeros_like(grad))

            grad = grad / self.be.bsz
            grad = self.clip_gradient_value(grad, self.gradient_clip_value)

            # update state
            state = states[0]
            state[:] = decay * state + self.be.square(grad) * (1.0 - decay)

            # update velocity
            velocity = states[1]
            temp_velocity = - (scale_factor * grad * lrate) / self.be.sqrt(state + epsilon)
            velocity[:] = self.momentum * velocity + temp_velocity

            param[:] = param + self.momentum * velocity + temp_velocity
Пример #3
0
    def clip_param_in_layers(layer_list, abs_bound=None):
        """
        Element-wise clip all parameter tensors to between
        ``-abs_bound`` and ``+abs_bound`` in a list of layers.

        Arguments:
            layer_list (list): List of layers
            be (Backend object): Backend in which the tensor resides
            abs_bound (float, optional): Value to element-wise clip gradients
                                         or parameters. Defaults to None.
        """
        param_list = get_param_list(layer_list)
        for (param, grad), states in param_list:
            if abs_bound:
                param[:] = param.backend.clip(param, -abs(abs_bound), abs(abs_bound))
Пример #4
0
    def clip_param_in_layers(layer_list, abs_bound=None):
        """
        Element-wise clip all parameter tensors to between
        ``-abs_bound`` and ``+abs_bound`` in a list of layers.

        Arguments:
            layer_list (list): List of layers
            be (Backend object): Backend in which the tensor resides
            abs_bound (float, optional): Value to element-wise clip gradients
                                         or parameters. Defaults to None.
        """
        param_list = get_param_list(layer_list)
        for (param, grad), states in param_list:
            if abs_bound:
                param[:] = param.backend.clip(param, -abs(abs_bound), abs(abs_bound))
Пример #5
0
    def optimize(self, layer_list, epoch):
        self.optimizer.optimize(layer_list, epoch=epoch)
        param_list = get_param_list(layer_list)

        for idx, ((param, grad), states) in enumerate(param_list):
            if layer_list[idx].name in ['BiasLayer']:
                continue
            if layer_list[idx].name in ['ConvolutionLayer']:
                max_norm = self.max_kern_norm
                axes = 0
            else:
                max_norm = self.max_col_norm
                axes = 1

            norm = self.be.empty_like(param)
            norm[:] = self.be.sqrt(self.be.sum(self.be.square(param), axis=axes))
            target_norm = self.be.clip(norm, 0., max_norm)
            param[:] = param * target_norm / (norm + 1e-7)
Пример #6
0
    def optimize(self, layer_list, epoch):
        """
        Apply the learning rule to all the layers and update the states.

        Arguments:
            layer_list (list): a list of Layer objects to optimize.
            epoch (int): the current epoch, needed for the Schedule object.
        """
        param_list = get_param_list(layer_list)

        scale_factor = self.clip_gradient_norm(param_list, self.gradient_clip_norm)
        lrate = self.schedule.get_learning_rate(self.learning_rate, epoch)

        for i, params in enumerate(param_list):
            if layer_list[i].use_fast_weights:
                fast_params = get_fast_params(layer_list[i])
                self._update_params_fast(params, fast_params, lrate, scale_factor)
            else:
                self._update_params(params, lrate, scale_factor)
Пример #7
0
def params(layers):
    for (param, grad), states in get_param_list(layers):
        yield param