示例#1
0
    def call(self, inputs):
        inputs, weights = inputs

        weights = weights / tf.reduce_sum(weights)  # Normalize sample weights
        weights_expand = tf.expand_dims(weights, axis=1)

        mean, variance = tf.nn.weighted_moments(
            inputs, [0], weights_expand)  # Compute weighed mean and variance

        counter = K.update_add(
            self.counter, K.ones_like(self.counter)
        )  # Count number of times the data passes through the model
        init = K.sign(
            counter - K.ones_like(counter)
        )  # Indicator is 1 if model is being initalized, 0 otherwise

        mean = K.update(
            self.mean, init * self.mean +
            (1.0 - init) * mean)  # Store the mean when the indicator is 1
        variance = K.update(
            self.variance, init * self.variance + (1.0 - init) *
            variance)  # Store the variance when the indicator is 1

        mean_expand = tf.expand_dims(mean, axis=0)
        variance_expand = tf.expand_dims(variance, axis=0)

        outputs = (inputs - mean_expand) / tf.sqrt(
            variance_expand + self.epsilon)  # Normalize the inputs

        return outputs
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.learning_rate
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):

            # Learning rate multipliers
            if self.multipliers:
                multiplier = [
                    mult for mult in self.multipliers if mult in p.name
                ]
            else:
                multiplier = None
            if multiplier:
                new_lr_t = lr_t * self.multipliers[multiplier[0]]
                if self.debug_verbose:
                    print('Setting {} to learning rate {}'.format(
                        multiplier[0], new_lr_t))
                    print(K.get_value(new_lr_t))
            else:
                new_lr_t = lr_t
                if self.debug_verbose:
                    print('No change in learning rate {}'.format(p.name))
                    print(K.get_value(new_lr_t))
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - new_lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - new_lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
示例#3
0
 def get_updates(self, loss, params):
   sync_cond = K.equal((self.iterations + 1) // self.sync_period *
                       self.sync_period, (self.iterations + 1))
   if TF_KERAS:
     slow_params = [K.variable(K.get_value(p), name='sp_{}'.format(i))
                    for i, p in enumerate(params)]
     self.updates = self.optimizer.get_updates(loss, params)
     slow_updates = []
     for p, sp in zip(params, slow_params):
       sp_t = sp + self.slow_step * (p - sp)
       slow_updates.append(K.update(sp, K.switch(
           sync_cond,
           sp_t,
           sp,
       )))
       slow_updates.append(K.update_add(p, K.switch(
           sync_cond,
           sp_t - p,
           K.zeros_like(p),
       )))
   else:
     slow_params = {p.name: K.variable(K.get_value(
         p), name='sp_{}'.format(i)) for i, p in enumerate(params)}
     update_names = ['update', 'update_add', 'update_sub']
     original_updates = [getattr(K, name) for name in update_names]
     setattr(K, 'update', lambda x, new_x: ('update', x, new_x))
     setattr(K, 'update_add', lambda x, new_x: ('update_add', x, new_x))
     setattr(K, 'update_sub', lambda x, new_x: ('update_sub', x, new_x))
     self.updates = self.optimizer.get_updates(loss, params)
     for name, original_update in zip(update_names, original_updates):
       setattr(K, name, original_update)
     slow_updates = []
     for i, update in enumerate(self.updates):
       if isinstance(update, tuple):
         name, x, new_x, adjusted = update + (update[-1],)
         update_func = getattr(K, name)
         if name == 'update_add':
           adjusted = x + new_x
         if name == 'update_sub':
           adjusted = x - new_x
         if x.name not in slow_params:
           self.updates[i] = update_func(x, new_x)
         else:
           slow_param = slow_params[x.name]
           slow_param_t = slow_param + \
               self.slow_step * (adjusted - slow_param)
           slow_updates.append(K.update(slow_param, K.switch(
               sync_cond,
               slow_param_t,
               slow_param,
           )))
           self.updates[i] = K.update(x, K.switch(
               sync_cond,
               slow_param_t,
               adjusted,
           ))
     slow_params = list(slow_params.values())
   self.updates += slow_updates
   self.weights = self.optimizer.weights + slow_params
   return self.updates
        def _resource_apply_op(self, grad, var, indices=None):
            # 更新判据
            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
            # 获取梯度
            ag = self.get_slot(var, 'ag')

            old_update = K.update

            def new_update(x, new_x):
                new_x = K.switch(cond, new_x, x)
                return old_update(x, new_x)

            K.update = new_update
            ag_t = ag / self.grad_accum_steps
            op = super(new_optimizer, self)._resource_apply_op(ag_t, var)
            K.update = old_update

            # 累积梯度
            with tf.control_dependencies([op]):
                ag_t = K.switch(cond, K.zeros_like(ag), ag)
                with tf.control_dependencies([K.update(ag, ag_t)]):
                    if indices is None:
                        ag_t = K.update(ag, ag + grad)
                    else:
                        ag_t = self._resource_scatter_add(ag, indices, grad)

            return ag_t
示例#5
0
文件: adax.py 项目: sunnyhuma171/adax
    def _resource_apply(self, grad, var, indices=None):
        # 准备变量
        var_dtype = var.dtype.base_dtype
        lr_t = self._decayed_lr(var_dtype)
        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')
        beta_1_t = self._get_hyper('beta_1', var_dtype)
        beta_2_t = self._get_hyper('beta_2', var_dtype)
        epsilon_t = K.cast(self.epsilon, var_dtype)
        local_step = K.cast(self.iterations + 1, var_dtype)

        # 更新公式
        if indices is None:
            m_t = K.update(m, beta_1_t * m + (1 - beta_1_t) * grad)
            v_t = K.update(v, (1 + beta_2_t) * v + beta_2_t * grad**2)
        else:
            mv_ops = [
                K.update(m, beta_1_t * m),
                K.update(v, (1 + beta_2_t) * v)
            ]
            with tf.control_dependencies(mv_ops):
                m_t = self._resource_scatter_add(
                    m, indices, (1 - beta_1_t) * grad
                )
                v_t = self._resource_scatter_add(v, indices, beta_2_t * grad**2)

        # 返回算子
        with tf.control_dependencies([m_t, v_t]):
            v_t = v_t / (K.pow(1.0 + beta_2_t, local_step) - 1.0)
            var_t = var - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
            return K.update(var, var_t)
    def get_updates(self, loss, params):
        assert len(params) == len(self.multipliers)
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. /
                   (1. +
                    self.decay * K.cast(self.iterations, K.dtype(self.decay))))
        # momentum
        shapes = [K.int_shape(p) for p in params]
        moments = [K.zeros(shape) for shape in shapes]
        self.weights = [self.iterations] + moments
        for p, g, m, mult in zip(params, grads, moments, self.multipliers):
            v = self.momentum * m - (lr * mult) * g  # velocity
            self.updates.append(K.update(m, v))

            if self.nesterov:
                new_p = p + self.momentum * v - lr * g
            else:
                new_p = p + v

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
示例#7
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            g2 = K.square(g)
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = v - (1. - self.beta_2) * K.sign(v - g2) * g2
            p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
示例#8
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        accumulators = [
            K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params
        ]
        self.weights = accumulators
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        for i, (p, g, a) in enumerate(zip(params, grads, accumulators)):
            # update accumulator
            rho = 0.5 if i == 0 and self.e2efs_layer is not None and not self.lr_momentum else self.rho
            i_lr = self.e2efs_lr if i == 0 and self.e2efs_layer is not None and not self.lr_momentum else lr
            new_a = rho * a + (1. - rho) * K.square(g)
            self.updates.append(K.update(a, new_a))
            new_p = p - i_lr * g / (K.sqrt(new_a) + self.epsilon)

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
示例#9
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        adam_lr = self.adam_lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))
            adam_lr = adam_lr * (1. / (1. + self.decay * K.cast(
                self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        adam_lr_t = adam_lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                               (1. - K.pow(self.beta_1, t)))

        # momentum
        shapes = [K.int_shape(p) for p in params]
        moments = [K.zeros(shape) for shape in shapes]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.ms = K.zeros(K.int_shape(params[0]), dtype=K.dtype(params[0]))
        self.vs = K.zeros(K.int_shape(params[0]), dtype=K.dtype(params[0]))
        self.weights = [self.iterations] + moments + vhats + [self.ms
                                                              ] + [self.vs]
        for i, (p, g, m, vhat) in enumerate(zip(params, grads, moments,
                                                vhats)):
            v = self.momentum * m - lr * g  # velocity
            self.updates.append(K.update(m, v))

            if self.nesterov:
                new_p = p + self.momentum * v - lr * g
            else:
                new_p = p + v

            if i == 0 and self.e2efs_layer is not None:
                nnz = K.sum(K.cast(K.greater(p, 0.), K.floatx()))
                m_t = (self.beta_1 * self.ms) + (1. - self.beta_1) * g
                v_t = (self.beta_2 *
                       self.vs) + (1. - self.beta_2) * K.square(g)
                if self.amsgrad:
                    vhat_t = K.maximum(vhat, v_t)
                    p_t = p - adam_lr_t * m_t / (K.sqrt(vhat_t) + K.epsilon())
                    self.updates.append(K.update(vhat, vhat_t))
                else:
                    p_t = p - adam_lr_t * m_t / (K.sqrt(v_t) + K.epsilon())

                self.updates.append(K.update(self.ms, m_t))
                self.updates.append(K.update(self.vs, v_t))
                new_p = K.switch(K.less_equal(nnz, self.e2efs_layer.units),
                                 new_p, p_t)

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
示例#10
0
    def get_updates(self, params, loss):
        grads = self.get_gradients(loss, params)
        shapes = [K.get_variable_shape(p) for p in params]
        alphas = [
            K.variable(K.ones(shape) * self.init_alpha) for shape in shapes
        ]
        old_grads = [K.zeros(shape) for shape in shapes]
        self.weights = alphas + old_grads
        self.updates = []

        for p, grad, old_grad, alpha in zip(params, grads, old_grads, alphas):
            grad = K.sign(grad)
            new_alpha = K.switch(
                K.greater(grad * old_grad, 0),
                K.minimum(alpha * self.scale_up, self.max_alpha),
                K.switch(K.less(grad * old_grad, 0),
                         K.maximum(alpha * self.scale_down, self.min_alpha),
                         alpha))

            grad = K.switch(K.less(grad * old_grad, 0), K.zeros_like(grad),
                            grad)
            new_p = p - grad * new_alpha

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)
            self.updates.append(K.update(p, new_p))
            self.updates.append(K.update(alpha, new_alpha))
            self.updates.append(K.update(old_grad, grad))

        return self.updates
示例#11
0
    def __call__(self, y_true,
                 y_pred):  # y_true, y_pred shape: (BS, row, col, ch * 2)
        data_true, generator_true = y_true[:, :, :, 0:3], y_true[:, :, :, 3:6]
        data_pred, generator_pred = y_pred[:, :, :, 0:3], y_pred[:, :, :, 3:6]
        loss_data = K.mean(K.abs(data_true - data_pred), axis=[1, 2, 3])
        loss_generator = K.mean(K.abs(generator_true - generator_pred),
                                axis=[1, 2, 3])
        ret = loss_data - self.k_var * loss_generator

        # for updating values in each epoch, use `updates` mechanism
        # DiscriminatorModel collects Loss Function's updates attributes
        mean_loss_data = K.mean(loss_data)
        mean_loss_gen = K.mean(loss_generator)

        # update K
        new_k = self.k_var + self.lambda_k * (self.gamma * mean_loss_data -
                                              mean_loss_gen)
        new_k = K.clip(new_k, 0, 1)
        self.updates.append(K.update(self.k_var, new_k))

        # calculate M-Global
        m_global = mean_loss_data + K.abs(self.gamma * mean_loss_data -
                                          mean_loss_gen)
        self.updates.append(K.update(self.m_global_var, m_global))

        # let loss_real_x mean_loss_data
        self.updates.append(K.update(self.loss_real_x_var, mean_loss_data))

        # let loss_gen_x mean_loss_gen
        self.updates.append(K.update(self.loss_gen_x_var, mean_loss_gen))

        return ret
示例#12
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = accumulators
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        for p, g, a in zip(params, grads, accumulators):
            # update accumulator
            new_a = self.rho * a + (1. - self.rho) * K.square(g)
            self.updates.append(K.update(a, new_a))
            new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)
            
            clptrkey = set_pattern_find(p.name,self.clips.keys())
            if self.clips_val and clptrkey:
                if self.verbose>0:
                    print("CLpping variable",p.name," to ", self.clips[p.name] )
                c = K.eval(self.clips[clptrkey])
                new_p = K.clip(new_p, c[0], c[1])

            self.updates.append(K.update(p, new_p))
        return self.updates
示例#13
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1

        # Applies bounds on actual learning rate
        step_size = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                          (1. - K.pow(self.beta_1, t)))

        final_lr = self.final_lr * lr / self.base_lr
        lower_bound = final_lr * (1. - 1. / (self.gamma * t + 1.))
        upper_bound = final_lr * (1. + 1. / (self.gamma * t))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsbound:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            # apply weight decay
            if self.weight_decay != 0.:
                g += self.weight_decay * K.stop_gradient(p)

            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)

            if self.amsbound:
                vhat_t = K.maximum(vhat, v_t)
                denom = (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                denom = (K.sqrt(v_t) + self.epsilon)

            # Compute the bounds
            step_size_p = step_size * K.ones_like(denom)
            step_size_p_bound = step_size_p / denom
            bounded_lr_t = m_t * K.minimum(
                K.maximum(step_size_p_bound, lower_bound), upper_bound)

            p_t = p - bounded_lr_t

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.learning_rate
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * K.cast(self.iterations,
                                                  K.dtype(self.decay))))
        # momentum
        shapes = [K.int_shape(p) for p in params]
        moments = [K.zeros(shape) for shape in shapes]
        self.weights = [self.iterations] + moments
        for p, g, m in zip(params, grads, moments):

            matched_layer = [x for x in self.lr_multipliers.keys() if x in p.name]
            if matched_layer:
                new_lr = lr * self.lr_multipliers[matched_layer[0]]
            else:
                new_lr = lr

            v = self.momentum * m - new_lr * g  # velocity
            self.updates.append(K.update(m, v))

            if self.nesterov:
                new_p = p + self.momentum * v - new_lr * g
            else:
                new_p = p + v

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
示例#15
0
    def get_updates(self, loss, params):
        """ Obtain the optimizer loss updates.

        Parameters
        ----------
        loss: list
            List of tensors

        params: list
            List of tensors

        Returns
        -------
        list
            List of tensors
        """
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        # Pass off to CPU if requested
        if self.cpu_mode:
            with K.tf.device("/cpu:0"):
                ms, vs, vhats = self._update_1(params)
        else:
            ms, vs, vhats = self._update_1(params)

        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
示例#16
0
 def update_func():
     update_params = [
         K.update(q, p * self.slow_ratio + q * self.fast_ratio)
         for p, q in zip(self.slow_params, params)
     ]
     with tf.control_dependencies(update_params):
         reset_slow_params = [
             K.update(p, q) for p, q in zip(self.slow_params, params)
         ]
     return tf.group(*(reset_slow_params + update_iter))
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        '''Bias corrections according to the Adam paper
        '''
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):

            ####################################################
            # Add a lr multiplier for vars outside excluded_vars
            if p.name in self.excluded_vars:
                multiplied_lr_t = lr_t
            else:
                multiplied_lr_t = lr_t * self.lr_mult
            ###################################################

            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            '''Schedule multiplier eta_t = 1 for simple AdamW
            According to the AdamW paper, eta_t can be fixed, decay, or 
            also be used for warm restarts (AdamWR to come). 
            '''
            eta_t = 1.
            p_t = p - eta_t * (multiplied_lr_t * m_t /
                               (K.sqrt(v_t) + self.epsilon))
            if self.weight_decay != 0:
                '''Normalized weight decay according to the AdamW paper
                '''
                w_d = self.weight_decay * K.sqrt(
                    self.batch_size / (self.samples_per_epoch * self.epochs))
                p_t = p_t - eta_t * (w_d * p)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
示例#18
0
def add_weightnorm_param_updates(updates, new_V_param, new_g_param, W,
                                 V_scaler):
    ps = K.get_variable_shape(new_V_param)
    norm_axes = [i for i in range(len(ps) - 1)]

    # update W and V_scaler
    new_V_norm = tf.sqrt(tf.reduce_sum(tf.square(new_V_param), norm_axes))
    new_V_scaler = new_g_param / new_V_norm
    new_W = tf.reshape(new_V_scaler, [1] * len(norm_axes) + [-1]) * new_V_param
    updates.append(K.update(W, new_W))
    updates.append(K.update(V_scaler, new_V_scaler))
示例#19
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        self.weights = [self.iterations]
        lr = self.learning_rate

        for i, (p, g) in enumerate(zip(params, grads)):
            g2 = K.square(g) + self.epsilon1
            shape, dtype = K.int_shape(p), K.dtype(p)
            factored_shape = self.factored_shape(shape)
            if factored_shape is None:
                # 定义参数
                v = K.zeros(shape, dtype=dtype, name='v_' + str(i))
                self.weights.append(v)
                # 定义更新
                v_t = self.beta2 * v + (1.0 - self.beta2) * g2
                self.updates.append(K.update(v, v_t))
            else:
                # 定义参数
                shape1, axis1, shape2, axis2 = factored_shape
                vr = K.zeros(shape1, dtype=dtype, name='vr_' + str(i))
                vc = K.zeros(shape2, dtype=dtype, name='vc_' + str(i))
                self.weights.extend([vr, vc])
                # 定义更新
                g2r = K.mean(g2, axis=axis1, keepdims=True)
                g2c = K.mean(g2, axis=axis2, keepdims=True)
                vr_t = self.beta2 * vr + (1.0 - self.beta2) * g2r
                vc_t = self.beta2 * vc + (1.0 - self.beta2) * g2c
                self.updates.extend([K.update(vr, vr_t), K.update(vc, vc_t)])
                # 合成矩阵
                v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True)
            # 增量主体
            u = g / K.sqrt(v_t)
            # 增量裁剪
            if self.clipping_threshold is not None:
                u_rms = self.rms(u)
                d = self.clipping_threshold
                u = u / K.maximum(1.0, u_rms / d)
            # 增量滑动
            if self.beta1 > 0.0:
                # 定义参数
                m = K.zeros(shape, dtype=dtype, name='m_' + str(i))
                self.weights.append(m)
                # 定义更新
                m_t = self.beta1 * m + (1.0 - self.beta1) * u
                self.updates.append(K.update(m, m_t))
                u = m_t
            # 增量调整
            if self.multiply_by_parameter_scale:
                u = u * K.maximum(self.rms(p), self.epsilon2)
            # 更新参数
            self.updates.append(K.update(p, p - lr * u))

        return self.updates
    def get_updates(self, loss, params):
        # Mostly the same code as Adam class, with added multiplier variables.
        # Keras code from:
        # https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/keras/optimizers.py#L456
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (
                1.0 / (1.0 + self.decay * K.cast(self.iterations, K.dtype(self.decay)))
            )

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (
            K.sqrt(1.0 - K.pow(self.beta_2, t)) / (1.0 - K.pow(self.beta_1, t))
        )

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            layername = p.name.split("/", 1)[0]
            mult = self.multipliers.get(layername, 1.0)

            m_t = (self.beta_1 * m) + (1.0 - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1.0 - self.beta_2) * K.square(g)

            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - mult * lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - mult * lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, "constraint", None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
        def south_good(loss, pred, lossS, tape, velocities):
            gradients = tape.gradient(lossS, trainable_vars)

            grad_mag_sq = tf.reduce_sum(
                [tf.reduce_sum(K.square(g)) for g in gradients])
            norm = 1.0 / grad_mag_sq

            vprimes = []
            for (v, g, accum) in zip(velocities, gradients,
                                     self._gradient_ms_accums):
                vflat, gflat = K.flatten(v), -K.flatten(g)
                vprime = K.reshape(2 * K.sum(vflat * gflat) * norm * g,
                                   v.shape) - v
                vprimes.append(vprime)

                new_accum = (self.galilean_gradient_momentum * accum +
                             (1 - self.galilean_gradient_momentum) *
                             tf.reduce_mean(K.square(g)))
                K.update(accum, new_accum)

            # x0 - v (S) -> x0 + vprime (E)
            for (w, v, vprime) in zip(trainable_vars, velocities, vprimes):
                K.update_add(w, v + vprime)

            predE = self(x, training=True)
            lossE = K.mean(loss_fun(y, predE))

            # x0 + vprime (E) -> x0 - vprime (W)
            for (w, v, vprime) in zip(trainable_vars, velocities, vprimes):
                K.update_add(w, -2 * vprime)

            predW = self(x, training=True)
            lossW = K.mean(loss_fun(y, predW))

            go_east = tf.math.logical_and(lossE <= loss0, lossW > loss0)
            go_west = tf.math.logical_and(lossW <= loss0, lossE > loss0)

            # x0 - vprime (W) -> x0
            for (w, vprime) in zip(trainable_vars, vprimes):
                K.update_add(w, vprime)

            east_good_f = functools.partial(east_good, vprimes)
            west_good_f = functools.partial(west_good, vprimes)
            fallback_case_f = functools.partial(fallback_case, velocities)
            new_v, gradient_calcs = tf.cond(
                go_east, east_good_f,
                lambda: tf.cond(go_west, west_good_f, fallback_case_f))

            return loss, pred, new_v, gradient_calcs
示例#22
0
    def get_updates(self, params, loss):
        grads = self.get_gradients(loss, params)
        shapes = [K.shape(p) for p in params]
        alphas = [
            K.variable(K.ones(shape) * self.init_alpha) for shape in shapes
        ]
        old_grads = [K.zeros(shape) for shape in shapes]
        prev_weight_deltas = [K.zeros(shape) for shape in shapes]
        self.weights = alphas + old_grads
        self.updates = []

        for param, grad, old_grad, prev_weight_delta, alpha in zip(
                params, grads, old_grads, prev_weight_deltas, alphas):
            # equation 4
            new_alpha = K.switch(
                K.greater(grad * old_grad, 0),
                K.minimum(alpha * self.scale_up, self.max_alpha),
                K.switch(K.less(grad * old_grad, 0),
                         K.maximum(alpha * self.scale_down, self.min_alpha),
                         alpha))

            # equation 5
            new_delta = K.switch(
                K.greater(grad, 0), -new_alpha,
                K.switch(K.less(grad, 0), new_alpha, K.zeros_like(new_alpha)))

            # equation 7
            weight_delta = K.switch(K.less(grad * old_grad, 0),
                                    -prev_weight_delta, new_delta)

            # equation 6
            new_param = param + weight_delta

            # reset gradient_{t-1} to 0 if gradient sign changed (so that we do
            # not "double punish", see paragraph after equation 7)
            grad = K.switch(K.less(grad * old_grad, 0), K.zeros_like(grad),
                            grad)

            # Apply constraints
            #if param in constraints:
            #    c = constraints[param]
            #    new_param = c(new_param)

            self.updates.append(K.update(param, new_param))
            self.updates.append(K.update(alpha, new_alpha))
            self.updates.append(K.update(old_grad, grad))
            self.updates.append(K.update(prev_weight_delta, weight_delta))

        return self.updates
示例#23
0
 def update_function():
     with tensorflow.control_dependencies(orig_get_updates(
             loss, params)):
         reset_grads = [
             K.update(p, K.zeros(K.int_shape(p), dtype=K.dtype(p)))
             for p in self.accumulate_gradient_accumulators
         ]
         if ema_decay > 0:
             reset_grads += [K.update_add(self.total_iterations, 1)]
             reset_grads += [
                 K.update(e_p, (e_p * ema_decay) + (1 - ema_decay) * p)
                 for e_p, p in zip(self.params_ema, params)
             ]
     return tensorflow.group(*(reset_grads +
                               [updates_accumulated_iterations]))
示例#24
0
    def get_updates(self, params, loss):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.inital_decay > 0:
            lr *= (1. / (1. + self.decay * self.iterations))

        t = self.iterations + 1
        lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (
            1. - K.pow(self.beta_1, t))

        shapes = [K.get_variable_shape(p) for p in params]
        ms = [K.zeros(shape) for shape in shapes]
        vs = [K.zeros(shape) for shape in shapes]
        f = K.variable(0)
        d = K.variable(1)
        self.weights = [self.iterations] + ms + vs + [f, d]

        cond = K.greater(t, K.variable(1))
        small_delta_t = K.switch(K.greater(loss, f), self.small_k + 1,
                                 1. / (self.big_K + 1))
        big_delta_t = K.switch(K.greater(loss, f), self.big_K + 1,
                               1. / (self.small_k + 1))

        c_t = K.minimum(K.maximum(small_delta_t, loss / (f + self.epsilon)),
                        big_delta_t)
        f_t = c_t * f
        r_t = K.abs(f_t - f) / (K.minimum(f_t, f))
        d_t = self.beta_3 * d + (1 - self.beta_3) * r_t

        f_t = K.switch(cond, f_t, loss)
        d_t = K.switch(cond, d_t, K.variable(1.))

        self.updates.append(K.update(f, f_t))
        self.updates.append(K.update(d, d_t))

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            p_t = p - lr_t * m_t / (d_t * K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))

            new_p = p_t
            self.updates.append(K.update(p, new_p))
        return self.updates
示例#25
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)
                
            clptrkey = set_pattern_find(p.name,self.clips.keys())
            if self.clips_val and clptrkey:            
                c = K.eval(self.clips[clptrkey])
                if self.verbose>0:
                    print("Clipping variable",p.name," to ", c )
                new_p = K.clip(new_p, c[0], c[1])

            self.updates.append(K.update(p, new_p))
        return self.updates
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)

            m_t_hat = m_t / (1. - K.pow(self.beta_1, t))
            v_t_hat = v_t / (1. - K.pow(self.beta_2, t))

            p_dash = m_t_hat / (K.sqrt(v_t_hat + self.epsilon))

            if self.weight_decay > 0.:
                wd = self.weight_decay * p
                p_dash = p_dash + wd

            r1 = K.sqrt(K.sum(K.square(p)))
            r2 = K.sqrt(K.sum(K.square(p_dash)))

            r = tf.where(tf.greater(r1, 0.),
                         tf.where(tf.greater(r2, 0.), r1 / r2, 1.0), 1.0)
            # r = r1 / r2
            eta = r * lr

            p_t = p - eta * p_dash

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
示例#27
0
 def update_function():
     with tf.control_dependencies(orig_get_updates(loss, params)):
         reset_grads = [
             K.update(p, K.zeros(K.int_shape(p), dtype=K.dtype(p)))
             for p in self.accumulated_grads
         ]
     return tf.group(*(reset_grads + [iters]))
示例#28
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [(self.iterations, self.iterations + 1)]

        t = self.iterations + 1
        lr_t = self.lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (
            1. - K.pow(self.beta_1, t))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        gs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = ms + vs

        flag = K.equal(t % self.accum_iters, 0)
        flag = K.cast(flag, dtype='float32')

        for p, g, m, v, gg in zip(params, grads, ms, vs, gs):

            gg_t = (1 - flag) * (gg + g)
            m_t = (self.beta_1 *
                   m) + (1. - self.beta_1) * (gg + flag * g) / self.accum_iters
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(
                (gg + flag * g) / self.accum_iters)
            p_t = p - flag * lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append((m, flag * m_t + (1 - flag) * m))
            self.updates.append((v, flag * v_t + (1 - flag) * v))
            self.updates.append((gg, gg_t))

            # apply constraints.
            new_p = p_t
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)
            self.updates.append(K.update(p, new_p))
        return self.updates
示例#29
0
    def call(self, inputs, **kwargs):
        inputs, memory_length = inputs
        memory_length = K.cast(memory_length[0][0], 'int32')
        batch_size = K.cast(K.shape(inputs)[0], 'int32')
        seq_len = K.cast(K.shape(inputs)[1], 'int32')

        # Build new memory
        pad = K.tile(inputs[0:1, ...], (self.batch_size - batch_size, 1, 1))
        padded = K.concatenate([inputs, pad], axis=0)              # (self.batch_size, seq_len, output_dim)
        new_memory = K.concatenate([self.memory, padded], axis=1)  # (self.batch_size, self.memory_len + seq_len, ...)
        new_memory = tf.slice(                                     # (self.batch_size, self.memory_len, output_dim)
            new_memory,
            (0, seq_len, 0),
            (self.batch_size, self.memory_len + self.target_len, self.output_dim),
        )
        self.add_update(K.update(self.memory, new_memory), inputs)

        # Build output
        old_memory = tf.slice(                                     # (batch_size, memory_length, output_dim)
            new_memory,
            (0, K.maximum(0, self.memory_len + self.target_len - seq_len - memory_length), 0),
            (batch_size, K.minimum(self.memory_len, memory_length), self.output_dim),
        )

        return old_memory
示例#30
0
 def _assign(self, x, y):
     if self._tf1:
         result = K.update(x, y)
     else:
         result = state_ops.assign(x, y, use_locking=self._use_locking)
     self._updates.append(result)
     return result