예제 #1
0
 def _add(self, x, y):
     if self._tf1:
         result = K.update_add(x, y)
     else:
         result = state_ops.assign_add(x, y, use_locking=self._use_locking)
     self._updates.append(result)
     return result
예제 #2
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = accumulators
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        for p, g, a in zip(params, grads, accumulators):
            # update accumulator
            new_a = self.rho * a + (1. - self.rho) * K.square(g)
            self.updates.append(K.update(a, new_a))
            new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)
            
            clptrkey = set_pattern_find(p.name,self.clips.keys())
            if self.clips_val and clptrkey:
                if self.verbose>0:
                    print("CLpping variable",p.name," to ", self.clips[p.name] )
                c = K.eval(self.clips[clptrkey])
                new_p = K.clip(new_p, c[0], c[1])

            self.updates.append(K.update(p, new_p))
        return self.updates
    def get_updates(self, loss, params):
        assert len(params) == len(self.multipliers)
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. /
                   (1. +
                    self.decay * K.cast(self.iterations, K.dtype(self.decay))))
        # momentum
        shapes = [K.int_shape(p) for p in params]
        moments = [K.zeros(shape) for shape in shapes]
        self.weights = [self.iterations] + moments
        for p, g, m, mult in zip(params, grads, moments, self.multipliers):
            v = self.momentum * m - (lr * mult) * g  # velocity
            self.updates.append(K.update(m, v))

            if self.nesterov:
                new_p = p + self.momentum * v - lr * g
            else:
                new_p = p + v

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
예제 #4
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            g2 = K.square(g)
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = v - (1. - self.beta_2) * K.sign(v - g2) * g2
            p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
예제 #5
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        adam_lr = self.adam_lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))
            adam_lr = adam_lr * (1. / (1. + self.decay * K.cast(
                self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        adam_lr_t = adam_lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                               (1. - K.pow(self.beta_1, t)))

        # momentum
        shapes = [K.int_shape(p) for p in params]
        moments = [K.zeros(shape) for shape in shapes]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.ms = K.zeros(K.int_shape(params[0]), dtype=K.dtype(params[0]))
        self.vs = K.zeros(K.int_shape(params[0]), dtype=K.dtype(params[0]))
        self.weights = [self.iterations] + moments + vhats + [self.ms
                                                              ] + [self.vs]
        for i, (p, g, m, vhat) in enumerate(zip(params, grads, moments,
                                                vhats)):
            v = self.momentum * m - lr * g  # velocity
            self.updates.append(K.update(m, v))

            if self.nesterov:
                new_p = p + self.momentum * v - lr * g
            else:
                new_p = p + v

            if i == 0 and self.e2efs_layer is not None:
                nnz = K.sum(K.cast(K.greater(p, 0.), K.floatx()))
                m_t = (self.beta_1 * self.ms) + (1. - self.beta_1) * g
                v_t = (self.beta_2 *
                       self.vs) + (1. - self.beta_2) * K.square(g)
                if self.amsgrad:
                    vhat_t = K.maximum(vhat, v_t)
                    p_t = p - adam_lr_t * m_t / (K.sqrt(vhat_t) + K.epsilon())
                    self.updates.append(K.update(vhat, vhat_t))
                else:
                    p_t = p - adam_lr_t * m_t / (K.sqrt(v_t) + K.epsilon())

                self.updates.append(K.update(self.ms, m_t))
                self.updates.append(K.update(self.vs, v_t))
                new_p = K.switch(K.less_equal(nnz, self.e2efs_layer.units),
                                 new_p, p_t)

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
예제 #6
0
    def call(self, inputs):
        inputs, weights = inputs

        weights = weights / tf.reduce_sum(weights)  # Normalize sample weights
        weights_expand = tf.expand_dims(weights, axis=1)

        mean, variance = tf.nn.weighted_moments(
            inputs, [0], weights_expand)  # Compute weighed mean and variance

        counter = K.update_add(
            self.counter, K.ones_like(self.counter)
        )  # Count number of times the data passes through the model
        init = K.sign(
            counter - K.ones_like(counter)
        )  # Indicator is 1 if model is being initalized, 0 otherwise

        mean = K.update(
            self.mean, init * self.mean +
            (1.0 - init) * mean)  # Store the mean when the indicator is 1
        variance = K.update(
            self.variance, init * self.variance + (1.0 - init) *
            variance)  # Store the variance when the indicator is 1

        mean_expand = tf.expand_dims(mean, axis=0)
        variance_expand = tf.expand_dims(variance, axis=0)

        outputs = (inputs - mean_expand) / tf.sqrt(
            variance_expand + self.epsilon)  # Normalize the inputs

        return outputs
        def north_bad(loss, pred, velocities):
            # check south loss
            for (w, v) in zip(trainable_vars, velocities):
                # x0 + v (N) -> x0 - v (S)
                K.update_add(w, -2 * v)

            # the base algorithm would check the gradient at x0 or
            # at x0 + v, but we are checking it at x0 - v here
            with tf.GradientTape() as tape:
                predS = self(x, training=True)
                lossS = K.mean(loss_fun(y, predS))

            south_good_f = functools.partial(south_good, loss, pred, lossS,
                                             tape, velocities)
            south_bad_f = functools.partial(south_bad, loss, pred, velocities)
            return tf.cond(lossS <= loss0, south_good_f, south_bad_f)
예제 #8
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        accumulators = [
            K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params
        ]
        self.weights = accumulators
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        for i, (p, g, a) in enumerate(zip(params, grads, accumulators)):
            # update accumulator
            rho = 0.5 if i == 0 and self.e2efs_layer is not None and not self.lr_momentum else self.rho
            i_lr = self.e2efs_lr if i == 0 and self.e2efs_layer is not None and not self.lr_momentum else lr
            new_a = rho * a + (1. - rho) * K.square(g)
            self.updates.append(K.update(a, new_a))
            new_p = p - i_lr * g / (K.sqrt(new_a) + self.epsilon)

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
예제 #9
0
 def get_updates(self, loss, params):
   sync_cond = K.equal((self.iterations + 1) // self.sync_period *
                       self.sync_period, (self.iterations + 1))
   if TF_KERAS:
     slow_params = [K.variable(K.get_value(p), name='sp_{}'.format(i))
                    for i, p in enumerate(params)]
     self.updates = self.optimizer.get_updates(loss, params)
     slow_updates = []
     for p, sp in zip(params, slow_params):
       sp_t = sp + self.slow_step * (p - sp)
       slow_updates.append(K.update(sp, K.switch(
           sync_cond,
           sp_t,
           sp,
       )))
       slow_updates.append(K.update_add(p, K.switch(
           sync_cond,
           sp_t - p,
           K.zeros_like(p),
       )))
   else:
     slow_params = {p.name: K.variable(K.get_value(
         p), name='sp_{}'.format(i)) for i, p in enumerate(params)}
     update_names = ['update', 'update_add', 'update_sub']
     original_updates = [getattr(K, name) for name in update_names]
     setattr(K, 'update', lambda x, new_x: ('update', x, new_x))
     setattr(K, 'update_add', lambda x, new_x: ('update_add', x, new_x))
     setattr(K, 'update_sub', lambda x, new_x: ('update_sub', x, new_x))
     self.updates = self.optimizer.get_updates(loss, params)
     for name, original_update in zip(update_names, original_updates):
       setattr(K, name, original_update)
     slow_updates = []
     for i, update in enumerate(self.updates):
       if isinstance(update, tuple):
         name, x, new_x, adjusted = update + (update[-1],)
         update_func = getattr(K, name)
         if name == 'update_add':
           adjusted = x + new_x
         if name == 'update_sub':
           adjusted = x - new_x
         if x.name not in slow_params:
           self.updates[i] = update_func(x, new_x)
         else:
           slow_param = slow_params[x.name]
           slow_param_t = slow_param + \
               self.slow_step * (adjusted - slow_param)
           slow_updates.append(K.update(slow_param, K.switch(
               sync_cond,
               slow_param_t,
               slow_param,
           )))
           self.updates[i] = K.update(x, K.switch(
               sync_cond,
               slow_param_t,
               adjusted,
           ))
     slow_params = list(slow_params.values())
   self.updates += slow_updates
   self.weights = self.optimizer.weights + slow_params
   return self.updates
예제 #10
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1

        # Applies bounds on actual learning rate
        step_size = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                          (1. - K.pow(self.beta_1, t)))

        final_lr = self.final_lr * lr / self.base_lr
        lower_bound = final_lr * (1. - 1. / (self.gamma * t + 1.))
        upper_bound = final_lr * (1. + 1. / (self.gamma * t))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsbound:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            # apply weight decay
            if self.weight_decay != 0.:
                g += self.weight_decay * K.stop_gradient(p)

            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)

            if self.amsbound:
                vhat_t = K.maximum(vhat, v_t)
                denom = (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                denom = (K.sqrt(v_t) + self.epsilon)

            # Compute the bounds
            step_size_p = step_size * K.ones_like(denom)
            step_size_p_bound = step_size_p / denom
            bounded_lr_t = m_t * K.minimum(
                K.maximum(step_size_p_bound, lower_bound), upper_bound)

            p_t = p - bounded_lr_t

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
예제 #11
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.learning_rate
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):

            # Learning rate multipliers
            if self.multipliers:
                multiplier = [
                    mult for mult in self.multipliers if mult in p.name
                ]
            else:
                multiplier = None
            if multiplier:
                new_lr_t = lr_t * self.multipliers[multiplier[0]]
                if self.debug_verbose:
                    print('Setting {} to learning rate {}'.format(
                        multiplier[0], new_lr_t))
                    print(K.get_value(new_lr_t))
            else:
                new_lr_t = lr_t
                if self.debug_verbose:
                    print('No change in learning rate {}'.format(p.name))
                    print(K.get_value(new_lr_t))
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - new_lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - new_lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
        def loop_body(i, loss, pred, velocities, gradient_steps):
            # set x <- x0 + v
            for (w, v) in zip(trainable_vars, velocities):
                K.update_add(w, v)

            predN = self(x, training=True)
            lossN = K.mean(loss_fun(y, predN))

            north_good_f = functools.partial(north_good, lossN, predN,
                                             velocities)
            north_bad_f = functools.partial(north_bad, loss, pred, velocities)
            (loss, pred, velocities,
             delta_gradients) = tf.cond(lossN <= loss0, north_good_f,
                                        north_bad_f)

            return (i + 1, loss, pred, velocities,
                    gradient_steps + delta_gradients)
예제 #13
0
 def get_updates(self, loss, params):
     self.updates = [
         K.update_add(self.iterations, 1),
         K.update_add(self.optimizer.iterations, K.cast(self.cond,
                                                        'int64')),
     ]
     # gradient accumulation
     self.accum_grads = [
         K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params
     ]
     grads = self.get_gradients(loss, params)
     for g, ag in zip(grads, self.accum_grads):
         self.updates.append(K.update(ag, K.switch(self.cond, g, ag + g)))
     # inheriting updates of original optimizer
     self.updates.extend(self.optimizer.get_updates(loss, params)[1:])
     self.weights.extend(self.optimizer.weights)
     return self.updates
예제 #14
0
    def __call__(self, gradients):
        """Accumulates :obj:`gradients` on the current replica."""
        if len(self._gradients) == 0:
            self._gradients.extend([
                tf.Variable(
                    tf.zeros_like(gradient),
                    trainable=False,
                    synchronization=tf.VariableSynchronization.ON_READ,
                    aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
                if gradient is not None else gradient for gradient in gradients
            ])

        for accum_gradient, gradient in zip(self._gradients, gradients):
            if accum_gradient is not None and gradient is not None:
                K.update_add(
                    accum_gradient,
                    _multiply_gradient(gradient, self._accum_grad_scale))
예제 #15
0
    def get_updates(self, loss, params):
        """ Obtain the optimizer loss updates.

        Parameters
        ----------
        loss: list
            List of tensors

        params: list
            List of tensors

        Returns
        -------
        list
            List of tensors
        """
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        # Pass off to CPU if requested
        if self.cpu_mode:
            with K.tf.device("/cpu:0"):
                ms, vs, vhats = self._update_1(params)
        else:
            ms, vs, vhats = self._update_1(params)

        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        '''Bias corrections according to the Adam paper
        '''
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):

            ####################################################
            # Add a lr multiplier for vars outside excluded_vars
            if p.name in self.excluded_vars:
                multiplied_lr_t = lr_t
            else:
                multiplied_lr_t = lr_t * self.lr_mult
            ###################################################

            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            '''Schedule multiplier eta_t = 1 for simple AdamW
            According to the AdamW paper, eta_t can be fixed, decay, or 
            also be used for warm restarts (AdamWR to come). 
            '''
            eta_t = 1.
            p_t = p - eta_t * (multiplied_lr_t * m_t /
                               (K.sqrt(v_t) + self.epsilon))
            if self.weight_decay != 0:
                '''Normalized weight decay according to the AdamW paper
                '''
                w_d = self.weight_decay * K.sqrt(
                    self.batch_size / (self.samples_per_epoch * self.epochs))
                p_t = p_t - eta_t * (w_d * p)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
예제 #17
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        self.weights = [self.iterations]
        lr = self.learning_rate

        for i, (p, g) in enumerate(zip(params, grads)):
            g2 = K.square(g) + self.epsilon1
            shape, dtype = K.int_shape(p), K.dtype(p)
            factored_shape = self.factored_shape(shape)
            if factored_shape is None:
                # 定义参数
                v = K.zeros(shape, dtype=dtype, name='v_' + str(i))
                self.weights.append(v)
                # 定义更新
                v_t = self.beta2 * v + (1.0 - self.beta2) * g2
                self.updates.append(K.update(v, v_t))
            else:
                # 定义参数
                shape1, axis1, shape2, axis2 = factored_shape
                vr = K.zeros(shape1, dtype=dtype, name='vr_' + str(i))
                vc = K.zeros(shape2, dtype=dtype, name='vc_' + str(i))
                self.weights.extend([vr, vc])
                # 定义更新
                g2r = K.mean(g2, axis=axis1, keepdims=True)
                g2c = K.mean(g2, axis=axis2, keepdims=True)
                vr_t = self.beta2 * vr + (1.0 - self.beta2) * g2r
                vc_t = self.beta2 * vc + (1.0 - self.beta2) * g2c
                self.updates.extend([K.update(vr, vr_t), K.update(vc, vc_t)])
                # 合成矩阵
                v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True)
            # 增量主体
            u = g / K.sqrt(v_t)
            # 增量裁剪
            if self.clipping_threshold is not None:
                u_rms = self.rms(u)
                d = self.clipping_threshold
                u = u / K.maximum(1.0, u_rms / d)
            # 增量滑动
            if self.beta1 > 0.0:
                # 定义参数
                m = K.zeros(shape, dtype=dtype, name='m_' + str(i))
                self.weights.append(m)
                # 定义更新
                m_t = self.beta1 * m + (1.0 - self.beta1) * u
                self.updates.append(K.update(m, m_t))
                u = m_t
            # 增量调整
            if self.multiply_by_parameter_scale:
                u = u * K.maximum(self.rms(p), self.epsilon2)
            # 更新参数
            self.updates.append(K.update(p, p - lr * u))

        return self.updates
예제 #18
0
 def get_updates(self, loss, params):
     """
     Build the graph nodes that accumulate gradients.
     """
     self.updates = []
     grads = self.get_gradients(loss, params)
     for param, grad in zip(params, grads):
         shape = K.int_shape(param)
         var = K.zeros(shape)
         self._vars.append(var)
         self.updates.append(K.update_add(var, grad))
     return self.updates
예제 #19
0
 def __call__(self, y_true, y_pred):
     '''Update precision computation.
     # Arguments
         y_true: Tensor, batch_wise labels
         y_pred: Tensor, batch_wise predictions
     # Returns
         Overall precision for the epoch at the completion of the batch.
     '''
     # Batch
     y_true, y_pred = _slice_by_class(y_true, y_pred, self.class_ind)
     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
     pred_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
     # Current
     current_true_positives = self.true_positives * 1
     current_pred_positives = self.pred_positives * 1
     # Updates
     updates = [K.update_add(self.true_positives, true_positives),
                K.update_add(self.pred_positives, pred_positives)]
     self.add_update(updates, inputs=[y_true, y_pred])
     # Compute recall
     return (current_true_positives + true_positives) / \
            (current_pred_positives + pred_positives + K.epsilon())
    def updated_get_updates(self, loss, params):
        self.accumulate_gradient_accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        updates_accumulated_iterations = K.update_add(self.accumulated_iterations, 1)
        new_grads = orig_get_gradients(loss, params)
        if not accumulate_sum_or_mean:
            new_grads = [g / K.cast(self.update_params_frequency, K.dtype(g)) for g in new_grads]
        self.updated_grads = [K.update_add(p, g) for p, g in zip(self.accumulate_gradient_accumulators, new_grads)]

        def update_function():
            with tensorflow.control_dependencies(orig_get_updates(loss, params)):
                reset_grads = [K.update(p, K.zeros(K.int_shape(p), dtype=K.dtype(p))) for p in
                               self.accumulate_gradient_accumulators]
            return tensorflow.group(*(reset_grads + [updates_accumulated_iterations]))

        def just_store_function():
            return tensorflow.group(*[updates_accumulated_iterations])

        update_switch = K.equal((updates_accumulated_iterations) % self.update_params_frequency, 0)

        with tensorflow.control_dependencies(self.updated_grads):
            self.updates = [K.switch(update_switch, update_function, just_store_function)]
            return self.updates
    def get_updates(self, loss, params):
        # Mostly the same code as Adam class, with added multiplier variables.
        # Keras code from:
        # https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/keras/optimizers.py#L456
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (
                1.0 / (1.0 + self.decay * K.cast(self.iterations, K.dtype(self.decay)))
            )

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (
            K.sqrt(1.0 - K.pow(self.beta_2, t)) / (1.0 - K.pow(self.beta_1, t))
        )

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            layername = p.name.split("/", 1)[0]
            mult = self.multipliers.get(layername, 1.0)

            m_t = (self.beta_1 * m) + (1.0 - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1.0 - self.beta_2) * K.square(g)

            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - mult * lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - mult * lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, "constraint", None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
예제 #22
0
 def update_function():
     with tensorflow.control_dependencies(orig_get_updates(
             loss, params)):
         reset_grads = [
             K.update(p, K.zeros(K.int_shape(p), dtype=K.dtype(p)))
             for p in self.accumulate_gradient_accumulators
         ]
         if ema_decay > 0:
             reset_grads += [K.update_add(self.total_iterations, 1)]
             reset_grads += [
                 K.update(e_p, (e_p * ema_decay) + (1 - ema_decay) * p)
                 for e_p, p in zip(self.params_ema, params)
             ]
     return tensorflow.group(*(reset_grads +
                               [updates_accumulated_iterations]))
예제 #23
0
    def get_updates(self, params, loss):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.inital_decay > 0:
            lr *= (1. / (1. + self.decay * self.iterations))

        t = self.iterations + 1
        lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (
            1. - K.pow(self.beta_1, t))

        shapes = [K.get_variable_shape(p) for p in params]
        ms = [K.zeros(shape) for shape in shapes]
        vs = [K.zeros(shape) for shape in shapes]
        f = K.variable(0)
        d = K.variable(1)
        self.weights = [self.iterations] + ms + vs + [f, d]

        cond = K.greater(t, K.variable(1))
        small_delta_t = K.switch(K.greater(loss, f), self.small_k + 1,
                                 1. / (self.big_K + 1))
        big_delta_t = K.switch(K.greater(loss, f), self.big_K + 1,
                               1. / (self.small_k + 1))

        c_t = K.minimum(K.maximum(small_delta_t, loss / (f + self.epsilon)),
                        big_delta_t)
        f_t = c_t * f
        r_t = K.abs(f_t - f) / (K.minimum(f_t, f))
        d_t = self.beta_3 * d + (1 - self.beta_3) * r_t

        f_t = K.switch(cond, f_t, loss)
        d_t = K.switch(cond, d_t, K.variable(1.))

        self.updates.append(K.update(f, f_t))
        self.updates.append(K.update(d, d_t))

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            p_t = p - lr_t * m_t / (d_t * K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))

            new_p = p_t
            self.updates.append(K.update(p, new_p))
        return self.updates
예제 #24
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)
                
            clptrkey = set_pattern_find(p.name,self.clips.keys())
            if self.clips_val and clptrkey:            
                c = K.eval(self.clips[clptrkey])
                if self.verbose>0:
                    print("Clipping variable",p.name," to ", c )
                new_p = K.clip(new_p, c[0], c[1])

            self.updates.append(K.update(p, new_p))
        return self.updates
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)

            m_t_hat = m_t / (1. - K.pow(self.beta_1, t))
            v_t_hat = v_t / (1. - K.pow(self.beta_2, t))

            p_dash = m_t_hat / (K.sqrt(v_t_hat + self.epsilon))

            if self.weight_decay > 0.:
                wd = self.weight_decay * p
                p_dash = p_dash + wd

            r1 = K.sqrt(K.sum(K.square(p)))
            r2 = K.sqrt(K.sum(K.square(p_dash)))

            r = tf.where(tf.greater(r1, 0.),
                         tf.where(tf.greater(r2, 0.), r1 / r2, 1.0), 1.0)
            # r = r1 / r2
            eta = r * lr

            p_t = p - eta * p_dash

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
예제 #26
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr

        t = K.cast(self.iterations, K.floatx()) + 1

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)

            m_t_hat = m_t / (1. - K.pow(self.beta_1, t))
            v_t_hat = v_t / (1. - K.pow(self.beta_2, t))

            p_dash = m_t_hat / (K.sqrt(v_t_hat + self.epsilon))

            if self._do_use_weight_decay(p.name):
                wd = self.weight_decay * p
                p_dash = p_dash + wd

            r1 = linalg_ops.norm(p, ord=2)
            r2 = linalg_ops.norm(p_dash, ord=2)

            r = array_ops.where(
                math_ops.greater(r1, 0),
                array_ops.where(math_ops.greater(r2, 0), (r1 / r2), 1.0), 1.0)

            # r = r1 / r2
            eta = r * lr

            p_t = p - eta * p_dash

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
    def call(self, inputs, training=None):
        x = inputs
        assert not isinstance(x, list)

        # Compute the minibatch statistics
        mean, var = self._moments(x)
        sigma = K.sqrt(var + self.epsilon)

        # If in training phase set rmax, dmax large so that we use the moving
        # averages to do the normalization
        rmax = K.in_train_phase(self.rmax, K.constant(1e5), training)
        dmax = K.in_train_phase(self.dmax, K.constant(1e5), training)

        # Compute the corrections based on rmax, dmax
        r = K.stop_gradient(
            self._clip(sigma / self.moving_sigma, 1. / rmax, rmax))
        d = K.stop_gradient(
            self._clip((mean - self.moving_mean) / self.moving_sigma, -dmax,
                       dmax))

        # Actually do the normalization and the rescaling
        xnorm = ((x - mean) / sigma) * r + d
        y = self.gamma * xnorm + self.beta

        # Add the moving average updates
        self.add_update([
            K.moving_average_update(self.moving_mean, mean, self.momentum),
            K.moving_average_update(self.moving_sigma, sigma, self.momentum)
        ], x)

        # Add the r, d updates
        rmax_prog = K.minimum(1., self.steps / self.rmax_dur)
        dmax_prog = K.minimum(1., self.steps / self.dmax_dur)
        self.add_update([
            K.update_add(self.steps, 1),
            K.update(self.rmax,
                     self.rmax_0 + rmax_prog * (self.rmax_inf - self.rmax_0)),
            K.update(self.dmax,
                     self.dmax_0 + dmax_prog * (self.dmax_inf - self.dmax_0))
        ])

        # Fix the output's uses learning phase
        y._uses_learning_phase = rmax._uses_learning_phase

        return y
예제 #28
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * self.iterations))

        t = self.iterations + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.get_variable_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.get_variable_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs
        
        # Multiplier for weights [0,2,4,6,...] and bias [1,3,5,7,...]
        if len(params) != len(self.lr_multipliers) : 
            raise Exception("Check Multipliers !") 
        count_multipliers = 0
        
        for p, g, m, v in zip(params, grads, ms, vs):

            # Multiplier for weights [0,2,4,6,...] and bias [1,3,5,7,...]
            if self.lr_multipliers is None:
                new_lr = lr_t     
            else:
                new_lr = lr_t * self.lr_multipliers[count_multipliers]
                count_multipliers += 1
                           
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            p_t = p - new_lr * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
예제 #29
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for i, (p, g, m, v,
                vhat) in enumerate(zip(params, grads, ms, vs, vhats)):
            beta_1 = 0.5 if i == 0 and self.e2efs_layer is not None and not self.lr_momentum else self.beta_1
            beta_2 = 0. if i == -1 and self.e2efs_layer is not None and not self.lr_momentum else self.beta_2
            m_t = (beta_1 * m) + (1. - beta_1) * g
            v_t = (beta_2 * v) + (1. - beta_2) * K.square(g)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
예제 #30
0
파일: adax.py 프로젝트: sunnyhuma171/adax
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.learning_rate
        if self.initial_decay > 0.0:
            lr = lr * (
                1.0 / (
                    1.0 +
                    self.decay * K.cast(self.iterations, K.dtype(self.decay))
                )
            )

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * K.sqrt(K.pow(1.0 + self.beta_2, t) - 1.0)

        ms = [
            K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i))
            for (i, p) in enumerate(params)
        ]
        vs = [
            K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i))
            for (i, p) in enumerate(params)
        ]

        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = self.beta_1 * m + (1.0 - self.beta_1) * g
            v_t = (1.0 + self.beta_2) * v + self.beta_2 * K.square(g)
            p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))

        return self.updates