Exemplo n.º 1
0
 def _apply_dense(self, grad, var, state):
     rms = state.get_slot(var, "rms")
     mom = state.get_slot(var, "momentum")
     if self._centered:
         mg = state.get_slot(var, "mg")
         return training_ops.apply_centered_rms_prop(
             var,
             mg,
             rms,
             mom,
             state.get_hyper("learning_rate", var.dtype.base_dtype),
             state.get_hyper("rho", var.dtype.base_dtype),
             state.get_hyper("momentum", var.dtype.base_dtype),
             # epsilon is now the rms initial value and is not added to the
             # denominator anymore, hence calling the kernel op with epsilon=0.
             0,
             grad,
             use_locking=self._use_locking).op
     else:
         return training_ops.apply_rms_prop(
             var,
             rms,
             mom,
             state.get_hyper("learning_rate", var.dtype.base_dtype),
             state.get_hyper("rho", var.dtype.base_dtype),
             state.get_hyper("momentum", var.dtype.base_dtype),
             0,
             grad,
             use_locking=self._use_locking).op
Exemplo n.º 2
0
 def _apply_dense(self, grad, var):
     rms = self.get_slot(var, "rms")
     mom = self.get_slot(var, "momentum")
     if self._centered:
         mg = self.get_slot(var, "mg")
         return training_ops.apply_centered_rms_prop(
             var,
             mg,
             rms,
             mom,
             math_ops.cast(self._learning_rate_tensor,
                           var.dtype.base_dtype),
             math_ops.cast(self._decay_tensor, var.dtype.base_dtype),
             math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
             math_ops.cast(self._epsilon_tensor, var.dtype.base_dtype),
             grad,
             use_locking=self._use_locking).op
     else:
         return training_ops.apply_rms_prop(
             var,
             rms,
             mom,
             math_ops.cast(self._learning_rate_tensor,
                           var.dtype.base_dtype),
             math_ops.cast(self._decay_tensor, var.dtype.base_dtype),
             math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
             math_ops.cast(self._epsilon_tensor, var.dtype.base_dtype),
             grad,
             use_locking=self._use_locking).op
Exemplo n.º 3
0
 def _apply_dense(self, grad, var, state):
   rms = state.get_slot(var, "rms")
   mom = state.get_slot(var, "momentum")
   if self._centered:
     mg = state.get_slot(var, "mg")
     return training_ops.apply_centered_rms_prop(
         var,
         mg,
         rms,
         mom,
         state.get_hyper("learning_rate", var.dtype.base_dtype),
         state.get_hyper("rho", var.dtype.base_dtype),
         state.get_hyper("momentum", var.dtype.base_dtype),
         # epsilon is now the rms initial value and is not added to the
         # denominator anymore, hence calling the kernel op with epsilon=0.
         0,
         grad,
         use_locking=self._use_locking).op
   else:
     return training_ops.apply_rms_prop(
         var,
         rms,
         mom,
         state.get_hyper("learning_rate", var.dtype.base_dtype),
         state.get_hyper("rho", var.dtype.base_dtype),
         state.get_hyper("momentum", var.dtype.base_dtype),
         0,
         grad,
         use_locking=self._use_locking).op
Exemplo n.º 4
0
 def _apply_dense(self, grad, var):
   rms = self.get_slot(var, "rms")
   mom = self.get_slot(var, "momentum")
   if self._centered:
     mg = self.get_slot(var, "mg")
     return training_ops.apply_centered_rms_prop(
         var,
         mg,
         rms,
         mom,
         math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
         math_ops.cast(self._decay_tensor, var.dtype.base_dtype),
         math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
         math_ops.cast(self._epsilon_tensor, var.dtype.base_dtype),
         grad,
         use_locking=self._use_locking).op
   else:
     return training_ops.apply_rms_prop(
         var,
         rms,
         mom,
         math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
         math_ops.cast(self._decay_tensor, var.dtype.base_dtype),
         math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
         math_ops.cast(self._epsilon_tensor, var.dtype.base_dtype),
         grad,
         use_locking=self._use_locking).op
Exemplo n.º 5
0
 def _apply_dense(self, grad, var, state):
     rms = state.get_slot(var, "rms")
     mom = state.get_slot(var, "momentum")
     if self._centered:
         mg = state.get_slot(var, "mg")
         return training_ops.apply_centered_rms_prop(
             var,
             mg,
             rms,
             mom,
             state.get_hyper("learning_rate", var.dtype.base_dtype),
             state.get_hyper("decay", var.dtype.base_dtype),
             state.get_hyper("momentum", var.dtype.base_dtype),
             state.get_hyper("epsilon", var.dtype.base_dtype),
             grad,
             use_locking=self._use_locking).op
     else:
         return training_ops.apply_rms_prop(
             var,
             rms,
             mom,
             state.get_hyper("learning_rate", var.dtype.base_dtype),
             state.get_hyper("decay", var.dtype.base_dtype),
             state.get_hyper("momentum", var.dtype.base_dtype),
             state.get_hyper("epsilon", var.dtype.base_dtype),
             grad,
             use_locking=self._use_locking).op
Exemplo n.º 6
0
 def _apply_dense(self, grad, var, state):
   rms = state.get_slot(var, "rms")
   mom = state.get_slot(var, "momentum")
   if self._centered:
     mg = state.get_slot(var, "mg")
     return training_ops.apply_centered_rms_prop(
         var,
         mg,
         rms,
         mom,
         state.get_hyper("learning_rate", var.dtype.base_dtype),
         state.get_hyper("decay", var.dtype.base_dtype),
         state.get_hyper("momentum", var.dtype.base_dtype),
         state.get_hyper("epsilon", var.dtype.base_dtype),
         grad,
         use_locking=self._use_locking).op
   else:
     return training_ops.apply_rms_prop(
         var,
         rms,
         mom,
         state.get_hyper("learning_rate", var.dtype.base_dtype),
         state.get_hyper("decay", var.dtype.base_dtype),
         state.get_hyper("momentum", var.dtype.base_dtype),
         state.get_hyper("epsilon", var.dtype.base_dtype),
         grad,
         use_locking=self._use_locking).op
Exemplo n.º 7
0
 def _apply_dense(self, variable, grad):
     rms = self._get_slot(variable, 'rms')
     momentum = self._get_slot(variable, 'momentum')
     return training_ops.apply_rms_prop(variable, rms, momentum,
                                        self._learning_rate_tensor,
                                        self._decay_tensor,
                                        self._momentum_tensor,
                                        self._epsilon_tensor,
                                        grad, use_locking=False).op
Exemplo n.º 8
0
 def _apply_dense(self, grad, var):
   rms = self.get_slot(var, "rms")
   mom = self.get_slot(var, "momentum")
   return training_ops.apply_rms_prop(
       var, rms, mom,
       self._learning_rate_tensor,
       self._decay_tensor,
       self._momentum_tensor,
       self._epsilon_tensor,
       grad, use_locking=self._use_locking).op
Exemplo n.º 9
0
 def _apply_dense(self, grad, var):
   rms = self.get_slot(var, "rms")
   mom = self.get_slot(var, "momentum")
   return training_ops.apply_rms_prop(
       var, rms, mom,
       self._learning_rate_tensor,
       self._decay_tensor,
       self._momentum_tensor,
       self._epsilon_tensor,
       grad, use_locking=self._use_locking).op
Exemplo n.º 10
0
 def applyDense(self, grad, var):
     rms = self.getSlot(var, "rms")
     mom = self.getSlot(var, "momentum")
     return training_ops.apply_rms_prop(var,
                                        rms,
                                        mom,
                                        self.learningRateTensor,
                                        self.decayTensor,
                                        self.momentumTensor,
                                        self.epsilonTensor,
                                        grad,
                                        use_locking=False).op
Exemplo n.º 11
0
 def _apply_dense(self, grad, var):
   rms = self.get_slot(var, "rms") #Ex - <tf.Variable 'navigation/W_fc/RMSPropApplier:0' shape=(8192, 512) dtype=float32_ref>
   mom = self.get_slot(var, "momentum") #get the momentrom slots
   #ms <- rho * ms_{t-1} + (1-rho) * grad * grad mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) var <- var - mom
   return training_ops.apply_rms_prop(  #we apply the gradiets directly to the master
     var, rms, mom,
     self._learning_rate_tensor,
     self._decay_tensor,
     self._momentum_tensor,
     self._epsilon_tensor,
     grad,
     use_locking=False).op
 def _apply_dense(self, grad, var):
     """
     应用梯度以及动量, 注意没有考虑到 local AC 之间的同步问题.
     TODO: 后期需要加上多线程同步
     :param grad:
     :param var:
     :return:
     """
     rms = self.get_slot(var, "rms")
     mom = self.get_slot(var, "momentum")
     return training_ops.apply_rms_prop(var,
                                        rms,
                                        mom,
                                        self._learning_rate_tensor,
                                        self._decay_tensor,
                                        self._momentum_tensor,
                                        self._epsilon_tensor,
                                        grad,
                                        use_locking=False).op
    def _apply_dense(self, grad, var):
        rms = self.get_slot(var, "rms")
        mom = self.get_slot(var, "momentum")
        eps = self.get_slot(var, 'eps')
        tf.summary.scalar('grad_norm', tf.norm(grad))
        # debug_here()
        if 'orthogonal_stiefel' in var.name and 'bias' not in var.name:
            with tf.variable_scope("orthogonal_update"):
                print('Appling an orthogonality preserving step to', var.name)
                # apply the rms update rule.
                new_rms = self._decay_tensor * rms + (1. - self._decay_tensor) \
                    * tf.square(grad)
                rms_assign_op = tf.assign(rms, new_rms)
                # scale the gradient.
                if self._nat_grad_normalization:
                    grad = grad / (tf.sqrt(rms) + eps)
                # the update should preserve orthogonality.
                grad_shape = tf.Tensor.get_shape(grad).as_list()
                # W_new_lst = []
                eye = tf.eye(grad_shape[0], dtype=tf.float32)
                G = grad
                W = var
                # Reunitarize after n steps.
                if self._qr_steps is not None:
                    W = tf.cond(tf.equal(tf.mod(self._global_step_tensor,
                                         self._qr_steps), 0),
                                lambda: self.re_unitarize(W), lambda: W)
                # A = tf.matmul(tf.transpose(G), W) - tf.matmul(tf.transpose(W), G)
                A = tf.matmul(G, tf.transpose(W)) - tf.matmul(W, tf.transpose(G))
                cayleyDenom = eye + (self._learning_rate_tensor/2.0) * A
                cayleyNumer = eye - (self._learning_rate_tensor/2.0) * A
                C = tf.matmul(tf.matrix_inverse(cayleyDenom), cayleyNumer)
                W_new = tf.matmul(C, W)
                if self._debug:
                    # self._summary_A(A)
                    self._summary_C(C)
                    self._summary_W(W)
                var_update_op = tf.assign(var, W_new)
                return tf.group(*[var_update_op, rms_assign_op])
        elif 'unitary_stiefel' in var.name and 'bias' not in var.name:
            with tf.variable_scope("unitary_update"):
                print('Appling an unitarity preserving step to', var.name)
                # apply the rms update rule.
                new_rms = self._decay_tensor * rms + (1. - self._decay_tensor) \
                    * tf.square(grad)
                rms_assign_op = tf.assign(rms, new_rms)
                # scale the gradient.
                if self._nat_grad_normalization:
                    grad = grad / (tf.sqrt(new_rms) + eps)
                # do an update step, which preserves unitary structure.
                # checking shapes.
                grad_shape = tf.Tensor.get_shape(grad).as_list()
                assert grad_shape[0] == grad_shape[1]
                eye = tf.eye(grad_shape[0], dtype=tf.complex64)
                G = tf.complex(grad[:, :, 0], grad[:, :, 1])
                W = tf.complex(var[:, :, 0], var[:, :, 1])

                # Reunitarize after n steps.
                if self._qr_steps is not None:
                    W = tf.cond(tf.equal(tf.mod(self._global_step_tensor,
                                         self._qr_steps), 0),
                                lambda: self.re_unitarize(W), lambda: W)

                A = tf.matmul(G, tf.conj(tf.transpose(W))) \
                    - tf.matmul(W, tf.conj(tf.transpose(G)))
                # A must be skew symmetric.
                larning_rate_scale = tf.complex(self._learning_rate_tensor/2.0,
                                                tf.zeros_like(self._learning_rate_tensor))
                cayleyDenom = eye + larning_rate_scale * A
                cayleyNumer = eye - larning_rate_scale * A
                C = tf.matmul(tf.matrix_inverse(cayleyDenom), cayleyNumer)
                W_new = tf.matmul(C, W)
                if self._debug:
                    # self._summary_A(A)
                    self._summary_C(C)
                    self._summary_W(W)
                # debug_here()
                W_new_re = tf.real(W_new)
                W_new_img = tf.imag(W_new)
                W_array = tf.stack([W_new_re, W_new_img], -1)
                var_update_op = tf.assign(var, W_array)
                return tf.group(*[var_update_op, rms_assign_op])
        else:
            # do the usual RMSprop update
            rms = False
            if rms:
                if 1:
                    # tensorflow default.
                    print('Appling standard rmsprop to', var.name)
                    return training_ops.apply_rms_prop(
                        var, rms, mom,
                        tf.cast(self._learning_rate_tensor, var.dtype.base_dtype),
                        tf.cast(self._decay_tensor, var.dtype.base_dtype),
                        tf.cast(self._momentum_tensor, var.dtype.base_dtype),
                        tf.cast(self._epsilon_tensor, var.dtype.base_dtype),
                        grad, use_locking=False).op
                else:
                    # My rmsprop implementation.
                    new_rms = self._decay_tensor * rms \
                        + (1. - self._decay_tensor) * tf.square(grad)
                    rms_assign_op = tf.assign(rms, new_rms)
                    W_new = var - self._learning_rate_tensor * grad \
                        / (tf.sqrt(new_rms) + eps)
                    var_update_op = tf.assign(var, W_new)
                    return tf.group(*[var_update_op, rms_assign_op])
            else:
                print('Appling default gradient descent to', var.name)
                return training_ops.apply_gradient_descent(
                    var,
                    tf.cast(self._learning_rate_tensor, var.dtype.base_dtype),
                    grad,
                    use_locking=False).op