def _testTypesForAdam(self, var, m, v, grad, use_gpu): self.setUp() with self.test_session(use_gpu=use_gpu): var_t = variables.Variable(var) m_t = variables.Variable(m) v_t = variables.Variable(v) t = 1 beta1 = np.array(0.9, dtype=var.dtype) beta2 = np.array(0.999, dtype=var.dtype) beta1_power = beta1**t beta2_power = beta2**t lr = np.array(0.001, dtype=var.dtype) epsilon = np.array(1e-8, dtype=var.dtype) beta1_t = constant_op.constant(beta1, self._toType(var.dtype), []) beta2_t = constant_op.constant(beta2, self._toType(var.dtype), []) beta1_power_t = variables.Variable(beta1_power) beta2_power_t = variables.Variable(beta2_power) lr_t = constant_op.constant(lr, self._toType(var.dtype), []) epsilon_t = constant_op.constant(epsilon, self._toType(var.dtype), []) variables.initialize_all_variables().run() self.assertAllEqual(var, var_t.eval()) new_var, _, _ = self._adamUpdateNumpy(var, grad, t, m, v, lr, beta1, beta2, epsilon) apply_adam = training_ops.apply_adam(var_t, m_t, v_t, beta1_power_t, beta2_power_t, lr_t, beta1_t, beta2_t, epsilon_t, grad) out = apply_adam.eval() self.assertShapeEqual(out, apply_adam) self.assertAllClose(new_var, out)
def _apply_dense(self, grad, var): m = self.get_slot(var, "m") v = self.get_slot(var, "v") return training_ops.apply_adam( var, m, v, self._beta1_power, self._beta2_power, self._lr_t, self._beta1_t, self._beta2_t, self._epsilon_t, grad, use_locking=self._use_locking).op
def _testTypesForAdam(self, var, m, v, grad, use_gpu): self.setUp() with self.test_session(use_gpu=use_gpu): var_t = variables.Variable(var) m_t = variables.Variable(m) v_t = variables.Variable(v) t = 1 beta1 = np.array(0.9, dtype=var.dtype) beta2 = np.array(0.999, dtype=var.dtype) beta1_power = beta1**t beta2_power = beta2**t lr = np.array(0.001, dtype=var.dtype) epsilon = np.array(1e-8, dtype=var.dtype) beta1_t = constant_op.constant(beta1, self._toType(var.dtype), []) beta2_t = constant_op.constant(beta2, self._toType(var.dtype), []) beta1_power_t = variables.Variable(beta1_power) beta2_power_t = variables.Variable(beta2_power) lr_t = constant_op.constant(lr, self._toType(var.dtype), []) epsilon_t = constant_op.constant(epsilon, self._toType(var.dtype), []) variables.global_variables_initializer().run() self.assertAllCloseAccordingToType(var, var_t.eval()) new_var, _, _ = self._adamUpdateNumpy(var, grad, t, m, v, lr, beta1, beta2, epsilon) apply_adam = training_ops.apply_adam(var_t, m_t, v_t, beta1_power_t, beta2_power_t, lr_t, beta1_t, beta2_t, epsilon_t, grad) out = apply_adam.eval() self.assertShapeEqual(out, apply_adam) self.assertAllCloseAccordingToType(new_var, out)
def _apply_dense(self, grad, var): m = self.get_slot(var, "m") v = self.get_slot(var, "v") # for BNN kernel # origin version clipping weight method is new_w = old_w + scale*(new_w - old_w) # and adam update function is new_w = old_w - lr_t * m_t / (sqrt(v_t) + epsilon) # so subtitute adam function into weight clipping # new_w = old_w - (scale * lr_t * m_t) / (sqrt(v_t) + epsilon) scale = self._weight_scale[var.name] / 4 return training_ops.apply_adam(var, m, v, math_ops.cast(self._beta1_power, var.dtype.base_dtype), math_ops.cast(self._beta2_power, var.dtype.base_dtype), math_ops.cast(self._lr_t * scale, var.dtype.base_dtype), math_ops.cast(self._beta1_t, var.dtype.base_dtype), math_ops.cast(self._beta2_t, var.dtype.base_dtype), math_ops.cast(self._epsilon_t, var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): m = self.get_slot(var, "m") v = self.get_slot(var, "v") beta1_power, beta2_power = self._get_beta_accumulators() clip_bounds = 3 * tf.sqrt(v / (1 - beta2_power)) + 0.1 grad = tf.clip_by_value(grad, -clip_bounds, clip_bounds) # Clip gradients by 3 std return training_ops.apply_adam(var, m, v, math_ops.cast(beta1_power, var.dtype.base_dtype), math_ops.cast(beta2_power, var.dtype.base_dtype), math_ops.cast(self._lr_t, var.dtype.base_dtype), math_ops.cast(self._beta1_t, var.dtype.base_dtype), math_ops.cast(self._beta2_t, var.dtype.base_dtype), math_ops.cast(self._epsilon_t, var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): m = self.get_slot(var, "m") v = self.get_slot(var, "v") beta1_power, beta2_power, _ = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) return tf.cond( self.condition, lambda: training_ops.apply_adam( var, m, v, beta1_power, math_ops.cast(beta2_power, var.dtype.base_dtype), math_ops.cast(self.rectified_lr, var.dtype.base_dtype ), # instead of _lr_t beta1_t, beta2_t, math_ops.cast(self._epsilon_t, var.dtype.base_dtype), grad, use_locking=self._use_locking).op, lambda: self._apply_dense_without_v(var, m, v, beta1_power, beta1_t, beta2_t, grad), )
def _apply_dense(self, grad, var): m = self.get_slot(var, "m") v = self.get_slot(var, "v") return training_ops.apply_adam( var, m, v, math_ops.cast(self._beta1_power, var.dtype.base_dtype), math_ops.cast(self._beta2_power, var.dtype.base_dtype), math_ops.cast(self._lr_t, var.dtype.base_dtype), math_ops.cast(self._beta1_t, var.dtype.base_dtype), math_ops.cast(self._beta2_t, var.dtype.base_dtype), math_ops.cast(self._epsilon_t, var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var, state): m = state.get_slot(var, "m") v = state.get_slot(var, "v") beta1_power, beta2_power = self._get_beta_accumulators(state) return training_ops.apply_adam( var, m, v, math_ops.cast(beta1_power, var.dtype.base_dtype), math_ops.cast(beta2_power, var.dtype.base_dtype), state.get_hyper("learning_rate", var.dtype.base_dtype), state.get_hyper("beta1", var.dtype.base_dtype), state.get_hyper("beta2", var.dtype.base_dtype), state.get_hyper("epsilon", var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense_in_action(self, grad, var): m = self.get_slot(var, "m") v = self.get_slot(var, "v") beta1_power, beta2_power = self._get_beta_accumulators() return training_ops.apply_adam( var, m, v, tf.cast(beta1_power, var.dtype.base_dtype), tf.cast(beta2_power, var.dtype.base_dtype), tf.cast(self._lr_t, var.dtype.base_dtype), tf.cast(self._beta1_t, var.dtype.base_dtype), tf.cast(self._beta2_t, var.dtype.base_dtype), tf.cast(self._epsilon_t, var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): m = self.get_slot(var, "m") v = self.get_slot(var, "v") beta1_power, beta2_power = self._get_beta_accumulators() return training_ops.apply_adam( var, m, v, math_ops.cast(beta1_power, var.dtype.base_dtype), math_ops.cast(beta2_power, var.dtype.base_dtype), math_ops.cast(self._lr_t, var.dtype.base_dtype), math_ops.cast(self._beta1_t, var.dtype.base_dtype), math_ops.cast(self._beta2_t, var.dtype.base_dtype), math_ops.cast(self._epsilon_t, var.dtype.base_dtype), grad, use_locking=self._use_locking, use_nesterov=True).op
def _apply_dense(self, grad, var): m = self.get_slot(var, "m") v = self.get_slot(var, "v") beta1_power, beta2_power = self._get_beta_accumulators() ops = self._get_ops_tester() ops_up = ops.assign_add(1) return control_flow_ops.group(*[ training_ops.apply_adam( var, m, v, math_ops.cast(beta1_power, var.dtype.base_dtype), math_ops.cast(beta2_power, var.dtype.base_dtype), math_ops.cast(self._lr_t, var.dtype.base_dtype), math_ops.cast(self._beta1_t, var.dtype.base_dtype), math_ops.cast(self._beta2_t, var.dtype.base_dtype), math_ops.cast(self._epsilon_t, var.dtype.base_dtype), grad, use_locking=self._use_locking).op, ops_up ])
def adam_apply_dense(self, grad, var): m = self.get_slot(var, "m") v = self.get_slot(var, "v") beta1_power, beta2_power = self._get_beta_accumulators() #grad = tf.Print(grad,["A"]) return training_ops.apply_adam(var, m, v, math_ops.cast(beta1_power, var.dtype.base_dtype), math_ops.cast(beta2_power, var.dtype.base_dtype), math_ops.cast(self._lr_t, var.dtype.base_dtype), math_ops.cast(self._beta1_t, var.dtype.base_dtype), math_ops.cast(self._beta2_t, var.dtype.base_dtype), math_ops.cast(self._epsilon_t, var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): m = self.get_slot(var, "m") v = self.get_slot(var, "v") decayed_var = var if self._do_use_weight_decay(self._get_variable_name(var.name)): decayed_var = self._weight_decay_rate * var return training_ops.apply_adam(decayed_var, m, v, tf.cast(self._beta1, var.dtype.base_dtype), tf.cast(self._beta2, var.dtype.base_dtype), tf.cast(self._learning_rate, var.dtype.base_dtype), tf.cast(self._beta1, var.dtype.base_dtype), tf.cast(self._beta2, var.dtype.base_dtype), tf.cast(self._epsilon, var.dtype.base_dtype), grad, use_locking=self._use_locking).op