def testBasicWithLearningRateInverseTimeDecay(self): for i, dtype in enumerate( [dtypes.half, dtypes.float32, dtypes.float64]): with self.cached_session(use_gpu=True): # Initialize variables for numpy implementation. m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) var0 = resource_variable_ops.ResourceVariable(var0_np, name="var0_%d" % i) var1 = resource_variable_ops.ResourceVariable(var1_np, name="var1_%d" % i) grads0 = constant_op.constant(grads0_np) grads1 = constant_op.constant(grads1_np) learning_rate = 0.001 decay = 0.5 lr_schedule = learning_rate_schedule.InverseTimeDecay( learning_rate, decay_steps=1.0, decay_rate=decay) beta_1 = 0.9 beta_2 = 0.999 epsilon = 1e-7 opt = adam.Adam(learning_rate=lr_schedule, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon) update = opt.apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(variables.global_variables_initializer()) # Run 3 steps of Adam for t in range(3): self.evaluate(update) lr_np = learning_rate / (1 + decay * t) var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0, lr=lr_np) var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1, lr=lr_np) # Validate updated params self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
def testBasicWithLearningRateInverseTimeDecaySerializeAndDeserialize(self): for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: learning_rate = learning_rate_schedule.InverseTimeDecay( 3.0, decay_steps=1.0, decay_rate=0.5) sgd = gradient_descent.SGD(learning_rate=learning_rate) sgd = gradient_descent.SGD.from_config(sgd.get_config()) self._test_basic_sgd_with_learning_rate_decay(sgd, dtype)
def testConfigWithLearningRateDecay(self): with test_util.use_gpu(): var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32) for decay_schedule in [ learning_rate_schedule.InverseTimeDecay( 0.5, decay_steps=1.0, decay_rate=0.1), learning_rate_schedule.PiecewiseConstantDecay( [5], [1., .5]) ]: step = 10 opt = gradient_descent.SGD(decay_schedule) config = opt.get_config() opt2 = gradient_descent.SGD.from_config(config) # assert both are equal float values. self.assertAllEqual( decay_schedule(step), opt._get_hyper('learning_rate')(step)) self.assertAllEqual( decay_schedule(step), opt2._get_hyper('learning_rate')(step)) loss = lambda: 3 * var0 # learning rate variable is created when calling minimize. opt.minimize(loss, [var0]) self.evaluate(variables.global_variables_initializer()) config = opt.get_config() opt3 = gradient_descent.SGD.from_config(config) self.assertAllEqual( self.evaluate(opt._get_hyper('learning_rate')(step)), opt3._get_hyper('learning_rate')(step))
def testConfigWithLearningRateDecay(self): with self.cached_session(): decay_schedule = learning_rate_schedule.InverseTimeDecay( 0.5, decay_steps=1.0, decay_rate=0.1) step = 10 opt = gradient_descent.SGD(decay_schedule) config = opt.get_config() opt2 = gradient_descent.SGD.from_config(config) # assert both are equal float values. self.assertAllEqual( decay_schedule(step), opt._get_hyper('learning_rate')(step)) self.assertAllEqual( decay_schedule(step), opt2._get_hyper('learning_rate')(step)) var0 = variables.Variable([[1.0], [2.0]], dtype=dtypes.float32) loss = lambda: 3 * var0 # learning rate variable created when calling minimize. opt.minimize(loss, [var0]) self.evaluate(variables.global_variables_initializer()) config = opt.get_config() opt3 = gradient_descent.SGD.from_config(config) self.assertAllEqual( self.evaluate(opt._get_hyper('learning_rate')(step)), opt3._get_hyper('learning_rate')(step))
def testAggregationMethod(self): for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: with self.cached_session(): var0 = variables.Variable([1.0, 2.0], dtype=dtype) var1 = variables.Variable([3.0, 4.0], dtype=dtype) cost = 5 * var0 + 3 * var1 global_step = variables.Variable( array_ops.zeros([], dtypes.int64), name='global_step') sgd_op = sgd.SGD(3.0) opt_op = sgd_op.minimize( cost, global_step, [var0, var1], aggregation_method=gradients_impl.AggregationMethod. EXPERIMENTAL_ACCUMULATE_N) variables.global_variables_initializer().run() # Fetch params to validate initial values self.assertAllClose([1.0, 2.0], var0.eval()) self.assertAllClose([3.0, 4.0], var1.eval()) # Run 1 step of sgd through optimizer opt_op.run() # Validate updated params self.assertAllClose([-14., -13.], var0.eval()) self.assertAllClose([-6., -5.], var1.eval()) sgd.learning_rate = learning_rate_schedule.InverseTimeDecay( 0.5, decay_steps=1.0, decay_rate=0.5) if context.executing_eagerly(): sgd.minimize(loss, [var0, var1]) else: self.evaluate(opt_op)
def testStaircase(self, serialize): initial_lr = 0.1 k = 10 decay_rate = 0.96 step = resource_variable_ops.ResourceVariable(0) decayed_lr = learning_rate_schedule.InverseTimeDecay( initial_lr, k, decay_rate, staircase=True) decayed_lr = _maybe_serialized(decayed_lr, serialize) self.evaluate(variables.global_variables_initializer()) for i in range(k + 1): expected = initial_lr / (1 + decay_rate * (i // k)) self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6) self.evaluate(step.assign_add(1))
def testDecay(self, serialize): initial_lr = 0.1 k = 10 decay_rate = 0.96 step = variables.Variable(0) decayed_lr = learning_rate_schedule.InverseTimeDecay( initial_lr, k, decay_rate) decayed_lr = _maybe_serialized(decayed_lr, serialize) self.evaluate(variables.global_variables_initializer()) for i in range(k + 1): expected = initial_lr / (1 + i / k * decay_rate) self.assertAllClose(self.evaluate(decayed_lr(step)), expected, 1e-6) self.evaluate(step.assign_add(1))
def testBasicWithLearningRateInverseTimeDecay(self): for dtype in [dtypes.float32, dtypes.float64]: with self.cached_session(): var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) var0 = resource_variable_ops.ResourceVariable(var0_np) var1 = resource_variable_ops.ResourceVariable(var1_np) grads0 = constant_op.constant(grads0_np) grads1 = constant_op.constant(grads1_np) learning_rate = 3.0 decay = 0.5 lr_schedule = learning_rate_schedule.InverseTimeDecay( learning_rate, decay_steps=1.0, decay_rate=decay) ada_opt = adagrad.Adagrad(lr_schedule) accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) if not context.executing_eagerly(): ada_update = ada_opt.apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(variables.global_variables_initializer()) # Fetch params to validate initial values v0_val, v1_val = self.evaluate([var0, var1]) self.assertAllClose([1.0, 2.0], v0_val) self.assertAllClose([3.0, 4.0], v1_val) # Run 3 steps of adagrad for t in range(3): if not context.executing_eagerly(): self.evaluate(ada_update) else: ada_opt.apply_gradients( zip([grads0, grads1], [var0, var1])) lr_np = learning_rate / (1 + decay * t) var0_np, accum0_np = adagrad_update_numpy( var0_np, accum0_np, grads0_np, lr_np) var1_np, accum1_np = adagrad_update_numpy( var1_np, accum1_np, grads1_np, lr_np) self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
def testAdaptiveLearningRate(self): self.skipTest('broken test to be fixed') for dtype in [ dtypes.half, dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128 ]: var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) def loss(): return 5 * var0 + 3 * var1 # pylint: disable=cell-var-from-loop sgd = gradient_descent.SGD(1.0) self.evaluate(variables.global_variables_initializer()) # Fetch params to validate initial values self.assertAllClose([1.0, 2.0], self.evaluate(var0)) self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Run 1 step of sgd through optimizer opt_op = sgd.minimize(loss, [var0, var1]) self.evaluate(variables.global_variables_initializer()) self.evaluate(opt_op) # Validate updated params # var0 = [1., 2.] - 1.0 * [5, 5] self.assertAllClose([-4., -3.], self.evaluate(var0)) # var1 = [3., 4.] - 1.0 * [3, 3] self.assertAllClose([0., 1.], self.evaluate(var1)) sgd.learning_rate = 0.5 if context.executing_eagerly(): sgd.minimize(loss, [var0, var1]) else: self.evaluate(opt_op) # Validate updated params # var0 = [-4., -3.] - 0.5 * [5, 5] self.assertAllClose([-6.5, -5.5], self.evaluate(var0)) # var1 = [0., 1.] - 0.5 * [3, 3] self.assertAllClose([-1.5, -0.5], self.evaluate(var1)) sgd.learning_rate = learning_rate_schedule.InverseTimeDecay( 0.5, decay_steps=1.0, decay_rate=0.5) if context.executing_eagerly(): sgd.minimize(loss, [var0, var1]) else: self.evaluate(opt_op)
def testAdaptiveLearningRate(self): for dtype in _DATA_TYPES: with self.test_session(): var0 = variables.Variable([1.0, 2.0], dtype=dtype) var1 = variables.Variable([3.0, 4.0], dtype=dtype) def loss(): return 5 * var0 + 3 * var1 # pylint: disable=cell-var-from-loop sgd = gradient_descent.SGD(1.0) self.evaluate(variables.global_variables_initializer()) # Fetch params to validate initial values self.assertAllClose([1.0, 2.0], self.evaluate(var0)) self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Run 1 step of sgd through optimizer opt_op = sgd.minimize(loss, [var0, var1]) self.evaluate(variables.global_variables_initializer()) self.evaluate(opt_op) # Validate updated params # var0 = [1., 2.] - 1.0 * [5, 5] self.assertAllClose([-4., -3.], self.evaluate(var0)) # var1 = [3., 4.] - 1.0 * [3, 3] self.assertAllClose([0., 1.], self.evaluate(var1)) sgd.learning_rate = 0.5 if context.executing_eagerly(): sgd.minimize(loss, [var0, var1]) else: self.evaluate(opt_op) # Validate updated params # var0 = [-4., -3.] - 0.5 * [5, 5] self.assertAllClose([-6.5, -5.5], self.evaluate(var0)) # var1 = [0., 1.] - 0.5 * [3, 3] self.assertAllClose([-1.5, -0.5], self.evaluate(var1)) sgd.learning_rate = learning_rate_schedule.InverseTimeDecay( 0.5, decay_steps=1.0, decay_rate=0.5) if context.executing_eagerly(): sgd.minimize(loss, [var0, var1]) else: self.evaluate(opt_op)
def testLearningRateDecayUsedInTwoFunctions(self): a = variables.Variable([1., 2.], name='var') b = variables.Variable([1.], name='var') learning_rate_decay = learning_rate_schedule.InverseTimeDecay( 0.5, decay_steps=1.0, decay_rate=0.5) opt = adam.Adam(learning_rate=learning_rate_decay) loss_a = lambda: 3 * a loss_b = lambda: 2 * b @def_function.function def fn_a(): opt.minimize(loss_a, [a]) return a @def_function.function def fn_b(): opt.minimize(loss_b, [b]) return b fn_a() fn_b()
def testDenseWithLearningRateInverseTimeDecay(self): var0_np = np.array([1.0, 2.0]) grads0_np = np.array([0.1, 0.2]) var1_np = np.array([3.0, 4.0]) grads1_np = np.array([0.01, 0.2]) var0 = resource_variable_ops.ResourceVariable(var0_np) var1 = resource_variable_ops.ResourceVariable(var1_np) grads0 = constant_op.constant(grads0_np) grads1 = constant_op.constant(grads1_np) learning_rate = 0.01 rho = 0.9 momentum = 0.0 epsilon = 1e-7 centered = False decay = 0.5 lr_schedule = learning_rate_schedule.InverseTimeDecay(learning_rate, decay_steps=1.0, decay_rate=decay) opt = rmsprop.RMSprop(learning_rate=lr_schedule, rho=rho, momentum=momentum, epsilon=epsilon, centered=centered) update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) self.evaluate(variables.global_variables_initializer()) rms0 = opt.get_slot(var0, "rms") self.assertIsNotNone(rms0) rms1 = opt.get_slot(var1, "rms") self.assertIsNotNone(rms1) if momentum > 0.: mom0 = opt.get_slot(var0, "momentum") mom1 = opt.get_slot(var1, "momentum") else: mom0 = None mom1 = None mg0_np = np.array([0.0, 0.0]) mg1_np = np.array([0.0, 0.0]) rms0_np = np.array([0.0, 0.0]) rms1_np = np.array([0.0, 0.0]) mom0_np = np.array([0.0, 0.0]) mom1_np = np.array([0.0, 0.0]) # Fetch params to validate initial values self.assertAllClose([1.0, 2.0], self.evaluate(var0)) self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Run 4 steps of RMSprop for t in range(2): self.evaluate(update) lr = learning_rate / (1 + decay * t) var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy( var0_np, grads0_np, mg0_np, rms0_np, mom0_np, lr, rho, momentum, epsilon, centered) var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy( var1_np, grads1_np, mg1_np, rms1_np, mom1_np, lr, rho, momentum, epsilon, centered) # Validate updated params self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0)) self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1)) if momentum > 0.: self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0)) self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1)) self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None): """Applies inverse time decay to the initial learning rate. When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies an inverse decay function to a provided initial learning rate. It requires an `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate. It is computed as: ```python decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step) ``` or, if `staircase` is `True`, as: ```python decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step)) ``` Example: decay 1/t with a rate of 0.5: ```python ... global_step = tf.Variable(0, trainable=False) learning_rate = 0.1 decay_steps = 1.0 decay_rate = 0.5 learning_rate = tf.compat.v1.train.inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate) # Passing global_step to minimize() will increment it at each step. learning_step = ( tf.compat.v1.train.GradientDescentOptimizer(learning_rate) .minimize(...my loss..., global_step=global_step) ) ``` Args: learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The initial learning rate. global_step: A Python number. Global step to use for the decay computation. Must not be negative. decay_steps: How often to apply decay. decay_rate: A Python number. The decay rate. staircase: Whether to apply decay in a discrete staircase, as opposed to continuous, fashion. name: String. Optional name of the operation. Defaults to 'InverseTimeDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. @compatibility(eager) When eager execution is enabled, this function returns a function which in turn returns the decayed learning rate Tensor. This can be useful for changing the learning rate value across different invocations of optimizer functions. @end_compatibility """ decayed_lr = learning_rate_schedule.InverseTimeDecay(learning_rate, decay_steps, decay_rate, staircase=staircase, name=name) if not context.executing_eagerly(): decayed_lr = decayed_lr(global_step) else: decayed_lr = functools.partial(decayed_lr, global_step) return decayed_lr