def test_sharing(): for dtype in _dtypes_to_test(use_gpu=test_utils.is_gpu_available()): # Initialize variables for numpy implementation. m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) var0 = tf.Variable(var0_np) var1 = tf.Variable(var1_np) grads0 = tf.constant(grads0_np) grads1 = tf.constant(grads1_np) opt = lamb.LAMB() # Fetch params to validate initial values np.testing.assert_allclose(np.asanyarray([1.0, 2.0]), var0.numpy()) np.testing.assert_allclose(np.asanyarray([3.0, 4.0]), var1.numpy()) # Run 3 steps of intertwined LAMB1 and LAMB2. for t in range(3): beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype) test_utils.assert_allclose_according_to_type( 0.9**(t + 1), beta_1_power) test_utils.assert_allclose_according_to_type( 0.999**(t + 1), beta_2_power) opt.apply_gradients(zip([grads0, grads1], [var0, var1])) var0_np, m0, v0 = lamb_update_numpy(var0_np, grads0_np, t, m0, v0) var1_np, m1, v1 = lamb_update_numpy(var1_np, grads1_np, t, m1, v1) # Validate updated params test_utils.assert_allclose_according_to_type(var0_np, var0.numpy()) test_utils.assert_allclose_according_to_type(var1_np, var1.numpy())
def test_exclude_weight_decay(): opt = lamb.LAMB(0.01, weight_decay=0.01, exclude_from_weight_decay=["var1"]) assert opt._do_use_weight_decay(tf.Variable([], name="var0")) assert not opt._do_use_weight_decay(tf.Variable([], name="var1")) assert not opt._do_use_weight_decay(tf.Variable([], name="var1_weight"))
def test_exclude_weight_decay(): opt = lamb.LAMB(0.01, weight_decay_rate=0.01, exclude_from_weight_decay=["var1"]) assert opt._do_use_weight_decay("var0") assert not opt._do_use_weight_decay("var1") assert not opt._do_use_weight_decay("var1_weight")
def testBasicWithLearningRateInverseTimeDecay(self): for i, dtype in enumerate( self._DtypesToTest(use_gpu=tf.test.is_gpu_available())): # Initialize variables for numpy implementation. m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) var0 = tf.Variable(var0_np, name="var0_%d" % i) var1 = tf.Variable(var1_np, name="var1_%d" % i) grads0 = tf.constant(grads0_np) grads1 = tf.constant(grads1_np) learning_rate = 0.001 decay = 0.5 lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay( learning_rate, decay_steps=1.0, decay_rate=decay) beta_1 = 0.9 beta_2 = 0.999 epsilon = 1e-7 opt = lamb.LAMB(learning_rate=lr_schedule, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon) if not tf.executing_eagerly(): update = opt.apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) # Run 3 steps of LAMB for t in range(3): if not tf.executing_eagerly(): self.evaluate(update) else: opt.apply_gradients(zip([grads0, grads1], [var0, var1])) lr_np = learning_rate / (1 + decay * t) var0_np, m0, v0 = lamb_update_numpy(var0_np, grads0_np, t, m0, v0, lr=lr_np) var1_np, m1, v1 = lamb_update_numpy(var1_np, grads1_np, t, m1, v1, lr=lr_np) # Validate updated params self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
def test_serialization(): optimizer = lamb.LAMB(1e-4, weight_decay_rate=0.01, exclude_from_weight_decay=["var1"]) config = tf.keras.optimizers.serialize(optimizer) new_optimizer = tf.keras.optimizers.deserialize(config) assert new_optimizer.get_config() == optimizer.get_config()
def test_basic_with_learning_rate_decay(): for i, dtype in enumerate( _dtypes_to_test(use_gpu=test_utils.is_gpu_available())): # Initialize variables for numpy implementation. m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) var0 = tf.Variable(var0_np, name="var0_%d" % i) var1 = tf.Variable(var1_np, name="var1_%d" % i) grads0 = tf.constant(grads0_np) grads1 = tf.constant(grads1_np) learning_rate = 0.001 beta_1 = 0.9 beta_2 = 0.999 epsilon = 1e-7 decay = 0.5 lamb_wd = 0.01 opt = lamb.LAMB( learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, weight_decay=lamb_wd, decay=decay, ) # Run 3 steps of LAMB for t in range(3): opt.apply_gradients(zip([grads0, grads1], [var0, var1])) lr_np = learning_rate / (1 + decay * t) var0_np, m0, v0 = lamb_update_numpy(var0_np, grads0_np, t, m0, v0, lr=lr_np, lamb_wd=lamb_wd) var1_np, m1, v1 = lamb_update_numpy(var1_np, grads1_np, t, m1, v1, lr=lr_np, lamb_wd=lamb_wd) # Validate updated params test_utils.assert_allclose_according_to_type(var0_np, var0.numpy()) test_utils.assert_allclose_according_to_type(var1_np, var1.numpy())
def doTestBasic(self, use_callable_params=False): for i, dtype in enumerate( self._DtypesToTest(use_gpu=tf.test.is_gpu_available())): # Initialize variables for numpy implementation. m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) var0 = tf.Variable(var0_np, name="var0_%d" % i) var1 = tf.Variable(var1_np, name="var1_%d" % i) grads0 = tf.constant(grads0_np) grads1 = tf.constant(grads1_np) learning_rate = lambda: 0.001 beta1 = lambda: 0.9 beta2 = lambda: 0.999 epsilon = lambda: 1e-8 if not use_callable_params: learning_rate = learning_rate() beta1 = beta1() beta2 = beta2() epsilon = epsilon() opt = lamb.LAMB(learning_rate=learning_rate) if not tf.executing_eagerly(): update = opt.apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) # Run 3 steps of LAMB for t in range(3): beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype) self.assertAllCloseAccordingToType(0.9**(t + 1), self.evaluate(beta_1_power)) self.assertAllCloseAccordingToType(0.999**(t + 1), self.evaluate(beta_2_power)) if not tf.executing_eagerly(): self.evaluate(update) else: opt.apply_gradients(zip([grads0, grads1], [var0, var1])) var0_np, m0, v0 = lamb_update_numpy(var0_np, grads0_np, t, m0, v0) var1_np, m1, v1 = lamb_update_numpy(var1_np, grads1_np, t, m1, v1) # Validate updated params self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
def testSharing(self): for dtype in self._DtypesToTest(use_gpu=tf.test.is_gpu_available()): # Initialize variables for numpy implementation. m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) var0 = tf.Variable(var0_np) var1 = tf.Variable(var1_np) grads0 = tf.constant(grads0_np) grads1 = tf.constant(grads1_np) opt = lamb.LAMB() if not tf.executing_eagerly(): update1 = opt.apply_gradients( zip([grads0, grads1], [var0, var1])) update2 = opt.apply_gradients( zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) # Fetch params to validate initial values self.assertAllClose([1.0, 2.0], self.evaluate(var0)) self.assertAllClose([3.0, 4.0], self.evaluate(var1)) # Run 3 steps of intertwined LAMB1 and LAMB2. for t in range(3): beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype) self.assertAllCloseAccordingToType(0.9**(t + 1), self.evaluate(beta_1_power)) self.assertAllCloseAccordingToType(0.999**(t + 1), self.evaluate(beta_2_power)) if not tf.executing_eagerly(): if t % 2 == 0: update1.run() else: update2.run() else: opt.apply_gradients(zip([grads0, grads1], [var0, var1])) var0_np, m0, v0 = lamb_update_numpy(var0_np, grads0_np, t, m0, v0) var1_np, m1, v1 = lamb_update_numpy(var1_np, grads1_np, t, m1, v1) # Validate updated params self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
def testSparse(self): for dtype in self._DtypesToTest(use_gpu=tf.test.is_gpu_available()): # Initialize tf for numpy implementation. m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype) grads0_np = np.array([0.1, 0.0, 0.1], dtype=dtype.as_numpy_dtype) var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype) grads1_np = np.array([0.01, 0.0, 0.01], dtype=dtype.as_numpy_dtype) var0 = tf.Variable(var0_np) var1 = tf.Variable(var1_np) grads0_np_indices = np.array([0, 2], dtype=np.int32) grads0 = tf.IndexedSlices( tf.constant(grads0_np[grads0_np_indices]), tf.constant(grads0_np_indices), tf.constant([3]), ) grads1_np_indices = np.array([0, 2], dtype=np.int32) grads1 = tf.IndexedSlices( tf.constant(grads1_np[grads1_np_indices]), tf.constant(grads1_np_indices), tf.constant([3]), ) opt = lamb.LAMB() if not tf.executing_eagerly(): update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) self.evaluate(tf.compat.v1.global_variables_initializer()) # Fetch params to validate initial values self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0)) self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1)) # Run 3 steps of LAMB for t in range(3): beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype) self.assertAllCloseAccordingToType( 0.9 ** (t + 1), self.evaluate(beta_1_power) ) self.assertAllCloseAccordingToType( 0.999 ** (t + 1), self.evaluate(beta_2_power) ) if not tf.executing_eagerly(): self.evaluate(update) else: opt.apply_gradients(zip([grads0, grads1], [var0, var1])) var0_np, m0, v0 = lamb_update_numpy(var0_np, grads0_np, t, m0, v0) var1_np, m1, v1 = lamb_update_numpy(var1_np, grads1_np, t, m1, v1) # Validate updated params self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0)) self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
def test_minimize_mean_square_loss_with_weight_decay(): w = tf.Variable([0.1, -0.2, -0.1]) x = tf.constant([0.4, 0.2, -0.5]) def loss(): return tf.reduce_mean(tf.square(x - w)) opt = lamb.LAMB(0.02, weight_decay=0.01) # Run 200 steps for _ in range(200): opt.minimize(loss, [w]) # Validate updated params np.testing.assert_allclose(w.numpy(), np.asanyarray([0.4, 0.2, -0.5]), rtol=1e-2, atol=1e-2)
def testMinimizeMeanSquareLossWithWeightDecay(self): w = tf.Variable([0.1, -0.2, -0.1]) x = tf.constant([0.4, 0.2, -0.5]) loss = lambda: tf.reduce_mean(tf.square(x - w)) # pylint:disable=cell-var-from-loop opt = lamb.LAMB(0.02, weight_decay_rate=0.01) if not tf.executing_eagerly(): op = opt.minimize(loss, [w]) self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(tf.compat.v1.global_variables_initializer()) # Run 200 steps for _ in range(200): if tf.executing_eagerly(): opt.minimize(loss, [w]) else: self.evaluate(op) # Validate updated params self.assertAllClose(self.evaluate(w), [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2)
def test_resource(): for i, dtype in enumerate( _dtypes_to_test(use_gpu=test_utils.is_gpu_available())): # Initialize variables for numpy implementation. m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0 var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype) var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype) var0 = tf.Variable(var0_np, name="var0_%d" % i) var1 = tf.Variable(var1_np, name="var1_%d" % i) grads0 = tf.constant(grads0_np) grads1 = tf.constant(grads1_np) def learning_rate(): return 0.001 opt = lamb.LAMB(learning_rate=learning_rate) # Run 3 steps of LAMB for t in range(3): beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype) test_utils.assert_allclose_according_to_type( 0.9**(t + 1), beta_1_power) test_utils.assert_allclose_according_to_type( 0.999**(t + 1), beta_2_power) opt.apply_gradients(zip([grads0, grads1], [var0, var1])) var0_np, m0, v0 = lamb_update_numpy(var0_np, grads0_np, t, m0, v0) var1_np, m1, v1 = lamb_update_numpy(var1_np, grads1_np, t, m1, v1) # Validate updated params test_utils.assert_allclose_according_to_type(var0_np, var0.numpy()) test_utils.assert_allclose_according_to_type(var1_np, var1.numpy())
def test_weight_decay_rate_deprecation(): with pytest.deprecated_call(): opt = lamb.LAMB(0.01, weight_decay_rate=0.01) config = opt.get_config() assert config["weight_decay"] == 0.01
def test_exclude_layer_adaptation(): opt = lamb.LAMB(0.01, exclude_from_layer_adaptation=["var1"]) assert opt._do_layer_adaptation(tf.Variable([], name="var0")) assert not opt._do_layer_adaptation(tf.Variable([], name="var1")) assert not opt._do_layer_adaptation(tf.Variable([], name="var1_weight"))
def test_get_config(self): opt = lamb.LAMB(1e-4) config = opt.get_config() self.assertEqual(config['learning_rate'], 1e-4)
def test_get_config(): opt = lamb.LAMB(1e-4) config = opt.get_config() assert config["learning_rate"] == 1e-4
def test_exclude_layer_adaptation(): opt = lamb.LAMB(0.01, exclude_from_layer_adaptation=["var1"]) assert opt._do_layer_adaptation("var0") assert not opt._do_layer_adaptation("var1") assert not opt._do_layer_adaptation("var1_weight")
def test_serialization(): optimizer = lamb.LAMB(1e-4) config = tf.keras.optimizers.serialize(optimizer) new_optimizer = tf.keras.optimizers.deserialize(config) assert new_optimizer.get_config() == optimizer.get_config()