예제 #1
0
def test_sharing():
    for dtype in _dtypes_to_test(use_gpu=test_utils.is_gpu_available()):
        # Initialize variables for numpy implementation.
        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)

        var0 = tf.Variable(var0_np)
        var1 = tf.Variable(var1_np)
        grads0 = tf.constant(grads0_np)
        grads1 = tf.constant(grads1_np)
        opt = lamb.LAMB()

        # Fetch params to validate initial values
        np.testing.assert_allclose(np.asanyarray([1.0, 2.0]), var0.numpy())
        np.testing.assert_allclose(np.asanyarray([3.0, 4.0]), var1.numpy())

        # Run 3 steps of intertwined LAMB1 and LAMB2.
        for t in range(3):
            beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
            test_utils.assert_allclose_according_to_type(
                0.9**(t + 1), beta_1_power)
            test_utils.assert_allclose_according_to_type(
                0.999**(t + 1), beta_2_power)

            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))

            var0_np, m0, v0 = lamb_update_numpy(var0_np, grads0_np, t, m0, v0)
            var1_np, m1, v1 = lamb_update_numpy(var1_np, grads1_np, t, m1, v1)

            # Validate updated params
            test_utils.assert_allclose_according_to_type(var0_np, var0.numpy())
            test_utils.assert_allclose_according_to_type(var1_np, var1.numpy())
예제 #2
0
def test_exclude_weight_decay():
    opt = lamb.LAMB(0.01,
                    weight_decay=0.01,
                    exclude_from_weight_decay=["var1"])
    assert opt._do_use_weight_decay(tf.Variable([], name="var0"))
    assert not opt._do_use_weight_decay(tf.Variable([], name="var1"))
    assert not opt._do_use_weight_decay(tf.Variable([], name="var1_weight"))
예제 #3
0
def test_exclude_weight_decay():
    opt = lamb.LAMB(0.01,
                    weight_decay_rate=0.01,
                    exclude_from_weight_decay=["var1"])
    assert opt._do_use_weight_decay("var0")
    assert not opt._do_use_weight_decay("var1")
    assert not opt._do_use_weight_decay("var1_weight")
예제 #4
0
    def testBasicWithLearningRateInverseTimeDecay(self):
        for i, dtype in enumerate(
                self._DtypesToTest(use_gpu=tf.test.is_gpu_available())):
            # Initialize variables for numpy implementation.
            m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
            var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
            grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
            var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
            grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)

            var0 = tf.Variable(var0_np, name="var0_%d" % i)
            var1 = tf.Variable(var1_np, name="var1_%d" % i)
            grads0 = tf.constant(grads0_np)
            grads1 = tf.constant(grads1_np)

            learning_rate = 0.001
            decay = 0.5
            lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
                learning_rate, decay_steps=1.0, decay_rate=decay)
            beta_1 = 0.9
            beta_2 = 0.999
            epsilon = 1e-7

            opt = lamb.LAMB(learning_rate=lr_schedule,
                            beta_1=beta_1,
                            beta_2=beta_2,
                            epsilon=epsilon)

            if not tf.executing_eagerly():
                update = opt.apply_gradients(
                    zip([grads0, grads1], [var0, var1]))
                self.evaluate(tf.compat.v1.global_variables_initializer())

            # Run 3 steps of LAMB
            for t in range(3):
                if not tf.executing_eagerly():
                    self.evaluate(update)
                else:
                    opt.apply_gradients(zip([grads0, grads1], [var0, var1]))

                lr_np = learning_rate / (1 + decay * t)

                var0_np, m0, v0 = lamb_update_numpy(var0_np,
                                                    grads0_np,
                                                    t,
                                                    m0,
                                                    v0,
                                                    lr=lr_np)
                var1_np, m1, v1 = lamb_update_numpy(var1_np,
                                                    grads1_np,
                                                    t,
                                                    m1,
                                                    v1,
                                                    lr=lr_np)

                # Validate updated params
                self.assertAllCloseAccordingToType(var0_np,
                                                   self.evaluate(var0))
                self.assertAllCloseAccordingToType(var1_np,
                                                   self.evaluate(var1))
예제 #5
0
def test_serialization():
    optimizer = lamb.LAMB(1e-4,
                          weight_decay_rate=0.01,
                          exclude_from_weight_decay=["var1"])
    config = tf.keras.optimizers.serialize(optimizer)
    new_optimizer = tf.keras.optimizers.deserialize(config)
    assert new_optimizer.get_config() == optimizer.get_config()
예제 #6
0
def test_basic_with_learning_rate_decay():
    for i, dtype in enumerate(
            _dtypes_to_test(use_gpu=test_utils.is_gpu_available())):
        # Initialize variables for numpy implementation.
        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)

        var0 = tf.Variable(var0_np, name="var0_%d" % i)
        var1 = tf.Variable(var1_np, name="var1_%d" % i)
        grads0 = tf.constant(grads0_np)
        grads1 = tf.constant(grads1_np)

        learning_rate = 0.001
        beta_1 = 0.9
        beta_2 = 0.999
        epsilon = 1e-7
        decay = 0.5
        lamb_wd = 0.01

        opt = lamb.LAMB(
            learning_rate=learning_rate,
            beta_1=beta_1,
            beta_2=beta_2,
            epsilon=epsilon,
            weight_decay=lamb_wd,
            decay=decay,
        )

        # Run 3 steps of LAMB
        for t in range(3):
            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))

            lr_np = learning_rate / (1 + decay * t)

            var0_np, m0, v0 = lamb_update_numpy(var0_np,
                                                grads0_np,
                                                t,
                                                m0,
                                                v0,
                                                lr=lr_np,
                                                lamb_wd=lamb_wd)
            var1_np, m1, v1 = lamb_update_numpy(var1_np,
                                                grads1_np,
                                                t,
                                                m1,
                                                v1,
                                                lr=lr_np,
                                                lamb_wd=lamb_wd)

            # Validate updated params
            test_utils.assert_allclose_according_to_type(var0_np, var0.numpy())
            test_utils.assert_allclose_according_to_type(var1_np, var1.numpy())
예제 #7
0
    def doTestBasic(self, use_callable_params=False):
        for i, dtype in enumerate(
                self._DtypesToTest(use_gpu=tf.test.is_gpu_available())):
            # Initialize variables for numpy implementation.
            m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
            var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
            grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
            var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
            grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)

            var0 = tf.Variable(var0_np, name="var0_%d" % i)
            var1 = tf.Variable(var1_np, name="var1_%d" % i)
            grads0 = tf.constant(grads0_np)
            grads1 = tf.constant(grads1_np)

            learning_rate = lambda: 0.001
            beta1 = lambda: 0.9
            beta2 = lambda: 0.999
            epsilon = lambda: 1e-8
            if not use_callable_params:
                learning_rate = learning_rate()
                beta1 = beta1()
                beta2 = beta2()
                epsilon = epsilon()

            opt = lamb.LAMB(learning_rate=learning_rate)
            if not tf.executing_eagerly():
                update = opt.apply_gradients(
                    zip([grads0, grads1], [var0, var1]))
                self.evaluate(tf.compat.v1.global_variables_initializer())

            # Run 3 steps of LAMB
            for t in range(3):
                beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
                self.assertAllCloseAccordingToType(0.9**(t + 1),
                                                   self.evaluate(beta_1_power))
                self.assertAllCloseAccordingToType(0.999**(t + 1),
                                                   self.evaluate(beta_2_power))
                if not tf.executing_eagerly():
                    self.evaluate(update)
                else:
                    opt.apply_gradients(zip([grads0, grads1], [var0, var1]))

                var0_np, m0, v0 = lamb_update_numpy(var0_np, grads0_np, t, m0,
                                                    v0)
                var1_np, m1, v1 = lamb_update_numpy(var1_np, grads1_np, t, m1,
                                                    v1)

                # Validate updated params
                self.assertAllCloseAccordingToType(var0_np,
                                                   self.evaluate(var0))
                self.assertAllCloseAccordingToType(var1_np,
                                                   self.evaluate(var1))
예제 #8
0
    def testSharing(self):
        for dtype in self._DtypesToTest(use_gpu=tf.test.is_gpu_available()):
            # Initialize variables for numpy implementation.
            m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
            var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
            grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
            var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
            grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)

            var0 = tf.Variable(var0_np)
            var1 = tf.Variable(var1_np)
            grads0 = tf.constant(grads0_np)
            grads1 = tf.constant(grads1_np)
            opt = lamb.LAMB()

            if not tf.executing_eagerly():
                update1 = opt.apply_gradients(
                    zip([grads0, grads1], [var0, var1]))
                update2 = opt.apply_gradients(
                    zip([grads0, grads1], [var0, var1]))
                self.evaluate(tf.compat.v1.global_variables_initializer())

            # Fetch params to validate initial values
            self.assertAllClose([1.0, 2.0], self.evaluate(var0))
            self.assertAllClose([3.0, 4.0], self.evaluate(var1))

            # Run 3 steps of intertwined LAMB1 and LAMB2.
            for t in range(3):
                beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
                self.assertAllCloseAccordingToType(0.9**(t + 1),
                                                   self.evaluate(beta_1_power))
                self.assertAllCloseAccordingToType(0.999**(t + 1),
                                                   self.evaluate(beta_2_power))

                if not tf.executing_eagerly():
                    if t % 2 == 0:
                        update1.run()
                    else:
                        update2.run()
                else:
                    opt.apply_gradients(zip([grads0, grads1], [var0, var1]))

                var0_np, m0, v0 = lamb_update_numpy(var0_np, grads0_np, t, m0,
                                                    v0)
                var1_np, m1, v1 = lamb_update_numpy(var1_np, grads1_np, t, m1,
                                                    v1)

                # Validate updated params
                self.assertAllCloseAccordingToType(var0_np,
                                                   self.evaluate(var0))
                self.assertAllCloseAccordingToType(var1_np,
                                                   self.evaluate(var1))
예제 #9
0
    def testSparse(self):
        for dtype in self._DtypesToTest(use_gpu=tf.test.is_gpu_available()):
            # Initialize tf for numpy implementation.
            m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
            var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
            grads0_np = np.array([0.1, 0.0, 0.1], dtype=dtype.as_numpy_dtype)
            var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
            grads1_np = np.array([0.01, 0.0, 0.01], dtype=dtype.as_numpy_dtype)

            var0 = tf.Variable(var0_np)
            var1 = tf.Variable(var1_np)
            grads0_np_indices = np.array([0, 2], dtype=np.int32)
            grads0 = tf.IndexedSlices(
                tf.constant(grads0_np[grads0_np_indices]),
                tf.constant(grads0_np_indices),
                tf.constant([3]),
            )
            grads1_np_indices = np.array([0, 2], dtype=np.int32)
            grads1 = tf.IndexedSlices(
                tf.constant(grads1_np[grads1_np_indices]),
                tf.constant(grads1_np_indices),
                tf.constant([3]),
            )
            opt = lamb.LAMB()
            if not tf.executing_eagerly():
                update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
                self.evaluate(tf.compat.v1.global_variables_initializer())

            # Fetch params to validate initial values
            self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0))
            self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1))

            # Run 3 steps of LAMB
            for t in range(3):
                beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
                self.assertAllCloseAccordingToType(
                    0.9 ** (t + 1), self.evaluate(beta_1_power)
                )
                self.assertAllCloseAccordingToType(
                    0.999 ** (t + 1), self.evaluate(beta_2_power)
                )
                if not tf.executing_eagerly():
                    self.evaluate(update)
                else:
                    opt.apply_gradients(zip([grads0, grads1], [var0, var1]))

                var0_np, m0, v0 = lamb_update_numpy(var0_np, grads0_np, t, m0, v0)
                var1_np, m1, v1 = lamb_update_numpy(var1_np, grads1_np, t, m1, v1)

                # Validate updated params
                self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
                self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
예제 #10
0
def test_minimize_mean_square_loss_with_weight_decay():
    w = tf.Variable([0.1, -0.2, -0.1])
    x = tf.constant([0.4, 0.2, -0.5])

    def loss():
        return tf.reduce_mean(tf.square(x - w))

    opt = lamb.LAMB(0.02, weight_decay=0.01)

    # Run 200 steps
    for _ in range(200):
        opt.minimize(loss, [w])
    # Validate updated params
    np.testing.assert_allclose(w.numpy(),
                               np.asanyarray([0.4, 0.2, -0.5]),
                               rtol=1e-2,
                               atol=1e-2)
예제 #11
0
    def testMinimizeMeanSquareLossWithWeightDecay(self):
        w = tf.Variable([0.1, -0.2, -0.1])
        x = tf.constant([0.4, 0.2, -0.5])
        loss = lambda: tf.reduce_mean(tf.square(x - w))  # pylint:disable=cell-var-from-loop
        opt = lamb.LAMB(0.02, weight_decay_rate=0.01)

        if not tf.executing_eagerly():
            op = opt.minimize(loss, [w])
            self.evaluate(tf.compat.v1.global_variables_initializer())

        self.evaluate(tf.compat.v1.global_variables_initializer())
        # Run 200 steps
        for _ in range(200):
            if tf.executing_eagerly():
                opt.minimize(loss, [w])
            else:
                self.evaluate(op)
        # Validate updated params
        self.assertAllClose(self.evaluate(w), [0.4, 0.2, -0.5],
                            rtol=1e-2,
                            atol=1e-2)
예제 #12
0
def test_resource():
    for i, dtype in enumerate(
            _dtypes_to_test(use_gpu=test_utils.is_gpu_available())):
        # Initialize variables for numpy implementation.
        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)

        var0 = tf.Variable(var0_np, name="var0_%d" % i)
        var1 = tf.Variable(var1_np, name="var1_%d" % i)
        grads0 = tf.constant(grads0_np)
        grads1 = tf.constant(grads1_np)

        def learning_rate():
            return 0.001

        opt = lamb.LAMB(learning_rate=learning_rate)

        # Run 3 steps of LAMB
        for t in range(3):
            beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
            test_utils.assert_allclose_according_to_type(
                0.9**(t + 1), beta_1_power)
            test_utils.assert_allclose_according_to_type(
                0.999**(t + 1), beta_2_power)

            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))

            var0_np, m0, v0 = lamb_update_numpy(var0_np, grads0_np, t, m0, v0)
            var1_np, m1, v1 = lamb_update_numpy(var1_np, grads1_np, t, m1, v1)

            # Validate updated params
            test_utils.assert_allclose_according_to_type(var0_np, var0.numpy())
            test_utils.assert_allclose_according_to_type(var1_np, var1.numpy())
예제 #13
0
def test_weight_decay_rate_deprecation():
    with pytest.deprecated_call():
        opt = lamb.LAMB(0.01, weight_decay_rate=0.01)
        config = opt.get_config()
        assert config["weight_decay"] == 0.01
예제 #14
0
def test_exclude_layer_adaptation():
    opt = lamb.LAMB(0.01, exclude_from_layer_adaptation=["var1"])
    assert opt._do_layer_adaptation(tf.Variable([], name="var0"))
    assert not opt._do_layer_adaptation(tf.Variable([], name="var1"))
    assert not opt._do_layer_adaptation(tf.Variable([], name="var1_weight"))
예제 #15
0
 def test_get_config(self):
     opt = lamb.LAMB(1e-4)
     config = opt.get_config()
     self.assertEqual(config['learning_rate'], 1e-4)
예제 #16
0
def test_get_config():
    opt = lamb.LAMB(1e-4)
    config = opt.get_config()
    assert config["learning_rate"] == 1e-4
예제 #17
0
def test_exclude_layer_adaptation():
    opt = lamb.LAMB(0.01, exclude_from_layer_adaptation=["var1"])
    assert opt._do_layer_adaptation("var0")
    assert not opt._do_layer_adaptation("var1")
    assert not opt._do_layer_adaptation("var1_weight")
예제 #18
0
def test_serialization():
    optimizer = lamb.LAMB(1e-4)
    config = tf.keras.optimizers.serialize(optimizer)
    new_optimizer = tf.keras.optimizers.deserialize(config)
    assert new_optimizer.get_config() == optimizer.get_config()