Пример #1
0
    def testBasicMatrix(self, use_resource_var):
        """Check update when gradient is a matrix."""
        size = [10, 5]
        init_var_np = np.zeros(size)
        grad_np = np.random.rand(size[0], size[1])
        grad_np_2 = np.random.rand(size[0], size[1])

        with self.cached_session() as sess:
            global_step = variables.Variable(0,
                                             dtype=dtypes.int64,
                                             use_resource=use_resource_var)
            var = variables.Variable(init_var_np,
                                     dtype=dtypes.float32,
                                     use_resource=use_resource_var)
            grad = constant_op.constant(grad_np, dtype=dtypes.float32)
            grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)

            opt = shampoo.ShampooOptimizer(global_step)
            update = opt.apply_gradients(zip([grad], [var]),
                                         global_step=global_step)
            update_2 = opt.apply_gradients(zip([grad_2], [var]),
                                           global_step=global_step)
            variables.global_variables_initializer().run()

            init_val = sess.run(var)
            self.assertAllCloseAccordingToType(init_var_np, init_val)

            # Run a step of Shampoo
            update.run()
            new_val = sess.run(var)

            # let up compute this in numpy
            # Update rule is var = var - lr * mat_g1^{-0.25} * grad * mat_g2^{-0.25}
            # lr = 1
            mat_g1 = np.dot(grad_np, grad_np.transpose())
            mat_left = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.25)
            mat_g2 = np.dot(grad_np.transpose(), grad_np)
            mat_right = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.25)
            new_val_np = init_var_np - np.dot(np.dot(mat_left, grad_np),
                                              mat_right)

            self.assertAllCloseAccordingToType(new_val_np,
                                               new_val,
                                               atol=TOLERANCE,
                                               rtol=TOLERANCE)

            # Run another step of Shampoo
            update_2.run()
            new_val = sess.run(var)

            mat_g1 += np.dot(grad_np_2, grad_np_2.transpose())
            mat_left = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.25)
            mat_g2 += np.dot(grad_np_2.transpose(), grad_np_2)
            mat_right = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.25)
            new_val_np -= np.dot(np.dot(mat_left, grad_np_2), mat_right)

            self.assertAllCloseAccordingToType(new_val_np,
                                               new_val,
                                               atol=TOLERANCE,
                                               rtol=TOLERANCE)
Пример #2
0
    def testBasicVector(self, use_resource_var):
        """Similar to the full Adagrad update."""

        size = 20
        init_var_np = np.zeros(size)
        grad_np = np.random.rand(size)
        grad_np_2 = np.random.rand(size)

        with self.cached_session() as sess:
            global_step = variables.VariableV1(0,
                                               dtype=dtypes.int64,
                                               use_resource=use_resource_var)
            var = variables.VariableV1(init_var_np,
                                       dtype=dtypes.float32,
                                       use_resource=use_resource_var)
            grad = constant_op.constant(grad_np, dtype=dtypes.float32)
            grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)

            opt = shampoo.ShampooOptimizer(global_step)
            update = opt.apply_gradients(zip([grad], [var]),
                                         global_step=global_step)
            update_2 = opt.apply_gradients(zip([grad_2], [var]),
                                           global_step=global_step)
            variables.global_variables_initializer().run()

            init_val = sess.run(var)
            self.assertAllCloseAccordingToType(init_var_np, init_val)

            # Run a step of Shampoo
            update.run()
            new_val = sess.run(var)

            # let up compute this in numpy
            # Update rule is var = var - lr * mat_g^{-0.5} * grad
            # lr = 1
            mat_g = np.outer(grad_np, grad_np) / grad_np.shape[0]
            mat_h = np_power(mat_g + RIDGE_EPSILON * np.eye(size), -0.5)
            new_val_np = init_var_np - np.dot(mat_h, grad_np)

            self.assertAllCloseAccordingToType(new_val_np,
                                               new_val,
                                               atol=TOLERANCE,
                                               rtol=TOLERANCE)

            # Run another step of Shampoo
            update_2.run()
            new_val = sess.run(var)

            mat_g += np.outer(grad_np_2, grad_np_2) / grad_np.shape[0]
            mat_h = np_power(mat_g + RIDGE_EPSILON * np.eye(size), -0.5)
            new_val_np -= np.dot(mat_h, grad_np_2)

            self.assertAllCloseAccordingToType(new_val_np,
                                               new_val,
                                               atol=TOLERANCE,
                                               rtol=TOLERANCE)
Пример #3
0
    def testLargeVector(self, use_resource_var):
        """This is just the diagonal Adagrad update."""

        size = 2000
        init_var_np = np.zeros(size)
        grad_np = np.random.rand(size)
        grad_np_2 = np.random.rand(size)

        with self.cached_session() as sess:
            global_step = variables.VariableV1(0,
                                               dtype=dtypes.int64,
                                               use_resource=use_resource_var)
            var = variables.VariableV1(init_var_np,
                                       dtype=dtypes.float32,
                                       use_resource=use_resource_var)
            grad = constant_op.constant(grad_np, dtype=dtypes.float32)
            grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)

            opt = shampoo.ShampooOptimizer(global_step)
            update = opt.apply_gradients(zip([grad], [var]),
                                         global_step=global_step)
            update_2 = opt.apply_gradients(zip([grad_2], [var]),
                                           global_step=global_step)
            variables.global_variables_initializer().run()

            init_val = sess.run(var)
            self.assertAllCloseAccordingToType(init_var_np, init_val)

            # Run a step of Shampoo
            update.run()
            new_val = sess.run(var)

            # let up compute this in numpy
            # Update rule is var = var - lr * gg^{-0.5} * grad
            # lr = 1
            mat_g = (grad_np * grad_np)
            new_val_np = init_var_np - np.power(mat_g, -0.5) * grad_np

            self.assertAllCloseAccordingToType(new_val_np,
                                               new_val,
                                               atol=TOLERANCE,
                                               rtol=TOLERANCE)
            # Run another step of Shampoo
            update_2.run()
            new_val = sess.run(var)

            mat_g += (grad_np_2 * grad_np_2)
            new_val_np -= np.power(mat_g, -0.5) * grad_np_2

            self.assertAllCloseAccordingToType(new_val_np,
                                               new_val,
                                               atol=TOLERANCE,
                                               rtol=TOLERANCE)
Пример #4
0
    def _testDelayedPrecondUpdate(self, use_iterative_root):
        """Update the squared sum every nth step, drop the other steps.

    Args:
      use_iterative_root: use iterative power method or SVD to find nth roots.
    """
        size = [10, 5, 7]
        init_var_np = np.zeros(size).astype(np.float32)
        iterations = 100
        grad_np = np.random.rand(iterations, size[0], size[1],
                                 size[2]).astype(np.float32)
        svd_interval = 20
        precond_update_interval = 5
        mat_g1_a = np.eye(size[0])
        mat_g1 = np.zeros_like(mat_g1_a)
        mat_g2_a = np.eye(size[1])
        mat_g2 = np.zeros_like(mat_g2_a)
        mat_g3_a = np.eye(size[2])
        mat_g3 = np.zeros_like(mat_g3_a)

        with self.test_session() as sess:
            global_step = variables.Variable(0, dtype=dtypes.int64)
            var = variables.Variable(init_var_np, dtype=dtypes.float32)
            grad = array_ops.placeholder(dtypes.float32, shape=size)

            opt = shampoo.ShampooOptimizer(
                global_step,
                svd_interval=svd_interval,
                precond_update_interval=precond_update_interval,
                use_iterative_root=use_iterative_root)
            update = opt.apply_gradients(zip([grad], [var]),
                                         global_step=global_step)
            variables.global_variables_initializer().run()

            init_val = sess.run(var)
            self.assertAllCloseAccordingToType(init_var_np, init_val)
            new_val_np = init_var_np

            # Run n steps of Shampoo
            for i in range(iterations):
                _ = sess.run(update, feed_dict={grad: grad_np[i]})
                new_val = sess.run(var)

                # let up compute this in numpy
                # Update rule is var = var - lr * Prod_i mat_g_i^{-0.5/3} grad
                # lr = 1
                if (i + 1) % precond_update_interval == 0:
                    mat_g1 += (np.tensordot(
                        grad_np[i], grad_np[i], axes=([1, 2], [1, 2])) *
                               precond_update_interval)
                    mat_g2 += (np.tensordot(
                        grad_np[i], grad_np[i], axes=([0, 2], [0, 2])) *
                               precond_update_interval)
                    mat_g3 += (np.tensordot(
                        grad_np[i], grad_np[i], axes=([0, 1], [0, 1])) *
                               precond_update_interval)

                if (i + 1) % svd_interval == 0:
                    mat_g1_a = np_power(mat_g1 + 0.1 * np.eye(size[0]),
                                        -0.5 / 3.0)
                    mat_g2_a = np_power(mat_g2 + 0.1 * np.eye(size[1]),
                                        -0.5 / 3.0)
                    mat_g3_a = np_power(mat_g3 + 0.1 * np.eye(size[2]),
                                        -0.5 / 3.0)

                precond_grad = np.tensordot(grad_np[i],
                                            mat_g1_a,
                                            axes=([0], [0]))
                precond_grad = np.tensordot(precond_grad,
                                            mat_g2_a,
                                            axes=([0], [0]))
                precond_grad = np.tensordot(precond_grad,
                                            mat_g3_a,
                                            axes=([0], [0]))
                new_val_np -= precond_grad

                self.assertAllCloseAccordingToType(new_val_np,
                                                   new_val,
                                                   atol=TOLERANCE,
                                                   rtol=TOLERANCE)
Пример #5
0
    def _testBasicTensorWithMomentum(self, use_iterative_root):
        """Check update with momentum when gradient is a tensor.

    Args:
      use_iterative_root: use iterative power method or SVD to find nth roots.
    """
        size = [10, 5, 7]
        init_var_np = np.zeros(size)
        grad_np = np.random.rand(size[0], size[1], size[2])
        grad_np_2 = np.random.rand(size[0], size[1], size[2])
        gbar_decay = 0.9
        gbar_weight = 0.1

        with self.test_session() as sess:
            global_step = variables.Variable(0, dtype=dtypes.int64)
            var = variables.Variable(init_var_np, dtype=dtypes.float32)
            grad = constant_op.constant(grad_np, dtype=dtypes.float32)
            grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)

            opt = shampoo.ShampooOptimizer(
                global_step,
                gbar_decay=gbar_decay,
                gbar_weight=gbar_weight,
                use_iterative_root=use_iterative_root)
            update = opt.apply_gradients(zip([grad], [var]),
                                         global_step=global_step)
            update_2 = opt.apply_gradients(zip([grad_2], [var]),
                                           global_step=global_step)
            variables.global_variables_initializer().run()

            # Run a step of Shampoo
            update.run()
            new_val = sess.run(var)

            # let up compute this in numpy
            # Update rule is var = var - lr * Prod_i mat_g_i^{-0.5/3} grad
            # lr = 1
            mat_g1 = np.tensordot(grad_np, grad_np, axes=([1, 2], [1, 2]))
            mat_g1_a = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.5 / 3.0)
            mat_g2 = np.tensordot(grad_np, grad_np, axes=([0, 2], [0, 2]))
            mat_g2_a = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.5 / 3.0)
            mat_g3 = np.tensordot(grad_np, grad_np, axes=([0, 1], [0, 1]))
            mat_g3_a = np_power(mat_g3 + 0.1 * np.eye(size[2]), -0.5 / 3.0)

            gbar_np = gbar_weight * grad_np
            precond_grad = np.tensordot(gbar_np, mat_g1_a, axes=([0], [0]))
            precond_grad = np.tensordot(precond_grad,
                                        mat_g2_a,
                                        axes=([0], [0]))
            precond_grad = np.tensordot(precond_grad,
                                        mat_g3_a,
                                        axes=([0], [0]))
            new_val_np = init_var_np - precond_grad

            self.assertAllCloseAccordingToType(new_val_np,
                                               new_val,
                                               atol=TOLERANCE,
                                               rtol=TOLERANCE)

            # Run another step of Shampoo
            update_2.run()
            new_val = sess.run(var)

            mat_g1 += np.tensordot(grad_np_2, grad_np_2, axes=([1, 2], [1, 2]))
            mat_g1_a = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.5 / 3.0)
            mat_g2 += np.tensordot(grad_np_2, grad_np_2, axes=([0, 2], [0, 2]))
            mat_g2_a = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.5 / 3.0)
            mat_g3 += np.tensordot(grad_np_2, grad_np_2, axes=([0, 1], [0, 1]))
            mat_g3_a = np_power(mat_g3 + 0.1 * np.eye(size[2]), -0.5 / 3.0)

            gbar_np_2 = gbar_decay * gbar_np + gbar_weight * grad_np_2
            precond_grad = np.tensordot(gbar_np_2, mat_g1_a, axes=([0], [0]))
            precond_grad = np.tensordot(precond_grad,
                                        mat_g2_a,
                                        axes=([0], [0]))
            precond_grad = np.tensordot(precond_grad,
                                        mat_g3_a,
                                        axes=([0], [0]))
            new_val_np -= precond_grad

            self.assertAllCloseAccordingToType(new_val_np,
                                               new_val,
                                               atol=TOLERANCE,
                                               rtol=TOLERANCE)
Пример #6
0
    def _testSparseUpdateSmall(self, use_iterative_root):
        """Gradient is of type IndexSlices, but the first dimension is small.

    We create dense gradient and do the full update with SVD etc.

    Args:
      use_iterative_root: use iterative power method or SVD to find nth roots.
    """

        size = [100, 3, 5]
        sample_size = 10
        init_var_np = np.zeros(size)
        grad_indices = np.sort(
            np.random.choice(np.arange(size[0]), sample_size, replace=False))
        grad_np = np.random.rand(sample_size, size[1], size[2])

        with self.test_session() as sess:
            global_step = variables.Variable(0, dtype=dtypes.int64)
            var = variables.Variable(init_var_np, dtype=dtypes.float32)
            grad = ops.IndexedSlices(
                constant_op.constant(grad_np, dtype=dtypes.float32),
                constant_op.constant(grad_indices), constant_op.constant(size))

            opt = shampoo.ShampooOptimizer(
                global_step, use_iterative_root=use_iterative_root)
            update = opt.apply_gradients(zip([grad], [var]),
                                         global_step=global_step)
            variables.global_variables_initializer().run()

            init_val = sess.run(var)
            self.assertAllCloseAccordingToType(init_var_np, init_val)

            # Run a step of Shampoo
            update.run()
            new_val = sess.run(var)

            # let up compute this in numpy
            # Update rule is var = var - lr * Prod_i mat_g_i^{-0.125} grad
            # lr = 1
            grad_dense = np.zeros_like(init_var_np)
            grad_dense[grad_indices] = grad_np

            mat_g1 = np.tensordot(grad_dense,
                                  grad_dense,
                                  axes=([1, 2], [1, 2]))
            mat_g1_a = np_power(mat_g1 + 0.1 * np.eye(size[0]), -0.5 / 3.0)
            mat_g2 = np.tensordot(grad_dense,
                                  grad_dense,
                                  axes=([0, 2], [0, 2]))
            mat_g2_a = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.5 / 3.0)
            mat_g3 = np.tensordot(grad_dense,
                                  grad_dense,
                                  axes=([0, 1], [0, 1]))
            mat_g3_a = np_power(mat_g3 + 0.1 * np.eye(size[2]), -0.5 / 3.0)

            precond_grad = np.tensordot(grad_dense, mat_g1_a, axes=([0], [0]))
            precond_grad = np.tensordot(precond_grad,
                                        mat_g2_a,
                                        axes=([0], [0]))
            precond_grad = np.tensordot(precond_grad,
                                        mat_g3_a,
                                        axes=([0], [0]))
            new_val_np = init_var_np - precond_grad

            self.assertAllCloseAccordingToType(new_val_np,
                                               new_val,
                                               atol=TOLERANCE,
                                               rtol=TOLERANCE)
Пример #7
0
    def testSparseUpdateLarge(self):
        """Check update when gradient is of type IndexSlices.

    We do diagonal updates for the first dimension, unless it is very small.
    """

        size = [2000, 3]
        sample_size_1 = 100
        init_var_np = np.zeros(size)
        grad_indices = np.sort(
            np.random.choice(np.arange(size[0]), sample_size_1, replace=False))
        grad_np = np.random.rand(sample_size_1, size[1])

        sample_size_2 = 7
        grad_indices_2 = np.sort(
            np.random.choice(np.arange(size[0]), sample_size_2, replace=False))
        grad_np_2 = np.random.rand(sample_size_2, size[1])

        with self.test_session() as sess:
            global_step = variables.Variable(0, dtype=dtypes.int64)
            var = variables.Variable(init_var_np, dtype=dtypes.float32)
            grad = ops.IndexedSlices(
                constant_op.constant(grad_np, dtype=dtypes.float32),
                constant_op.constant(grad_indices), constant_op.constant(size))
            grad_2 = ops.IndexedSlices(
                constant_op.constant(grad_np_2, dtype=dtypes.float32),
                constant_op.constant(grad_indices_2),
                constant_op.constant(size))

            opt = shampoo.ShampooOptimizer(global_step)
            update = opt.apply_gradients(zip([grad], [var]),
                                         global_step=global_step)
            update_2 = opt.apply_gradients(zip([grad_2], [var]),
                                           global_step=global_step)
            variables.global_variables_initializer().run()

            init_val = sess.run(var)
            self.assertAllCloseAccordingToType(init_var_np, init_val)

            # Run a step of Shampoo
            update.run()
            new_val = sess.run(var)

            # let up compute this in numpy
            # Update rule is var = var - lr * mat_left * grad * mat_right
            # where the mat_left * grad is just element-wise product,
            # with broadcasting
            # lr = 1
            # In this case the update lr * mat_left * grad * mat_right is
            # of size 10 x 2.
            # So the correct indices of var need to be updated.

            mat_g1 = np.sum(grad_np * grad_np, axis=1, keepdims=True)
            mat_g1_acc = np.zeros((size[0], 1))
            mat_g1_acc[grad_indices] += mat_g1
            mat_left = np.power(mat_g1 + 0.1, -0.25)
            mat_g2 = np.dot(grad_np.transpose(), grad_np)
            mat_right = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.25)
            new_val_np = init_var_np
            new_val_np[grad_indices, :] -= np.dot(grad_np * mat_left,
                                                  mat_right)

            self.assertAllCloseAccordingToType(new_val_np,
                                               new_val,
                                               atol=TOLERANCE,
                                               rtol=TOLERANCE)

            # Run another step of Shampoo
            update_2.run()
            new_val = sess.run(var)

            mat_g1 = np.sum(grad_np_2 * grad_np_2, axis=1, keepdims=True)
            mat_g1_acc[grad_indices_2] += mat_g1
            mat_left = np.power(mat_g1_acc[grad_indices_2] + 0.1, -0.25)
            mat_g2 += np.dot(grad_np_2.transpose(), grad_np_2)
            mat_right = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.25)
            new_val_np[grad_indices_2, :] -= np.dot(grad_np_2 * mat_left,
                                                    mat_right)

            self.assertAllCloseAccordingToType(new_val_np,
                                               new_val,
                                               atol=TOLERANCE,
                                               rtol=TOLERANCE)
Пример #8
0
    def testLargeMatrix(self):
        """Gradient is a matrix, one of whose dimensions is large.

    We do diagonal updates for large dimensions.
    """

        size = [2000, 3]
        init_var_np = np.zeros(size)
        grad_np = np.random.rand(size[0], size[1])
        grad_np_2 = np.random.rand(size[0], size[1])

        with self.test_session() as sess:
            global_step = variables.Variable(0, dtype=dtypes.int64)
            var = variables.Variable(init_var_np, dtype=dtypes.float32)
            grad = constant_op.constant(grad_np, dtype=dtypes.float32)
            grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)

            opt = shampoo.ShampooOptimizer(global_step)
            update = opt.apply_gradients(zip([grad], [var]),
                                         global_step=global_step)
            update_2 = opt.apply_gradients(zip([grad_2], [var]),
                                           global_step=global_step)
            variables.global_variables_initializer().run()

            init_val = sess.run(var)
            self.assertAllCloseAccordingToType(init_var_np, init_val)

            # Run a step of Shampoo
            update.run()
            new_val = sess.run(var)

            # let up compute this in numpy
            # Update rule is var = var - lr * mat_left * grad * mat_right
            # where the mat_left * grad is just element-wise product,
            # with broadcasting
            # lr = 1

            mat_g1 = np.sum(grad_np * grad_np, axis=1, keepdims=True)
            mat_left = np.power(mat_g1 + 0.1, -0.25)
            mat_g2 = np.dot(grad_np.transpose(), grad_np)
            mat_right = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.25)
            new_val_np = init_var_np - np.dot(grad_np * mat_left, mat_right)

            self.assertAllCloseAccordingToType(new_val_np,
                                               new_val,
                                               atol=TOLERANCE,
                                               rtol=TOLERANCE)

            # Run another step of Shampoo
            update_2.run()
            new_val = sess.run(var)

            mat_g1 += np.sum(grad_np_2 * grad_np_2, axis=1, keepdims=True)
            mat_left = np.power(mat_g1 + 0.1, -0.25)
            mat_g2 += np.dot(grad_np_2.transpose(), grad_np_2)
            mat_right = np_power(mat_g2 + 0.1 * np.eye(size[1]), -0.25)
            new_val_np -= np.dot(grad_np_2 * mat_left, mat_right)

            self.assertAllCloseAccordingToType(new_val_np,
                                               new_val,
                                               atol=TOLERANCE,
                                               rtol=TOLERANCE)
  def _testBasicTensor(self, use_iterative_root, use_resource_var):
    """Check update when gradient is a tensor.

    Args:
      use_iterative_root: use iterative power method or SVD to find nth roots.
      use_resource_var: use resource var as variables.
    """
    size = [10, 5, 7]
    init_var_np = np.zeros(size)
    grad_np = np.random.rand(size[0], size[1], size[2])
    grad_np_2 = np.random.rand(size[0], size[1], size[2])

    with self.cached_session() as sess:
      global_step = variables.Variable(
          0, dtype=dtypes.int64, use_resource=use_resource_var)
      var = variables.Variable(
          init_var_np, dtype=dtypes.float32, use_resource=use_resource_var)
      grad = constant_op.constant(grad_np, dtype=dtypes.float32)
      grad_2 = constant_op.constant(grad_np_2, dtype=dtypes.float32)

      opt = shampoo.ShampooOptimizer(global_step,
                                     use_iterative_root=use_iterative_root)
      update = opt.apply_gradients(zip([grad], [var]),
                                   global_step=global_step)
      update_2 = opt.apply_gradients(zip([grad_2], [var]),
                                     global_step=global_step)
      variables.global_variables_initializer().run()

      init_val = sess.run(var)
      self.assertAllCloseAccordingToType(init_var_np, init_val)

      # Run a step of Shampoo
      update.run()
      new_val = sess.run(var)

      # let up compute this in numpy
      # Update rule is var = var - lr * Prod_i mat_g_i^{-0.5/3} grad
      # lr = 1
      mat_g1 = (
          np.tensordot(grad_np, grad_np, axes=([1, 2], [1, 2])) /
          grad_np.shape[0])
      mat_g1_a = np_power(mat_g1 + RIDGE_EPSILON * np.eye(size[0]), -0.5 / 3.0)
      mat_g2 = (
          np.tensordot(grad_np, grad_np, axes=([0, 2], [0, 2])) /
          grad_np.shape[1])
      mat_g2_a = np_power(mat_g2 + RIDGE_EPSILON * np.eye(size[1]), -0.5 / 3.0)
      mat_g3 = (
          np.tensordot(grad_np, grad_np, axes=([0, 1], [0, 1])) /
          grad_np.shape[2])
      mat_g3_a = np_power(mat_g3 + RIDGE_EPSILON * np.eye(size[2]), -0.5 / 3.0)

      precond_grad = np.tensordot(grad_np, mat_g1_a, axes=([0], [0]))
      precond_grad = np.tensordot(precond_grad, mat_g2_a, axes=([0], [0]))
      precond_grad = np.tensordot(precond_grad, mat_g3_a, axes=([0], [0]))
      new_val_np = init_var_np - precond_grad

      self.assertAllCloseAccordingToType(new_val_np, new_val,
                                         atol=TOLERANCE, rtol=TOLERANCE)

      # Run another step of Shampoo
      update_2.run()
      new_val = sess.run(var)

      mat_g1 += (
          np.tensordot(grad_np_2, grad_np_2, axes=([1, 2], [1, 2])) /
          grad_np_2.shape[0])
      mat_g1_a = np_power(mat_g1 + RIDGE_EPSILON * np.eye(size[0]), -0.5 / 3.0)
      mat_g2 += (
          np.tensordot(grad_np_2, grad_np_2, axes=([0, 2], [0, 2])) /
          grad_np_2.shape[1])
      mat_g2_a = np_power(mat_g2 + RIDGE_EPSILON * np.eye(size[1]), -0.5 / 3.0)
      mat_g3 += (
          np.tensordot(grad_np_2, grad_np_2, axes=([0, 1], [0, 1])) /
          grad_np_2.shape[2])
      mat_g3_a = np_power(mat_g3 + RIDGE_EPSILON * np.eye(size[2]), -0.5 / 3.0)

      precond_grad = np.tensordot(grad_np_2, mat_g1_a, axes=([0], [0]))
      precond_grad = np.tensordot(precond_grad, mat_g2_a, axes=([0], [0]))
      precond_grad = np.tensordot(precond_grad, mat_g3_a, axes=([0], [0]))
      new_val_np -= precond_grad

      self.assertAllCloseAccordingToType(new_val_np, new_val,
                                         atol=TOLERANCE, rtol=TOLERANCE)
Пример #10
0
    def _testDelayedSVD(self, use_iterative_root, use_resource_var):
        """Performing the SVD every nth step.

    Args:
      use_iterative_root: use iterative power method or SVD to find nth roots.
      use_resource_var: use resource var as variables.
    """
        size = [10, 5, 7]
        init_var_np = np.zeros(size).astype(np.float32)
        iterations = 20
        svd_interval = 5
        grad_np = np.random.rand(iterations, size[0], size[1],
                                 size[2]).astype(np.float32)
        mat_g1_a = np.eye(size[0])
        mat_g1 = np.zeros_like(mat_g1_a)
        mat_g2_a = np.eye(size[1])
        mat_g2 = np.zeros_like(mat_g2_a)
        mat_g3_a = np.eye(size[2])
        mat_g3 = np.zeros_like(mat_g3_a)

        with self.cached_session() as sess:
            global_step = variables.VariableV1(0,
                                               dtype=dtypes.int64,
                                               use_resource=use_resource_var)
            var = variables.VariableV1(init_var_np,
                                       dtype=dtypes.float32,
                                       use_resource=use_resource_var)
            grad = array_ops.placeholder(dtypes.float32, shape=size)

            opt = shampoo.ShampooOptimizer(
                global_step,
                svd_interval=svd_interval,
                use_iterative_root=use_iterative_root)
            update = opt.apply_gradients(zip([grad], [var]),
                                         global_step=global_step)
            variables.global_variables_initializer().run()

            init_val = sess.run(var)
            self.assertAllCloseAccordingToType(init_var_np, init_val)
            new_val_np = init_var_np

            # Run n steps of Shampoo
            for i in range(iterations):
                _ = sess.run(update, feed_dict={grad: grad_np[i]})
                new_val = sess.run(var)

                # let up compute this in numpy
                # Update rule is var = var - lr * Prod_i mat_g_i^{-0.5/3} grad
                # lr = 1
                mat_g1 += np.tensordot(
                    grad_np[i], grad_np[i],
                    axes=([1, 2], [1, 2])) / grad_np[i].shape[0]
                mat_g2 += np.tensordot(
                    grad_np[i], grad_np[i],
                    axes=([0, 2], [0, 2])) / grad_np[i].shape[1]
                mat_g3 += np.tensordot(
                    grad_np[i], grad_np[i],
                    axes=([0, 1], [0, 1])) / grad_np[i].shape[2]
                if (i + 1) % svd_interval == 0:
                    mat_g1_a = np_power(
                        mat_g1 + RIDGE_EPSILON * np.eye(size[0]), -0.5 / 3.0)
                    mat_g2_a = np_power(
                        mat_g2 + RIDGE_EPSILON * np.eye(size[1]), -0.5 / 3.0)
                    mat_g3_a = np_power(
                        mat_g3 + RIDGE_EPSILON * np.eye(size[2]), -0.5 / 3.0)

                precond_grad = np.tensordot(grad_np[i],
                                            mat_g1_a,
                                            axes=([0], [0]))
                precond_grad = np.tensordot(precond_grad,
                                            mat_g2_a,
                                            axes=([0], [0]))
                precond_grad = np.tensordot(precond_grad,
                                            mat_g3_a,
                                            axes=([0], [0]))
                new_val_np -= precond_grad

                self.assertAllCloseAccordingToType(new_val_np,
                                                   new_val,
                                                   atol=TOLERANCE,
                                                   rtol=TOLERANCE)