示例#1
0
    def applyOptimizer(self, opt, dtype, steps=5, is_sparse=False):
        if is_sparse:
            var0 = tf.Variable([[0.0], [0.0]], dtype=dtype)
            var1 = tf.Variable([[0.0], [0.0]], dtype=dtype)
            grads0 = tf.IndexedSlices(
                tf.constant([0.1], shape=[1, 1], dtype=dtype),
                tf.constant([0]), tf.constant([2, 1]))
            grads1 = tf.IndexedSlices(
                tf.constant([0.02], shape=[1, 1], dtype=dtype),
                tf.constant([1]), tf.constant([2, 1]))
        else:
            var0 = tf.Variable([0.0, 0.0], dtype=dtype)
            var1 = tf.Variable([0.0, 0.0], dtype=dtype)
            grads0 = tf.constant([0.1, 0.2], dtype=dtype)
            grads1 = tf.constant([0.01, 0.02], dtype=dtype)

        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
        self.evaluate(tf.compat.v1.global_variables_initializer())

        v0_val, v1_val = self.evaluate([var0, var1])
        if is_sparse:
            self.assertAllCloseAccordingToType([[0.0], [0.0]], v0_val)
            self.assertAllCloseAccordingToType([[0.0], [0.0]], v1_val)
        else:
            self.assertAllCloseAccordingToType([0.0, 0.0], v0_val)
            self.assertAllCloseAccordingToType([0.0, 0.0], v1_val)

        # Run Ftrl for a few steps
        for _ in range(steps):
            update.run()

        v0_val, v1_val = self.evaluate([var0, var1])
        return v0_val, v1_val
示例#2
0
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""
        if self._var_key(variable) not in self._index_dict:
            raise KeyError(
                f'Optimizer cannot recognize variable {variable.name}, '
                f'this usually means you are calling an optimizer '
                f'previously used on a different model. Please try '
                f'creating a new optimizer instance.')
        lr = tf.cast(self.learning_rate, variable.dtype)

        var_key = self._var_key(variable)
        velocity = self._velocities[self._index_dict[var_key]]
        momentum = None
        if self.momentum > 0:
            momentum = self._momentums[self._index_dict[var_key]]
        average_grad = None
        if self.centered:
            average_grad = self._average_gradients[self._index_dict[var_key]]

        rho = self.rho

        if isinstance(gradient, tf.IndexedSlices):
            # Sparse gradients.
            velocity.assign(rho * velocity)
            velocity.scatter_add(
                tf.IndexedSlices(
                    tf.square(gradient.values) * (1 - rho), gradient.indices))
            if self.centered:
                average_grad.assign(rho * average_grad)
                average_grad.scatter_add(
                    tf.IndexedSlices(
                        tf.square(gradient.values) * (1 - rho),
                        gradient.indices))
                velocity.assign_add(-tf.square(average_grad))
            velocity_value = tf.gather(velocity, gradient.indices)
            transformed_grad = tf.IndexedSlices(
                gradient.values / (tf.sqrt(velocity_value) + self.epsilon),
                gradient.indices)

            if self.momentum > 0:
                momentum.assign(self.momentum * momentum)
                momentum.scatter_add(transformed_grad)
                variable.assign_add(-lr * momentum)
            else:
                variable.scatter_add(
                    tf.IndexedSlices(-lr * transformed_grad.values,
                                     transformed_grad.indices))
        else:
            # Dense gradients.
            velocity.assign(rho * velocity + (1 - rho) * tf.square(gradient))
            if self.centered:
                average_grad.assign(rho * average_grad +
                                    (1 - rho) * tf.square(gradient))
                velocity.assign_add(-tf.square(average_grad))
            transformed_grad = gradient / (tf.sqrt(velocity) + self.epsilon)
            if self.momentum > 0:
                momentum.assign(self.momentum * momentum + transformed_grad)
                variable.assign_add(-lr * momentum)
            else:
                variable.assign_add(-lr * transformed_grad)
示例#3
0
  def testSparseBasic(self):
    if tf.executing_eagerly():
      return

    for dtype in [tf.half, tf.float32, tf.float64]:
      with self.cached_session():
        var0 = tf.Variable([[1.1], [2.1]], dtype=dtype)
        var1 = tf.Variable([[3.], [4.]], dtype=dtype)
        grads0 = tf.IndexedSlices(
            tf.constant([0.1], shape=[1, 1], dtype=dtype),
            tf.constant([0]), tf.constant([2, 1]))
        grads1 = tf.IndexedSlices(
            tf.constant([0.01], shape=[1, 1], dtype=dtype),
            tf.constant([1]), tf.constant([2, 1]))
        decay_rate = 0.9
        sgd_op = tfp.optimizer.StochasticGradientLangevinDynamics(
            3., preconditioner_decay_rate=decay_rate).apply_gradients(
                zip([grads0, grads1], [var0, var1]))
        self.evaluate(tf1.global_variables_initializer())
        # Fetch params to validate initial values
        self.assertAllCloseAccordingToType([[1.1], [2.1]], self.evaluate(var0))
        self.assertAllCloseAccordingToType([[3.], [4.]], self.evaluate(var1))
        # Run 1 step of sgd
        self.evaluate(sgd_op)
        # Validate updated params
        grads_scaled = (0.5 * 0.1 /
                        np.sqrt(decay_rate + (1. - decay_rate) * 0.1**2 + 1e-8))
        # Note that `tfp.math.diag_jacobian(xs=var, ys=grad)` returns zero
        # tensor
        self.assertAllCloseAccordingToType([[1.1 - 3. * grads_scaled], [2.1]],
                                           self.evaluate(var0))
        grads_scaled = (0.5 * 0.01 / np.sqrt(
            decay_rate + (1. - decay_rate) * 0.01**2 + 1e-8))
        self.assertAllCloseAccordingToType(
            [[3. - 3. * 0], [4. - 3. * grads_scaled]], self.evaluate(var1))
示例#4
0
    def testSparseBasicWithLearningRateDecay(self):
        # TODO(tanzheny, omalleyt): Fix test in eager mode.
        with tf.Graph().as_default():
            for dtype in [tf.half, tf.float32, tf.float64]:
                var0 = tf.Variable([[1.0], [2.0]], dtype=dtype)
                var1 = tf.Variable([[3.0], [4.0]], dtype=dtype)
                grads0 = tf.IndexedSlices(
                    tf.constant([0.1], shape=[1, 1], dtype=dtype),
                    tf.constant([0]), tf.constant([2, 1]))
                grads1 = tf.IndexedSlices(
                    tf.constant([0.01], shape=[1, 1], dtype=dtype),
                    tf.constant([1]), tf.constant([2, 1]))
                sgd_op = gradient_descent.SGD(3.0, decay=0.5).apply_gradients(
                    zip([grads0, grads1], [var0, var1]))
                self.evaluate(tf.compat.v1.global_variables_initializer())
                # Run 2 steps of sgd
                self.evaluate(sgd_op)
                # Validate updated params
                self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
                                                   self.evaluate(var0))
                self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
                                                   self.evaluate(var1))

                self.evaluate(sgd_op)
                # Validate updated params
                self.assertAllCloseAccordingToType(
                    [[1.0 - 3.0 * 0.1 - 2.0 * 0.1], [2.0]],
                    self.evaluate(var0))
                self.assertAllCloseAccordingToType(
                    [[3.0], [4.0 - 3.0 * 0.01 - 2.0 * 0.01]],
                    self.evaluate(var1))
示例#5
0
    def testSparseRepeatedIndices(self):
        # TODO(tanzheny, omalleyt): Fix test in eager mode.
        with tf.Graph().as_default():
            for dtype in _DATA_TYPES:
                var_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)

                repeated_index_update_var = tf.Variable(var_np, dtype=dtype)
                aggregated_update_var = tf.Variable(var_np, dtype=dtype)
                grad_repeated_index = tf.IndexedSlices(
                    tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
                    tf.constant([1, 1]), tf.constant([2, 1]))
                grad_aggregated = tf.IndexedSlices(
                    tf.constant([0.2], shape=[1, 1], dtype=dtype),
                    tf.constant([1]), tf.constant([2, 1]))
                repeated_update = adagrad.Adagrad(3.0).apply_gradients([
                    (grad_repeated_index, repeated_index_update_var)
                ])
                aggregated_update = adagrad.Adagrad(3.0).apply_gradients([
                    (grad_aggregated, aggregated_update_var)
                ])
                self.evaluate(tf.compat.v1.global_variables_initializer())
                self.assertAllClose(self.evaluate(aggregated_update_var),
                                    self.evaluate(repeated_index_update_var))
                for _ in range(3):
                    self.evaluate(repeated_update)
                    self.evaluate(aggregated_update)
                    self.assertAllClose(
                        self.evaluate(aggregated_update_var),
                        self.evaluate(repeated_index_update_var))
示例#6
0
  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
    var_device, var_dtype = var.device, var.dtype.base_dtype
    coefficients = ((apply_state or {}).get((var_device, var_dtype)) or
                    self._fallback_apply_state(var_device, var_dtype))

    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, 'm')
    m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
    m.assign(m * coefficients['beta_1_t'])
    m.scatter_add(tf.IndexedSlices(m_scaled_g_values, indices))

    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
    v = self.get_slot(var, 'v')
    v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
    v.assign(v * coefficients['beta_2_t'])
    v.scatter_add(tf.IndexedSlices(v_scaled_g_values, indices))

    if not self.amsgrad:
      var.assign_sub(coefficients['lr'] * m /
                     (tf.sqrt(v) + coefficients['epsilon']))
    else:
      v_hat = self.get_slot(var, 'vhat')
      v_hat.assign(tf.maximum(v_hat, v))
      var.assign_sub(coefficients['lr'] * m /
                     (tf.sqrt(v_hat) + coefficients['epsilon']))
示例#7
0
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""
        if self._var_key(variable) not in self._index_dict:
            raise KeyError(
                f'Optimizer cannot recognize variable {variable.name}, '
                f'this usually means you are calling an optimizer '
                f'previously used on a different model. Please try '
                f'creating a new optimizer instance.')
        lr = tf.cast(self.learning_rate, variable.dtype)
        local_step = tf.cast(self.iterations + 1, variable.dtype)
        beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)

        var_key = self._var_key(variable)
        m = self._m[self._index_dict[var_key]]
        u = self._u[self._index_dict[var_key]]

        if isinstance(gradient, tf.IndexedSlices):
            # Sparse gradients.
            indices = gradient.indices
            m.assign_add(-m * (1 - self.beta_1))
            m.scatter_add(
                tf.IndexedSlices(gradient.values * (1 - self.beta_1), indices))
            u.assign(u * self.beta_2)
            u_slice = tf.gather(u, indices)
            u_slice_incremental = tf.maximum(u_slice, tf.abs(
                gradient.values)) - u_slice
            u.scatter_add(tf.IndexedSlices(u_slice_incremental, indices))
            variable.assign_sub(
                (lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
        else:
            # Dense gradients.
            m.assign_add((gradient - m) * (1 - self.beta_1))
            u.assign(tf.maximum(self.beta_2 * u, tf.abs(gradient)))
            variable.assign_sub(
                (lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
示例#8
0
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""
        lr = tf.cast(self.learning_rate, variable.dtype)
        local_step = tf.cast(self.iterations + 1, variable.dtype)
        beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)

        var_key = self._var_key(variable)
        m = self._m[self._index_dict[var_key]]
        u = self._u[self._index_dict[var_key]]

        if isinstance(gradient, tf.IndexedSlices):
            # Sparse gradients.
            indices = gradient.indices
            m.assign_add(-m * (1 - self.beta_1))
            m.scatter_add(
                tf.IndexedSlices(gradient.values * (1 - self.beta_1), indices))
            u.assign(u * self.beta_2)
            u_slice = tf.gather(u, indices)
            u_slice_incremental = (
                tf.maximum(u_slice, tf.abs(gradient.values)) - u_slice)
            u.scatter_add(tf.IndexedSlices(u_slice_incremental, indices))
            variable.assign_sub(
                (lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
        else:
            # Dense gradients.
            m.assign_add((gradient - m) * (1 - self.beta_1))
            u.assign(tf.maximum(self.beta_2 * u, tf.abs(gradient)))
            variable.assign_sub(
                (lr * m) / ((1 - beta_1_power) * (u + self.epsilon)))
示例#9
0
 def testSparseRepeatedIndices(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     for dtype in [tf.half, tf.float32, tf.float64]:
         with tf.Graph().as_default(), self.cached_session():
             repeated_index_update_var = tf.Variable([[1.0], [2.0]],
                                                     dtype=dtype)
             aggregated_update_var = tf.Variable([[1.0], [2.0]],
                                                 dtype=dtype)
             grad_repeated_index = tf.IndexedSlices(
                 tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
                 tf.constant([1, 1]), tf.constant([2, 1]))
             grad_aggregated = tf.IndexedSlices(
                 tf.constant([0.2], shape=[1, 1], dtype=dtype),
                 tf.constant([1]), tf.constant([2, 1]))
             repeated_update = adamax.Adamax().apply_gradients([
                 (grad_repeated_index, repeated_index_update_var)
             ])
             aggregated_update = adamax.Adamax().apply_gradients([
                 (grad_aggregated, aggregated_update_var)
             ])
             self.evaluate(tf.compat.v1.global_variables_initializer())
             self.assertAllClose(aggregated_update_var,
                                 repeated_index_update_var.eval())
             for _ in range(3):
                 repeated_update.run()
                 aggregated_update.run()
                 self.assertAllClose(aggregated_update_var,
                                     repeated_index_update_var.eval())
示例#10
0
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""
        var_dtype = variable.dtype
        lr = tf.cast(self.learning_rate, var_dtype)
        local_step = tf.cast(self.iterations + 1, var_dtype)
        next_step = tf.cast(self.iterations + 2, var_dtype)
        decay = tf.cast(0.96, var_dtype)
        beta_1 = tf.cast(self.beta_1, var_dtype)
        beta_2 = tf.cast(self.beta_2, var_dtype)
        u_t = beta_1 * (1.0 - 0.5 * (tf.pow(decay, local_step)))
        u_t_1 = beta_1 * (1.0 - 0.5 * (tf.pow(decay, next_step)))

        def get_cached_u_product():
            return self._u_product

        def compute_new_u_product():
            u_product_t = self._u_product * u_t
            self._u_product.assign(u_product_t)
            self._u_product_counter += 1
            return u_product_t

        u_product_t = tf.cond(
            self._u_product_counter == (self.iterations + 2),
            true_fn=get_cached_u_product,
            false_fn=compute_new_u_product,
        )
        u_product_t_1 = u_product_t * u_t_1
        beta_2_power = tf.pow(beta_2, local_step)

        var_key = self._var_key(variable)
        m = self._momentums[self._index_dict[var_key]]
        v = self._velocities[self._index_dict[var_key]]

        if isinstance(gradient, tf.IndexedSlices):
            # Sparse gradients.
            m.assign_add(-m * (1 - beta_1))
            m.scatter_add(
                tf.IndexedSlices(gradient.values * (1 - beta_1),
                                 gradient.indices))
            v.assign_add(-v * (1 - beta_2))
            v.scatter_add(
                tf.IndexedSlices(
                    tf.square(gradient.values) * (1 - beta_2),
                    gradient.indices))
            m_hat = u_t_1 * m / (1 - u_product_t_1) + (1 - u_t) * gradient / (
                1 - u_product_t)
            v_hat = v / (1 - beta_2_power)

            variable.assign_sub((m_hat * lr) / (tf.sqrt(v_hat) + self.epsilon))
        else:
            # Dense gradients.
            m.assign_add((gradient - m) * (1 - beta_1))
            v.assign_add((tf.square(gradient) - v) * (1 - beta_2))
            m_hat = u_t_1 * m / (1 - u_product_t_1) + (1 - u_t) * gradient / (
                1 - u_product_t)
            v_hat = v / (1 - beta_2_power)

            variable.assign_sub((m_hat * lr) / (tf.sqrt(v_hat) + self.epsilon))
示例#11
0
    def testResourceSparse(self):
        # TODO(tanzheny, omalleyt): Fix test in eager mode.
        for dtype in [tf.half, tf.float32, tf.float64]:
            with tf.Graph().as_default(), self.cached_session():
                # Initialize variables for numpy implementation.
                zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype)  # pylint: disable=cell-var-from-loop
                m0, v0, m1, v1 = (
                    zero_slots(),
                    zero_slots(),
                    zero_slots(),
                    zero_slots(),
                )
                var0_np = np.array([1.0, 2.0, 3.0], dtype=dtype.as_numpy_dtype)
                grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
                var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype)
                grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)

                var0 = tf.Variable(var0_np)
                var1 = tf.Variable(var1_np)

                grads0_np_indices = np.array([0, 1], dtype=np.int32)
                grads0 = tf.IndexedSlices(
                    tf.constant(grads0_np),
                    tf.constant(grads0_np_indices),
                    tf.constant([3]),
                )
                grads1_np_indices = np.array([2, 1], dtype=np.int32)
                grads1 = tf.IndexedSlices(
                    tf.constant(grads1_np),
                    tf.constant(grads1_np_indices),
                    tf.constant([3]),
                )
                opt = adamax.Adamax()
                update = opt.apply_gradients(
                    zip([grads0, grads1], [var0, var1]))
                self.evaluate(tf.compat.v1.global_variables_initializer())

                # Fetch params to validate initial values
                self.assertAllClose([1.0, 2.0, 3.0], var0)
                self.assertAllClose([4.0, 5.0, 6.0], var1)

                beta1_power = get_beta_accumulators(opt, dtype)

                # Run 3 steps of Adamax
                for t in range(3):
                    self.assertAllCloseAccordingToType(0.9**(t + 1),
                                                       beta1_power)
                    update.run()

                    var0_np, m0, v0 = adamax_sparse_update_numpy(
                        var0_np, grads0_np_indices, grads0_np, t, m0, v0)
                    var1_np, m1, v1 = adamax_sparse_update_numpy(
                        var1_np, grads1_np_indices, grads1_np, t, m1, v1)

                    # Validate updated params
                    self.assertAllCloseAccordingToType(var0_np, var0)
                    self.assertAllCloseAccordingToType(var1_np, var1)
示例#12
0
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""
        lr = tf.cast(self.learning_rate, variable.dtype)

        var_key = self._var_key(variable)
        velocity = self._velocities[self._index_dict[var_key]]
        momentum = None
        if self.momentum > 0:
            momentum = self._momentums[self._index_dict[var_key]]
        average_grad = None
        if self.centered:
            average_grad = self._average_gradients[self._index_dict[var_key]]

        rho = self.rho

        if isinstance(gradient, tf.IndexedSlices):
            # Sparse gradients.
            velocity.assign(rho * velocity)
            velocity.scatter_add(
                tf.IndexedSlices(
                    tf.square(gradient.values) * (1 - rho), gradient.indices))
            if self.centered:
                average_grad.assign(rho * average_grad)
                average_grad.scatter_add(
                    tf.IndexedSlices(
                        tf.square(gradient.values) * (1 - rho),
                        gradient.indices))
                velocity.assign_add(-tf.square(average_grad))
            velocity_value = tf.gather(velocity, gradient.indices)
            transformed_grad = tf.IndexedSlices(
                gradient.values / (tf.sqrt(velocity_value) + self.epsilon),
                gradient.indices,
            )

            if self.momentum > 0:
                momentum.assign(self.momentum * momentum)
                momentum.scatter_add(transformed_grad)
                variable.assign_add(-lr * momentum)
            else:
                variable.scatter_add(
                    tf.IndexedSlices(-lr * transformed_grad.values,
                                     transformed_grad.indices))
        else:
            # Dense gradients.
            velocity.assign(rho * velocity + (1 - rho) * tf.square(gradient))
            if self.centered:
                average_grad.assign(rho * average_grad +
                                    (1 - rho) * tf.square(gradient))
                velocity.assign_add(-tf.square(average_grad))
            transformed_grad = gradient / (tf.sqrt(velocity) + self.epsilon)
            if self.momentum > 0:
                momentum.assign(self.momentum * momentum + transformed_grad)
                variable.assign_add(-lr * momentum)
            else:
                variable.assign_add(-lr * transformed_grad)
示例#13
0
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""
        beta_1_power = None
        beta_2_power = None
        lr = tf.cast(self.learning_rate, variable.dtype)
        local_step = tf.cast(self.iterations + 1, variable.dtype)
        beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)
        beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step)

        var_key = self._var_key(variable)
        m = self._momentums[self._index_dict[var_key]]
        v = self._velocities[self._index_dict[var_key]]

        alpha = lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)

        # Apply step weight decay
        if (
            self.weight_decay != 0
            and variable not in self._exclude_from_weight_decay
        ):
            wd = tf.cast(self.weight_decay, variable.dtype)
            variable.assign_sub(variable * wd)

        if isinstance(gradient, tf.IndexedSlices):
            # Sparse gradients.
            m.assign_add(-m * (1 - self.beta_1))
            m.scatter_add(
                tf.IndexedSlices(
                    gradient.values * (1 - self.beta_1), gradient.indices
                )
            )
            v.assign_add(-v * (1 - self.beta_2))
            v.scatter_add(
                tf.IndexedSlices(
                    tf.square(gradient.values) * (1 - self.beta_2),
                    gradient.indices,
                )
            )
            if self.amsgrad:
                v_hat = self._velocity_hats[self._index_dict[var_key]]
                v_hat.assign(tf.maximum(v_hat, v))
                v = v_hat
            variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
        else:
            # Dense gradients.
            m.assign_add((gradient - m) * (1 - self.beta_1))
            v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2))
            if self.amsgrad:
                v_hat = self._velocity_hats[self._index_dict[var_key]]
                v_hat.assign(tf.maximum(v_hat, v))
                v = v_hat
            variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
示例#14
0
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""
        if self._var_key(variable) not in self._index_dict:
            raise KeyError(
                f'Optimizer cannot recognize variable {variable.name}, '
                f'this usually means you are calling an optimizer '
                f'previously used on a different model. Please try '
                f'creating a new optimizer instance.')
        beta_1_power = None
        beta_2_power = None
        lr = tf.cast(self.learning_rate, variable.dtype)
        local_step = tf.cast(self.iterations + 1, variable.dtype)
        beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)
        beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step)

        var_key = self._var_key(variable)
        m = self._momentums[self._index_dict[var_key]]
        v = self._velocities[self._index_dict[var_key]]

        alpha = (lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power))

        # Apply step weight decay
        if self.weight_decay != 0:
            wd = tf.cast(self.weight_decay, variable.dtype)
            variable.assign_sub(variable * (1 - lr * wd))

        if isinstance(gradient, tf.IndexedSlices):
            # Sparse gradients.
            m.assign_add(-m * (1 - self.beta_1))
            m.scatter_add(
                tf.IndexedSlices(gradient.values * (1 - self.beta_1),
                                 gradient.indices))
            v.assign_add(-v * (1 - self.beta_2))
            v.scatter_add(
                tf.IndexedSlices(
                    tf.square(gradient.values) * (1 - self.beta_2),
                    gradient.indices))
            if self.amsgrad:
                v_hat = self._velocity_hats[self._index_dict[var_key]]
                v_hat.assign(tf.maximum(v_hat, v))
                v = v_hat
            variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
        else:
            # Dense gradients.
            m.assign_add((gradient - m) * (1 - self.beta_1))
            v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2))
            if self.amsgrad:
                v_hat = self._velocity_hats[self._index_dict[var_key]]
                v_hat.assign(tf.maximum(v_hat, v))
                v = v_hat
            variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
示例#15
0
    def testSparseBasic(self):
        # TODO(tanzheny, omalleyt): Fix test in eager mode.
        with tf.Graph().as_default():
            for dtype in _DATA_TYPES:
                var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
                grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
                var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
                grads1_np = np.array([0.01, 0, 0.01],
                                     dtype=dtype.as_numpy_dtype)

                var0 = tf.Variable(var0_np)
                var1 = tf.Variable(var1_np)
                grads0_np_indices = np.array([0, 2], dtype=np.int32)
                grads0 = tf.IndexedSlices(
                    tf.constant(grads0_np[grads0_np_indices]),
                    tf.constant(grads0_np_indices), tf.constant([3]))
                grads1_np_indices = np.array([0, 2], dtype=np.int32)
                grads1 = tf.IndexedSlices(
                    tf.constant(grads1_np[grads1_np_indices]),
                    tf.constant(grads1_np_indices), tf.constant([3]))
                learning_rate = 3.0
                ada_opt = adagrad.Adagrad(learning_rate)
                ada_update = ada_opt.apply_gradients(
                    zip([grads0, grads1], [var0, var1]))
                self.evaluate(tf.compat.v1.global_variables_initializer())

                # Fetch params to validate initial values
                self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0))
                self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1))

                accum0_np = np.array([0.1, 0.1, 0.1],
                                     dtype=dtype.as_numpy_dtype)
                accum1_np = np.array([0.1, 0.1, 0.1],
                                     dtype=dtype.as_numpy_dtype)

                # Run 3 step of sgd
                for _ in range(3):
                    self.evaluate(ada_update)

                    var0_np, accum0_np = sparse_adagrad_update_numpy(
                        var0_np, accum0_np, grads0_np_indices,
                        grads0_np[grads0_np_indices], learning_rate)
                    var1_np, accum1_np = sparse_adagrad_update_numpy(
                        var1_np, accum1_np, grads1_np_indices,
                        grads1_np[grads1_np_indices], learning_rate)
                    self.assertAllCloseAccordingToType(var0_np,
                                                       self.evaluate(var0))
                    self.assertAllCloseAccordingToType(var1_np,
                                                       self.evaluate(var1))
示例#16
0
  def testSparseWithAmsgrad(self):
    # dtypes.half does not work on gpu + eager.
    for dtype in [tf.float32, tf.float64]:
      with self.cached_session():
        m0 = np.array([[0.0], [0.0]])
        v0 = np.array([[0.0], [0.0]])
        v0hat = np.array([[0.0], [0.0]])
        indices_np = np.array([1])
        indices = tf.constant(indices_np, dtype=tf.int32)
        var0_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
        repeated_index_update_var = tf.Variable(var0_np, dtype=dtype)
        aggregated_update_var = tf.Variable(var0_np, dtype=dtype)
        grads0_np = np.array([[0.2]], dtype=dtype.as_numpy_dtype)
        grad_repeated_index = tf.IndexedSlices(
            tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
            tf.constant([1, 1]), tf.constant([2, 1]))
        grad_aggregated = tf.IndexedSlices(grads0_np, indices,
                                            tf.constant([2, 1]))
        opt_repeated = adam.NonFusedAdam(amsgrad=True)
        opt_aggregated = adam.NonFusedAdam(amsgrad=True)
        if not tf.executing_eagerly():
          repeated_update = opt_repeated.apply_gradients(
              [(grad_repeated_index, repeated_index_update_var)])
          aggregated_update = opt_aggregated.apply_gradients(
              [(grad_aggregated, aggregated_update_var)])
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.assertAllClose(
            self.evaluate(aggregated_update_var),
            self.evaluate(repeated_index_update_var))
        for t in range(3):
          if not tf.executing_eagerly():
            self.evaluate(repeated_update)
            self.evaluate(aggregated_update)
          else:
            opt_repeated.apply_gradients(
                [(grad_repeated_index, repeated_index_update_var)])
            opt_aggregated.apply_gradients(
                [(grad_aggregated, aggregated_update_var)])

          var0_np, m0, v0, v0hat = adam_sparse_update_numpy_amsgrad(
              var0_np, indices_np, grads0_np, t, m0, v0, v0hat)

          # Validate updated params
          self.assertAllCloseAccordingToType(
              var0_np, self.evaluate(aggregated_update_var))
          self.assertAllCloseAccordingToType(
              self.evaluate(aggregated_update_var),
              self.evaluate(repeated_index_update_var))
示例#17
0
  def update_step(self, grad, variable):
    """Update step given gradient and the associated model variable."""
    if self._var_key(variable) not in self._index_dict:
      raise KeyError(f'Optimizer cannot recognize variable {variable.name}, '
                     f'this usually means you are calling an optimizer '
                     f'previously used on a different model. Please try '
                     f'creating a new optimizer instance.')
    lr = tf.cast(self.learning_rate, variable.dtype)

    var_key = self._var_key(variable)
    rho = self.rho
    accumulated_grad = self._accumulated_grads[self._index_dict[var_key]]
    accumulated_delta_var = self._accumulated_delta_vars[
        self._index_dict[var_key]]

    def rms(x):
      return tf.sqrt(x + self.epsilon)

    if isinstance(grad, tf.IndexedSlices):
      # Sparse gradients.
      accumulated_grad.assign_add((rho - 1) * accumulated_grad)
      accumulated_grad.scatter_add(tf.IndexedSlices(
          (1 - rho) * tf.square(grad.values), grad.indices))
      delta_var = -rms(accumulated_delta_var) * grad / rms(accumulated_grad)
      accumulated_delta_var.assign(rho * accumulated_delta_var +
                                   (1 - rho) * delta_var * delta_var)
    else:
      # Dense gradients.
      accumulated_grad.assign(rho * accumulated_grad + (1 - rho) * grad * grad)
      delta_var = -rms(accumulated_delta_var) * grad / rms(accumulated_grad)
      accumulated_delta_var.assign(rho * accumulated_delta_var +
                                   (1 - rho) * delta_var * delta_var)
    variable.assign_add(lr * delta_var)
示例#18
0
def run_sparse_sample(iterations, expected, optimizer):
    var_0 = tf.Variable([1.0, 2.0])
    var_1 = tf.Variable([3.0, 4.0])

    grad_0 = tf.IndexedSlices(tf.constant([0.1]), tf.constant([0]),
                              tf.constant([2]))
    grad_1 = tf.IndexedSlices(tf.constant([0.04]), tf.constant([1]),
                              tf.constant([2]))

    grads_and_vars = list(zip([grad_0, grad_1], [var_0, var_1]))

    for _ in range(iterations):
        optimizer.apply_gradients(grads_and_vars)

    np.testing.assert_allclose(var_0.read_value(), expected[0], atol=2e-4)
    np.testing.assert_allclose(var_1.read_value(), expected[1], atol=2e-4)
示例#19
0
    def update_step(self, grad, variable):
        """Update step given gradient and the associated model variable."""
        lr = tf.cast(self.learning_rate, variable.dtype)

        var_key = self._var_key(variable)
        rho = self.rho
        accumulated_grad = self._accumulated_grads[self._index_dict[var_key]]
        accumulated_delta_var = self._accumulated_delta_vars[
            self._index_dict[var_key]]

        def rms(x):
            return tf.sqrt(x + self.epsilon)

        if isinstance(grad, tf.IndexedSlices):
            # Sparse gradients.
            accumulated_grad.assign_add((rho - 1) * accumulated_grad)
            accumulated_grad.scatter_add(
                tf.IndexedSlices((1 - rho) * tf.square(grad.values),
                                 grad.indices))
            delta_var = (-rms(accumulated_delta_var) * grad /
                         rms(accumulated_grad))
            accumulated_delta_var.assign(rho * accumulated_delta_var +
                                         (1 - rho) * delta_var * delta_var)
        else:
            # Dense gradients.
            accumulated_grad.assign(rho * accumulated_grad +
                                    (1 - rho) * grad * grad)
            delta_var = (-rms(accumulated_delta_var) * grad /
                         rms(accumulated_grad))
            accumulated_delta_var.assign(rho * accumulated_delta_var +
                                         (1 - rho) * delta_var * delta_var)
        variable.assign_add(lr * delta_var)
示例#20
0
    def update_step(self, gradient, variable):
        """Update step given gradient and the associated model variable."""
        lr = tf.cast(self.learning_rate, variable.dtype)
        m = None
        var_key = self._var_key(variable)
        if self.momentum != 0:
            momentum = tf.cast(self.momentum, variable.dtype)
            m = self.momentums[self._index_dict[var_key]]

        # TODO(b/204321487): Add nesterov acceleration.
        if isinstance(gradient, tf.IndexedSlices):
            # Sparse gradients.
            add_value = tf.IndexedSlices(-gradient.values * lr,
                                         gradient.indices)
            if m is not None:
                m.assign(m * momentum)
                m.scatter_add(add_value)
                if self.nesterov:
                    variable.scatter_add(add_value)
                    variable.assign_add(m * momentum)
                else:
                    variable.assign_add(m)
            else:
                variable.scatter_add(add_value)
        else:
            # Dense gradients
            if m is not None:
                m.assign(-gradient * lr + m * momentum)
                if self.nesterov:
                    variable.assign_add(-gradient * lr + m * momentum)
                else:
                    variable.assign_add(m)
            else:
                variable.assign_add(-gradient * lr)
示例#21
0
 def testSparseStability(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     with tf.Graph().as_default():
         for dtype in [tf.half]:
             shape = [1, 6]
             var0_np = np.array([[
                 0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257,
                 -0.0105945
             ]],
                                dtype=dtype.as_numpy_dtype)
             var0 = tf.Variable(var0_np)
             grads0_np = np.array([[
                 -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05,
                 -8.4877e-05, -9.48906e-05
             ]],
                                  dtype=dtype.as_numpy_dtype)
             grads0 = tf.IndexedSlices(tf.constant(grads0_np),
                                       tf.constant([0]), tf.constant(shape))
             ada_opt = adagrad.Adagrad(1.0)
             ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
             slot0 = ada_opt.get_slot(var0, "accumulator")
             init = tf.compat.v1.global_variables_initializer()
             for _ in range(100):
                 self.evaluate(init)
                 self.evaluate(ada_update)
                 self.assertAllCloseAccordingToType(
                     np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]),
                     self.evaluate(slot0))
                 self.assertAllCloseAccordingToType(
                     np.array([[
                         0.00891194, -0.10712013, 0.11047515, 0.22636929,
                         -0.0144573, -0.01029443
                     ]]), self.evaluate(var0))
示例#22
0
  def testSparse(self):
    # TODO(tanzheny, omalleyt): Fix test in eager mode.
    sparse_epsilon = 1e-7
    for dtype in [tf.half, tf.float32, tf.float64]:
      with tf.Graph().as_default(), self.cached_session():
        # Initialize variables for numpy implementation.
        m0, v0, m1, v1, mcache = 0.0, 0.0, 0.0, 0.0, 1.0
        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)

        var0 = tf.Variable(var0_np)
        var1 = tf.Variable(var1_np)
        grads0_np_indices = np.array([0, 2], dtype=np.int32)
        grads0 = tf.IndexedSlices(
            tf.constant(grads0_np[grads0_np_indices]),
            tf.constant(grads0_np_indices), tf.constant([3]))
        grads1_np_indices = np.array([0, 2], dtype=np.int32)
        grads1 = tf.IndexedSlices(
            tf.constant(grads1_np[grads1_np_indices]),
            tf.constant(grads1_np_indices), tf.constant([3]))
        opt = nadam.Nadam(epsilon=sparse_epsilon)
        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
        self.evaluate(tf.compat.v1.global_variables_initializer())

        # Fetch params to validate initial values
        self.assertAllClose([1.0, 1.0, 2.0], var0)
        self.assertAllClose([3.0, 3.0, 4.0], var1)

        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)

        # Run 3 steps of Nadam
        for t in range(3):
          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power)
          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power)
          update.run()

          mcache = update_m_cache(mcache, t)
          var0_np, m0, v0 = nadam_update_numpy(
              var0_np, grads0_np, t, m0, v0, mcache, epsilon=sparse_epsilon)
          var1_np, m1, v1 = nadam_update_numpy(
              var1_np, grads1_np, t, m1, v1, mcache, epsilon=sparse_epsilon)

          # Validate updated params
          self.assertAllCloseAccordingToType(var0_np, var0)
          self.assertAllCloseAccordingToType(var1_np, var1)
示例#23
0
 def testIndexedSlices(self):
   dtype = tf.int64
   iss = tf.IndexedSlices(values=tf.ones([2, 3], dtype=dtype),
                          indices=tf.constant([1, 9]),
                          dense_shape=[10, 3])
   a = array_ops.array(iss, copy=False)
   expected = tf.scatter_nd([[1], [9]], tf.ones([2, 3], dtype=dtype), [10, 3])
   self.assertAllEqual(expected, a)
示例#24
0
def _multiply_gradient(gradient, scale):
    """Multiply a (possibly sparse) gradient by the given scale factor."""
    scale = tf.cast(scale, gradient.dtype)
    if isinstance(gradient, tf.IndexedSlices):
        return tf.IndexedSlices(gradient.values * scale,
                                gradient.indices,
                                dense_shape=gradient.dense_shape)
    else:
        return gradient * scale
 def select(self, step):
     """Returns the index of the selected representation for a training step."""
     if step - self.last_selection_step >= self.sample_freq:
         self.current_selection.assign(self._select())
         self.last_selection_step.assign(step)
     # Increment the counter for the newly selected item.
     self.selection_counter.scatter_add(
         tf.IndexedSlices(1, self.current_selection))
     return self.current_selection.numpy()
示例#26
0
    def testFtrlWithL1_L2_L2ShrinkageSparse(self):
        """Tests the new FTRL op with support for l2 shrinkage on sparse
        grads."""
        # TODO(tanzheny, omalleyt): Fix test in eager mode.
        for dtype in [tf.half, tf.float32]:
            with tf.Graph().as_default(), self.cached_session():
                var0 = tf.Variable([[1.0], [2.0]], dtype=dtype)
                var1 = tf.Variable([[4.0], [3.0]], dtype=dtype)
                grads0 = tf.IndexedSlices(
                    tf.constant([0.1], shape=[1, 1], dtype=dtype),
                    tf.constant([0]),
                    tf.constant([2, 1]),
                )
                grads1 = tf.IndexedSlices(
                    tf.constant([0.02], shape=[1, 1], dtype=dtype),
                    tf.constant([1]),
                    tf.constant([2, 1]),
                )

                opt = ftrl.Ftrl(
                    3.0,
                    initial_accumulator_value=0.1,
                    l1_regularization_strength=0.001,
                    l2_regularization_strength=2.0,
                    l2_shrinkage_regularization_strength=0.1,
                )
                update = opt.apply_gradients(
                    zip([grads0, grads1], [var0, var1]))
                self.evaluate(tf.compat.v1.global_variables_initializer())

                v0_val, v1_val = self.evaluate([var0, var1])
                self.assertAllCloseAccordingToType([[1.0], [2.0]], v0_val)
                self.assertAllCloseAccordingToType([[4.0], [3.0]], v1_val)

                # Run 10 steps FTRL
                for _ in range(10):
                    update.run()

                v0_val, v1_val = self.evaluate([var0, var1])
                self.assertAllCloseAccordingToType([[-0.22578995], [2.0]],
                                                   v0_val)
                self.assertAllCloseAccordingToType([[4.0], [-0.13229476]],
                                                   v1_val)
 def testGetUnscaledSparseGradients(self, opt_cls):
   opt = create_sgd(opt_cls)
   opt = create_lso(opt, dynamic=False, initial_scale=2)
   sparse_scaled_grad = tf.IndexedSlices(
       tf.convert_to_tensor([[4., 2.], [8., 5.]]),
       tf.convert_to_tensor([1, 3], dtype='int32'),
       dense_shape=tf.convert_to_tensor([5, 2], dtype='int32'))
   sparse_grad = opt.get_unscaled_gradients([sparse_scaled_grad])[0]
   self.assertIsInstance(sparse_grad, tf.IndexedSlices)
   self.assertAllEqual([[2., 1.], [4., 2.5]],
                       self.evaluate(sparse_grad.values))
示例#28
0
    def testSparseBasic(self):
        for dtype in [tf.half, tf.float32, tf.float64]:
            with self.cached_session():
                var0 = tf.Variable([[1.1], [2.1]], dtype=dtype)
                var1 = tf.Variable([[3.0], [4.0]], dtype=dtype)
                grads0 = tf.IndexedSlices(
                    tf.constant([0.1], shape=[1, 1], dtype=dtype),
                    tf.constant([0]), tf.constant([2, 1]))
                grads1 = tf.IndexedSlices(
                    tf.constant([0.01], shape=[1, 1], dtype=dtype),
                    tf.constant([1]), tf.constant([2, 1]))
                decay_rate = 0.1
                batch_size = 2
                total_num_examples = 10
                sgd_optimizer = tfp.optimizer.VariationalSGD(
                    batch_size,
                    total_num_examples,
                    max_learning_rate=3.0,
                    burnin=0,
                    preconditioner_decay_rate=decay_rate)
                if not tf.executing_eagerly():
                    sgd_op = sgd_optimizer.apply_gradients(
                        zip([grads0, grads1], [var0, var1]))

                self.evaluate(tf1.global_variables_initializer())
                # Fetch params to validate initial values
                self.assertAllCloseAccordingToType([[1.1], [2.1]],
                                                   self.evaluate(var0))
                self.assertAllCloseAccordingToType([[3.0], [4.0]],
                                                   self.evaluate(var1))
                # Run 1 step of sgd
                if not tf.executing_eagerly():
                    self.evaluate(sgd_op)
                else:
                    sgd_optimizer.apply_gradients(
                        zip([grads0, grads1], [var0, var1]))
                # Validate updated params
                self.assertAllCloseAccordingToType([[1.1 - 3. * 0.1], [2.1]],
                                                   self.evaluate(var0))
                self.assertAllCloseAccordingToType(
                    [[3. - 3. * 0], [4. - 3. * 0.01]], self.evaluate(var1))
示例#29
0
    def _resource_apply_sparse(self, grad, var, indices):
        max_learning_rate = tf.where(
            self.iterations < tf.cast(self._burnin, tf.int64),
            self._burnin_max_learning_rate, self._max_learning_rate)

        learn_rate = tf.clip_by_value(
            self._get_coordinatewise_learning_rate(
                tf.IndexedSlices(grad, indices), var), 0.,
            tf.cast(max_learning_rate, var.dtype))
        delta = grad * learn_rate

        return self._resource_scatter_add(var, indices, -delta)
 def testGetUnscaledSparseGradients(self):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt,
                                                   dynamic=False,
                                                   initial_scale=2)
     sparse_scaled_grad = tf.IndexedSlices(
         tf.convert_to_tensor([[4., 2.], [8., 5.]]),
         tf.convert_to_tensor([1, 3], dtype='int32'),
         dense_shape=tf.convert_to_tensor([5, 2], dtype='int32'))
     sparse_grad = opt.get_unscaled_gradients([sparse_scaled_grad])[0]
     self.assertIsInstance(sparse_grad, tf.IndexedSlices)
     self.assertAllEqual([[2., 1.], [4., 2.5]],
                         self.evaluate(sparse_grad.values))