Пример #1
0
def adagrad(learning_rate: ScalarOrSchedule,
            initial_accumulator_value: float = 0.1,
            eps: float = 1e-7) -> base.GradientTransformation:
    """The Adagrad optimizer.

  Adagrad is an algorithm for gradient based optimisation that anneals the
  learning rate for each parameter during the course of training.

  WARNING: Adagrad's main limit is the monotonic accumulation of squared
  gradients in the denominator: since all terms are >0, the sum keeps growing
  during training and the learning rate eventually becomes vanishingly small.

  References:
    [Duchi et al, 2011](https://jmlr.org/papers/v12/duchi11a.html)

  Args:
    learning_rate: this is a fixed global scaling factor.
    initial_accumulator_value: initialisation for the accumulator.
    eps: a small constant applied to denominator inside of the square root
      (as in RMSProp) to avoid dividing by zero when rescaling.

  Returns:
    the corresponding `GradientTransformation`.
  """
    return combine.chain(
        transform.scale_by_rss(
            initial_accumulator_value=initial_accumulator_value, eps=eps),
        _scale_by_learning_rate(learning_rate),
    )
Пример #2
0
def adagrad(learning_rate: float,
            initial_accumulator_value: float = 0.1,
            eps: float = 1e-7) -> GradientTransformation:
    return combine.chain(
        transform.scale_by_rss(
            initial_accumulator_value=initial_accumulator_value, eps=eps),
        transform.scale(-learning_rate),
    )