Пример #1
0
 def testDefaultNoisyLinearCosine(self, serialize):
     num_training_steps = 1000
     initial_lr = 1.0
     for step in range(0, 1500, 250):
         # No numerical check because of noise
         decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay(
             initial_lr, num_training_steps)
         decayed_lr = _maybe_serialized(decayed_lr, serialize)
         # Cannot be deterministically tested
         self.evaluate(decayed_lr(step))
Пример #2
0
 def testNonDefaultNoisyLinearCosine(self, serialize):
     num_training_steps = 1000
     initial_lr = 1.0
     for step in range(0, 1500, 250):
         # No numerical check because of noise
         decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay(
             initial_lr,
             num_training_steps,
             initial_variance=0.5,
             variance_decay=0.1,
             alpha=0.1,
             beta=1e-4,
             num_periods=5)
         decayed_lr = _maybe_serialized(decayed_lr, serialize)
         # Cannot be deterministically tested
         self.evaluate(decayed_lr(step))
Пример #3
0
def noisy_linear_cosine_decay(learning_rate,
                              global_step,
                              decay_steps,
                              initial_variance=1.0,
                              variance_decay=0.55,
                              num_periods=0.5,
                              alpha=0.0,
                              beta=0.001,
                              name=None):
    """Applies noisy linear cosine decay to the learning rate.

  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
  https://arxiv.org/abs/1709.07417

  For the idea of warm starts here controlled by `num_periods`,
  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
  with Warm Restarts. https://arxiv.org/abs/1608.03983

  Note that linear cosine decay is more aggressive than cosine decay and
  larger initial learning rates can typically be used.

  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies a noisy linear
  cosine decay function to a provided initial learning rate.
  It requires a `global_step` value to compute the decayed learning rate.
  You can just pass a TensorFlow variable that you increment at each
  training step.

  The function returns the decayed learning rate.  It is computed as:
  ```python
  global_step = min(global_step, decay_steps)
  linear_decay = (decay_steps - global_step) / decay_steps)
  cosine_decay = 0.5 * (
      1 + cos(pi * 2 * num_periods * global_step / decay_steps))
  decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
  decayed_learning_rate = learning_rate * decayed
  ```
  where eps_t is 0-centered gaussian noise with variance
  initial_variance / (1 + global_step) ** variance_decay

  Example usage:
  ```python
  decay_steps = 1000
  lr_decayed = noisy_linear_cosine_decay(
    learning_rate, global_step, decay_steps)
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
      The initial learning rate.
    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
      step to use for the decay computation.
    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number
      of steps to decay over.
    initial_variance: initial variance for the noise. See computation above.
    variance_decay: decay for the noise's variance. See computation above.
    num_periods: Number of periods in the cosine part of the decay. See
      computation above.
    alpha: See computation above.
    beta: See computation above.
    name: String.  Optional name of the operation.  Defaults to
      'NoisyLinearCosineDecay'.

  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.
  Raises:
    ValueError: if `global_step` is not supplied.

  @compatibility(eager)
  When eager execution is enabled, this function returns a function which in
  turn returns the decayed learning rate Tensor. This can be useful for changing
  the learning rate value across different invocations of optimizer functions.
  @end_compatibility
  """
    decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay(
        learning_rate,
        decay_steps,
        initial_variance=initial_variance,
        variance_decay=variance_decay,
        num_periods=num_periods,
        alpha=alpha,
        beta=beta,
        name=name)

    if not context.executing_eagerly():
        decayed_lr = decayed_lr(global_step)
    else:
        decayed_lr = functools.partial(decayed_lr, global_step)
    return decayed_lr
Пример #4
0
def noisy_linear_cosine_decay(learning_rate,
                              global_step,
                              decay_steps,
                              k_decay=1.0,
                              initial_variance=1.0,
                              variance_decay=0.55,
                              num_periods=0.5,
                              alpha=0.0,
                              beta=0.001,
                              name=None):
    """Applies noisy linear cosine decay to the learning rate.

  Note that linear cosine decay is more aggressive than cosine decay and
  larger initial learning rates can typically be used.

  When training a model, it is often recommended to lower the learning rate as
  the training progresses.  This function applies a noisy linear
  cosine decay function to a provided initial learning rate.
  It requires a `global_step` value to compute the decayed learning rate.
  You can just pass a TensorFlow variable that you increment at each
  training step.

  The function returns the decayed learning rate.  It is computed as:
  ```python
  global_step = min(global_step, decay_steps)
  linear_decay = (decay_steps - global_step) / decay_steps)
  cosine_decay = 0.5 * (
      1 + cos(pi * 2 * num_periods * (global_step / decay_steps) ^ k_decay))
  decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
  decayed_learning_rate = learning_rate * decayed
  ```
  where eps_t is 0-centered gaussian noise with variance
  initial_variance / (1 + global_step) ** variance_decay

  Example usage:
  ```python
  decay_steps = 1000
  lr_decayed = noisy_linear_cosine_decay(
    learning_rate, global_step, decay_steps)
  ```

  Args:
    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
      The initial learning rate.
    global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global
      step to use for the decay computation.
    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Number
      of steps to decay over.
    k_decay: A scalar `float32` or `float64` `Tensor` or a Python number. The k values of
      the polynomial of k-decay method. Defaults to 1.0.
    initial_variance: initial variance for the noise. See computation above.
    variance_decay: decay for the noise's variance. See computation above.
    num_periods: Number of periods in the cosine part of the decay. See
      computation above.
    alpha: See computation above.
    beta: See computation above.
    name: String.  Optional name of the operation.  Defaults to
      'NoisyLinearCosineDecay'.

  Returns:
    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
    learning rate.
  Raises:
    ValueError: if `global_step` is not supplied.

  References:
    Neural Optimizer Search with Reinforcement Learning:
      [Bello et al., 2017](http://proceedings.mlr.press/v70/bello17a.html)
      ([pdf](http://proceedings.mlr.press/v70/bello17a/bello17a.pdf))
    Stochastic Gradient Descent with Warm Restarts:
      [Loshchilov et al., 2017]
      (https://openreview.net/forum?id=Skq89Scxx&noteId=Skq89Scxx)
      ([pdf](https://openreview.net/pdf?id=Skq89Scxx))
    k-decay: A New Method For Learning Rate Schedule:
      [Tao Zhang, Wei Li., 2020]
      ([pdf])(https://arxiv.org/abs/2004.05909)

  @compatibility(eager)
  When eager execution is enabled, this function returns a function which in
  turn returns the decayed learning rate Tensor. This can be useful for changing
  the learning rate value across different invocations of optimizer functions.
  @end_compatibility
  """
    decayed_lr = learning_rate_schedule.NoisyLinearCosineDecay(
        learning_rate,
        decay_steps,
        k_decay=k_decay,
        initial_variance=initial_variance,
        variance_decay=variance_decay,
        num_periods=num_periods,
        alpha=alpha,
        beta=beta,
        name=name)

    if not context.executing_eagerly():
        decayed_lr = decayed_lr(global_step)
    else:
        decayed_lr = functools.partial(decayed_lr, global_step)
    return decayed_lr