示例#1
0
def create_optimizer(
    init_lr,
    num_train_steps,
    num_warmup_steps,
    learning_rate_type="polynomial",
    adam_beta_2=0.999,
    adam_epsilon=1e-06,
    weight_decay_rate=0.0,
    optimizer_type="adamw",
):
    """Creates an optimizer with learning rate schedule."""

    if optimizer_type == "adafactor":
        return AdafactorOptimizer(learning_rate=init_lr)
    # Implements linear decay of the learning rate.
    if learning_rate_type == "linear":
        if num_warmup_steps:
            learning_rate_fn = WarmUp_Linear(
                initial_learning_rate=init_lr,
                num_training_steps=num_train_steps,
                warmup_steps=num_warmup_steps,
            )

    else:
        learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=init_lr,
            decay_steps=num_train_steps,
            end_learning_rate=0.0,
        )
        if num_warmup_steps:
            learning_rate_fn = WarmUp(
                initial_learning_rate=init_lr,
                decay_schedule_fn=learning_rate_fn,
                warmup_steps=num_warmup_steps,
            )

    if optimizer_type == "adamw":
        logging.info("using Adamw optimizer")
        optimizer = AdamWeightDecay(
            learning_rate=learning_rate_fn,
            weight_decay_rate=weight_decay_rate,
            beta_1=0.9,
            beta_2=adam_beta_2,
            epsilon=adam_epsilon,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
        )
    elif optimizer_type == "lamb":
        logging.info("using Lamb optimizer")
        optimizer = tfa_optimizers.LAMB(
            learning_rate=learning_rate_fn,
            weight_decay_rate=weight_decay_rate,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
        )
    else:
        raise ValueError("Unsupported optimizer type: ", optimizer_type)

    return optimizer, learning_rate_fn
    def test_lamb_optimizer(self):
        params = {'optimizer': {'type': 'lamb'}}
        expected_optimizer_config = tfa_optimizers.LAMB().get_config()
        opt_config = optimization_config.OptimizationConfig(params)
        opt_factory = optimizer_factory.OptimizerFactory(opt_config)
        lr = opt_factory.build_learning_rate()
        optimizer = opt_factory.build_optimizer(lr)

        self.assertIsInstance(optimizer, tfa_optimizers.LAMB)
        self.assertEqual(expected_optimizer_config, optimizer.get_config())
示例#3
0
def create_optimizer(init_lr,
                     num_train_steps,
                     num_warmup_steps,
                     optimizer,
                     use_lr_schedule,
                     use_bias_correction_for_adamw=False):
    """Creates an optimizer with learning rate schedule.

    Extended based on official.nlp.optimization.create_optimizer
    :param init_lr Initial learning rate
    :param num_train_steps Number of training steps
    :param num_warmup_steps Number of warmup steps
    :param optimizer Type of optimizer
    :param use_lr_schedule Whether to use learning rate scheudling such as warm up and decay
    :param use_bias_correction_for_adamw Whether to use bias correction in AdamWeightDecay optimzer
    """
    lr_schedule = init_lr
    if use_lr_schedule:
        # Implements linear decay of the learning rate.
        lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=init_lr,
            decay_steps=num_train_steps,
            power=1.0,
            end_learning_rate=0.0)

        if num_warmup_steps:
            lr_schedule = WarmUp(initial_learning_rate=init_lr,
                                 decay_schedule_fn=lr_schedule,
                                 warmup_steps=num_warmup_steps)

    optimizer_dct = {
        'sgd':
        tf.keras.optimizers.SGD(learning_rate=lr_schedule),
        'adam':
        tf.keras.optimizers.Adam(learning_rate=lr_schedule),
        'adamw':
        AdamWeightDecay(
            learning_rate=lr_schedule,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'],
            use_bias_correction=use_bias_correction_for_adamw),
        'lamb':
        tfa_optimizers.LAMB(
            learning_rate=lr_schedule,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
    }

    return optimizer_dct[optimizer]
示例#4
0
def create_optimizer(init_lr,
                     num_train_steps,
                     num_warmup_steps,
                     optimizer_type="adam"):
    """Creates an optimizer with learning rate schedule."""
    # Implements linear decay of the learning rate.
    if optimizer_type == "adam":
        power = 1.0
        decayed_learning_rate_at_crossover_point = init_lr * (
            (1.0 - float(num_warmup_steps) / float(num_train_steps))**power)
    else:
        power = 0.5
        decayed_learning_rate_at_crossover_point = init_lr
    init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point)
    print(
        'decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e'
        % (decayed_learning_rate_at_crossover_point, init_lr))

    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=init_lr,
        decay_steps=num_train_steps,
        end_learning_rate=0.0,
        power=power)
    if num_warmup_steps:
        learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
                                  decay_schedule_fn=learning_rate_fn,
                                  warmup_steps=num_warmup_steps)
    if optimizer_type == 'adam':
        optimizer = AdamWeightDecay(
            learning_rate=learning_rate_fn,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
    else:
        skip_list = [
            'None'
        ]  # to avoid exclude_from_layer_adaptation set to exclude_from_weight_decay if the arg is None
        optimizer = tfa_optimizers.LAMB(
            learning_rate=learning_rate_fn,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'],
            exclude_from_layer_adaptation=skip_list)
    return optimizer
示例#5
0
def create_optimizer(init_lr, num_train_steps, num_warmup_steps, weight_decay_rate=0.01,
                     layerwise_lr_decay=-1, n_transformer_layers=None, clip_norm=1.0,
                     optimizer="adam", skip_adaptive=False, power=1.0, beta_1=0.9, beta_2=0.999, end_lr=0.0):
    """Creates an optimizer with learning rate schedule."""
    # Implements linear decay of the learning rate.
    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=init_lr, decay_steps=num_train_steps - num_warmup_steps, end_learning_rate=end_lr, power=power
    )
    if num_warmup_steps:
        learning_rate_fn = WarmUp(
            initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps
        )
    layer_decay = None
    if layerwise_lr_decay > 0 and n_transformer_layers is not None:
        layer_decay = _get_layer_decay(layerwise_lr_decay, n_transformer_layers)

    if optimizer == "adam":
        optimizer = AdamWeightDecay(
            learning_rate=learning_rate_fn,
            weight_decay_rate=weight_decay_rate,
            layer_decay=layer_decay,
            beta_1=beta_1,
            beta_2=beta_2,
            epsilon=1e-6,
            exclude_from_weight_decay=["layer_norm", "bias", "LayerNorm"],
            clip_norm=clip_norm,
        )
    else:
        if skip_adaptive:
            skip_list = ["layer_norm", "bias", "LayerNorm"]
        else:
            skip_list = ["None"]
        log("Skip list for LAMB {}".format(skip_list))
        
        optimizer = tfa_optimizers.LAMB(
            learning_rate=learning_rate_fn,
            weight_decay_rate=weight_decay_rate,
            beta_1=beta_1,
            beta_2=beta_2,
            epsilon=1e-6,
            exclude_from_weight_decay=["layer_norm", "bias", "LayerNorm"],
            exclude_from_layer_adaptation=skip_list,
        )

    return optimizer
def create_optimizer(init_lr,
                     num_train_steps,
                     num_warmup_steps,
                     end_lr=0.0,
                     optimizer_type="adamw"):
    """Creates an optimizer with learning rate schedule."""
    # Implements linear decay of the learning rate.
    lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=init_lr,
        decay_steps=num_train_steps,
        end_learning_rate=end_lr,
    )
    if num_warmup_steps:
        lr_schedule = WarmUp(
            initial_learning_rate=init_lr,
            decay_schedule_fn=lr_schedule,
            warmup_steps=num_warmup_steps,
        )

    if optimizer_type == "adamw":
        logging.info("using Adamw optimizer")
        optimizer = AdamWeightDecay(
            learning_rate=lr_schedule,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["layer_norm", "bias"],
        )
    elif optimizer_type == "lamb":
        logging.info("using Lamb optimizer")
        optimizer = tfa_optimizers.LAMB(
            learning_rate=lr_schedule,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["layer_norm", "bias"],
        )
    else:
        raise ValueError("Unsupported optimizer type: ", optimizer_type)

    return optimizer
示例#7
0
def create_optimizer(init_lr,
                     num_train_steps,
                     num_warmup_steps,
                     end_lr=0.0,
                     optimizer_type='adamw',
                     beta_1=0.9,
                     poly_power=1.0):
  """Creates an optimizer with learning rate schedule."""
  # Implements linear decay of the learning rate.
  lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
      initial_learning_rate=init_lr,
      decay_steps=num_train_steps,
      end_learning_rate=end_lr,
      power=poly_power)
  if num_warmup_steps:
    lr_schedule = WarmUp(
        initial_learning_rate=init_lr,
        decay_schedule_fn=lr_schedule,
        warmup_steps=num_warmup_steps)

  if optimizer_type == 'adamw':
    logging.info('using Adamw optimizer')
    optimizer = AdamWeightDecay(
        learning_rate=lr_schedule,
        weight_decay_rate=0.01,
        beta_1=beta_1,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
  elif optimizer_type == 'lamb':
    logging.info('using Lamb optimizer')
    optimizer = tfa_optimizers.LAMB(
        learning_rate=lr_schedule,
        weight_decay_rate=0.01,
        beta_1=beta_1,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
  else:
    raise ValueError('Unsupported optimizer type: ', optimizer_type)

  return optimizer