def create_optimizer(init_lr, num_train_steps, num_warmup_steps, min_lr_ratio=0.0, adam_epsilon=1e-8, weight_decay_rate=0.0): """Creates an optimizer with learning rate schedule.""" # Implements linear decay of the learning rate. learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=init_lr, decay_steps=num_train_steps - num_warmup_steps, end_learning_rate=init_lr * min_lr_ratio) if num_warmup_steps: learning_rate_fn = WarmUp(initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps) if weight_decay_rate > 0.0: logging.info( "Using AdamWeightDecay with adam_epsilon=%.9f weight_decay_rate=%.3f", adam_epsilon, weight_decay_rate) optimizer = optimization.AdamWeightDecay( learning_rate=learning_rate_fn, weight_decay_rate=weight_decay_rate, beta_1=0.9, beta_2=0.999, epsilon=adam_epsilon, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], include_in_weight_decay=["r_s_bias", "r_r_bias", "r_w_bias"]) else: logging.info("Using Adam with adam_epsilon=%.9f", (adam_epsilon)) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_fn, epsilon=adam_epsilon) return optimizer, learning_rate_fn
def test_adam_weight_decay_optimizer(self): params = {'optimizer': {'type': 'adamw'}} expected_optimizer_config = nlp_optimization.AdamWeightDecay( ).get_config() opt_config = optimization_config.OptimizationConfig(params) opt_factory = optimizer_factory.OptimizerFactory(opt_config) lr = opt_factory.build_learning_rate() optimizer = opt_factory.build_optimizer(lr) self.assertIsInstance(optimizer, nlp_optimization.AdamWeightDecay) self.assertEqual(expected_optimizer_config, optimizer.get_config())
def fit(model, strategy, train_dataset, model_dir, init_checkpoint_path=None, evaluate_fn=None, learning_rate=1e-5, learning_rate_polynomial_decay_rate=1., weight_decay_rate=1e-1, num_warmup_steps=5000, num_decay_steps=51000, num_epochs=6): """Train and evaluate.""" hparams = dict( learning_rate=learning_rate, num_decay_steps=num_decay_steps, num_warmup_steps=num_warmup_steps, num_epochs=num_epochs, weight_decay_rate=weight_decay_rate, dropout_rate=FLAGS.dropout_rate, attention_dropout_rate=FLAGS.attention_dropout_rate, label_smoothing=FLAGS.label_smoothing) logging.info(hparams) learning_rate_schedule = nlp_optimization.WarmUp( learning_rate, tf.keras.optimizers.schedules.PolynomialDecay( learning_rate, num_decay_steps, end_learning_rate=0., power=learning_rate_polynomial_decay_rate), num_warmup_steps) with strategy.scope(): optimizer = nlp_optimization.AdamWeightDecay( learning_rate_schedule, weight_decay_rate=weight_decay_rate, epsilon=1e-6, exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']) model.compile(optimizer, loss=modeling.SpanOrCrossEntropyLoss()) def init_fn(init_checkpoint_path): ckpt = tf.train.Checkpoint(encoder=model.encoder) ckpt.restore(init_checkpoint_path).assert_existing_objects_matched() with worker_context(): ckpt_manager = tf.train.CheckpointManager( tf.train.Checkpoint(model=model, optimizer=optimizer), model_dir, max_to_keep=None, init_fn=(functools.partial(init_fn, init_checkpoint_path) if init_checkpoint_path else None)) with strategy.scope(): ckpt_manager.restore_or_initialize() val_summary_writer = tf.summary.create_file_writer( os.path.join(model_dir, 'val')) best_exact_match = 0. for epoch in range(len(ckpt_manager.checkpoints), num_epochs): model.fit( train_dataset, callbacks=[ tf.keras.callbacks.TensorBoard(model_dir, write_graph=False), ]) ckpt_path = ckpt_manager.save() if evaluate_fn is None: continue metrics = evaluate_fn() logging.info('Epoch %d: %s', epoch + 1, metrics) if best_exact_match < metrics['exact_match']: best_exact_match = metrics['exact_match'] model.save(os.path.join(model_dir, 'export'), include_optimizer=False) logging.info('Exporting %s as SavedModel.', ckpt_path) with val_summary_writer.as_default(): for name, data in metrics.items(): tf.summary.scalar(name, data, epoch + 1)