def _create_optimizer(self): """Creates optimizer.""" params = self.params # TODO(b/139414679): Explore the difference between using # LearningRateSchedule and callback for GPU runs, and try to merge them. lr_schedule = optimizer.LearningRateSchedule( params["learning_rate"], params["hidden_size"], params["learning_rate_warmup_steps"]) opt = tf.keras.optimizers.Adam( lr_schedule if self.use_tpu else params["learning_rate"], params["optimizer_adam_beta1"], params["optimizer_adam_beta2"], epsilon=params["optimizer_adam_epsilon"]) if params["dtype"] == tf.float16: opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer( opt, loss_scale=flags_core.get_loss_scale( self.flags_obj, default_for_fp16="dynamic")) if self.flags_obj.fp16_implementation == "graph_rewrite": # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32' # which will ensure tf.compat.v2.keras.mixed_precision and # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double # up. opt = tf.train.experimental.enable_mixed_precision_graph_rewrite( opt) return opt
def train_and_eval(self): """Trains the model.""" lr_schedule = optimizer.LearningRateSchedule(self.params["learning_rate"], self.params["hidden_size"], self.params["learning_rate_warmup_steps"]) opt = tf.keras.optimizers.Adam(lr_schedule, self.params["optimizer_adam_beta1"], self.params["optimizer_adam_beta2"], epsilon=self.params["optimizer_adam_epsilon"]) self.train_model.compile(opt) self.train_model.summary() # create train dataset train_ds = data_pipeline.train_input_fn(self.params, shuffle_seed = 42, num_ranks = tnt.get_size(), rank = tnt.get_rank()) # enable global callbacks callbacks = [] if self.flags_obj.enable_tensorboard and self.flags_obj.model_dir: callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=self.flags_obj.model_dir)) # enable logging callbacks only on the master rank if self.flags_obj.enable_time_history: time_callback = keras_utils.TimeHistory(self.params["batch_size"], self.params["num_sentences"], logdir = None) tnt_time_callback = tnt.keras.callbacks.Callback(time_callback, aggregate_logs = False, run_on_all_ranks = False) callbacks.append(tnt_time_callback) # print messages only once if tnt.is_master_rank(): logging.info("Start train") stats = {} for epoch in range(0, self.params["train_epochs"], self.params["epochs_between_evals"]): # as our dataset is distributed manually, disable the automatic Tarantella distribution history = self.train_model.fit(train_ds, callbacks = callbacks, tnt_distribute_dataset = False, initial_epoch = epoch, epochs = epoch + min(self.params["epochs_between_evals"], self.params["train_epochs"]-epoch), verbose = 2) if tnt.is_master_rank(): logging.info("Train history: {}".format(history.history)) stats = misc.build_stats(history, callbacks) if tnt.is_master_rank(): eval_stats = self.eval() stats.update(eval_stats) return stats
def _create_optimizer(self): """Creates optimizer.""" params = self.params lr_schedule = optimizer.LearningRateSchedule( params["learning_rate"], params["hidden_size"], params["learning_rate_warmup_steps"]) opt = tf.keras.optimizers.Adam( lr_schedule, params["optimizer_adam_beta1"], params["optimizer_adam_beta2"], epsilon=params["optimizer_adam_epsilon"]) opt = performance.configure_optimizer( opt, use_float16=params["dtype"] == tf.float16, loss_scale=flags_core.get_loss_scale( self.flags_obj, default_for_fp16="dynamic")) return opt