def _preprocess_callbacks(self, callbacks): if callbacks is not None: remove_tensorboard_index = None for index, callback in enumerate(callbacks): if isinstance(callback, tf.keras.callbacks.ModelCheckpoint): tnt_callback = TntModelCheckpoint( keras_model_checkpoint=callback, underlying_optimizer=self.orig_optimizer, distributed_optimizer=self.dist_optimizer) callbacks[index] = tnt_callback elif isinstance(callback, tf.keras.callbacks.LearningRateScheduler): if not tarantella.global_tnt_config.output_on_all_devices: if not tarantella.is_master_rank(): callback.verbose = 0 elif isinstance(callback, tf.keras.callbacks.TensorBoard): if tarantella.global_tnt_config.tensorboard_on_all_devices: callback.log_dir += '/rank_{}'.format(self.rank) else: if not tarantella.is_master_rank(): remove_tensorboard_index = index if remove_tensorboard_index is not None: del callbacks[remove_tensorboard_index]
def train_and_eval(self): """Trains the model.""" lr_schedule = optimizer.LearningRateSchedule(self.params["learning_rate"], self.params["hidden_size"], self.params["learning_rate_warmup_steps"]) opt = tf.keras.optimizers.Adam(lr_schedule, self.params["optimizer_adam_beta1"], self.params["optimizer_adam_beta2"], epsilon=self.params["optimizer_adam_epsilon"]) self.train_model.compile(opt) self.train_model.summary() # create train dataset train_ds = data_pipeline.train_input_fn(self.params, shuffle_seed = 42, num_ranks = tnt.get_size(), rank = tnt.get_rank()) # enable global callbacks callbacks = [] if self.flags_obj.enable_tensorboard and self.flags_obj.model_dir: callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=self.flags_obj.model_dir)) # enable logging callbacks only on the master rank if self.flags_obj.enable_time_history: time_callback = keras_utils.TimeHistory(self.params["batch_size"], self.params["num_sentences"], logdir = None) tnt_time_callback = tnt.keras.callbacks.Callback(time_callback, aggregate_logs = False, run_on_all_ranks = False) callbacks.append(tnt_time_callback) # print messages only once if tnt.is_master_rank(): logging.info("Start train") stats = {} for epoch in range(0, self.params["train_epochs"], self.params["epochs_between_evals"]): # as our dataset is distributed manually, disable the automatic Tarantella distribution history = self.train_model.fit(train_ds, callbacks = callbacks, tnt_distribute_dataset = False, initial_epoch = epoch, epochs = epoch + min(self.params["epochs_between_evals"], self.params["train_epochs"]-epoch), verbose = 2) if tnt.is_master_rank(): logging.info("Train history: {}".format(history.history)) stats = misc.build_stats(history, callbacks) if tnt.is_master_rank(): eval_stats = self.eval() stats.update(eval_stats) return stats
def setup_save_path(request): barrier = tnt.Barrier() barrier.execute() # save logs in a shared directory accessible to all ranks save_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_callbacks") if tnt.is_master_rank(): os.makedirs(save_dir, exist_ok=True) yield save_dir # clean up barrier.execute() if tnt.is_master_rank(): shutil.rmtree(save_dir, ignore_errors=True)
def __init__(self, keras_model_checkpoint, underlying_optimizer, distributed_optimizer): super(TntModelCheckpoint, self).__init__(keras_model_checkpoint.filepath) self.underlying_optimizer = underlying_optimizer self.distributed_optimizer = distributed_optimizer # set member variables from ModelCheckpoint instance self.validation_data = keras_model_checkpoint.validation_data self.model = keras_model_checkpoint.model self._chief_worker_only = keras_model_checkpoint._chief_worker_only self._supports_tf_logs = True self.monitor = keras_model_checkpoint.monitor self.filepath = keras_model_checkpoint.filepath self.save_best_only = keras_model_checkpoint.save_best_only self.save_weights_only = keras_model_checkpoint.save_weights_only self.save_freq = keras_model_checkpoint.save_freq self.epochs_since_last_save = keras_model_checkpoint.epochs_since_last_save self._batches_seen_since_last_saving = keras_model_checkpoint._batches_seen_since_last_saving self._last_batch_seen = 0 self.load_weights_on_restart = keras_model_checkpoint.load_weights_on_restart self.period = keras_model_checkpoint.period self.monitor_op = keras_model_checkpoint.monitor_op self.best = keras_model_checkpoint.best # only master rank should save and thus print messages self.verbose = keras_model_checkpoint.verbose if tarantella.is_master_rank( ) else 0
def test_csv_logger_callback(self, setup_save_path, model_config, number_epochs): (train_dataset, val_dataset) = train_val_dataset_generator() (ref_train_dataset, ref_val_dataset) = train_val_dataset_generator() filename = os.path.join(setup_save_path, "training") tnt_model_runner, reference_model_runner = gen_model_runners( model_config) param_dict = {'epochs': number_epochs, 'verbose': 0, 'shuffle': False} tnt_filename = filename + '_tnt.csv' tnt_model_runner.model.fit( train_dataset, validation_data=val_dataset, callbacks=[tf.keras.callbacks.CSVLogger(tnt_filename)], **param_dict) result = [True] if tnt.is_master_rank(): ref_filename = filename + '_ref.csv' reference_model_runner.model.fit( ref_train_dataset, validation_data=ref_val_dataset, callbacks=[tf.keras.callbacks.CSVLogger(ref_filename)], **param_dict) tnt_metrics = util.get_metric_values_from_file(tnt_filename) ref_metrics = util.get_metric_values_from_file(ref_filename) result = np.allclose(tnt_metrics, ref_metrics, atol=1e-6) util.assert_on_all_ranks(result)
def save(self, filepath, tnt_save_all_devices=False, **kwargs): if tnt_save_all_devices: self._save(filepath, kwargs) else: if tarantella.is_master_rank(): self._save(filepath, kwargs) # make sure, every rank can load the model after function exit self.barrier.synchronize()
def init(): logging_config.setup_logging(logger, tnt.global_tnt_config.log_level, tnt.get_rank(), tnt.is_master_rank(), tnt.global_tnt_config.log_on_all_devices) # the number of GPUs per node can be specified either as default # configuration value or a `TNT_GPUS_PER_NODE` environment variable devices_per_node = tnt.global_tnt_config.gpus_per_node setup_gpus(tnt.get_rank(), ngpus=devices_per_node)
def _save_to_file(self, tnt_save_all_devices, save_function, filepath, **kwargs): if tnt_save_all_devices: save_function(filepath, kwargs) else: if tnt.is_master_rank(): save_function(filepath, kwargs) # make sure that every rank can load the model after function exit self.barrier.execute()
def assert_identical_tnt_and_ref_history(tnt_history, ref_history): result = [True] if tnt.is_master_rank(): for key in ref_history.history.keys(): result += [ all( np.isclose(tnt_history.history[key], ref_history.history[key], atol=1e-6)) ] result = [all(result)] util.assert_on_all_ranks(result)
def save_setup(request): save_all_devices = request.param # save model in a shared directory accessible to all ranks save_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_save_model") if save_all_devices: save_dir = save_dir + str(tnt.get_rank()) yield {'save_dir': save_dir, 'all_devices': save_all_devices} # clean up if save_all_devices or tnt.is_master_rank(): shutil.rmtree(save_dir, ignore_errors=True)
def test_early_stopping_callback(self, model_config, number_epochs): monitor_metric = 'val_loss' callbacks = [ tf.keras.callbacks.EarlyStopping(monitor=monitor_metric, min_delta=0.1, patience=1) ] tnt_history, reference_history = train_tnt_and_ref_models_with_callbacks( callbacks, model_config, number_epochs) # Expect both models to run same number of epochs result = [True] if tnt.is_master_rank(): result = (len(tnt_history.history[monitor_metric]) == len( reference_history.history[monitor_metric])) util.assert_on_all_ranks(result)
def test_optimizers_compare_to_reference(self, model_config, optimizer, micro_batch_size, nbatches, number_epochs): tnt_history, ref_history = train_tnt_and_reference_models( model_config, optimizer, micro_batch_size, nbatches, number_epochs) result = [True, True] if tnt.is_master_rank(): result = [ np.allclose(tnt_history.history['loss'], ref_history.history['loss'], atol=1e-4), np.allclose(tnt_history.history[metric], ref_history.history[metric], atol=1e-6) ] util.assert_on_all_ranks(result)
def test_compare_accuracy_against_reference(self, model_runners, micro_batch_size, number_epochs, nbatches, test_nbatches, remainder_samples_per_batch, last_incomplete_batch_size): (train_dataset, test_dataset) = util.train_test_mnist_datasets( nbatches=nbatches, test_nbatches=test_nbatches, micro_batch_size=micro_batch_size, shuffle=False, remainder_samples_per_batch=remainder_samples_per_batch, last_incomplete_batch_size=last_incomplete_batch_size) (ref_train_dataset, ref_test_dataset) = util.train_test_mnist_datasets( nbatches=nbatches, test_nbatches=test_nbatches, micro_batch_size=micro_batch_size, shuffle=False, remainder_samples_per_batch=remainder_samples_per_batch, last_incomplete_batch_size=last_incomplete_batch_size) tnt_model_runner, reference_model_runner = model_runners reference_model_runner.train_model(ref_train_dataset, number_epochs) tnt_model_runner.train_model(train_dataset, number_epochs) tnt_loss_accuracy = tnt_model_runner.evaluate_model(test_dataset) ref_loss_accuracy = reference_model_runner.evaluate_model( ref_test_dataset) rank = tnt.get_rank() logging.getLogger().info( f"[Rank {rank}] Tarantella[loss, accuracy] = {tnt_loss_accuracy}") logging.getLogger().info( f"[Rank {rank}] Reference [loss, accuracy] = {ref_loss_accuracy}") result = [True, True] if tnt.is_master_rank(): result = [ np.isclose(tnt_loss_accuracy[0], ref_loss_accuracy[0], atol=1e-2), # losses might not be identical np.isclose(tnt_loss_accuracy[1], ref_loss_accuracy[1], atol=1e-6) ] util.assert_on_all_ranks(result)
def test_progbar_logger_callback_inference(self, model_config, number_epochs, use_explicit_progbarlogger, verbose, exec_type, capsys): (train_dataset, test_dataset) = train_val_dataset_generator() (ref_train_dataset, ref_test_dataset) = train_val_dataset_generator() tnt_callbacks = [tf.keras.callbacks.ProgbarLogger( count_mode='steps')] if use_explicit_progbarlogger else [] ref_callbacks = [tf.keras.callbacks.ProgbarLogger( count_mode='steps')] if use_explicit_progbarlogger else [] tnt_model_runner, ref_model_runner = gen_model_runners(model_config) if exec_type == 'evaluate': tnt_model_runner.model.evaluate(test_dataset, callbacks=tnt_callbacks, verbose=verbose) elif exec_type == 'predict': tnt_model_runner.model.predict(test_dataset, callbacks=tnt_callbacks, verbose=verbose) tnt_captured = capsys.readouterr() tnt_metrics = util.get_metrics_from_stdout( tnt_captured.out, tnt_model_runner.model.metrics_names) if exec_type == 'evaluate': ref_model_runner.model.evaluate(ref_test_dataset, callbacks=ref_callbacks, verbose=verbose) elif exec_type == 'predict': ref_model_runner.model.predict(ref_test_dataset, callbacks=ref_callbacks, verbose=verbose) ref_captured = capsys.readouterr() ref_metrics = util.get_metrics_from_stdout( ref_captured.out, ref_model_runner.model.metrics_names) if tnt.is_master_rank(): result = all(np.isclose(tnt_metrics, ref_metrics, atol=1e-6)) else: result = all([tnt_captured.out == "", tnt_captured.err == ""]) util.assert_on_all_ranks(result)
def test_tensorboard_callback(self, setup_save_path, model_config, number_epochs): (train_dataset, val_dataset) = train_val_dataset_generator() tnt_model_runner, _ = gen_model_runners(model_config) tnt_model_runner.model.fit( train_dataset, validation_data=val_dataset, epochs=number_epochs, callbacks=[ tf.keras.callbacks.TensorBoard(log_dir=setup_save_path) ]) result = [True] if tnt.is_master_rank(): result = [ os.path.isdir(os.path.join(setup_save_path, "train")), os.path.isdir(os.path.join(setup_save_path, "validation")) ] result = [all(result)] util.assert_on_all_ranks(result)
def test_sgd_momentum_compare_to_reference(self, model_config, nesterov, momentum, micro_batch_size, nbatches, number_epochs): optimizer = keras.optimizers.SGD optimizer_kwargs = { 'learning_rate': 0.01, 'momentum': momentum, 'nesterov': nesterov } tnt_history, ref_history = train_tnt_and_reference_models( model_config, optimizer, micro_batch_size, nbatches, number_epochs, optimizer_kwargs) result = [True, True] if tnt.is_master_rank(): result = [ np.allclose(tnt_history.history['loss'], ref_history.history['loss'], atol=1e-4), np.allclose(tnt_history.history[metric], ref_history.history[metric], atol=1e-6) ] util.assert_on_all_ranks(result)
def summary(self, *args, **kwargs): if tarantella.global_tnt_config.output_on_all_devices: self.model.summary(*args, **kwargs) else: if tarantella.is_master_rank(): self.model.summary(*args, **kwargs)
def _set_verbose_all_ranks(self, exec_type, args_dict): if not 'verbose' in args_dict: args_dict['verbose'] = self.tf_default_verbose[exec_type] if not tarantella.global_tnt_config.output_on_all_devices: if not tarantella.is_master_rank(): args_dict['verbose'] = 0