Exemplo n.º 1
0
    def _preprocess_callbacks(self, callbacks):
        if callbacks is not None:
            remove_tensorboard_index = None

            for index, callback in enumerate(callbacks):
                if isinstance(callback, tf.keras.callbacks.ModelCheckpoint):
                    tnt_callback = TntModelCheckpoint(
                        keras_model_checkpoint=callback,
                        underlying_optimizer=self.orig_optimizer,
                        distributed_optimizer=self.dist_optimizer)
                    callbacks[index] = tnt_callback

                elif isinstance(callback,
                                tf.keras.callbacks.LearningRateScheduler):
                    if not tarantella.global_tnt_config.output_on_all_devices:
                        if not tarantella.is_master_rank():
                            callback.verbose = 0

                elif isinstance(callback, tf.keras.callbacks.TensorBoard):
                    if tarantella.global_tnt_config.tensorboard_on_all_devices:
                        callback.log_dir += '/rank_{}'.format(self.rank)
                    else:
                        if not tarantella.is_master_rank():
                            remove_tensorboard_index = index

            if remove_tensorboard_index is not None:
                del callbacks[remove_tensorboard_index]
  def train_and_eval(self):
    """Trains the model."""
    lr_schedule = optimizer.LearningRateSchedule(self.params["learning_rate"], self.params["hidden_size"],
                                                 self.params["learning_rate_warmup_steps"])
    opt = tf.keras.optimizers.Adam(lr_schedule,
                                   self.params["optimizer_adam_beta1"],
                                   self.params["optimizer_adam_beta2"],
                                   epsilon=self.params["optimizer_adam_epsilon"])
    self.train_model.compile(opt)
    self.train_model.summary()

    # create train dataset
    train_ds = data_pipeline.train_input_fn(self.params,
                                            shuffle_seed = 42,
                                            num_ranks = tnt.get_size(),
                                            rank = tnt.get_rank())

    # enable global callbacks
    callbacks = []
    if self.flags_obj.enable_tensorboard and self.flags_obj.model_dir:
      callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=self.flags_obj.model_dir))

    # enable logging callbacks only on the master rank
    if self.flags_obj.enable_time_history:
      time_callback = keras_utils.TimeHistory(self.params["batch_size"],
                                              self.params["num_sentences"],
                                              logdir = None)
      tnt_time_callback = tnt.keras.callbacks.Callback(time_callback,
                                                       aggregate_logs = False,
                                                       run_on_all_ranks = False)
      callbacks.append(tnt_time_callback)

    # print messages only once
    if tnt.is_master_rank():
      logging.info("Start train")

    stats = {}
    for epoch in range(0, self.params["train_epochs"], self.params["epochs_between_evals"]):
      # as our dataset is distributed manually, disable the automatic Tarantella distribution
      history = self.train_model.fit(train_ds,
                                     callbacks = callbacks,
                                     tnt_distribute_dataset = False,
                                     initial_epoch = epoch,
                                     epochs = epoch + min(self.params["epochs_between_evals"],
                                                          self.params["train_epochs"]-epoch),
                                     verbose = 2)

      if tnt.is_master_rank():
        logging.info("Train history: {}".format(history.history))
        stats = misc.build_stats(history, callbacks)

      if tnt.is_master_rank():
        eval_stats = self.eval()
        stats.update(eval_stats)

    return stats
Exemplo n.º 3
0
def setup_save_path(request):
    barrier = tnt.Barrier()
    barrier.execute()
    # save logs in a shared directory accessible to all ranks
    save_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            "test_callbacks")
    if tnt.is_master_rank():
        os.makedirs(save_dir, exist_ok=True)
    yield save_dir

    # clean up
    barrier.execute()
    if tnt.is_master_rank():
        shutil.rmtree(save_dir, ignore_errors=True)
Exemplo n.º 4
0
    def __init__(self, keras_model_checkpoint, underlying_optimizer,
                 distributed_optimizer):
        super(TntModelCheckpoint,
              self).__init__(keras_model_checkpoint.filepath)
        self.underlying_optimizer = underlying_optimizer
        self.distributed_optimizer = distributed_optimizer

        # set member variables from ModelCheckpoint instance
        self.validation_data = keras_model_checkpoint.validation_data
        self.model = keras_model_checkpoint.model
        self._chief_worker_only = keras_model_checkpoint._chief_worker_only
        self._supports_tf_logs = True
        self.monitor = keras_model_checkpoint.monitor
        self.filepath = keras_model_checkpoint.filepath
        self.save_best_only = keras_model_checkpoint.save_best_only
        self.save_weights_only = keras_model_checkpoint.save_weights_only
        self.save_freq = keras_model_checkpoint.save_freq
        self.epochs_since_last_save = keras_model_checkpoint.epochs_since_last_save
        self._batches_seen_since_last_saving = keras_model_checkpoint._batches_seen_since_last_saving
        self._last_batch_seen = 0
        self.load_weights_on_restart = keras_model_checkpoint.load_weights_on_restart
        self.period = keras_model_checkpoint.period
        self.monitor_op = keras_model_checkpoint.monitor_op
        self.best = keras_model_checkpoint.best

        # only master rank should save and thus print messages
        self.verbose = keras_model_checkpoint.verbose if tarantella.is_master_rank(
        ) else 0
Exemplo n.º 5
0
    def test_csv_logger_callback(self, setup_save_path, model_config,
                                 number_epochs):
        (train_dataset, val_dataset) = train_val_dataset_generator()
        (ref_train_dataset, ref_val_dataset) = train_val_dataset_generator()

        filename = os.path.join(setup_save_path, "training")
        tnt_model_runner, reference_model_runner = gen_model_runners(
            model_config)
        param_dict = {'epochs': number_epochs, 'verbose': 0, 'shuffle': False}

        tnt_filename = filename + '_tnt.csv'
        tnt_model_runner.model.fit(
            train_dataset,
            validation_data=val_dataset,
            callbacks=[tf.keras.callbacks.CSVLogger(tnt_filename)],
            **param_dict)

        result = [True]
        if tnt.is_master_rank():
            ref_filename = filename + '_ref.csv'
            reference_model_runner.model.fit(
                ref_train_dataset,
                validation_data=ref_val_dataset,
                callbacks=[tf.keras.callbacks.CSVLogger(ref_filename)],
                **param_dict)

            tnt_metrics = util.get_metric_values_from_file(tnt_filename)
            ref_metrics = util.get_metric_values_from_file(ref_filename)
            result = np.allclose(tnt_metrics, ref_metrics, atol=1e-6)
        util.assert_on_all_ranks(result)
Exemplo n.º 6
0
 def save(self, filepath, tnt_save_all_devices=False, **kwargs):
     if tnt_save_all_devices:
         self._save(filepath, kwargs)
     else:
         if tarantella.is_master_rank():
             self._save(filepath, kwargs)
     # make sure, every rank can load the model after function exit
     self.barrier.synchronize()
Exemplo n.º 7
0
def init():
    logging_config.setup_logging(logger, tnt.global_tnt_config.log_level,
                                 tnt.get_rank(), tnt.is_master_rank(),
                                 tnt.global_tnt_config.log_on_all_devices)

    # the number of GPUs per node can be specified either as default
    # configuration value or a `TNT_GPUS_PER_NODE` environment variable
    devices_per_node = tnt.global_tnt_config.gpus_per_node
    setup_gpus(tnt.get_rank(), ngpus=devices_per_node)
Exemplo n.º 8
0
 def _save_to_file(self, tnt_save_all_devices, save_function,
                   filepath, **kwargs):
   if tnt_save_all_devices:
     save_function(filepath, kwargs)
   else:
     if tnt.is_master_rank():
       save_function(filepath, kwargs)
   # make sure that every rank can load the model after function exit
   self.barrier.execute()
Exemplo n.º 9
0
def assert_identical_tnt_and_ref_history(tnt_history, ref_history):
    result = [True]
    if tnt.is_master_rank():
        for key in ref_history.history.keys():
            result += [
                all(
                    np.isclose(tnt_history.history[key],
                               ref_history.history[key],
                               atol=1e-6))
            ]
        result = [all(result)]
    util.assert_on_all_ranks(result)
Exemplo n.º 10
0
def save_setup(request):
    save_all_devices = request.param
    # save model in a shared directory accessible to all ranks
    save_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                            "test_save_model")
    if save_all_devices:
        save_dir = save_dir + str(tnt.get_rank())

    yield {'save_dir': save_dir, 'all_devices': save_all_devices}

    # clean up
    if save_all_devices or tnt.is_master_rank():
        shutil.rmtree(save_dir, ignore_errors=True)
Exemplo n.º 11
0
    def test_early_stopping_callback(self, model_config, number_epochs):
        monitor_metric = 'val_loss'
        callbacks = [
            tf.keras.callbacks.EarlyStopping(monitor=monitor_metric,
                                             min_delta=0.1,
                                             patience=1)
        ]
        tnt_history, reference_history = train_tnt_and_ref_models_with_callbacks(
            callbacks, model_config, number_epochs)

        # Expect both models to run same number of epochs
        result = [True]
        if tnt.is_master_rank():
            result = (len(tnt_history.history[monitor_metric]) == len(
                reference_history.history[monitor_metric]))
        util.assert_on_all_ranks(result)
Exemplo n.º 12
0
 def test_optimizers_compare_to_reference(self, model_config, optimizer,
                                          micro_batch_size, nbatches,
                                          number_epochs):
     tnt_history, ref_history = train_tnt_and_reference_models(
         model_config, optimizer, micro_batch_size, nbatches, number_epochs)
     result = [True, True]
     if tnt.is_master_rank():
         result = [
             np.allclose(tnt_history.history['loss'],
                         ref_history.history['loss'],
                         atol=1e-4),
             np.allclose(tnt_history.history[metric],
                         ref_history.history[metric],
                         atol=1e-6)
         ]
     util.assert_on_all_ranks(result)
    def test_compare_accuracy_against_reference(self, model_runners,
                                                micro_batch_size,
                                                number_epochs, nbatches,
                                                test_nbatches,
                                                remainder_samples_per_batch,
                                                last_incomplete_batch_size):
        (train_dataset, test_dataset) = util.train_test_mnist_datasets(
            nbatches=nbatches,
            test_nbatches=test_nbatches,
            micro_batch_size=micro_batch_size,
            shuffle=False,
            remainder_samples_per_batch=remainder_samples_per_batch,
            last_incomplete_batch_size=last_incomplete_batch_size)
        (ref_train_dataset, ref_test_dataset) = util.train_test_mnist_datasets(
            nbatches=nbatches,
            test_nbatches=test_nbatches,
            micro_batch_size=micro_batch_size,
            shuffle=False,
            remainder_samples_per_batch=remainder_samples_per_batch,
            last_incomplete_batch_size=last_incomplete_batch_size)
        tnt_model_runner, reference_model_runner = model_runners

        reference_model_runner.train_model(ref_train_dataset, number_epochs)
        tnt_model_runner.train_model(train_dataset, number_epochs)

        tnt_loss_accuracy = tnt_model_runner.evaluate_model(test_dataset)
        ref_loss_accuracy = reference_model_runner.evaluate_model(
            ref_test_dataset)

        rank = tnt.get_rank()
        logging.getLogger().info(
            f"[Rank {rank}] Tarantella[loss, accuracy] = {tnt_loss_accuracy}")
        logging.getLogger().info(
            f"[Rank {rank}] Reference [loss, accuracy] = {ref_loss_accuracy}")

        result = [True, True]
        if tnt.is_master_rank():
            result = [
                np.isclose(tnt_loss_accuracy[0],
                           ref_loss_accuracy[0],
                           atol=1e-2),  # losses might not be identical
                np.isclose(tnt_loss_accuracy[1],
                           ref_loss_accuracy[1],
                           atol=1e-6)
            ]
        util.assert_on_all_ranks(result)
Exemplo n.º 14
0
    def test_progbar_logger_callback_inference(self, model_config,
                                               number_epochs,
                                               use_explicit_progbarlogger,
                                               verbose, exec_type, capsys):
        (train_dataset, test_dataset) = train_val_dataset_generator()
        (ref_train_dataset, ref_test_dataset) = train_val_dataset_generator()

        tnt_callbacks = [tf.keras.callbacks.ProgbarLogger(
            count_mode='steps')] if use_explicit_progbarlogger else []
        ref_callbacks = [tf.keras.callbacks.ProgbarLogger(
            count_mode='steps')] if use_explicit_progbarlogger else []

        tnt_model_runner, ref_model_runner = gen_model_runners(model_config)

        if exec_type == 'evaluate':
            tnt_model_runner.model.evaluate(test_dataset,
                                            callbacks=tnt_callbacks,
                                            verbose=verbose)
        elif exec_type == 'predict':
            tnt_model_runner.model.predict(test_dataset,
                                           callbacks=tnt_callbacks,
                                           verbose=verbose)
        tnt_captured = capsys.readouterr()
        tnt_metrics = util.get_metrics_from_stdout(
            tnt_captured.out, tnt_model_runner.model.metrics_names)

        if exec_type == 'evaluate':
            ref_model_runner.model.evaluate(ref_test_dataset,
                                            callbacks=ref_callbacks,
                                            verbose=verbose)
        elif exec_type == 'predict':
            ref_model_runner.model.predict(ref_test_dataset,
                                           callbacks=ref_callbacks,
                                           verbose=verbose)
        ref_captured = capsys.readouterr()
        ref_metrics = util.get_metrics_from_stdout(
            ref_captured.out, ref_model_runner.model.metrics_names)

        if tnt.is_master_rank():
            result = all(np.isclose(tnt_metrics, ref_metrics, atol=1e-6))
        else:
            result = all([tnt_captured.out == "", tnt_captured.err == ""])
        util.assert_on_all_ranks(result)
Exemplo n.º 15
0
    def test_tensorboard_callback(self, setup_save_path, model_config,
                                  number_epochs):
        (train_dataset, val_dataset) = train_val_dataset_generator()
        tnt_model_runner, _ = gen_model_runners(model_config)

        tnt_model_runner.model.fit(
            train_dataset,
            validation_data=val_dataset,
            epochs=number_epochs,
            callbacks=[
                tf.keras.callbacks.TensorBoard(log_dir=setup_save_path)
            ])
        result = [True]
        if tnt.is_master_rank():
            result = [
                os.path.isdir(os.path.join(setup_save_path, "train")),
                os.path.isdir(os.path.join(setup_save_path, "validation"))
            ]
            result = [all(result)]
        util.assert_on_all_ranks(result)
Exemplo n.º 16
0
 def test_sgd_momentum_compare_to_reference(self, model_config, nesterov,
                                            momentum, micro_batch_size,
                                            nbatches, number_epochs):
     optimizer = keras.optimizers.SGD
     optimizer_kwargs = {
         'learning_rate': 0.01,
         'momentum': momentum,
         'nesterov': nesterov
     }
     tnt_history, ref_history = train_tnt_and_reference_models(
         model_config, optimizer, micro_batch_size, nbatches, number_epochs,
         optimizer_kwargs)
     result = [True, True]
     if tnt.is_master_rank():
         result = [
             np.allclose(tnt_history.history['loss'],
                         ref_history.history['loss'],
                         atol=1e-4),
             np.allclose(tnt_history.history[metric],
                         ref_history.history[metric],
                         atol=1e-6)
         ]
     util.assert_on_all_ranks(result)
Exemplo n.º 17
0
 def summary(self, *args, **kwargs):
     if tarantella.global_tnt_config.output_on_all_devices:
         self.model.summary(*args, **kwargs)
     else:
         if tarantella.is_master_rank():
             self.model.summary(*args, **kwargs)
Exemplo n.º 18
0
 def _set_verbose_all_ranks(self, exec_type, args_dict):
     if not 'verbose' in args_dict:
         args_dict['verbose'] = self.tf_default_verbose[exec_type]
     if not tarantella.global_tnt_config.output_on_all_devices:
         if not tarantella.is_master_rank():
             args_dict['verbose'] = 0