Пример #1
0
    def test_csv_logger_callback(self, setup_save_path, model_config,
                                 number_epochs):
        (train_dataset, val_dataset) = train_val_dataset_generator()
        (ref_train_dataset, ref_val_dataset) = train_val_dataset_generator()

        filename = os.path.join(setup_save_path, "training")
        tnt_model_runner, reference_model_runner = gen_model_runners(
            model_config)
        param_dict = {'epochs': number_epochs, 'verbose': 0, 'shuffle': False}

        tnt_filename = filename + '_tnt.csv'
        tnt_model_runner.model.fit(
            train_dataset,
            validation_data=val_dataset,
            callbacks=[tf.keras.callbacks.CSVLogger(tnt_filename)],
            **param_dict)

        result = [True]
        if tnt.is_master_rank():
            ref_filename = filename + '_ref.csv'
            reference_model_runner.model.fit(
                ref_train_dataset,
                validation_data=ref_val_dataset,
                callbacks=[tf.keras.callbacks.CSVLogger(ref_filename)],
                **param_dict)

            tnt_metrics = util.get_metric_values_from_file(tnt_filename)
            ref_metrics = util.get_metric_values_from_file(ref_filename)
            result = np.allclose(tnt_metrics, ref_metrics, atol=1e-6)
        util.assert_on_all_ranks(result)
Пример #2
0
def assert_identical_tnt_and_ref_history(tnt_history, ref_history):
    result = [True]
    if tnt.is_master_rank():
        for key in ref_history.history.keys():
            result += [
                all(
                    np.isclose(tnt_history.history[key],
                               ref_history.history[key],
                               atol=1e-6))
            ]
        result = [all(result)]
    util.assert_on_all_ranks(result)
Пример #3
0
    def test_early_stopping_callback(self, model_config, number_epochs):
        monitor_metric = 'val_loss'
        callbacks = [
            tf.keras.callbacks.EarlyStopping(monitor=monitor_metric,
                                             min_delta=0.1,
                                             patience=1)
        ]
        tnt_history, reference_history = train_tnt_and_ref_models_with_callbacks(
            callbacks, model_config, number_epochs)

        # Expect both models to run same number of epochs
        result = [True]
        if tnt.is_master_rank():
            result = (len(tnt_history.history[monitor_metric]) == len(
                reference_history.history[monitor_metric]))
        util.assert_on_all_ranks(result)
 def test_optimizers_compare_to_reference(self, model_config, optimizer,
                                          micro_batch_size, nbatches,
                                          number_epochs):
     tnt_history, ref_history = train_tnt_and_reference_models(
         model_config, optimizer, micro_batch_size, nbatches, number_epochs)
     result = [True, True]
     if tnt.is_master_rank():
         result = [
             np.allclose(tnt_history.history['loss'],
                         ref_history.history['loss'],
                         atol=1e-4),
             np.allclose(tnt_history.history[metric],
                         ref_history.history[metric],
                         atol=1e-6)
         ]
     util.assert_on_all_ranks(result)
    def test_compare_accuracy_against_reference(self, model_runners,
                                                micro_batch_size,
                                                number_epochs, nbatches,
                                                test_nbatches,
                                                remainder_samples_per_batch,
                                                last_incomplete_batch_size):
        (train_dataset, test_dataset) = util.train_test_mnist_datasets(
            nbatches=nbatches,
            test_nbatches=test_nbatches,
            micro_batch_size=micro_batch_size,
            shuffle=False,
            remainder_samples_per_batch=remainder_samples_per_batch,
            last_incomplete_batch_size=last_incomplete_batch_size)
        (ref_train_dataset, ref_test_dataset) = util.train_test_mnist_datasets(
            nbatches=nbatches,
            test_nbatches=test_nbatches,
            micro_batch_size=micro_batch_size,
            shuffle=False,
            remainder_samples_per_batch=remainder_samples_per_batch,
            last_incomplete_batch_size=last_incomplete_batch_size)
        tnt_model_runner, reference_model_runner = model_runners

        reference_model_runner.train_model(ref_train_dataset, number_epochs)
        tnt_model_runner.train_model(train_dataset, number_epochs)

        tnt_loss_accuracy = tnt_model_runner.evaluate_model(test_dataset)
        ref_loss_accuracy = reference_model_runner.evaluate_model(
            ref_test_dataset)

        rank = tnt.get_rank()
        logging.getLogger().info(
            f"[Rank {rank}] Tarantella[loss, accuracy] = {tnt_loss_accuracy}")
        logging.getLogger().info(
            f"[Rank {rank}] Reference [loss, accuracy] = {ref_loss_accuracy}")

        result = [True, True]
        if tnt.is_master_rank():
            result = [
                np.isclose(tnt_loss_accuracy[0],
                           ref_loss_accuracy[0],
                           atol=1e-2),  # losses might not be identical
                np.isclose(tnt_loss_accuracy[1],
                           ref_loss_accuracy[1],
                           atol=1e-6)
            ]
        util.assert_on_all_ranks(result)
Пример #6
0
    def test_progbar_logger_callback_inference(self, model_config,
                                               number_epochs,
                                               use_explicit_progbarlogger,
                                               verbose, exec_type, capsys):
        (train_dataset, test_dataset) = train_val_dataset_generator()
        (ref_train_dataset, ref_test_dataset) = train_val_dataset_generator()

        tnt_callbacks = [tf.keras.callbacks.ProgbarLogger(
            count_mode='steps')] if use_explicit_progbarlogger else []
        ref_callbacks = [tf.keras.callbacks.ProgbarLogger(
            count_mode='steps')] if use_explicit_progbarlogger else []

        tnt_model_runner, ref_model_runner = gen_model_runners(model_config)

        if exec_type == 'evaluate':
            tnt_model_runner.model.evaluate(test_dataset,
                                            callbacks=tnt_callbacks,
                                            verbose=verbose)
        elif exec_type == 'predict':
            tnt_model_runner.model.predict(test_dataset,
                                           callbacks=tnt_callbacks,
                                           verbose=verbose)
        tnt_captured = capsys.readouterr()
        tnt_metrics = util.get_metrics_from_stdout(
            tnt_captured.out, tnt_model_runner.model.metrics_names)

        if exec_type == 'evaluate':
            ref_model_runner.model.evaluate(ref_test_dataset,
                                            callbacks=ref_callbacks,
                                            verbose=verbose)
        elif exec_type == 'predict':
            ref_model_runner.model.predict(ref_test_dataset,
                                           callbacks=ref_callbacks,
                                           verbose=verbose)
        ref_captured = capsys.readouterr()
        ref_metrics = util.get_metrics_from_stdout(
            ref_captured.out, ref_model_runner.model.metrics_names)

        if tnt.is_master_rank():
            result = all(np.isclose(tnt_metrics, ref_metrics, atol=1e-6))
        else:
            result = all([tnt_captured.out == "", tnt_captured.err == ""])
        util.assert_on_all_ranks(result)
Пример #7
0
    def test_tensorboard_callback(self, setup_save_path, model_config,
                                  number_epochs):
        (train_dataset, val_dataset) = train_val_dataset_generator()
        tnt_model_runner, _ = gen_model_runners(model_config)

        tnt_model_runner.model.fit(
            train_dataset,
            validation_data=val_dataset,
            epochs=number_epochs,
            callbacks=[
                tf.keras.callbacks.TensorBoard(log_dir=setup_save_path)
            ])
        result = [True]
        if tnt.is_master_rank():
            result = [
                os.path.isdir(os.path.join(setup_save_path, "train")),
                os.path.isdir(os.path.join(setup_save_path, "validation"))
            ]
            result = [all(result)]
        util.assert_on_all_ranks(result)
 def test_sgd_momentum_compare_to_reference(self, model_config, nesterov,
                                            momentum, micro_batch_size,
                                            nbatches, number_epochs):
     optimizer = keras.optimizers.SGD
     optimizer_kwargs = {
         'learning_rate': 0.01,
         'momentum': momentum,
         'nesterov': nesterov
     }
     tnt_history, ref_history = train_tnt_and_reference_models(
         model_config, optimizer, micro_batch_size, nbatches, number_epochs,
         optimizer_kwargs)
     result = [True, True]
     if tnt.is_master_rank():
         result = [
             np.allclose(tnt_history.history['loss'],
                         ref_history.history['loss'],
                         atol=1e-4),
             np.allclose(tnt_history.history[metric],
                         ref_history.history[metric],
                         atol=1e-6)
         ]
     util.assert_on_all_ranks(result)