def test_dict_varying_values(self): value1 = 3.29 value2 = 17.0 input_2D_array = np.full(shape=(4, 5), fill_value=value1, dtype=np.float32) input_list = [input_2D_array, input_2D_array, input_2D_array] input_3D_array = np.full(shape=(2, 15, 4), fill_value=value2, dtype=np.float64) input_dict = dict() input_dict["list_of_tensors"] = input_list input_dict["single_tensor"] = input_3D_array expected_output_2D_array = tnt.get_size() * input_2D_array expected_output_3D_array = tnt.get_size() * input_3D_array allreducer = tnt.TensorAllreducer(input_dict) output_dict = allreducer.allreduce(input_dict) assert isinstance(output_dict, dict) assert len(output_dict) == 2 assert len(output_dict["list_of_tensors"]) == 3 assert all( np.array_equal(array, expected_output_2D_array) for array in output_dict["list_of_tensors"]) assert np.array_equal(output_dict["single_tensor"], expected_output_3D_array)
def _create_tnt_model(cls, model: tf.keras.Model, parallel_strategy: tnt.ParallelStrategy = tnt.ParallelStrategy.ALL if TF_DEFAULT_PIPELINING_FLAG \ else tnt.ParallelStrategy.DATA, num_pipeline_stages: int = 1): replica_group = tnt.Group() if (tnt.ParallelStrategy.PIPELINING in parallel_strategy) and isinstance(model, tf.keras.Sequential): logger.warn( f"Cannot pipeline a `tf.keras.Sequential` model; disabling model parallelism." ) parallel_strategy = parallel_strategy ^ tnt.ParallelStrategy.PIPELINING logger.info(f"Creating parallel model using {parallel_strategy}.") if tnt.ParallelStrategy.PIPELINING in parallel_strategy: rank = tnt.get_rank() partition_generator = pgen.GraphPartitionGenerator(model) rank_mapper = rmapper.RankMapper( num_ranks=tnt.get_size(), pipeline_graph=partition_generator.get_pipeline_graph()) pipeline_group = rank_mapper.get_pipelining_group_for_rank(rank) logger.info( f"[Pipelining] Creating pipelined model with {pipeline_group.size} partitions." ) # get my partition model = pm.PartitionedModel( model=model, group=pipeline_group, partition_generator=partition_generator, rank_mapper=rank_mapper, num_pipeline_stages=num_pipeline_stages) if tnt.ParallelStrategy.DATA in parallel_strategy: replica_group = rank_mapper.get_replica_group_for_rank(rank) else: if pipeline_group.size != tnt.get_size(): raise ValueError( f"Provided model has only {pipeline_group.size} partitions; use {pipeline_group.size} ranks or a different parallel strategy." ) if tnt.ParallelStrategy.DATA in parallel_strategy: # replicate my partition across the data parallel group logger.info( f"[DataParallel] Replicating local model across ranks {replica_group.group}." ) model = dpm.DataParallelModel(model=model, group=replica_group) return model
def __init__(self, model): if not tarantella.global_context: raise RuntimeError( """Cannot initialize a Model before the Tarantella library. Please call "tarantella.init()" first. """) self.rank = tarantella.get_rank() self.comm_size = tarantella.get_size() self.model = model self.input_shapes = None self.done_broadcast = False self.compiled = False self.broadcaster = None self.barrier = tarantella.Barrier() self.orig_optimizer = None self.orig_loss = None self.orig_metrics = None self.orig_loss_weights = None self.orig_sample_weight_mode = None self.orig_weighted_metrics = None self.dist_optimizer = None self.default_shuffle_seed = 42 # support for TF 2.0 -- 2.3 self.tf_default_verbose = { 'fit': 1, 'evaluate': 1, 'predict': 0, }
def to_microbatched(model, micro_batch_size, num_micro_batches, num_batches, num_test_batches): rank = tnt.get_rank() partition_generator = pgen.GraphPartitionGenerator(model) rank_mapper = rmapper.RankMapper(num_ranks = tnt.get_size(), pipeline_graph = partition_generator.get_pipeline_graph()) partition_id = rank_mapper.get_partition_for_rank(rank) partition_graph = partition_generator.get_partition_graph(partition_id) partition_info = pinfo.PartitionInfo(partition_id = partition_id, partition_graph = partition_graph) core_model_builder = cm_builder.CoreModelBuilder(model, partition_id, partition_graph) core_model = core_model_builder.get_model() connection_table = rank_mapper.get_connections_for_rank(rank) pipeline_communicator = tnt.PipelineCommunicator(connection_table, num_micro_batches) shared_model_builder = shared.SharedModelBuilder(partition_info, core_model, pipeline_communicator, micro_batch_size) shared_model = shared_model_builder.get_model() microbatched_model_builder = microbatched.MicrobatchedModelBuilder(partition_info, shared_model, micro_batch_size, num_micro_batches) ds = load_microbatched_datasets(micro_batch_size, num_micro_batches, num_batches, num_test_batches, partition_info) pipeline_communicator.setup_infrastructure(micro_batch_size) return microbatched_model_builder, ds
def test_dict_of_tensors(self, input_dict): expected_dict = {k: v * tnt.get_size() for k, v in input_dict.items()} allreducer = tnt.TensorAllreducer(input_dict) output_dict = allreducer.allreduce(input_dict) assert {k: tf.is_tensor(v) for k, v in output_dict.items()} assert output_dict == expected_dict
def test_single_array_identical_inputs(self, array_length, dtype): input_array = np.ones(shape=(array_length, 1), dtype=dtype) expected_output_array = input_array * tnt.get_size() allreducer = tnt.TensorAllreducer(input_array) output_array = allreducer.allreduce(input_array) assert isinstance(output_array, np.ndarray) assert np.array_equal(output_array, expected_output_array)
def test_train(self, batch_size, num_batches, number_epochs): assert tnt.get_size() == number_partitions micro_batch_size = batch_size // num_micro_batches ### CREATE MODEL pipeline_communicator = get_pipeline_communicator(num_micro_batches) pipeline_communicator.setup_infrastructure(micro_batch_size) core_model = get_partitioned_core_model() shared_model = get_partitioned_shared_model(core_model, pipeline_communicator, micro_batch_size) microbatched_model = get_partitioned_model(shared_model) ### LOAD DATASETS partition_info = get_partition_info(core_model) microbatched_ds = load_microbatched_datasets(micro_batch_size, num_micro_batches, num_batches, 0, partition_info) reference_ds = load_reference_datasets(batch_size, num_batches, 0) ### MODEL COMPILE/TRAIN (on each rank individually) # single rank model fit_params = {'epochs' : number_epochs, 'shuffle' : False, 'verbose' : 0} sgd = keras.optimizers.SGD(learning_rate) if rank == master_rank: print("\nTraining reference model") reference_model = get_reference_model() reference_model.compile(optimizer = sgd, loss = keras.losses.SparseCategoricalCrossentropy(), metrics = [keras.metrics.SparseCategoricalAccuracy()]) reference_history = reference_model.fit(reference_ds["train"], **fit_params) # pipelined model if rank == p_0_rank: partition_losses = {"p_0_m_0_edge_output_0" : tnt_losses.ZeroLoss(), "p_0_m_0_edge_output_1" : tnt_losses.ZeroLoss(), "p_0_m_1_edge_output_0" : tnt_losses.ZeroLoss(), "p_0_m_1_edge_output_1" : tnt_losses.ZeroLoss(), "p_0_seq_output" : tnt_losses.ZeroLoss()} partition_loss_weights = None partition_metrics = None if rank == p_1_rank: partition_losses = {"p_1_m_0_real_output_0" : keras.losses.SparseCategoricalCrossentropy(), "p_1_m_1_real_output_0" : keras.losses.SparseCategoricalCrossentropy(), "p_1_seq_output" : tnt_losses.ZeroLoss()} partition_loss_weights = {"p_1_m_0_real_output_0" : 1./num_micro_batches, "p_1_m_1_real_output_0" : 1./num_micro_batches, "p_1_seq_output" : 0.} partition_metrics = {"p_1_m_0_real_output_0" : keras.metrics.SparseCategoricalAccuracy(), "p_1_m_1_real_output_0" : keras.metrics.SparseCategoricalAccuracy()} microbatched_model.compile(optimizer = sgd, loss = partition_losses, loss_weights = partition_loss_weights, metrics = partition_metrics) pipeline_history = microbatched_model.fit(microbatched_ds["train"], **fit_params) if rank == master_rank: check_histories_match(reference_history, pipeline_history, num_micro_batches)
def test_tensor_numeric(self, input_value, dtype): expected_value = input_value * tnt.get_size() input = tf.constant(input_value, dtype=dtype) allreducer = tnt.TensorAllreducer(input) output = allreducer.allreduce(input) assert tf.is_tensor(output) assert output == expected_value
def test_tensor_from_list(self): input_list = tf.constant([[1, 2, 3, 4, 5], [.2, .3, .4, .5, .6]]) expected_output_list = input_list * tnt.get_size() allreducer = tnt.TensorAllreducer(input_list) output = allreducer.allreduce(input_list) assert tf.is_tensor(output) assert np.all(output == expected_output_list)
def test_train(self, num_micro_batches, batch_size, num_batches, num_test_batches, number_epochs): assert tnt.get_size() == number_partitions fit_params = {'epochs': number_epochs, 'shuffle': False, 'verbose': 0} micro_batch_size = batch_size // num_micro_batches # create pipelined model and load datasets pipeline_communicator = get_pipeline_communicator(num_micro_batches) pipeline_communicator.setup_infrastructure(micro_batch_size) core_model = get_partitioned_core_model() partition_info = get_partition_info(core_model) shared_model_builder = shared.SharedModelBuilder( partition_info, core_model, pipeline_communicator, micro_batch_size) shared_model = shared_model_builder.get_model() microbatched_model_builder = microbatched.MicrobatchedModelBuilder( partition_info, shared_model, micro_batch_size, num_micro_batches) microbatched_model = microbatched_model_builder.get_model() microbatched_ds = load_microbatched_datasets(micro_batch_size, num_micro_batches, num_batches, num_test_batches, partition_info) # reference model if rank == master_rank: reference_model = get_reference_model() reference_model.compile(**get_reference_compile_params()) reference_ds = load_reference_datasets(batch_size, num_batches, num_test_batches) reference_history = reference_model.fit( reference_ds["train"], validation_data=reference_ds["val"], **fit_params) reference_result = reference_model.evaluate(reference_ds["test"], verbose=0) # pipelined model microbatched_model.compile( **get_microbatched_compile_params(microbatched_model_builder)) pipeline_history = microbatched_model.fit( microbatched_ds["train"], validation_data=microbatched_ds["val"], **fit_params) pipeline_result = microbatched_model.evaluate(microbatched_ds["test"], verbose=0) if rank == master_rank: check_histories_match(reference_history, pipeline_history, num_micro_batches) check_validation_histories_match(reference_history, pipeline_history, num_micro_batches) check_predictions_match(reference_result, pipeline_result, num_micro_batches)
def test_single_value(self): inputs = float(tnt.get_rank()) expected_output = sum(range(tnt.get_size())) allreducer = tnt.TensorAllreducer(inputs) output = allreducer.allreduce(inputs) assert isinstance(output, float) assert expected_output == output
def train_and_eval(self): """Trains the model.""" lr_schedule = optimizer.LearningRateSchedule(self.params["learning_rate"], self.params["hidden_size"], self.params["learning_rate_warmup_steps"]) opt = tf.keras.optimizers.Adam(lr_schedule, self.params["optimizer_adam_beta1"], self.params["optimizer_adam_beta2"], epsilon=self.params["optimizer_adam_epsilon"]) self.train_model.compile(opt) self.train_model.summary() # create train dataset train_ds = data_pipeline.train_input_fn(self.params, shuffle_seed = 42, num_ranks = tnt.get_size(), rank = tnt.get_rank()) # enable global callbacks callbacks = [] if self.flags_obj.enable_tensorboard and self.flags_obj.model_dir: callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=self.flags_obj.model_dir)) # enable logging callbacks only on the master rank if self.flags_obj.enable_time_history: time_callback = keras_utils.TimeHistory(self.params["batch_size"], self.params["num_sentences"], logdir = None) tnt_time_callback = tnt.keras.callbacks.Callback(time_callback, aggregate_logs = False, run_on_all_ranks = False) callbacks.append(tnt_time_callback) # print messages only once if tnt.is_master_rank(): logging.info("Start train") stats = {} for epoch in range(0, self.params["train_epochs"], self.params["epochs_between_evals"]): # as our dataset is distributed manually, disable the automatic Tarantella distribution history = self.train_model.fit(train_ds, callbacks = callbacks, tnt_distribute_dataset = False, initial_epoch = epoch, epochs = epoch + min(self.params["epochs_between_evals"], self.params["train_epochs"]-epoch), verbose = 2) if tnt.is_master_rank(): logging.info("Train history: {}".format(history.history)) stats = misc.build_stats(history, callbacks) if tnt.is_master_rank(): eval_stats = self.eval() stats.update(eval_stats) return stats
def test_array_inf(self, array_length, index): injection_rank = util.same_random_int_all_ranks(0, tnt.get_size()) input_array = np.ones(shape=(array_length, 1), dtype=np.float32) if tnt.get_rank() == injection_rank: input_array[index] = math.inf allreducer = tnt.TensorAllreducer(input_array) output_array = allreducer.allreduce(input_array) assert np.isinf(output_array[index])
def test_single_array_identical_inputs(self, array_length): input_array = np.ones(shape=(array_length, 1), dtype=np.float32) expected_output_array = np.ones(shape=(array_length * tnt.get_size()), dtype=np.float32) allgatherer = tnt.TensorAllgatherer(input_array) output_array = allgatherer.allgather(input_array) assert isinstance(output_array, np.ndarray) assert np.array_equal(output_array, expected_output_array)
def test_nd_tensor(self, input_shape, dtype): input_array = np.ones(shape=input_shape, dtype=dtype) expected_output_array = input_array * tnt.get_size() inputs = tf.constant(input_array) allreducer = tnt.TensorAllreducer(inputs) output = allreducer.allreduce(inputs) assert tf.is_tensor(output) assert np.array_equal(output.numpy(), expected_output_array)
def test_scalar_identical_inputs(self): scalar = 50 expected_output_array = np.ones(shape=(tnt.get_size()), dtype=int) expected_output_array.fill(scalar) allgatherer = tnt.TensorAllgatherer(scalar) output_array = allgatherer.allgather(scalar) assert isinstance(output_array, np.ndarray) assert np.array_equal(output_array, expected_output_array)
def train_val_dataset_generator(): micro_batch_size = 64 nbatches = 1 batch_size = micro_batch_size * tnt.get_size() nsamples = nbatches * batch_size train_dataset, val_dataset, _ = util.load_dataset( mnist.load_mnist_dataset, train_size=nsamples, train_batch_size=batch_size, val_size=nsamples, val_batch_size=batch_size) return train_dataset, val_dataset
def test_dict_many_keys(self, length): input_value = 4.2 input_dict = dict.fromkeys(("key " + str(i) for i in range(length)), input_value) expected_output_value = input_value * tnt.get_size() allreducer = tnt.TensorAllreducer(input_dict) output_dict = allreducer.allreduce(input_dict) assert isinstance(output_dict, dict) assert len(input_dict) == len(output_dict) assert all(v == expected_output_value for v in output_dict.values())
def test_single_array_different_inputs(self, array_length): input_array = np.empty(shape=(array_length, 1), dtype=np.float32) input_array.fill(tnt.get_rank()) expected_output_array = np.empty(input_array.shape, dtype=np.float32) expected_output_array.fill(sum(range(tnt.get_size()))) allreducer = tnt.TensorAllreducer(input_array) output_array = allreducer.allreduce(input_array) assert isinstance(output_array, np.ndarray) assert np.array_equal(output_array, expected_output_array)
def test_list_of_tensors_identical_inputs(self, list_length): input_array = tf.constant([1, 2, 3]) input_list = [input_array for i in range(list_length)] expected_output_array = input_array * tnt.get_size() allreducer = tnt.TensorAllreducer(input_list) output_list = allreducer.allreduce(input_list) assert isinstance(output_list, list) assert all( np.array_equal(array, expected_output_array) for array in output_list)
def test_list_of_arrays_identical_inputs(self, list_length, dtype): array_length = 50 input_array = np.ones(shape=(array_length, 1), dtype=dtype) input_list = [input_array for i in range(list_length)] expected_output_array = input_array * tnt.get_size() allreducer = tnt.TensorAllreducer(input_list) output_list = allreducer.allreduce(input_list) assert isinstance(output_list, list) assert all( np.array_equal(array, expected_output_array) for array in output_list)
def test_single_array(self, array_shape): np.random.seed(42) input_array = np.random.random_sample(array_shape).astype('float32') rank = tnt.get_rank() root_rank = tnt.get_size() - 1 broadcaster = tnt.TensorBroadcaster(input_array, root_rank) expected_output_array = input_array if rank == root_rank: output_array = broadcaster.broadcast(input_array) else: output_array = broadcaster.broadcast() result = (output_array == expected_output_array).all() assert isinstance(output_array, np.ndarray) assert result
def test_list_of_arrays_identical_inputs_diff_types(self): input_array_float = np.ones(shape=(238, 1), dtype=np.float32) input_array_double = np.ones(shape=(42, 1), dtype=np.double) another_input_array_float = np.ones(shape=(99, 1), dtype=np.float32) input_list = [ input_array_float, input_array_double, another_input_array_float ] expected_output_list = [array * tnt.get_size() for array in input_list] allreducer = tnt.TensorAllreducer(input_list) output_list = allreducer.allreduce(input_list) assert isinstance(output_list, list) assert all(np.array_equal(output_array, expected_output_array) \ for (output_array, expected_output_array) \ in zip(output_list, expected_output_list))
def __init__(self, dataset, num_ranks=tnt.get_size(), rank=tnt.get_rank(), shuffle_seed=42): self.num_ranks = num_ranks self.rank = rank self.shuffle_seed = shuffle_seed self.base_dataset, self.dataset_transformations = \ ops_helpers.gen_dataset_transformations(dataset) self.batching_info = ops_helpers.get_batching_info( self.dataset_transformations) # convenience attributes computed when the dataset is distributed among ranks self._dataset = None self._num_samples = None self._micro_batch_size = None
def check_histories_match(reference_history, pipeline_history, num_micro_batches, prefix = ""): loss_name = prefix + 'loss' metric_name = 'sparse_categorical_accuracy' output_id = 0 partition_id = tnt.get_size() - 1 # compute metric only on the last partition for i in range(len(reference_history.history[loss_name])): # check loss matches assert np.allclose(reference_history.history[loss_name], pipeline_history.history[loss_name]) # check metrics match reference_metric_value = reference_history.history[prefix + metric_name][i] pipeline_metric_value = 0 for m in range(num_micro_batches): pipeline_metric_value += \ pipeline_history.history[f"{prefix}p_{partition_id}_m_{m}" f"_real_output_{output_id}_{metric_name}"][i] pipeline_metric_value = pipeline_metric_value / num_micro_batches assert np.allclose(reference_metric_value, pipeline_metric_value)
def train_test_mnist_datasets(nbatches=1, val_nbatches=0, test_nbatches=0, micro_batch_size=64, shuffle=True, remainder_samples_per_batch=0, last_incomplete_batch_size=0, drop_remainder=False): batch_size = micro_batch_size * tnt.get_size() + remainder_samples_per_batch nsamples = nbatches * batch_size + last_incomplete_batch_size val_nsamples = val_nbatches * batch_size test_nsamples = test_nbatches * batch_size return load_train_test_dataset(mnist.load_mnist_dataset, train_size=nsamples, train_batch_size=batch_size, test_size=test_nsamples, test_batch_size=batch_size, shuffle=shuffle, drop_remainder=drop_remainder)
def reduce_gradients(self, gradients_and_weights): gradients_to_reduce = list() for grad, weight in gradients_and_weights: # add an Allreduce operation for each gradient grad_id = self.weight_to_index[weight.name] number_partial_sums = tnt.get_size() grad = grad / number_partial_sums output_grad = tnt_ops.start_allreduce_op( grad, tensor_id=grad_id, tnt_synchcomm=self.comm.get_raw_ptr()) gradients_to_reduce.append(output_grad) # Create barrier op in the Tensorflow graph to make sure all # the Allreduce operations on gradients have started. # This ensures that the graph execution does not get delayed by waiting # for gradients to be reduced as long as there are remaining computations # in the backward pass. temp_gradients = tnt_ops.barrier_op(gradients_to_reduce, Tout=[tf.float32] * len(gradients_to_reduce)) # Add individual ops that wait for each gradient to be reduced before updating # the weights. # These ops are executed only after the backward pass has been completed. reduced_gradients = list() for idx, (_, weight) in enumerate(gradients_and_weights): # gradient tensors obtained after barrier are listed in the same order # as the initial `gradients_and_weights` gradient = temp_gradients[idx] grad_id = self.weight_to_index[weight.name] output_grad = tnt_ops.finish_allreduce_op( gradient, tensor_id=grad_id, Tout=tf.float32, tnt_synchcomm=self.comm.get_raw_ptr()) if version_utils.tf_version_below_equal('2.3'): reduced_gradients.append(output_grad) else: reduced_gradients.append((output_grad, weight)) return reduced_gradients
def test_cifar_alexnet(self, keras_model, optimizer, micro_batch_size, nbatches, ntest_batches): batch_size = micro_batch_size * tnt.get_size() nsamples = nbatches * batch_size (number_epochs, lr) = cifar.get_hyperparams(optimizer) (train_dataset, test_dataset) = util.load_train_test_dataset(cifar.load_cifar_dataset, train_size = nsamples, train_batch_size = batch_size, test_size = ntest_batches * batch_size, test_batch_size = batch_size) if optimizer.__name__ == 'SGD': keras_optimizer = optimizer(learning_rate=lr, momentum=0.9) else: keras_optimizer = optimizer(learning_rate=lr) model = tnt.Model(keras_model()) model.compile(keras_optimizer, loss = keras.losses.SparseCategoricalCrossentropy(), metrics = [keras.metrics.SparseCategoricalAccuracy()]) model.fit(train_dataset, epochs = number_epochs, verbose = 0) results = model.evaluate(test_dataset) util.check_accuracy_greater(results[1], 0.5)
def test_train(self, model_generator, num_micro_batches, micro_batch_size, num_batches, num_test_batches, number_epochs): batch_size = micro_batch_size * num_micro_batches fit_params = {'epochs' : number_epochs, 'shuffle' : False, 'verbose' : 0} rank = tnt.get_rank() master_rank = tnt.get_size() - 1 # the last partition will be assigned to rank (nranks-1) # reference model if rank == master_rank: reference_ds = load_reference_datasets(batch_size, num_batches, num_test_batches) reference_model = model_generator() reference_model.compile(**get_reference_compile_params()) reference_history = reference_model.fit(reference_ds["train"], validation_data = reference_ds["val"], **fit_params) reference_result = reference_model.evaluate(reference_ds["test"], verbose = 0) # pipelined model model = model_generator() microbatched_model_builder, microbatched_ds = to_microbatched(model, micro_batch_size, num_micro_batches, num_batches, num_test_batches) microbatched_model = microbatched_model_builder.get_model() microbatched_model.summary() microbatched_model.compile(**get_microbatched_compile_params(microbatched_model_builder)) pipeline_history = microbatched_model.fit(microbatched_ds["train"], validation_data = microbatched_ds["val"], **fit_params) pipeline_result = microbatched_model.evaluate(microbatched_ds["test"], verbose = 0) if rank == master_rank: print (reference_history.history) print (pipeline_history.history) check_histories_match(reference_history, pipeline_history, num_micro_batches) check_validation_histories_match(reference_history, pipeline_history, num_micro_batches) check_predictions_match(reference_result, pipeline_result, num_micro_batches)
np.isclose(tnt_history.history[key], ref_history.history[key], atol=1e-6)) ] result = [all(result)] util.assert_on_all_ranks(result) @pytest.mark.parametrize("model_config", [ base_runner.ModelConfig(mnist.fc_model_generator), base_runner.ModelConfig(mnist.subclassed_model_generator), pytest.param( base_runner.ModelConfig(mnist.fc_model_generator, tnt.ParallelStrategy.PIPELINING), marks=pytest.mark.skipif( tnt.get_size() != 1, reason="Cannot run multi-rank, model has only one partition")), ]) class TestTarantellaCallbacks: @pytest.mark.parametrize("number_epochs", [5]) def test_learning_rate_scheduler_callback(self, model_config, number_epochs): callbacks = [ tf.keras.callbacks.LearningRateScheduler( schedule=(lambda epoch, lr: 0.1 * lr), verbose=1) ] tnt_history, reference_history = train_tnt_and_ref_models_with_callbacks( callbacks, model_config, number_epochs) assert_identical_tnt_and_ref_history(tnt_history, reference_history) @pytest.mark.parametrize("number_epochs", [5])