def test_dict_varying_values(self):
        value1 = 3.29
        value2 = 17.0

        input_2D_array = np.full(shape=(4, 5),
                                 fill_value=value1,
                                 dtype=np.float32)
        input_list = [input_2D_array, input_2D_array, input_2D_array]
        input_3D_array = np.full(shape=(2, 15, 4),
                                 fill_value=value2,
                                 dtype=np.float64)

        input_dict = dict()
        input_dict["list_of_tensors"] = input_list
        input_dict["single_tensor"] = input_3D_array

        expected_output_2D_array = tnt.get_size() * input_2D_array
        expected_output_3D_array = tnt.get_size() * input_3D_array

        allreducer = tnt.TensorAllreducer(input_dict)
        output_dict = allreducer.allreduce(input_dict)

        assert isinstance(output_dict, dict)
        assert len(output_dict) == 2
        assert len(output_dict["list_of_tensors"]) == 3
        assert all(
            np.array_equal(array, expected_output_2D_array)
            for array in output_dict["list_of_tensors"])
        assert np.array_equal(output_dict["single_tensor"],
                              expected_output_3D_array)
예제 #2
0
    def _create_tnt_model(cls, model: tf.keras.Model,
                          parallel_strategy: tnt.ParallelStrategy = tnt.ParallelStrategy.ALL if TF_DEFAULT_PIPELINING_FLAG \
                                                                                             else tnt.ParallelStrategy.DATA,
                          num_pipeline_stages: int = 1):
        replica_group = tnt.Group()

        if (tnt.ParallelStrategy.PIPELINING
                in parallel_strategy) and isinstance(model,
                                                     tf.keras.Sequential):
            logger.warn(
                f"Cannot pipeline a `tf.keras.Sequential` model; disabling model parallelism."
            )
            parallel_strategy = parallel_strategy ^ tnt.ParallelStrategy.PIPELINING

        logger.info(f"Creating parallel model using {parallel_strategy}.")
        if tnt.ParallelStrategy.PIPELINING in parallel_strategy:
            rank = tnt.get_rank()

            partition_generator = pgen.GraphPartitionGenerator(model)
            rank_mapper = rmapper.RankMapper(
                num_ranks=tnt.get_size(),
                pipeline_graph=partition_generator.get_pipeline_graph())
            pipeline_group = rank_mapper.get_pipelining_group_for_rank(rank)

            logger.info(
                f"[Pipelining] Creating pipelined model with {pipeline_group.size} partitions."
            )
            # get my partition
            model = pm.PartitionedModel(
                model=model,
                group=pipeline_group,
                partition_generator=partition_generator,
                rank_mapper=rank_mapper,
                num_pipeline_stages=num_pipeline_stages)
            if tnt.ParallelStrategy.DATA in parallel_strategy:
                replica_group = rank_mapper.get_replica_group_for_rank(rank)
            else:
                if pipeline_group.size != tnt.get_size():
                    raise ValueError(
                        f"Provided model has only {pipeline_group.size} partitions; use {pipeline_group.size} ranks or a different parallel strategy."
                    )

        if tnt.ParallelStrategy.DATA in parallel_strategy:
            # replicate my partition across the data parallel group
            logger.info(
                f"[DataParallel] Replicating local model across ranks {replica_group.group}."
            )
            model = dpm.DataParallelModel(model=model, group=replica_group)
        return model
예제 #3
0
    def __init__(self, model):
        if not tarantella.global_context:
            raise RuntimeError(
                """Cannot initialize a Model before the Tarantella library.
      Please call "tarantella.init()" first.
      """)
        self.rank = tarantella.get_rank()
        self.comm_size = tarantella.get_size()

        self.model = model
        self.input_shapes = None
        self.done_broadcast = False
        self.compiled = False
        self.broadcaster = None
        self.barrier = tarantella.Barrier()

        self.orig_optimizer = None
        self.orig_loss = None
        self.orig_metrics = None
        self.orig_loss_weights = None
        self.orig_sample_weight_mode = None
        self.orig_weighted_metrics = None

        self.dist_optimizer = None
        self.default_shuffle_seed = 42

        # support for TF 2.0 -- 2.3
        self.tf_default_verbose = {
            'fit': 1,
            'evaluate': 1,
            'predict': 0,
        }
예제 #4
0
def to_microbatched(model, micro_batch_size, num_micro_batches, num_batches, num_test_batches):
  rank = tnt.get_rank()
  partition_generator = pgen.GraphPartitionGenerator(model)
  rank_mapper = rmapper.RankMapper(num_ranks = tnt.get_size(),
                                   pipeline_graph = partition_generator.get_pipeline_graph())

  partition_id = rank_mapper.get_partition_for_rank(rank)
  partition_graph = partition_generator.get_partition_graph(partition_id)
  partition_info = pinfo.PartitionInfo(partition_id = partition_id,
                                       partition_graph = partition_graph)

  core_model_builder = cm_builder.CoreModelBuilder(model, partition_id, partition_graph)
  core_model = core_model_builder.get_model()

  connection_table = rank_mapper.get_connections_for_rank(rank)
  pipeline_communicator = tnt.PipelineCommunicator(connection_table, num_micro_batches)

  shared_model_builder = shared.SharedModelBuilder(partition_info, core_model,
                                                   pipeline_communicator, micro_batch_size)
  shared_model = shared_model_builder.get_model()

  microbatched_model_builder = microbatched.MicrobatchedModelBuilder(partition_info, shared_model,
                                                                     micro_batch_size, num_micro_batches)
  ds = load_microbatched_datasets(micro_batch_size, num_micro_batches,
                                  num_batches, num_test_batches, partition_info)
  pipeline_communicator.setup_infrastructure(micro_batch_size)
  return microbatched_model_builder, ds
    def test_dict_of_tensors(self, input_dict):
        expected_dict = {k: v * tnt.get_size() for k, v in input_dict.items()}

        allreducer = tnt.TensorAllreducer(input_dict)
        output_dict = allreducer.allreduce(input_dict)

        assert {k: tf.is_tensor(v) for k, v in output_dict.items()}
        assert output_dict == expected_dict
    def test_single_array_identical_inputs(self, array_length, dtype):
        input_array = np.ones(shape=(array_length, 1), dtype=dtype)
        expected_output_array = input_array * tnt.get_size()

        allreducer = tnt.TensorAllreducer(input_array)
        output_array = allreducer.allreduce(input_array)
        assert isinstance(output_array, np.ndarray)
        assert np.array_equal(output_array, expected_output_array)
예제 #7
0
  def test_train(self, batch_size, num_batches, number_epochs):
    assert tnt.get_size() == number_partitions
    micro_batch_size = batch_size // num_micro_batches

    ### CREATE MODEL
    pipeline_communicator = get_pipeline_communicator(num_micro_batches)
    pipeline_communicator.setup_infrastructure(micro_batch_size)

    core_model = get_partitioned_core_model()
    shared_model = get_partitioned_shared_model(core_model, pipeline_communicator, micro_batch_size)
    microbatched_model = get_partitioned_model(shared_model)

    ### LOAD DATASETS
    partition_info = get_partition_info(core_model)
    microbatched_ds = load_microbatched_datasets(micro_batch_size, num_micro_batches,
                                                 num_batches, 0, partition_info)
    reference_ds = load_reference_datasets(batch_size, num_batches, 0)

    ### MODEL COMPILE/TRAIN (on each rank individually)
    # single rank model
    fit_params = {'epochs' : number_epochs, 'shuffle' : False, 'verbose' : 0}
    sgd = keras.optimizers.SGD(learning_rate)
    if rank == master_rank:
      print("\nTraining reference model")
      reference_model = get_reference_model()
      reference_model.compile(optimizer = sgd,
                              loss = keras.losses.SparseCategoricalCrossentropy(),
                              metrics = [keras.metrics.SparseCategoricalAccuracy()])
      reference_history = reference_model.fit(reference_ds["train"],
                                              **fit_params)

    # pipelined model
    if rank == p_0_rank:
      partition_losses = {"p_0_m_0_edge_output_0" : tnt_losses.ZeroLoss(),
                          "p_0_m_0_edge_output_1" : tnt_losses.ZeroLoss(),
                          "p_0_m_1_edge_output_0" : tnt_losses.ZeroLoss(),
                          "p_0_m_1_edge_output_1" : tnt_losses.ZeroLoss(),
                          "p_0_seq_output" : tnt_losses.ZeroLoss()}
      partition_loss_weights = None
      partition_metrics = None
    if rank == p_1_rank:
      partition_losses = {"p_1_m_0_real_output_0" : keras.losses.SparseCategoricalCrossentropy(),
                          "p_1_m_1_real_output_0" : keras.losses.SparseCategoricalCrossentropy(),
                          "p_1_seq_output" : tnt_losses.ZeroLoss()}
      partition_loss_weights = {"p_1_m_0_real_output_0" : 1./num_micro_batches,
                                "p_1_m_1_real_output_0" : 1./num_micro_batches,
                                "p_1_seq_output" : 0.}
      partition_metrics = {"p_1_m_0_real_output_0" : keras.metrics.SparseCategoricalAccuracy(),
                           "p_1_m_1_real_output_0" : keras.metrics.SparseCategoricalAccuracy()}

    microbatched_model.compile(optimizer = sgd,
                               loss = partition_losses,
                               loss_weights = partition_loss_weights,
                               metrics = partition_metrics)
    pipeline_history = microbatched_model.fit(microbatched_ds["train"],
                                              **fit_params)
    if rank == master_rank:
      check_histories_match(reference_history, pipeline_history, num_micro_batches)
    def test_tensor_numeric(self, input_value, dtype):
        expected_value = input_value * tnt.get_size()
        input = tf.constant(input_value, dtype=dtype)

        allreducer = tnt.TensorAllreducer(input)
        output = allreducer.allreduce(input)

        assert tf.is_tensor(output)
        assert output == expected_value
    def test_tensor_from_list(self):
        input_list = tf.constant([[1, 2, 3, 4, 5], [.2, .3, .4, .5, .6]])
        expected_output_list = input_list * tnt.get_size()

        allreducer = tnt.TensorAllreducer(input_list)
        output = allreducer.allreduce(input_list)

        assert tf.is_tensor(output)
        assert np.all(output == expected_output_list)
예제 #10
0
    def test_train(self, num_micro_batches, batch_size, num_batches,
                   num_test_batches, number_epochs):
        assert tnt.get_size() == number_partitions
        fit_params = {'epochs': number_epochs, 'shuffle': False, 'verbose': 0}
        micro_batch_size = batch_size // num_micro_batches

        # create pipelined model and load datasets
        pipeline_communicator = get_pipeline_communicator(num_micro_batches)
        pipeline_communicator.setup_infrastructure(micro_batch_size)
        core_model = get_partitioned_core_model()

        partition_info = get_partition_info(core_model)
        shared_model_builder = shared.SharedModelBuilder(
            partition_info, core_model, pipeline_communicator,
            micro_batch_size)
        shared_model = shared_model_builder.get_model()

        microbatched_model_builder = microbatched.MicrobatchedModelBuilder(
            partition_info, shared_model, micro_batch_size, num_micro_batches)
        microbatched_model = microbatched_model_builder.get_model()
        microbatched_ds = load_microbatched_datasets(micro_batch_size,
                                                     num_micro_batches,
                                                     num_batches,
                                                     num_test_batches,
                                                     partition_info)

        # reference model
        if rank == master_rank:
            reference_model = get_reference_model()
            reference_model.compile(**get_reference_compile_params())

            reference_ds = load_reference_datasets(batch_size, num_batches,
                                                   num_test_batches)
            reference_history = reference_model.fit(
                reference_ds["train"],
                validation_data=reference_ds["val"],
                **fit_params)
            reference_result = reference_model.evaluate(reference_ds["test"],
                                                        verbose=0)

        # pipelined model
        microbatched_model.compile(
            **get_microbatched_compile_params(microbatched_model_builder))
        pipeline_history = microbatched_model.fit(
            microbatched_ds["train"],
            validation_data=microbatched_ds["val"],
            **fit_params)
        pipeline_result = microbatched_model.evaluate(microbatched_ds["test"],
                                                      verbose=0)
        if rank == master_rank:
            check_histories_match(reference_history, pipeline_history,
                                  num_micro_batches)
            check_validation_histories_match(reference_history,
                                             pipeline_history,
                                             num_micro_batches)
            check_predictions_match(reference_result, pipeline_result,
                                    num_micro_batches)
    def test_single_value(self):
        inputs = float(tnt.get_rank())
        expected_output = sum(range(tnt.get_size()))

        allreducer = tnt.TensorAllreducer(inputs)
        output = allreducer.allreduce(inputs)

        assert isinstance(output, float)
        assert expected_output == output
  def train_and_eval(self):
    """Trains the model."""
    lr_schedule = optimizer.LearningRateSchedule(self.params["learning_rate"], self.params["hidden_size"],
                                                 self.params["learning_rate_warmup_steps"])
    opt = tf.keras.optimizers.Adam(lr_schedule,
                                   self.params["optimizer_adam_beta1"],
                                   self.params["optimizer_adam_beta2"],
                                   epsilon=self.params["optimizer_adam_epsilon"])
    self.train_model.compile(opt)
    self.train_model.summary()

    # create train dataset
    train_ds = data_pipeline.train_input_fn(self.params,
                                            shuffle_seed = 42,
                                            num_ranks = tnt.get_size(),
                                            rank = tnt.get_rank())

    # enable global callbacks
    callbacks = []
    if self.flags_obj.enable_tensorboard and self.flags_obj.model_dir:
      callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=self.flags_obj.model_dir))

    # enable logging callbacks only on the master rank
    if self.flags_obj.enable_time_history:
      time_callback = keras_utils.TimeHistory(self.params["batch_size"],
                                              self.params["num_sentences"],
                                              logdir = None)
      tnt_time_callback = tnt.keras.callbacks.Callback(time_callback,
                                                       aggregate_logs = False,
                                                       run_on_all_ranks = False)
      callbacks.append(tnt_time_callback)

    # print messages only once
    if tnt.is_master_rank():
      logging.info("Start train")

    stats = {}
    for epoch in range(0, self.params["train_epochs"], self.params["epochs_between_evals"]):
      # as our dataset is distributed manually, disable the automatic Tarantella distribution
      history = self.train_model.fit(train_ds,
                                     callbacks = callbacks,
                                     tnt_distribute_dataset = False,
                                     initial_epoch = epoch,
                                     epochs = epoch + min(self.params["epochs_between_evals"],
                                                          self.params["train_epochs"]-epoch),
                                     verbose = 2)

      if tnt.is_master_rank():
        logging.info("Train history: {}".format(history.history))
        stats = misc.build_stats(history, callbacks)

      if tnt.is_master_rank():
        eval_stats = self.eval()
        stats.update(eval_stats)

    return stats
    def test_array_inf(self, array_length, index):
        injection_rank = util.same_random_int_all_ranks(0, tnt.get_size())
        input_array = np.ones(shape=(array_length, 1), dtype=np.float32)
        if tnt.get_rank() == injection_rank:
            input_array[index] = math.inf

        allreducer = tnt.TensorAllreducer(input_array)
        output_array = allreducer.allreduce(input_array)

        assert np.isinf(output_array[index])
    def test_single_array_identical_inputs(self, array_length):
        input_array = np.ones(shape=(array_length, 1), dtype=np.float32)
        expected_output_array = np.ones(shape=(array_length * tnt.get_size()),
                                        dtype=np.float32)

        allgatherer = tnt.TensorAllgatherer(input_array)
        output_array = allgatherer.allgather(input_array)

        assert isinstance(output_array, np.ndarray)
        assert np.array_equal(output_array, expected_output_array)
    def test_nd_tensor(self, input_shape, dtype):
        input_array = np.ones(shape=input_shape, dtype=dtype)
        expected_output_array = input_array * tnt.get_size()

        inputs = tf.constant(input_array)

        allreducer = tnt.TensorAllreducer(inputs)
        output = allreducer.allreduce(inputs)

        assert tf.is_tensor(output)
        assert np.array_equal(output.numpy(), expected_output_array)
    def test_scalar_identical_inputs(self):
        scalar = 50

        expected_output_array = np.ones(shape=(tnt.get_size()), dtype=int)
        expected_output_array.fill(scalar)

        allgatherer = tnt.TensorAllgatherer(scalar)
        output_array = allgatherer.allgather(scalar)

        assert isinstance(output_array, np.ndarray)
        assert np.array_equal(output_array, expected_output_array)
예제 #17
0
def train_val_dataset_generator():
    micro_batch_size = 64
    nbatches = 1
    batch_size = micro_batch_size * tnt.get_size()
    nsamples = nbatches * batch_size
    train_dataset, val_dataset, _ = util.load_dataset(
        mnist.load_mnist_dataset,
        train_size=nsamples,
        train_batch_size=batch_size,
        val_size=nsamples,
        val_batch_size=batch_size)
    return train_dataset, val_dataset
    def test_dict_many_keys(self, length):
        input_value = 4.2
        input_dict = dict.fromkeys(("key " + str(i) for i in range(length)),
                                   input_value)
        expected_output_value = input_value * tnt.get_size()

        allreducer = tnt.TensorAllreducer(input_dict)
        output_dict = allreducer.allreduce(input_dict)

        assert isinstance(output_dict, dict)
        assert len(input_dict) == len(output_dict)
        assert all(v == expected_output_value for v in output_dict.values())
    def test_single_array_different_inputs(self, array_length):
        input_array = np.empty(shape=(array_length, 1), dtype=np.float32)
        input_array.fill(tnt.get_rank())

        expected_output_array = np.empty(input_array.shape, dtype=np.float32)
        expected_output_array.fill(sum(range(tnt.get_size())))

        allreducer = tnt.TensorAllreducer(input_array)
        output_array = allreducer.allreduce(input_array)

        assert isinstance(output_array, np.ndarray)
        assert np.array_equal(output_array, expected_output_array)
    def test_list_of_tensors_identical_inputs(self, list_length):
        input_array = tf.constant([1, 2, 3])
        input_list = [input_array for i in range(list_length)]

        expected_output_array = input_array * tnt.get_size()

        allreducer = tnt.TensorAllreducer(input_list)
        output_list = allreducer.allreduce(input_list)

        assert isinstance(output_list, list)
        assert all(
            np.array_equal(array, expected_output_array)
            for array in output_list)
    def test_list_of_arrays_identical_inputs(self, list_length, dtype):
        array_length = 50
        input_array = np.ones(shape=(array_length, 1), dtype=dtype)
        input_list = [input_array for i in range(list_length)]

        expected_output_array = input_array * tnt.get_size()

        allreducer = tnt.TensorAllreducer(input_list)
        output_list = allreducer.allreduce(input_list)

        assert isinstance(output_list, list)
        assert all(
            np.array_equal(array, expected_output_array)
            for array in output_list)
예제 #22
0
    def test_single_array(self, array_shape):
        np.random.seed(42)
        input_array = np.random.random_sample(array_shape).astype('float32')
        rank = tnt.get_rank()
        root_rank = tnt.get_size() - 1
        broadcaster = tnt.TensorBroadcaster(input_array, root_rank)

        expected_output_array = input_array
        if rank == root_rank:
            output_array = broadcaster.broadcast(input_array)
        else:
            output_array = broadcaster.broadcast()

        result = (output_array == expected_output_array).all()
        assert isinstance(output_array, np.ndarray)
        assert result
    def test_list_of_arrays_identical_inputs_diff_types(self):
        input_array_float = np.ones(shape=(238, 1), dtype=np.float32)
        input_array_double = np.ones(shape=(42, 1), dtype=np.double)
        another_input_array_float = np.ones(shape=(99, 1), dtype=np.float32)
        input_list = [
            input_array_float, input_array_double, another_input_array_float
        ]

        expected_output_list = [array * tnt.get_size() for array in input_list]

        allreducer = tnt.TensorAllreducer(input_list)
        output_list = allreducer.allreduce(input_list)

        assert isinstance(output_list, list)
        assert all(np.array_equal(output_array, expected_output_array) \
                   for (output_array, expected_output_array) \
                   in zip(output_list, expected_output_list))
예제 #24
0
    def __init__(self,
                 dataset,
                 num_ranks=tnt.get_size(),
                 rank=tnt.get_rank(),
                 shuffle_seed=42):
        self.num_ranks = num_ranks
        self.rank = rank
        self.shuffle_seed = shuffle_seed

        self.base_dataset, self.dataset_transformations = \
               ops_helpers.gen_dataset_transformations(dataset)
        self.batching_info = ops_helpers.get_batching_info(
            self.dataset_transformations)

        # convenience attributes computed when the dataset is distributed among ranks
        self._dataset = None
        self._num_samples = None
        self._micro_batch_size = None
예제 #25
0
def check_histories_match(reference_history, pipeline_history, num_micro_batches, prefix = ""):
  loss_name = prefix + 'loss'
  metric_name = 'sparse_categorical_accuracy'
  output_id = 0
  partition_id = tnt.get_size() - 1 # compute metric only on the last partition

  for i in range(len(reference_history.history[loss_name])):
    # check loss matches
    assert np.allclose(reference_history.history[loss_name], pipeline_history.history[loss_name])

    # check metrics match 
    reference_metric_value = reference_history.history[prefix + metric_name][i]
    pipeline_metric_value = 0
    for m in range(num_micro_batches):
      pipeline_metric_value += \
        pipeline_history.history[f"{prefix}p_{partition_id}_m_{m}"
                                 f"_real_output_{output_id}_{metric_name}"][i]
    pipeline_metric_value = pipeline_metric_value / num_micro_batches
    assert np.allclose(reference_metric_value, pipeline_metric_value)
예제 #26
0
def train_test_mnist_datasets(nbatches=1,
                              val_nbatches=0,
                              test_nbatches=0,
                              micro_batch_size=64,
                              shuffle=True,
                              remainder_samples_per_batch=0,
                              last_incomplete_batch_size=0,
                              drop_remainder=False):
    batch_size = micro_batch_size * tnt.get_size() + remainder_samples_per_batch
    nsamples = nbatches * batch_size + last_incomplete_batch_size
    val_nsamples = val_nbatches * batch_size
    test_nsamples = test_nbatches * batch_size

    return load_train_test_dataset(mnist.load_mnist_dataset,
                                   train_size=nsamples,
                                   train_batch_size=batch_size,
                                   test_size=test_nsamples,
                                   test_batch_size=batch_size,
                                   shuffle=shuffle,
                                   drop_remainder=drop_remainder)
예제 #27
0
    def reduce_gradients(self, gradients_and_weights):
        gradients_to_reduce = list()
        for grad, weight in gradients_and_weights:
            # add an Allreduce operation for each gradient
            grad_id = self.weight_to_index[weight.name]
            number_partial_sums = tnt.get_size()
            grad = grad / number_partial_sums
            output_grad = tnt_ops.start_allreduce_op(
                grad, tensor_id=grad_id, tnt_synchcomm=self.comm.get_raw_ptr())
            gradients_to_reduce.append(output_grad)

        # Create barrier op in the Tensorflow graph to make sure all
        # the Allreduce operations on gradients have started.
        # This ensures that the graph execution does not get delayed by waiting
        # for gradients to be reduced as long as there are remaining computations
        # in the backward pass.
        temp_gradients = tnt_ops.barrier_op(gradients_to_reduce,
                                            Tout=[tf.float32] *
                                            len(gradients_to_reduce))

        # Add individual ops that wait for each gradient to be reduced before updating
        # the weights.
        # These ops are executed only after the backward pass has been completed.
        reduced_gradients = list()
        for idx, (_, weight) in enumerate(gradients_and_weights):
            # gradient tensors obtained after barrier are listed in the same order
            # as the initial `gradients_and_weights`
            gradient = temp_gradients[idx]
            grad_id = self.weight_to_index[weight.name]

            output_grad = tnt_ops.finish_allreduce_op(
                gradient,
                tensor_id=grad_id,
                Tout=tf.float32,
                tnt_synchcomm=self.comm.get_raw_ptr())
            if version_utils.tf_version_below_equal('2.3'):
                reduced_gradients.append(output_grad)
            else:
                reduced_gradients.append((output_grad, weight))
        return reduced_gradients
예제 #28
0
  def test_cifar_alexnet(self, keras_model, optimizer, micro_batch_size, nbatches, ntest_batches):
    batch_size = micro_batch_size * tnt.get_size()
    nsamples = nbatches * batch_size
    (number_epochs, lr) = cifar.get_hyperparams(optimizer)
    (train_dataset, test_dataset) = util.load_train_test_dataset(cifar.load_cifar_dataset,
                                                                 train_size = nsamples,
                                                                 train_batch_size = batch_size,
                                                                 test_size = ntest_batches * batch_size,
                                                                 test_batch_size = batch_size)
    if optimizer.__name__ == 'SGD':
      keras_optimizer = optimizer(learning_rate=lr, momentum=0.9)
    else:
      keras_optimizer = optimizer(learning_rate=lr)

    model = tnt.Model(keras_model())
    model.compile(keras_optimizer,
                  loss = keras.losses.SparseCategoricalCrossentropy(),
                  metrics = [keras.metrics.SparseCategoricalAccuracy()])
    model.fit(train_dataset,
              epochs = number_epochs,
              verbose = 0)
    results = model.evaluate(test_dataset)
    util.check_accuracy_greater(results[1], 0.5)
예제 #29
0
  def test_train(self, model_generator, num_micro_batches, micro_batch_size,
                 num_batches, num_test_batches, number_epochs):
    batch_size = micro_batch_size * num_micro_batches
    fit_params = {'epochs' : number_epochs, 'shuffle' : False, 'verbose' : 0}
    rank = tnt.get_rank()
    master_rank = tnt.get_size() - 1  # the last partition will be assigned to rank (nranks-1)

    # reference model
    if rank == master_rank:
      reference_ds = load_reference_datasets(batch_size, num_batches, num_test_batches)
      reference_model = model_generator()

      reference_model.compile(**get_reference_compile_params())
      reference_history = reference_model.fit(reference_ds["train"],
                                              validation_data = reference_ds["val"],
                                              **fit_params)
      reference_result = reference_model.evaluate(reference_ds["test"], verbose = 0)

    # pipelined model
    model = model_generator()
    microbatched_model_builder, microbatched_ds = to_microbatched(model, micro_batch_size,
                                                  num_micro_batches, num_batches, num_test_batches)
    microbatched_model = microbatched_model_builder.get_model()
    microbatched_model.summary()

    microbatched_model.compile(**get_microbatched_compile_params(microbatched_model_builder))
    pipeline_history = microbatched_model.fit(microbatched_ds["train"],
                                              validation_data = microbatched_ds["val"],
                                              **fit_params)
    pipeline_result = microbatched_model.evaluate(microbatched_ds["test"], verbose = 0)

    if rank == master_rank:
      print (reference_history.history)
      print (pipeline_history.history)
      check_histories_match(reference_history, pipeline_history, num_micro_batches)
      check_validation_histories_match(reference_history, pipeline_history, num_micro_batches)
      check_predictions_match(reference_result, pipeline_result, num_micro_batches)
예제 #30
0
                    np.isclose(tnt_history.history[key],
                               ref_history.history[key],
                               atol=1e-6))
            ]
        result = [all(result)]
    util.assert_on_all_ranks(result)


@pytest.mark.parametrize("model_config", [
    base_runner.ModelConfig(mnist.fc_model_generator),
    base_runner.ModelConfig(mnist.subclassed_model_generator),
    pytest.param(
        base_runner.ModelConfig(mnist.fc_model_generator,
                                tnt.ParallelStrategy.PIPELINING),
        marks=pytest.mark.skipif(
            tnt.get_size() != 1,
            reason="Cannot run multi-rank, model has only one partition")),
])
class TestTarantellaCallbacks:
    @pytest.mark.parametrize("number_epochs", [5])
    def test_learning_rate_scheduler_callback(self, model_config,
                                              number_epochs):
        callbacks = [
            tf.keras.callbacks.LearningRateScheduler(
                schedule=(lambda epoch, lr: 0.1 * lr), verbose=1)
        ]
        tnt_history, reference_history = train_tnt_and_ref_models_with_callbacks(
            callbacks, model_config, number_epochs)
        assert_identical_tnt_and_ref_history(tnt_history, reference_history)

    @pytest.mark.parametrize("number_epochs", [5])