def evaluate(self, x = None, y = None, callbacks = None, tnt_micro_batch_size = None, tnt_distribute_dataset = True, **kwargs): self._setup_for_execution('evaluate', x, y, kwargs) processed_callbacks = utilities._preprocess_callbacks(callbacks, self.group, parallel_strategy = tnt.ParallelStrategy.DATA, exec_type = 'evaluate', verbose = kwargs.get('verbose', None)) if tnt_distribute_dataset: test_dataset = tnt.data.Dataset(dataset = x, num_ranks = self.group.size, rank = self.group.to_group_rank(self.rank), shuffle_seed = self.default_shuffle_seed) x = test_dataset.distribute_dataset_across_ranks( user_micro_batch_size = tnt_micro_batch_size, is_training = False) self._validate_micro_batch_size_for_batch_normalization(test_dataset.micro_batch_size) else: logger.info("Automatic dataset distribution is disabled.") return self.model.evaluate(x, callbacks = processed_callbacks, **kwargs)
def fit(self, x = None, y = None, callbacks = None, validation_data = None, **kwargs): logger.info(f"[PartitionedModel] fit.") self._configure_rebuild(dataset = x) self._build_model_and_compile_if_necessary() processed_callbacks = utilities._preprocess_callbacks(callbacks, self.group, parallel_strategy = tnt.ParallelStrategy.PIPELINING, exec_type = 'fit', verbose = kwargs.get('verbose', None)) ds = self._get_microbatched_dataset(dataset = x, nano_batch_size = self.nano_batch_size, num_pipeline_stages = self.num_pipeline_stages) distributed_validation_data = None if validation_data: distributed_validation_data = self._get_microbatched_dataset(dataset = validation_data, nano_batch_size = self.nano_batch_size, num_pipeline_stages = self.num_pipeline_stages) return self.model.fit(x = ds, callbacks = processed_callbacks, validation_data = distributed_validation_data, **kwargs)
def evaluate(self, x = None, y = None, callbacks = None, **kwargs): self._configure_rebuild(dataset = x) self._build_model_and_compile_if_necessary() processed_callbacks = utilities._preprocess_callbacks(callbacks, self.group, parallel_strategy = tnt.ParallelStrategy.PIPELINING, exec_type = 'evaluate', verbose = kwargs.get('verbose', None)) ds = self._get_microbatched_dataset(dataset = x, nano_batch_size = self.nano_batch_size, num_pipeline_stages = self.num_pipeline_stages) return_dict = kwargs.pop('return_dict', None) test_loss_metrics = self.model.evaluate(x = ds, callbacks = processed_callbacks, return_dict = False, **kwargs) user_visible_loss_metrics = putil.extract_user_visible_metrics( dict(zip(self.model.metrics_names, test_loss_metrics))) if len(user_visible_loss_metrics) == 1: return user_visible_loss_metrics[0] else: metrics_dict = putil.avg_metrics_over_pipeline_stages(user_visible_loss_metrics) if return_dict == True: return metrics_dict metrics_values = [] for metric in metrics_dict.values(): metrics_values += metric if isinstance(metric, list) else [metric] return metrics_values
def predict(self, x = None, callbacks = None, **kwargs): self._configure_rebuild(dataset = x) self._build_model_and_compile_if_necessary() processed_callbacks = utilities._preprocess_callbacks(callbacks, self.group, parallel_strategy = tnt.ParallelStrategy.PIPELINING, exec_type = 'predict', verbose = kwargs.get('verbose', None)) ds = self._get_microbatched_dataset(dataset = x, nano_batch_size = self.nano_batch_size, num_pipeline_stages = self.num_pipeline_stages) test_loss_metrics = self.model.predict(x = ds, callbacks = processed_callbacks, **kwargs) if tnt.is_group_master_rank(self.group): # last partition return test_loss_metrics
def fit(self, x = None, y = None, callbacks = None, validation_data = None, tnt_micro_batch_size = None, tnt_validation_micro_batch_size = None, tnt_distribute_dataset = True, tnt_distribute_validation_dataset = True, **kwargs): self._setup_for_execution('fit', x, y, kwargs) processed_callbacks = utilities._preprocess_callbacks(callbacks, self.group, parallel_strategy = tnt.ParallelStrategy.DATA, exec_type = 'fit', verbose = kwargs.get('verbose', None)) if tnt_distribute_dataset: # Distribute dataset into micro-batches among ranks by taking into account # all possible cases of splitting the dataset: # # 1. Batch size # a. `batch_size` is a multiple of the number of ranks # => identical `micro_batch_size` for all ranks # b. `batch_size` is not a multiple of the number of ranks # => different ranks have different `micro_batch_size`s and # locally computed gradients need to be scaled by a factor to # account for the differences # c. `batch_size` < number of ranks # => raise Error # # 2. Last batch within epoch # a. the last batch in the dataset is incomplete, but dataset is batched # with `drop_remainder = True` # => the last batch is dropped # b. the last batch in the dataset is incomplete with `drop_remainder = False` # - number of samples in the last batch is smaller than `num_ranks`, # => pad the dataset with a number of zeroed samples to ensure that each rank # has one sample, so that they all see the same number of iterations in an epoch; # the fake samples will be filtered out from the final gradient computation by # assigning them `micro_batch_size = 0` # - number of samples in the last batch is >= `num_ranks` # => last batch can be considered a new `batch_size`, which will be handled as above (in 1.), # both for computing the `micro_batch_size` and the `scaling_factor` distributed_x = tnt.data.Dataset(dataset = x, num_ranks = self.group.size, rank = self.group.to_group_rank(self.rank), shuffle_seed = self.default_shuffle_seed) x = distributed_x.distribute_dataset_across_ranks( user_micro_batch_size = tnt_micro_batch_size, is_training = True) self._validate_micro_batch_size_for_batch_normalization(distributed_x.micro_batch_size) # if different ranks have different micro-batch sizes, the gradients need rescaling dataset_callback = distributed_x.get_gradient_scaling_callback() if dataset_callback: processed_callbacks.append(dataset_callback) else: logger.info("Automatic dataset distribution is disabled." "Make sure the dataset is sharded manually across ranks.") # Always switch off shuffling kwargs["shuffle"] = False if validation_data: if tnt_distribute_validation_dataset: distributed_validation_data = tnt.data.Dataset(dataset = validation_data, num_ranks = self.group.size, rank = self.group.to_group_rank(self.rank), shuffle_seed = self.default_shuffle_seed) validation_data = distributed_validation_data.distribute_dataset_across_ranks( user_micro_batch_size = tnt_validation_micro_batch_size, is_training = False) self._validate_micro_batch_size_for_batch_normalization(distributed_validation_data.micro_batch_size) else: logger.info("Automatic distribution for the validation dataset is disabled.") return self.model.fit(x = x, validation_data = validation_data, callbacks = processed_callbacks, **kwargs)