def _compute_validation_metrics(self) -> workload.Response: self.context.reset_reducers() # Set the behavior of certain layers (e.g., dropout) that are # different between training and inference. for model in self.context.models: model.eval() step_start_time = time.time() for callback in self.callbacks.values(): if util.is_overridden(callback.on_validation_step_start, pytorch.PyTorchCallback): logging.warning("on_validation_step_start is now deprecated, " "please use on_validation_start instead") callback.on_validation_step_start() for callback in self.callbacks.values(): callback.on_validation_start() num_inputs = 0 metrics = {} # type: Dict[str, Any] if self._evaluate_batch_defined(): keys = None batch_metrics = [] self.validation_loader = cast(torch.utils.data.DataLoader, self.validation_loader) check.gt(len(self.validation_loader), 0) for callback in self.callbacks.values(): callback.on_validation_epoch_start() for idx, batch in enumerate(self.validation_loader): if self.context.experimental._auto_to_device: batch = self.context.to_device(batch) num_inputs += self.trial.get_batch_length(batch) if has_param(self.trial.evaluate_batch, "batch_idx", 2): vld_metrics = self.trial.evaluate_batch(batch=batch, batch_idx=idx) else: vld_metrics = self.trial.evaluate_batch( batch=batch) # type: ignore # Verify validation metric names are the same across batches. if keys is None: keys = vld_metrics.keys() else: check.eq( keys, vld_metrics.keys(), "Validation metric names must match across all batches of data.", ) check.is_instance( vld_metrics, dict, "validation_metrics() must return a " "dictionary of string names to Tensor " "metrics", ) # TODO: For performance perform -> cpu() only at the end of validation. batch_metrics.append( pytorch._convert_metrics_to_numpy(vld_metrics)) if self.env.test_mode: break for callback in self.callbacks.values(): callback.on_validation_epoch_end(batch_metrics) metrics = pytorch._reduce_metrics( self.context.distributed, batch_metrics=batch_metrics, keys=keys, metrics_reducers=pytorch._prepare_metrics_reducers( self.trial.evaluation_reducer(), keys=keys), ) # Gather a list of per-worker (num_inputs, num_batches) tuples. input_counts = self.context.distributed.gather( (num_inputs, idx + 1)) if self.context.distributed.rank == 0: assert input_counts is not None # Reshape and sum. num_inputs, num_batches = [sum(n) for n in zip(*input_counts)] else: check.true(self._evaluate_full_dataset_defined()) self.validation_loader = cast(torch.utils.data.DataLoader, self.validation_loader) if self.is_chief: metrics = self.trial.evaluate_full_dataset( data_loader=self.validation_loader) check.is_instance( metrics, dict, f"eval() must return a dictionary, got {type(metrics)}.") metrics = pytorch._convert_metrics_to_numpy(metrics) num_inputs = self.context.get_per_slot_batch_size() * len( self.validation_loader) metrics.update( pytorch._convert_metrics_to_numpy( self.context.reduce_metrics(for_training=False))) if self.context.distributed.size > 1 and any( map( lambda c: util.is_overridden( c.on_validation_end, pytorch. PyTorchCallback) or util.is_overridden( c.on_validation_step_end, pytorch.PyTorchCallback), self.callbacks.values(), )): logging.debug( "Broadcasting metrics to all worker processes to execute a " "validation step end callback") metrics = hvd.broadcast_object(metrics, root_rank=0) for callback in self.callbacks.values(): if util.is_overridden(callback.on_validation_step_end, pytorch.PyTorchCallback): logging.warning( "on_validation_step_end is now deprecated, please use on_validation_end instead" ) callback.on_validation_step_end(metrics) for callback in self.callbacks.values(): callback.on_validation_end(metrics) if not self.is_chief: return {} # Skip reporting timings if evaluate_full_dataset() was defined. This is far less common # than evaluate_batch() and we can't know how the user processed their validation data. if self._evaluate_batch_defined(): step_duration = time.time() - step_start_time logging.info( det.util.make_timing_log("validated", step_duration, num_inputs, num_batches)) return {"num_inputs": num_inputs, "validation_metrics": metrics}
def _compute_validation_metrics(self) -> workload.Response: self.context.reset_reducers() # Set the behavior of certain layers (e.g., dropout) that are # different between training and inference. for model in self.context.models: model.eval() step_start_time = time.time() for callback in self.callbacks.values(): if util.is_overridden(callback.on_validation_step_start, pytorch.PyTorchCallback): logging.warning("on_validation_step_start is now deprecated, " "please use on_validation_start instead") callback.on_validation_step_start() for callback in self.callbacks.values(): callback.on_validation_start() num_inputs = 0 keys = None batch_metrics = [] for callback in self.callbacks.values(): callback.on_validation_epoch_start() validation_iterator = iter( self.validation_loader) if self.validation_loader else None for idx in range(cast(int, self.num_validation_batches)): num_inputs += cast(int, self.validation_batch_size) # Note that when using pipeline parallelism, each call to evaluate_batch will request # self.context.num_micro_batches_per_slot batches from the validation iterator. # This is why we set self.num_validation_batches differently for pipeline parallel # and no pipeline parallel when building the data laoders. vld_metrics = self.trial.evaluate_batch(validation_iterator, idx) if self.context._mpu.should_report_metrics: if not isinstance(vld_metrics, dict): raise det.errors.InvalidExperimentException( "evaluate_batch must return a dictionary of string names " "to Tensor metrics", ) # Verify validation metric names are the same across batches. if keys is None: keys = vld_metrics.keys() else: if keys != vld_metrics.keys(): raise det.errors.InvalidExperimentException( "Validation metric names must match across all batches of data.", ) # TODO: For performance perform -> cpu() only at the end of validation. batch_metrics.append( pytorch._convert_metrics_to_numpy(vld_metrics)) if self.env.test_mode: break # keys and list(keys) does not satisfy all cases because it will return dict_keys type if # keys is an empty dict. this will then break when passed to zmq_broadcast since it does # not know how to serialize dict_keys type. all_keys = self.context.distributed.gather( keys if keys is None else list(keys)) if self.is_chief: all_keys = [k for k in all_keys if k is not None] keys = all_keys[0] keys = self.context.distributed.broadcast(keys) for callback in self.callbacks.values(): callback.on_validation_epoch_end(batch_metrics) metrics = pytorch._reduce_metrics( self.context.distributed, batch_metrics=batch_metrics, keys=keys, metrics_reducers=pytorch._prepare_metrics_reducers( pytorch.Reducer.AVG, keys=keys), ) metrics.update( pytorch._convert_metrics_to_numpy( self.context.reduce_metrics(for_training=False))) if self.context.distributed.size > 1 and any( util.is_overridden(c.on_validation_end, pytorch.PyTorchCallback) or util.is_overridden(c.on_validation_step_end, pytorch.PyTorchCallback) for c in self.callbacks.values()): logging.debug( "Broadcasting metrics to all worker processes to execute a " "validation step end callback") metrics = self.context.distributed.broadcast(metrics) for callback in self.callbacks.values(): if util.is_overridden(callback.on_validation_step_end, pytorch.PyTorchCallback): logging.warning( "on_validation_step_end is now deprecated, please use on_validation_end instead" ) callback.on_validation_step_end(metrics) for callback in self.callbacks.values(): callback.on_validation_end(metrics) if not self.is_chief: return {} num_inputs *= self.context._mpu.data_parallel_world_size step_duration = time.time() - step_start_time logging.info( det.util.make_timing_log("validated", step_duration, num_inputs, cast(int, self.num_validation_batches))) self.metric_writer.on_validation_step_end(self.steps_completed, metrics) return {"num_inputs": num_inputs, "validation_metrics": metrics}
def _train_for_step(self, step_id: int, num_batches: int, total_batches_processed: int) -> workload.Response: self.prof.set_training(True) check.gt(step_id, 0) step_start_time = time.time() self.context.reset_reducers() # Set the behavior of certain layers (e.g., dropout) that are different # between training and inference. for model in self.context.models: model.train() start = total_batches_processed end = start + num_batches per_batch_metrics = [] # type: List[Dict] num_inputs = 0 for batch_idx in range(start, end): self.steps_completed += 1 batch_start_time = time.time() self.prof.update_batch_idx(batch_idx) with self.prof.record_timing("dataloader_next", requires_sync=False): batch = next(self.training_iterator) batch_inputs = self.trial.get_batch_length(batch) num_inputs += batch_inputs if self.context.experimental._auto_to_device: with self.prof.record_timing("to_device", accumulate=True): batch = self.context.to_device(batch) self.context._current_batch_idx = batch_idx epoch_idx = self.get_epoch_idx(batch_idx) if self.context.is_epoch_start(): for callback in self.callbacks.values(): with self.prof.record_timing( f"callbacks.{callback.__class__.__name__}.on_training_epoch_start" ): sig = signature(callback.on_training_epoch_start) if sig.parameters: callback.on_training_epoch_start(epoch_idx) else: logging.warning( "on_training_epoch_start() without parameters is deprecated" " since 0.17.8. Please add epoch_idx parameter." ) callback.on_training_epoch_start( ) # type: ignore[call-arg] self.context._loss_ids = {} with self.prof.record_timing("train_batch", requires_sync=False): if self.context.profiler: with self.context.profiler as torch_profiler: tr_metrics = self.trial.train_batch( batch=batch, epoch_idx=epoch_idx, batch_idx=batch_idx, ) torch_profiler.step() else: tr_metrics = self.trial.train_batch( batch=batch, epoch_idx=epoch_idx, batch_idx=batch_idx, ) if self._should_update_scaler(): self.context._scaler.update() if isinstance(tr_metrics, torch.Tensor): tr_metrics = {"loss": tr_metrics} check.is_instance( tr_metrics, dict, "train_batch() must return a dictionary " f"mapping string names to Tensor metrics, got {type(tr_metrics)}", ) # Step learning rate of a pytorch.LRScheduler. with self.prof.record_timing("step_lr_schedulers"): for lr_scheduler in self.context.lr_schedulers: self._auto_step_lr_scheduler_per_batch( batch_idx, lr_scheduler) with self.prof.record_timing("from_device"): for name, metric in tr_metrics.items(): # Convert PyTorch metric values to NumPy, so that # `det.util.encode_json` handles them properly without # needing a dependency on PyTorch. if isinstance(metric, torch.Tensor): metric = metric.cpu().detach().numpy() tr_metrics[name] = metric batch_dur = time.time() - batch_start_time samples_per_second = batch_inputs / batch_dur samples_per_second *= self.context.distributed.size self.prof.record_metric("samples_per_second", samples_per_second) per_batch_metrics.append(tr_metrics) if self.context.is_epoch_end(): for callback in self.callbacks.values(): with self.prof.record_timing( f"callbacks.{callback.__class__.__name__}.on_training_epoch_end" ): callback.on_training_epoch_end(epoch_idx) # Aggregate and reduce training metrics from all the training processes. if self.context.distributed.size > 1 and self.context._average_training_metrics: with self.prof.record_timing("average_training_metrics"): per_batch_metrics = pytorch._combine_and_average_training_metrics( self.context.distributed, per_batch_metrics) num_inputs *= self.context.distributed.size metrics = det.util.make_metrics(num_inputs, per_batch_metrics) # Ignore batch_metrics entirely for custom reducers; there's no guarantee that per-batch # metrics are even logical for a custom reducer. with self.prof.record_timing("reduce_metrics"): metrics["avg_metrics"].update( pytorch._convert_metrics_to_numpy( self.context.reduce_metrics(for_training=True))) if not self.is_chief: # The training metrics are reported only in the chief process. return {} step_duration = time.time() - step_start_time logging.info( det.util.make_timing_log("trained", step_duration, num_inputs, num_batches)) return metrics
def _train_for_step(self, step_id: int, num_batches: int, total_batches_processed: int) -> workload.Response: """ DeepSpeed allows specifying train_batch_size, train_micro_batch_size_per_gpu, and gradient_accumulation_steps. The three are related as follows: train_batch_size = train_micro_batch_size * gradient_accumulation_steps. Hence, if two are specified, the third can be inferred. For pipeline parallel training, DeepSpeed will automatically interleave gradient_accumulation_steps worth of micro batches in one train_batch/eval_batch call. With the default DeepSpeed model engine (no pipeline parallel training), the backward and optimizer step calls track micro batches and will automatically update model weights and lr scheduler if micro batches % gradient_accumulation_steps == 0. Comparing training with and without pipeline parallel is a common goal. Since DeepSpeed's PipelineEngine trains on a number of micro batches equal to gradient accumulation steps, we automatically perform gradient accumulation by default when pipeline parallelism is not enabled. This makes it fair to compare training with and without pipeline parallelism at a given batch idx. This can be turned off by setting context.disable_auto_grad_accumulation. """ self.prof.set_training(True) assert step_id > 0, "step_id should be greater than 0" step_start_time = time.time() self.context.reset_reducers() # Set the behavior of certain layers (e.g., dropout) that are different # between training and inference. for model in self.context.models: model.train() start = total_batches_processed end = start + num_batches per_batch_metrics = [] # type: List[Dict] num_inputs = 0 for batch_idx in range(start, end): self.steps_completed += 1 self.prof.update_batch_idx(batch_idx) batch_start_time = time.time() self.context._current_batch_idx = batch_idx if self.context.is_epoch_start(): for callback in self.callbacks.values(): with self.prof.record_timing( f"callbacks.{callback.__class__.__name__}.on_training_epoch_start" ): callback.on_training_epoch_start( self.get_epoch_idx(batch_idx)) # This can be inaccurate if the user's data loader does not return batches with # the micro batch size. It is also slightly inaccurate if the data loader can return # partial batches. The same sort of assumptions are made in the DeepSpeed # model engine's accounting and profiling computations. batch_inputs = (self.context.train_micro_batch_size_per_gpu * self.context.num_micro_batches_per_slot) num_inputs += batch_inputs num_train_batch_calls = self.context.num_micro_batches_per_slot if self.context.use_pipeline_parallel or self.context._manual_grad_accumulation: num_train_batch_calls = 1 self.context._loss_ids = {} for _ in range(num_train_batch_calls): with self.prof.record_timing("train_batch", requires_sync=False, accumulate=True): tr_metrics = self.trial.train_batch( self.training_iterator, self.get_epoch_idx(batch_idx), batch_idx, ) if self.context._mpu.should_report_metrics: if isinstance(tr_metrics, torch.Tensor): tr_metrics = {"loss": tr_metrics} if not isinstance(tr_metrics, dict): raise det.errors.InvalidExperimentException( "train_batch must return a dictionary " f"mapping string names to Tensor metrics, got {type(tr_metrics)}", ) for name, metric in tr_metrics.items(): # Convert PyTorch metric values to NumPy, so that # `det.util.encode_json` handles them properly without # needing a dependency on PyTorch. if isinstance(metric, torch.Tensor): metric = metric.cpu().detach().numpy() tr_metrics[name] = metric per_batch_metrics.append(tr_metrics) # We do a check here to make sure that we do indeed process `num_micro_batches_per_slot` # micro batches when training a batch for models that do not use pipeline parallelism. model0 = self.context.models[0] if not isinstance(model0, deepspeed.PipelineEngine): assert (model0.micro_steps % self.context.num_micro_batches_per_slot == 0 ), "did not train for gradient accumulation steps" batch_dur = time.time() - batch_start_time samples_per_second = batch_inputs / batch_dur samples_per_second *= self.context._mpu.data_parallel_world_size self.prof.record_metric("samples_per_second", samples_per_second) if self.context.is_epoch_end(): for callback in self.callbacks.values(): with self.prof.record_timing( f"callbacks.{callback.__class__.__name__}.on_training_epoch_end" ): callback.on_training_epoch_end( self.get_epoch_idx(batch_idx)) # Aggregate and reduce training metrics from all the training processes. if self.context.distributed.size > 1 and self.context._average_training_metrics: with self.prof.record_timing("average_training_metrics"): per_batch_metrics = pytorch._combine_and_average_training_metrics( self.context.distributed, per_batch_metrics) num_inputs *= self.context._mpu.data_parallel_world_size metrics = det.util.make_metrics(num_inputs, per_batch_metrics) # Ignore batch_metrics entirely for custom reducers; there's no guarantee that per-batch # metrics are even logical for a custom reducer. with self.prof.record_timing("reduce_metrics"): metrics["avg_metrics"].update( pytorch._convert_metrics_to_numpy( self.context.reduce_metrics(for_training=True))) if not self.is_chief: # The training metrics are reported only in the chief process. return {} step_duration = time.time() - step_start_time logging.info( det.util.make_timing_log("trained", step_duration, num_inputs, num_batches)) self.prof.set_training(False) self.metric_writer.on_train_step_end( self.steps_completed, metrics["avg_metrics"], metrics["batch_metrics"], ) return metrics