def test_reducer() -> None: metrics = np.array([0.25, 0.5, 0.75, 1, 25.5, 1.9]) assert np.around(_reduce_metrics(Reducer.AVG, metrics), decimals=2) == 4.98 assert _reduce_metrics(Reducer.SUM, metrics) == 29.9 assert _reduce_metrics(Reducer.MIN, metrics) == 0.25 assert _reduce_metrics(Reducer.MAX, metrics) == 25.5 batches_per_process = [1, 2, 5, 4, 5, 6] assert np.around(_reduce_metrics(Reducer.AVG, metrics, batches_per_process), decimals=2) == 6.43
def _reduce_metrics( self, batch_metrics: List, keys: Any, metrics_reducers: Dict[str, pytorch.Reducer] ) -> Dict[str, Any]: metrics = { name: pytorch._reduce_metrics( reducer=metrics_reducers[name], metrics=np.stack([b[name] for b in batch_metrics], axis=0), num_batches=None, ) for name in keys or [] } if self.hvd_config.use: # If using horovod combine metrics across all processes. # Only the chief process will receive all the metrics. self.validation_loader = cast(torch.utils.data.DataLoader, self.validation_loader) num_batches = len(self.validation_loader) combined_metrics, batches_per_process = self._combine_metrics_across_processes( metrics, num_batches ) if self.is_chief: # Only the chief collects all the metrics. combined_metrics = self._convert_metrics_to_numpy( cast(Dict[str, Any], combined_metrics) ) metrics = { name: pytorch._reduce_metrics( reducer=metrics_reducers[name], metrics=combined_metrics[name], num_batches=batches_per_process, ) for name in keys or [] } else: return {} return metrics
def _compute_validation_metrics(self) -> workload.Response: self.context.reset_reducers() # Set the behavior of certain layers (e.g., dropout) that are # different between training and inference. for model in self.context.models: model.eval() step_start_time = time.time() for callback in self.callbacks.values(): if util.is_overridden(callback.on_validation_step_start, pytorch.PyTorchCallback): logging.warning("on_validation_step_start is now deprecated, " "please use on_validation_start instead") callback.on_validation_step_start() for callback in self.callbacks.values(): callback.on_validation_start() num_inputs = 0 metrics = {} # type: Dict[str, Any] if self._evaluate_batch_defined(): keys = None batch_metrics = [] self.validation_loader = cast(torch.utils.data.DataLoader, self.validation_loader) check.gt(len(self.validation_loader), 0) for callback in self.callbacks.values(): callback.on_validation_epoch_start() for idx, batch in enumerate(self.validation_loader): if self.context.experimental._auto_to_device: batch = self.context.to_device(batch) num_inputs += self.trial.get_batch_length(batch) if has_param(self.trial.evaluate_batch, "batch_idx", 2): vld_metrics = self.trial.evaluate_batch(batch=batch, batch_idx=idx) else: vld_metrics = self.trial.evaluate_batch( batch=batch) # type: ignore # Verify validation metric names are the same across batches. if keys is None: keys = vld_metrics.keys() else: check.eq( keys, vld_metrics.keys(), "Validation metric names must match across all batches of data.", ) check.is_instance( vld_metrics, dict, "validation_metrics() must return a " "dictionary of string names to Tensor " "metrics", ) # TODO: For performance perform -> cpu() only at the end of validation. batch_metrics.append( pytorch._convert_metrics_to_numpy(vld_metrics)) if self.env.test_mode: break for callback in self.callbacks.values(): callback.on_validation_epoch_end(batch_metrics) metrics = pytorch._reduce_metrics( self.context.distributed, batch_metrics=batch_metrics, keys=keys, metrics_reducers=pytorch._prepare_metrics_reducers( self.trial.evaluation_reducer(), keys=keys), ) # Gather a list of per-worker (num_inputs, num_batches) tuples. input_counts = self.context.distributed.gather( (num_inputs, idx + 1)) if self.context.distributed.rank == 0: assert input_counts is not None # Reshape and sum. num_inputs, num_batches = [sum(n) for n in zip(*input_counts)] else: check.true(self._evaluate_full_dataset_defined()) self.validation_loader = cast(torch.utils.data.DataLoader, self.validation_loader) if self.is_chief: metrics = self.trial.evaluate_full_dataset( data_loader=self.validation_loader) check.is_instance( metrics, dict, f"eval() must return a dictionary, got {type(metrics)}.") metrics = pytorch._convert_metrics_to_numpy(metrics) num_inputs = self.context.get_per_slot_batch_size() * len( self.validation_loader) metrics.update( pytorch._convert_metrics_to_numpy( self.context.reduce_metrics(for_training=False))) if self.context.distributed.size > 1 and any( map( lambda c: util.is_overridden( c.on_validation_end, pytorch. PyTorchCallback) or util.is_overridden( c.on_validation_step_end, pytorch.PyTorchCallback), self.callbacks.values(), )): logging.debug( "Broadcasting metrics to all worker processes to execute a " "validation step end callback") metrics = hvd.broadcast_object(metrics, root_rank=0) for callback in self.callbacks.values(): if util.is_overridden(callback.on_validation_step_end, pytorch.PyTorchCallback): logging.warning( "on_validation_step_end is now deprecated, please use on_validation_end instead" ) callback.on_validation_step_end(metrics) for callback in self.callbacks.values(): callback.on_validation_end(metrics) if not self.is_chief: return {} # Skip reporting timings if evaluate_full_dataset() was defined. This is far less common # than evaluate_batch() and we can't know how the user processed their validation data. if self._evaluate_batch_defined(): step_duration = time.time() - step_start_time logging.info( det.util.make_timing_log("validated", step_duration, num_inputs, num_batches)) return {"num_inputs": num_inputs, "validation_metrics": metrics}
def _compute_validation_metrics(self) -> workload.Response: self.context.reset_reducers() # Set the behavior of certain layers (e.g., dropout) that are # different between training and inference. for model in self.context.models: model.eval() step_start_time = time.time() for callback in self.callbacks.values(): if util.is_overridden(callback.on_validation_step_start, pytorch.PyTorchCallback): logging.warning("on_validation_step_start is now deprecated, " "please use on_validation_start instead") callback.on_validation_step_start() for callback in self.callbacks.values(): callback.on_validation_start() num_inputs = 0 keys = None batch_metrics = [] for callback in self.callbacks.values(): callback.on_validation_epoch_start() validation_iterator = iter( self.validation_loader) if self.validation_loader else None for idx in range(cast(int, self.num_validation_batches)): num_inputs += cast(int, self.validation_batch_size) # Note that when using pipeline parallelism, each call to evaluate_batch will request # self.context.num_micro_batches_per_slot batches from the validation iterator. # This is why we set self.num_validation_batches differently for pipeline parallel # and no pipeline parallel when building the data laoders. vld_metrics = self.trial.evaluate_batch(validation_iterator, idx) if self.context._mpu.should_report_metrics: if not isinstance(vld_metrics, dict): raise det.errors.InvalidExperimentException( "evaluate_batch must return a dictionary of string names " "to Tensor metrics", ) # Verify validation metric names are the same across batches. if keys is None: keys = vld_metrics.keys() else: if keys != vld_metrics.keys(): raise det.errors.InvalidExperimentException( "Validation metric names must match across all batches of data.", ) # TODO: For performance perform -> cpu() only at the end of validation. batch_metrics.append( pytorch._convert_metrics_to_numpy(vld_metrics)) if self.env.test_mode: break # keys and list(keys) does not satisfy all cases because it will return dict_keys type if # keys is an empty dict. this will then break when passed to zmq_broadcast since it does # not know how to serialize dict_keys type. all_keys = self.context.distributed.gather( keys if keys is None else list(keys)) if self.is_chief: all_keys = [k for k in all_keys if k is not None] keys = all_keys[0] keys = self.context.distributed.broadcast(keys) for callback in self.callbacks.values(): callback.on_validation_epoch_end(batch_metrics) metrics = pytorch._reduce_metrics( self.context.distributed, batch_metrics=batch_metrics, keys=keys, metrics_reducers=pytorch._prepare_metrics_reducers( pytorch.Reducer.AVG, keys=keys), ) metrics.update( pytorch._convert_metrics_to_numpy( self.context.reduce_metrics(for_training=False))) if self.context.distributed.size > 1 and any( util.is_overridden(c.on_validation_end, pytorch.PyTorchCallback) or util.is_overridden(c.on_validation_step_end, pytorch.PyTorchCallback) for c in self.callbacks.values()): logging.debug( "Broadcasting metrics to all worker processes to execute a " "validation step end callback") metrics = self.context.distributed.broadcast(metrics) for callback in self.callbacks.values(): if util.is_overridden(callback.on_validation_step_end, pytorch.PyTorchCallback): logging.warning( "on_validation_step_end is now deprecated, please use on_validation_end instead" ) callback.on_validation_step_end(metrics) for callback in self.callbacks.values(): callback.on_validation_end(metrics) if not self.is_chief: return {} num_inputs *= self.context._mpu.data_parallel_world_size step_duration = time.time() - step_start_time logging.info( det.util.make_timing_log("validated", step_duration, num_inputs, cast(int, self.num_validation_batches))) self.metric_writer.on_validation_step_end(self.steps_completed, metrics) return {"num_inputs": num_inputs, "validation_metrics": metrics}