def _average_training_metrics( self, per_batch_metrics: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Average training metrics across GPUs""" check.true(self.hvd_config.use, "Can only average training metrics in multi-GPU training.") metrics_timeseries = util._list_to_dict(per_batch_metrics) # combined_timeseries is: dict[metric_name] -> 2d-array. # A measurement is accessed via combined_timeseries[metric_name][process_idx][batch_idx]. combined_timeseries, _ = self._combine_metrics_across_processes( metrics_timeseries, num_batches=len(per_batch_metrics)) # If the value for a metric is a single-element array, the averaging process will # change that into just the element. We record what metrics are single-element arrays # so we can wrap them in an array later (for perfect compatibility with non-averaging # codepath). array_metrics = [] for metric_name in per_batch_metrics[0].keys(): if isinstance(per_batch_metrics[0][metric_name], np.ndarray): array_metrics.append(metric_name) if self.is_chief: combined_timeseries_type = Dict[str, List[List[Any]]] combined_timeseries = cast(combined_timeseries_type, combined_timeseries) num_batches = len(per_batch_metrics) num_processes = hvd.size() averaged_metrics_timeseries = {} # type: Dict[str, List] for metric_name in combined_timeseries.keys(): averaged_metrics_timeseries[metric_name] = [] for batch_idx in range(num_batches): batch = [ combined_timeseries[metric_name][process_idx] [batch_idx] for process_idx in range(num_processes) ] np_batch = np.array(batch) batch_avg = np.mean( np_batch[np_batch != None]) # noqa: E711 if metric_name in array_metrics: batch_avg = np.array(batch_avg) averaged_metrics_timeseries[metric_name].append(batch_avg) per_batch_metrics = util._dict_to_list(averaged_metrics_timeseries) return per_batch_metrics
def _combine_and_average_training_metrics( context: det.core.DistributedContext, per_batch_metrics: List[Dict[str, Any]]) -> List[Dict[str, Any]]: assert context.size > 1, "Can only average training metrics in multi-GPU training." metrics_timeseries = util._list_to_dict(per_batch_metrics) # Gather metrics across ranks onto rank 0 slot. # The combined_timeseries is: dict[metric_name] -> 2d-array. # A measurement is accessed via combined_timeseries[metric_name][process_idx][batch_idx]. combined_timeseries, combined_num_batches = _combine_metrics_across_processes( context, metrics_timeseries, num_batches=len(per_batch_metrics)) if context.rank == 0: # We can safely cast variables here because this is all happening on the chief, which # is where we gather metrics. combined_timeseries = cast(Dict[str, List[List[Any]]], combined_timeseries) combined_num_batches = cast(List[int], combined_num_batches) per_batch_metrics = _average_training_metrics(combined_timeseries, combined_num_batches) return per_batch_metrics
def test_list_to_dict() -> None: r = _list_to_dict([{"a": 1}, {"b": 2}, {"a": 2}]) assert r == {"a": [1, 2], "b": [2]}