def _combine_metrics_across_processes( self, metrics: Dict[str, Any], num_batches: int ) -> Tuple[Optional[Dict[str, Any]], Optional[List[int]]]: # The chief receives the metric from every other training process. check.true(self.hvd_config.use) metrics_lists = {} # type: Dict[str, Any] batches_per_process = [] # type: List[int] if self.is_chief: self.train_process_comm_chief = cast(ipc.ZMQServer, self.train_process_comm_chief) worker_metrics = self.train_process_comm_chief.barrier( num_connections=hvd.size() - 1) worker_metrics = cast(List[ipc.MetricsInfo], worker_metrics) for metric_name in metrics.keys(): metrics_lists[metric_name] = [metrics[metric_name]] for worker_metric in worker_metrics: metrics_lists[metric_name].append( worker_metric.metrics[metric_name]) batches_per_process.append(num_batches) for worker_metric in worker_metrics: batches_per_process.append(worker_metric.num_batches) return metrics_lists, batches_per_process else: self.train_process_comm_worker = cast( ipc.ZMQClient, self.train_process_comm_worker) self.train_process_comm_worker.barrier(message=ipc.MetricsInfo( metrics=metrics, num_batches=num_batches)) return None, None
def _combine_metrics_across_processes( self, metrics: Dict[str, Any], num_batches: int ) -> Tuple[Optional[Dict[str, Any]], Optional[List[int]]]: # The chief receives the metric from every other training process. check.true(self.hvd_config.use) metrics_lists = {} # type: Dict[str, Any] batches_per_process = [] # type: List[int] if self.is_chief: self.train_process_comm_chief = cast( ipc.ZMQBroadcastServer, self.train_process_comm_chief ) worker_metrics, _ = self.train_process_comm_chief.gather_with_polling(lambda: None) self.train_process_comm_chief.broadcast(None) worker_metrics = cast(List[ipc.MetricsInfo], worker_metrics) for metric_name in metrics.keys(): metrics_lists[metric_name] = [metrics[metric_name]] for worker_metric in worker_metrics: metrics_lists[metric_name].append(worker_metric.metrics[metric_name]) batches_per_process.append(num_batches) for worker_metric in worker_metrics: batches_per_process.append(worker_metric.num_batches) return metrics_lists, batches_per_process else: self.train_process_comm_worker = cast( ipc.ZMQBroadcastClient, self.train_process_comm_worker ) self.train_process_comm_worker.send( ipc.MetricsInfo(metrics=metrics, num_batches=num_batches) ) # Synchronize with the chief so that there is no risk of accidentally calling send() # for a future gather before all workers have called send() on this gather. _ = self.train_process_comm_worker.recv() return None, None