def _mean_reduce(rank: int, world_size: int) -> None: _setup(rank, world_size) to_sreduce = torch.tensor(rank + 1, dtype=torch.float).to(rank) actual = mean_reduce(to_sreduce, world_size) assert actual == torch.tensor(((world_size + 1) / 2), dtype=torch.float).to(rank) _cleanup()
def mean_reduce_ddp_metrics(self, metrics: Dict) -> Dict: """Syncs ``metrics`` over ``world_size`` in the distributed mode.""" metrics = { k: mean_reduce( torch.tensor(v, device=self.device), world_size=self.state.num_processes, ) for k, v in metrics.items() } return metrics
def sync_tensor(self, tensor: torch.Tensor, mode: str): """Syncs ``tensor`` over ``world_size`` in distributed mode. Args: tensor: tensor to sync across the processes. mode: tensor synchronization type, should be one of 'sum' or 'mean'. Default is 'mean'. Returns: torch.Tensor with synchronized values. Raises: ValueError: if mode is out of ``sum`` or ``mean`` """ if mode not in {"sum", "mean"}: raise ValueError(f"Unknown sync_type '{mode}'") if mode == "sum": return sum_reduce(tensor) else: return mean_reduce(tensor, self.world_size)
def mean_reduce_ddp_metrics(self, metrics: Dict) -> Dict: """Syncs ``metrics`` over ``world_size`` in the distributed mode.""" if self.state.distributed_type in [ DistributedType.MULTI_CPU, DistributedType.MULTI_GPU, ]: metrics = { k: mean_reduce( torch.tensor(v, device=self.device), world_size=self.state.num_processes, ) for k, v in metrics.items() } elif self.state.distributed_type == DistributedType.TPU: metrics = { k: xm.mesh_reduce( k, v.item() if isinstance(v, torch.Tensor) else v, np.mean ) for k, v in metrics.items() } return metrics