def _sum_reduce(rank: int, world_size: int) -> None: _setup(rank, world_size) to_sreduce = torch.tensor(rank + 1, dtype=torch.float).to(rank) actual = sum_reduce(to_sreduce) assert actual == torch.tensor((world_size * (world_size + 1)) // 2, dtype=torch.float).to(rank) _cleanup()
def sync_tensor(self, tensor: torch.Tensor, mode: str): """Syncs ``tensor`` over ``world_size`` in distributed mode. Args: tensor: tensor to sync across the processes. mode: tensor synchronization type, should be one of 'sum' or 'mean'. Default is 'mean'. Returns: torch.Tensor with synchronized values. Raises: ValueError: if mode is out of ``sum`` or ``mean`` """ if mode not in {"sum", "mean"}: raise ValueError(f"Unknown sync_type '{mode}'") if mode == "sum": return sum_reduce(tensor) else: return mean_reduce(tensor, self.world_size)