def async_send(self, tensors_compressed, name): handles = [] for root_rank in range(self.world_size): rank_handles = [] for i, tensor_compressed in enumerate(tensors_compressed): rank_handles.append( broadcast_async(tensor_compressed, root_rank, name + str(root_rank) + '_' + str(i))) handles.append(rank_handles) return handles
def test_allgather(): torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() tensor = torch.rand(10).float().cuda() print('rank: ', rank, ', tensor: ', tensor) #handle = hvd.allgather_async(tensor) #tensor = hvd.synchronize(handle) handle = hvd.broadcast_async(tensor, 0) hvd.synchronize(handle) print('---------') print('rank: ', rank, ', tensor: ', tensor)
def test_horovod_broadcast_duplicate_name_error(self): """Test that the broadcast raises an error if there are two concurrent operations with the same name.""" hvd.init() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return dims = [17] * 3 tensor = torch.FloatTensor(*dims) hvd.broadcast_async(tensor, root_rank=0, name='duplicate_name') try: for i in range(10): hvd.broadcast_async(tensor, root_rank=0, name='duplicate_name') assert False, 'hvd.broadcast_async did not throw error' except (torch.FatalError, ValueError): pass
def broadcast(val, name=None): val = torch.tensor(val) hvd.broadcast_async(val, root_rank=0, name=name)