Пример #1
0
 def async_send(self, tensors_compressed, name):
     handles = []
     for root_rank in range(self.world_size):
         rank_handles = []
         for i, tensor_compressed in enumerate(tensors_compressed):
             rank_handles.append(
                 broadcast_async(tensor_compressed, root_rank,
                                 name + str(root_rank) + '_' + str(i)))
         handles.append(rank_handles)
     return handles
Пример #2
0
def test_allgather():
    torch.cuda.set_device(hvd.local_rank())
    rank = hvd.rank()
    tensor = torch.rand(10).float().cuda()
    print('rank: ', rank, ', tensor: ', tensor)
    #handle = hvd.allgather_async(tensor)
    #tensor = hvd.synchronize(handle)
    handle = hvd.broadcast_async(tensor, 0)
    hvd.synchronize(handle)
    print('---------')
    print('rank: ', rank, ', tensor: ', tensor)
Пример #3
0
    def test_horovod_broadcast_duplicate_name_error(self):
        """Test that the broadcast raises an error if there are
        two concurrent operations with the same name."""
        hvd.init()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        dims = [17] * 3
        tensor = torch.FloatTensor(*dims)

        hvd.broadcast_async(tensor, root_rank=0, name='duplicate_name')
        try:
            for i in range(10):
                hvd.broadcast_async(tensor, root_rank=0, name='duplicate_name')
            assert False, 'hvd.broadcast_async did not throw error'
        except (torch.FatalError, ValueError):
            pass
def broadcast(val, name=None):
    val = torch.tensor(val)
    hvd.broadcast_async(val, root_rank=0, name=name)