示例#1
0
    def _test_all_gather_multigpu_helper(self, group, group_id, rank,
                                         rankToGPUMapping):
        for dest in group:
            tensors = [
                _build_tensor(dest + 1).cuda(device=i)
                for i in rankToGPUMapping[rank]
            ]

            # construct expected output along with
            # a place holder to receive all gather results
            output_tensors = []
            expected_output = []
            output_per_gpu = [_build_tensor(dest + 1, -1)] * len(
                rankToGPUMapping[0]) * len(group)
            expected_per_gpu = [_build_tensor(dest + 1)] * len(
                rankToGPUMapping[0]) * len(group)
            for gpu in rankToGPUMapping[rank]:
                output_tensors.append(
                    [t.cuda(device=gpu) for t in output_per_gpu])
                expected_output.append(
                    [t.cuda(device=gpu) for t in expected_per_gpu])

            dist.all_gather_multigpu(output_tensors, tensors, group_id)
            self.assertEqual(output_tensors, expected_output)

        self._barrier()
示例#2
0
def all_gather_op(single_node_output):
    all_node_outputs = [
        torch.zeros_like(single_node_output).cuda() if torch.cuda.is_available()
        else torch.zeros_like(single_node_output)
        for rank in range(dist.get_world_size())
    ]
    if torch.cuda.is_available():
        single_node_output = single_node_output.cuda()
    dist.all_gather_multigpu(all_node_outputs, single_node_output)
    dist.barrier()
    return torch.cat(all_node_outputs)
示例#3
0
def all_gather_op(single_node_output, verbose=False):
    try:
        all_node_outputs = [
            torch.zeros_like(single_node_output).cuda() if torch.cuda.is_available()
            else torch.zeros_like(single_node_output)
            for rank in range(dist.get_world_size())
        ]
        if torch.cuda.is_available():
            single_node_output = single_node_output.cuda()
        dist.all_gather_multigpu(all_node_outputs, single_node_output)
        dist.barrier()
        return torch.cat(all_node_outputs)
    except AssertionError as e:
        if verbose:
            print(f'Distributed process group not initialized. Assuming 1 node. Error: {str(e)}')
        return single_node_output
示例#4
0
    def _test_all_gather_multigpu_helper(self, group, group_id, rank,
                                         rank_to_GPU):
        for dest in group:
            tensors = [_build_tensor(dest + 1).cuda(device=i)
                       for i in rank_to_GPU[rank]]

            # construct expected output along with
            # a place holder to receive all gather results
            output_tensors = []
            expected_output = []
            output_per_gpu = [_build_tensor(dest + 1, -1)] * len(rank_to_GPU[0]) * len(group)
            expected_per_gpu = [_build_tensor(dest + 1)] * len(rank_to_GPU[0]) * len(group)
            for gpu in rank_to_GPU[rank]:
                output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu])
                expected_output.append([t.cuda(device=gpu) for t in expected_per_gpu])

            dist.all_gather_multigpu(output_tensors, tensors, group_id)
            self.assertEqual(output_tensors, expected_output)

        self._barrier()
示例#5
0
 def all_gather_multigpu(
     self, output_tensor_lists, input_tensor_list, async_op=False
 ):  # pragma: no cover
     return dist.all_gather_multigpu(
         output_tensor_lists, input_tensor_list, self.group, async_op
     )
示例#6
0
    assert_mean(tensor_list[1], 3.)

# ---------------- ALL_GATHER -----------------
# all_gather semantics is quite complicated:
# https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_gather_multigpu

tensor_list = get_tensor_list()
"""
Process 0: physical GPU 0, 1, output list device residence 
    [[gpu0, gpu0, gpu0, gpu0], [gpu1, gpu1, gpu1, gpu1]]
    values [[1., 2., 3., 4.], [1., 2., 3., 4.]]  from all GPUs across all procs
Process 1: physical GPU 2, 3, output list device residence 
    [[gpu2, gpu2, gpu2, gpu2], [gpu3, gpu3, gpu3, gpu3]]
    values [[1., 2., 3., 4.], [1., 2., 3., 4.]]  from all GPUs across all procs
"""
output_tensor_lists = [[new_tensor(i, value=0.) for _ in range(4)]
                       for i in local_gpu_ids]

dist.all_gather_multigpu(
    output_tensor_lists,
    tensor_list,
)
print('all_gather_multigpu rank ' + str(local_rank) + '\n' +
      '\n\t'.join(map(str, output_tensor_lists)))

for same_gpu_tensor_list in output_tensor_lists:
    assert_mean(same_gpu_tensor_list[0], 1.)
    assert_mean(same_gpu_tensor_list[1], 2.)
    assert_mean(same_gpu_tensor_list[2], 3.)
    assert_mean(same_gpu_tensor_list[3], 4.)