예제 #1
0
    def test_reduce_add_coalesced(self):
        numel = 5
        num_bytes = numel * 8
        tensors = [
            torch.randn(numel).long().cuda(),
            torch.randn(numel).cuda(),
            torch.randn(numel).long().cuda(),
            torch.randn(numel).long().cuda(),
            torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
            torch.randn(numel).cuda(),
        ]
        dup_tensors = [tensors, list(map(lambda t: t.cuda(1), tensors))]

        r_tensors = list(map(comm.reduce_add, zip(*dup_tensors)))
        for r, t in zip(r_tensors, tensors):
            self.assertEqual(r.get_device(), t.get_device())
            self.assertEqual(r, t * 2)
            self.assertIsInstance(r, type(t))

        rc_tensors = comm.reduce_add_coalesced(dup_tensors,
                                               buffer_size=num_bytes * 5 // 2)
        self.assertEqual(r_tensors, rc_tensors)
        for r, rc in zip(r_tensors, rc_tensors):
            self.assertEqual(rc.get_device(), r.get_device())
            self.assertIsInstance(rc, type(r))
예제 #2
0
 def backward(ctx, *inputs):
     inputs = [i.data for i in inputs]
     inputs = [inputs[i:i + ctx.num_inputs]
              for i in range(0, len(inputs), ctx.num_inputs)]
     results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
     outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
     return (None,) + tuple([Variable(t) for tensors in outputs for t in tensors])
예제 #3
0
	def collect_gradients(self):

		if self.ngradev > 1:
			# in case some parameters might not be used during the forward propagation on some GPUs: p.data.new_zeros(p.data.size()) if p.grad is None else p.grad instead of p.grad, but in most cases, this can warn you in case you miss the use of some parameters in the forward computation.
			grads = comm.reduce_add_coalesced([[p.grad for p in filter_para_grad(net.parameters())] for net in self.nets[:self.ngradev]], self.output_device)# if self.ngradev > 1 else [p.grad for p in filter_para_grad(self.nets[0].parameters())]
			for mp, grad in zip(filter_para_grad(self.module.parameters()), grads):
				mp.grad = grad
        def hook(*unused):
            # reduce gradients across devices on a single machine
            if len(self.device_ids) > 1:

                # collect gradients from all copies
                all_grads = [[] for _ in range(len(self._module_copies))]
                for dev_idx, module in enumerate(self._module_copies):
                    for p in module.parameters():
                        if not p.requires_grad or p.grad is None:
                            continue
                        all_grads[dev_idx].append(p.grad.data)

                # reduce grads
                reduced_grads = reduce_add_coalesced(
                    all_grads, self.output_device,
                    self.nccl_reduce_bucket_size)

                # update grads with reduced grads
                for grad, reduced in zip(all_grads[0], reduced_grads):
                    grad.copy_(reduced)

                # clear the gradients and parameters across all replicas
                for module in self._module_copies[1:]:
                    for param in module.parameters():
                        if param.requires_grad:
                            param.grad = None
                            param.data.set_()
예제 #5
0
    def collect_gradients(self):

        grads = comm.reduce_add_coalesced([[
            p.data.new_zeros(p.data.size()) if p.grad is None else p.grad
            for p in net.parameters()
        ] for net in self.nets], self.output_device)
        for mp, grad in zip(self.module.parameters(), grads):
            mp.grad = grad
예제 #6
0
    def forward(ctx, destination, num_inputs, *grads):
        ctx.target_gpus = [
            grads[i].get_device() for i in range(0, len(grads), num_inputs)
        ]

        grads = [
            grads[i:i + num_inputs] for i in range(0, len(grads), num_inputs)
        ]
        return comm.reduce_add_coalesced(grads, destination)
예제 #7
0
 def forward(ctx, num_inputs, *inputs):
     ctx.num_inputs = num_inputs
     ctx.target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)]
     inputs = [inputs[i:i + num_inputs] for i in range(0, len(inputs), num_inputs)]
     # sort before reduce sum
     inputs = sorted(inputs, key=lambda i: i[0].get_device())
     results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0])
     outputs = comm.broadcast_coalesced(results, ctx.target_gpus)
     return tuple([t for tensors in outputs for t in tensors])
예제 #8
0
파일: base.py 프로젝트: lgstd/transformer
    def collect_gradients_func(self, func):

        if self.ngradev > 1:
            grads = comm.reduce_add_coalesced(
                [[p.grad for p in filter_para_grad(func(net).parameters())]
                 for net in self.nets[:self.ngradev]], self.output_device)
            for mp, grad in zip(
                    filter_para_grad(func(self.module).parameters()), grads):
                mp.grad = grad
예제 #9
0
    def _test_reduce_add_coalesced(self, tensors, buffer_size):
        dup_tensors = [tensors, list(map(lambda t: t.cuda(1), tensors))]

        r_tensors = list(map(comm.reduce_add, zip(*dup_tensors)))
        for r, t in zip(r_tensors, tensors):
            self.assertEqual(r.get_device(), t.get_device())
            self.assertEqual(r, t * 2)
            self.assertIsInstance(r, type(t))

        rc_tensors = comm.reduce_add_coalesced(dup_tensors, buffer_size=buffer_size)
        self.assertEqual(r_tensors, rc_tensors)
        for r, rc in zip(r_tensors, rc_tensors):
            self.assertEqual(rc.get_device(), r.get_device())
            self.assertIsInstance(rc, type(r))
        def hook(*unused):
            # reduce gradients across devices on a single machine
            if len(self.device_ids) > 1:

                # collect gradients from all copies
                all_grads = [[] for _ in range(len(self._module_copies))]
                for dev_idx, module in enumerate(self._module_copies):
                    for p in module.parameters():
                        if not p.requires_grad or p.grad is None:
                            continue
                        all_grads[dev_idx].append(p.grad.data)

                # reduce grads
                reduced_grads = reduce_add_coalesced(
                    all_grads, self.output_device,
                    self.nccl_reduce_bucket_size)

                # update grads with reduced grads
                for grad, reduced in zip(all_grads[0], reduced_grads):
                    grad.copy_(reduced)

                # clear the gradients and parameters across all replicas
                for module in self._module_copies[1:]:
                    for param in module.parameters():
                        if param.requires_grad:
                            param.grad = None
                            with torch.no_grad():
                                param.set_()

            if self.nprocs_per_node > 1:
                grads = []
                for p in self.module.parameters():
                    if not p.requires_grad or p.grad is None:
                        continue
                    p.grad.data.div_(
                        self.nprocs_per_node_device.type(p.grad.data.dtype))
                    grads.append(p.grad.data)

                communication_op = functools.partial(
                    dist.all_reduce, group=self.local_node_group)
                communicate(grads, communication_op)

            # convert model back to ps-numerator
            self.ps_numerator()
예제 #11
0
 def backward(self):
     group_params = [[(n, m[n]) for n, p in self.param_group
                      if m[n].grad is not None] for m in self.named_params]
     grad_params = [g for g in group_params if len(g) > 0]
     assert all([len(g) == len(grad_params[0])
                 for g in grad_params]), [len(g) for g in grad_params]
     grad = [[p.grad for n, p in g] for g in grad_params]
     reduced_grad = reduce_add_coalesced(grad, self.device)
     grads = dict([(n, g)
                   for ((n, p), g) in zip(grad_params[0], reduced_grad)])
     l2norm = 0
     for n, p in self.param_group:
         if n in grads:
             p.grad = grads[n].float(
             ) if grads[n].dtype == torch.half else grads[n]
             l2norm += p.grad.norm().item()**2
         else:
             assert p.grad is None, n
     return l2norm
예제 #12
0
 def backward(self, *grad_outputs):
     grad_outputs = [grad_outputs[i:i + self.num_inputs]
                     for i in range(0, len(grad_outputs), self.num_inputs)]
     return comm.reduce_add_coalesced(grad_outputs, self.input_device)
예제 #13
0
    def forward(ctx, destination, num_inputs, *grads):
        ctx.target_gpus = [grads[i].get_device() for i in range(0, len(grads), num_inputs)]

        grads = [grads[i:i + num_inputs]
                 for i in range(0, len(grads), num_inputs)]
        return comm.reduce_add_coalesced(grads, destination)
예제 #14
0
    def collect_gradients(self):

        if self.optm_splt is not None:
            if self.is_contiguous_parameters:
                for i, (
                        net,
                        device,
                ) in enumerate(zip(self.nets, self.device_ids)):
                    _dev_grads = [[
                        para.grad
                        for para in get_contiguous_parameters_m(_net, index=i)
                    ] for _net in self.nets[:self.ngradev]]
                    if i > 0:
                        _dev_grads.insert(
                            0,
                            _dev_grads.pop(i) if i < self.ngradev else [
                                para.grad for para in
                                get_contiguous_parameters_m(net, index=i)
                            ])
                    _dev_grads = comm.reduce_add_coalesced(_dev_grads, device)
                    for mp, grad in zip(
                            get_contiguous_parameters_m(net, index=i),
                            _dev_grads):
                        mp.grad.copy_(grad)
            else:
                grads = [[
                    para.grad for para in filter_para_grad(net.parameters())
                ] for net in self.nets[:self.ngradev]]
                for i, (
                        net,
                        device,
                    (
                        lind,
                        rind,
                    ),
                ) in enumerate(zip(self.nets, self.device_ids,
                                   self.optm_splt)):
                    _dev_grads = [gradu[lind:rind] for gradu in grads]
                    if i > 0:
                        _dev_grads.insert(
                            0,
                            _dev_grads.pop(i) if i < self.ngradev else [
                                _pg.new_zeros(_pg.size(), device=device)
                                for _pg in _dev_grads[0]
                            ])
                    _dev_grads = comm.reduce_add_coalesced(_dev_grads, device)
                    for mp, grad in zip(
                            range_parameter_iter(net,
                                                 lind,
                                                 rind,
                                                 func=filter_para_grad_iter),
                            _dev_grads):
                        mp.grad = grad
        elif self.ngradev > 1:
            if self.is_contiguous_parameters:
                grads = comm.reduce_add_coalesced([[
                    para.grad for para in get_all_contiguous_parameters_m(net)
                ] for net in self.nets[:self.ngradev]], self.output_device)
                for mp, grad in zip(
                        get_all_contiguous_parameters_m(self.module), grads):
                    mp.grad.copy_(grad)
            else:
                # in case some parameters might not be used during the forward propagation on some GPUs: p.data.new_zeros(p.data.size()) if p.grad is None else p.grad instead of p.grad, but in most cases, this can warn you in case you miss the use of some parameters in the forward computation.
                grads = comm.reduce_add_coalesced(
                    [[
                        para.grad
                        for para in filter_para_grad(net.parameters())
                    ] for net in self.nets[:self.ngradev]], self.output_device
                )  # if self.ngradev > 1 else [p.grad for p in filter_para_grad(self.nets[0].parameters())]
                for mp, grad in zip(filter_para_grad(self.module.parameters()),
                                    grads):
                    mp.grad = grad
예제 #15
0
 def backward(self, *grad_outputs):
     grad_outputs = [grad_outputs[i:i + self.num_inputs]
                     for i in range(0, len(grad_outputs), self.num_inputs)]
     return comm.reduce_add_coalesced(grad_outputs, self.input_device)