Exemplo n.º 1
0
    def allreduce_bucket(self, bucket):
        tensor = flatten(bucket)

        tensor_to_allreduce = tensor

        if self.allreduce_always_fp32():
            tensor_to_allreduce = tensor.float()

        if self.postscale_gradients():
            if self.gradient_predivide_factor != 1.0:
                tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor)

            dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group)

            if self.gradient_average:
                if self.gradient_predivide_factor != self.dp_world_size:
                    tensor_to_allreduce.mul_(self.gradient_predivide_factor /
                                             self.dp_world_size)
        else:
            tensor_to_allreduce.div_(self.dp_world_size)
            dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group)

        if self.allreduce_always_fp32() and tensor is not tensor_to_allreduce:
            tensor.copy_(tensor_to_allreduce)

        return tensor
Exemplo n.º 2
0
    def synchronize(self):
        synced = False
        if self.count_down == 0:
            missing_p = self._requires_update - set(self._handles.keys())
            for p in missing_p:
                self._allreduce_tensor(p)

            if self._multi_node:
                for p, value in self._handles.items():
                    handle, ctx = value
                    output = synchronize(handle)
                    p.grad.set_(
                        self._compression.decompress(output, ctx) /
                        self.accumulation_step)
            else:
                buckets = OrderedDict()
                for tensor in self._handles.values():
                    tp = tensor.type()
                    if tp not in buckets:
                        buckets[tp] = []
                    buckets[tp].append(tensor)
                for tp in buckets:
                    bucket = buckets[tp]
                    coalesced = flatten(
                        bucket) / self.world_size / self.accumulation_step
                    torch.distributed.all_reduce_multigpu([coalesced])
                    for buf, synced in zip(bucket,
                                           unflatten(coalesced, bucket)):
                        buf.copy_(synced)
            self._handles.clear()
            synced = True
            self.count_down = self.accumulation_step

        self.count_down -= 1
        return synced
Exemplo n.º 3
0
 def step_maybe_fp16_maybe_distributed(optim):
     if args.use_fp16:
         if args.distributed:
             for flat_master, allreduce_buffer in zip(
                     flat_master_buckets, ssd300.allreduce_buffers):
                 if allreduce_buffer is None:
                     raise RuntimeError("allreduce_buffer is None")
                 flat_master.grad = allreduce_buffer.float()
                 flat_master.grad.data.mul_(1. / static_loss_scale)
         else:
             for flat_master, model_bucket in zip(flat_master_buckets,
                                                  model_buckets):
                 flat_grad = apex_C.flatten(
                     [m.grad.data for m in model_bucket])
                 flat_master.grad = flat_grad.float()
                 flat_master.grad.data.mul_(1. / static_loss_scale)
     optim.step()
     if args.use_fp16:
         # Use multi-tensor scale instead of loop & individual parameter copies
         for model_bucket, flat_master in zip(model_buckets,
                                              flat_master_buckets):
             multi_tensor_applier(
                 amp_C.multi_tensor_scale, dummy_overflow_buf, [
                     apex_C.unflatten(flat_master.data, model_bucket),
                     model_bucket
                 ], 1.0)
Exemplo n.º 4
0
 def _get_flat_grads(self, out=None, has_grad=True):
     grads = self._get_grads(has_grad)
     #if out is None:
     #    grads_size = sum(g.numel() for g in grads)
     #    out = grads[0].new(grads_size).zero_()
     #offset = 0
     #for g in grads:
     #    numel = g.numel()
     #    out[offset:offset+numel].copy_(g.view(-1))
     #    offset += numel
     #return out[:offset]
     return apex_C.flatten(grads)
Exemplo n.º 5
0
def apply_flat_dist_call(bucket, call, extra_args=None):

    coalesced = flatten(bucket)
    #print("Rank", dist.get_rank(), "Broadcasting ", coalesced.device, " Size", coalesced.size())
    if extra_args is not None:
        call(coalesced, *extra_args)
    else:
        call(coalesced)

    if call is dist.all_reduce:
        coalesced /= dist.get_world_size()

    for buf, synced in zip(bucket, unflatten(coalesced, bucket)):
        buf.copy_(synced)
Exemplo n.º 6
0
def apply_flat_dist_call(bucket, call, extra_args=None):

    coalesced = flatten(bucket)

    if extra_args is not None:
        call(coalesced, *extra_args)
    else:
        call(coalesced)

    if call is dist.all_reduce:
        coalesced /= dist.get_world_size()

    for buf, synced in zip(bucket, unflatten(coalesced, bucket)):
        buf.copy_(synced)
def create_flat_master(model_buckets):
    # Ideally, we'd like to flatten the model params as well, and reset the float params' .data
    # attributes to point directly into the flattened master buffers.  However, my version that does
    # so is yielding CUDNN_STATUS_BAD_PARAM errors when running with distributed and nhwc.
    # I ended up making the safe choice of not altering what the params' .data members point to.
    check_type_split(model_buckets)

    flat_master_buckets = [apex_C.flatten([p.detach().clone().float() for p in model_bucket])
                           for model_bucket in model_buckets]

    for flat_master in flat_master_buckets:
        flat_master.requires_grad_()

    return flat_master_buckets
Exemplo n.º 8
0
    def allreduce_bucket(self, bucket):
        tensor = flatten(bucket)

        tensor_to_allreduce = tensor

        if self.allreduce_always_fp32:
            tensor_to_allreduce = tensor.float()

        if self.gradient_average_split_factor != 1.0:
            tensor_to_allreduce.mul_(1. / self.gradient_average_split_factor)

        dist.all_reduce(tensor_to_allreduce)

        tensor_to_allreduce.mul_(self.gradient_average_split_factor /
                                 self.world_size)

        if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce:
            tensor.copy_(tensor_to_allreduce)

        return tensor
Exemplo n.º 9
0
 def step_maybe_fp16_maybe_distributed(optim):
     if args.use_fp16:
         if args.distributed:
             for flat_master, allreduce_buffer in zip(
                     flat_master_buckets, ssd300.allreduce_buffers):
                 if allreduce_buffer is None:
                     raise RuntimeError("allreduce_buffer is None")
                 flat_master.grad = allreduce_buffer.float()
                 flat_master.grad.data.mul_(1. / static_loss_scale)
         else:
             for flat_master, model_bucket in zip(flat_master_buckets,
                                                  model_buckets):
                 flat_grad = apex_C.flatten(
                     [m.grad.data for m in model_bucket])
                 flat_master.grad = flat_grad.float()
                 flat_master.grad.data.mul_(1. / static_loss_scale)
     optim.step()
     if args.use_fp16:
         for model_bucket, flat_master in zip(model_buckets,
                                              flat_master_buckets):
             for model, master in zip(
                     model_bucket,
                     apex_C.unflatten(flat_master.data, model_bucket)):
                 model.data.copy_(master.data)