def allreduce_bucket(self, bucket): tensor = flatten(bucket) tensor_to_allreduce = tensor if self.allreduce_always_fp32(): tensor_to_allreduce = tensor.float() if self.postscale_gradients(): if self.gradient_predivide_factor != 1.0: tensor_to_allreduce.mul_(1. / self.gradient_predivide_factor) dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group) if self.gradient_average: if self.gradient_predivide_factor != self.dp_world_size: tensor_to_allreduce.mul_(self.gradient_predivide_factor / self.dp_world_size) else: tensor_to_allreduce.div_(self.dp_world_size) dist.all_reduce(tensor_to_allreduce, group=self.data_parallel_group) if self.allreduce_always_fp32() and tensor is not tensor_to_allreduce: tensor.copy_(tensor_to_allreduce) return tensor
def synchronize(self): synced = False if self.count_down == 0: missing_p = self._requires_update - set(self._handles.keys()) for p in missing_p: self._allreduce_tensor(p) if self._multi_node: for p, value in self._handles.items(): handle, ctx = value output = synchronize(handle) p.grad.set_( self._compression.decompress(output, ctx) / self.accumulation_step) else: buckets = OrderedDict() for tensor in self._handles.values(): tp = tensor.type() if tp not in buckets: buckets[tp] = [] buckets[tp].append(tensor) for tp in buckets: bucket = buckets[tp] coalesced = flatten( bucket) / self.world_size / self.accumulation_step torch.distributed.all_reduce_multigpu([coalesced]) for buf, synced in zip(bucket, unflatten(coalesced, bucket)): buf.copy_(synced) self._handles.clear() synced = True self.count_down = self.accumulation_step self.count_down -= 1 return synced
def step_maybe_fp16_maybe_distributed(optim): if args.use_fp16: if args.distributed: for flat_master, allreduce_buffer in zip( flat_master_buckets, ssd300.allreduce_buffers): if allreduce_buffer is None: raise RuntimeError("allreduce_buffer is None") flat_master.grad = allreduce_buffer.float() flat_master.grad.data.mul_(1. / static_loss_scale) else: for flat_master, model_bucket in zip(flat_master_buckets, model_buckets): flat_grad = apex_C.flatten( [m.grad.data for m in model_bucket]) flat_master.grad = flat_grad.float() flat_master.grad.data.mul_(1. / static_loss_scale) optim.step() if args.use_fp16: # Use multi-tensor scale instead of loop & individual parameter copies for model_bucket, flat_master in zip(model_buckets, flat_master_buckets): multi_tensor_applier( amp_C.multi_tensor_scale, dummy_overflow_buf, [ apex_C.unflatten(flat_master.data, model_bucket), model_bucket ], 1.0)
def _get_flat_grads(self, out=None, has_grad=True): grads = self._get_grads(has_grad) #if out is None: # grads_size = sum(g.numel() for g in grads) # out = grads[0].new(grads_size).zero_() #offset = 0 #for g in grads: # numel = g.numel() # out[offset:offset+numel].copy_(g.view(-1)) # offset += numel #return out[:offset] return apex_C.flatten(grads)
def apply_flat_dist_call(bucket, call, extra_args=None): coalesced = flatten(bucket) #print("Rank", dist.get_rank(), "Broadcasting ", coalesced.device, " Size", coalesced.size()) if extra_args is not None: call(coalesced, *extra_args) else: call(coalesced) if call is dist.all_reduce: coalesced /= dist.get_world_size() for buf, synced in zip(bucket, unflatten(coalesced, bucket)): buf.copy_(synced)
def apply_flat_dist_call(bucket, call, extra_args=None): coalesced = flatten(bucket) if extra_args is not None: call(coalesced, *extra_args) else: call(coalesced) if call is dist.all_reduce: coalesced /= dist.get_world_size() for buf, synced in zip(bucket, unflatten(coalesced, bucket)): buf.copy_(synced)
def create_flat_master(model_buckets): # Ideally, we'd like to flatten the model params as well, and reset the float params' .data # attributes to point directly into the flattened master buffers. However, my version that does # so is yielding CUDNN_STATUS_BAD_PARAM errors when running with distributed and nhwc. # I ended up making the safe choice of not altering what the params' .data members point to. check_type_split(model_buckets) flat_master_buckets = [apex_C.flatten([p.detach().clone().float() for p in model_bucket]) for model_bucket in model_buckets] for flat_master in flat_master_buckets: flat_master.requires_grad_() return flat_master_buckets
def allreduce_bucket(self, bucket): tensor = flatten(bucket) tensor_to_allreduce = tensor if self.allreduce_always_fp32: tensor_to_allreduce = tensor.float() if self.gradient_average_split_factor != 1.0: tensor_to_allreduce.mul_(1. / self.gradient_average_split_factor) dist.all_reduce(tensor_to_allreduce) tensor_to_allreduce.mul_(self.gradient_average_split_factor / self.world_size) if self.allreduce_always_fp32 and tensor is not tensor_to_allreduce: tensor.copy_(tensor_to_allreduce) return tensor
def step_maybe_fp16_maybe_distributed(optim): if args.use_fp16: if args.distributed: for flat_master, allreduce_buffer in zip( flat_master_buckets, ssd300.allreduce_buffers): if allreduce_buffer is None: raise RuntimeError("allreduce_buffer is None") flat_master.grad = allreduce_buffer.float() flat_master.grad.data.mul_(1. / static_loss_scale) else: for flat_master, model_bucket in zip(flat_master_buckets, model_buckets): flat_grad = apex_C.flatten( [m.grad.data for m in model_bucket]) flat_master.grad = flat_grad.float() flat_master.grad.data.mul_(1. / static_loss_scale) optim.step() if args.use_fp16: for model_bucket, flat_master in zip(model_buckets, flat_master_buckets): for model, master in zip( model_bucket, apex_C.unflatten(flat_master.data, model_bucket)): model.data.copy_(master.data)