def _test_all_reduce_helper( self, group, group_id, rank, op, master_value, worker_value, expected_value, cuda=False, rank_to_GPU=None, ): for src in group: if rank == src: tensor = _build_tensor(src + 1).fill_(master_value) if cuda: tensor = tensor.cuda(rank_to_GPU[rank][0]) dist.all_reduce(tensor, op, group_id) self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) else: tensor = _build_tensor(src + 1).fill_(worker_value) if cuda: tensor = tensor.cuda(rank_to_GPU[rank][0]) dist.all_reduce(tensor, op, group_id) self.assertEqual(tensor, _build_tensor(src + 1, expected_value)) self._barrier()
def _process_batch(): dev_grad_batch, dev_events, job_event = queue.get() dev_coalesced = [] # Coalesce the tensors on all devices and start a local reduction for dev_id, grad_batch, event, stream in zip(device_ids, dev_grad_batch, dev_events, reduction_streams): with torch.cuda.device(dev_id), torch.cuda.stream(stream): stream.wait_event(event) coalesced = _flatten_dense_tensors(grad_batch) dev_coalesced.append(coalesced) # Wait for all copies to complete before starting the NCCL kernel for stream in reduction_streams: stream.synchronize() nccl.reduce(dev_coalesced, root=0, streams=nccl_streams) # From now on we're only going to work on the first device (from device_ids) grad_batch = dev_grad_batch[0] coalesced = dev_coalesced[0] reduce_stream = reduction_streams[0] with torch.cuda.stream(reduce_stream): reduce_stream.wait_stream(nccl_streams[0]) coalesced /= dist.get_world_size() dist.all_reduce(coalesced, group=group_id) for grad, reduced in zip(grad_batch, _unflatten_dense_tensors(coalesced, grad_batch)): grad.copy_(reduced) job_event.set()
def sync(self, iter_id): if iter_id % self.sync_frequency == 0: for p in self.model.parameters(): dist.all_reduce(p.data, op=dist.reduce_op.SUM, group=self.group) p.data /= self.world_size if iter_id % self.evaluate_frequency == 0: # current for simplicity the full model is transmitted, but it can be compressed later self.update_frozen_lengths() self.last_model_copy = copy.deepcopy(self.model).cuda() # update round id and defrozen corresponded parameters self.round_id += 1 self.synchronization_mask = torch.where( self.defrozen_round_ids == self.round_id, torch.tensor(1).cuda().byte(), torch.tensor(0).cuda().byte()) stable_ratio = 1 - float(sum( self.synchronization_mask.int())) / self.model_size # adjust the synchronization frequency when necessary # print self.synchronization_mask print 'at iteration: ', iter_id, '; round id: ', self.round_id, '; stable ratio: ', stable_ratio return True return False
def reduce_loss_dict(loss_dict): """ Reduce the loss dictionary from all processes so that process with rank 0 has the averaged results. Returns a dict with the same fields as loss_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return loss_dict with torch.no_grad(): loss_names = [] all_losses = [] for k, v in loss_dict.items(): loss_names.append(k) all_losses.append(v) all_losses = torch.stack(all_losses, dim=0) # dist.reduce(all_losses, dst=0) dist.all_reduce(all_losses) # if dist.get_rank() == 0: if True: # only main process gets accumulated, so only divide by # world_size in this case all_losses /= world_size reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} return reduced_losses
def after_sync(self, model, iter_id): if iter_id % self.sync_frequency == 0: for p in model.parameters(): dist.all_reduce(p.data, op=dist.reduce_op.SUM, group=self.group) p.data /= self.world_size
def sync(self, iter_id): if self.phase > 0: for p in self.model.parameters(): dist.all_reduce(p.data, op=dist.reduce_op.SUM, group=self.group) p.data /= self.world_size elif iter_id % self.sync_frequency == 0: for p in self.model.parameters(): dist.all_reduce(p.data, op=dist.reduce_op.SUM, group=self.group) p.data /= self.world_size self.update_synchronization_mask() self.last_model_copy = copy.deepcopy(self.model)
def all_reduce(tensor, group=None, op=SUM_OP): if group is None: group = get_default_group() if _use_c10d[0]: return dist_c10d.all_reduce(tensor, op=op['c10d'], group=group) else: return dist_no_c10d.all_reduce(tensor, op=op['no_c10d'], group=group)
def all_reduce(tensor, group=None): if group is None: group = get_default_group() if _use_c10d[0]: return dist_c10d.all_reduce(tensor, group=group) else: return dist_no_c10d.all_reduce(tensor, group=group)
def allreduce_params(): if self.needs_reduction: self.needs_reduction = False buckets = defaultdict(list) for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) buckets[tp].append(param) for bucket in buckets.values(): grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def sync(self, model, iter_id): if self.phase > 0: for p in model.parameters(): dist.all_reduce(p.grad, op=dist.reduce_op.SUM, group=self.group) p.grad /= self.world_size return grad = [p.grad for p in model.parameters()] flattened_grad = torch.tensor([]) for g in grad: frag = g.view(-1) flattened_grad = torch.cat((flattened_grad, frag),0) # flattened_grad = torch.cat([p.grad.data.view(-1) for p in model.parameters()], 0) # filter gradient with mask filtered_grad = flattened_grad * self.synchronization_mask.float() self.local_ac_grad += filtered_grad valid_grad = filtered_grad if iter_id % self.sync_frequency == 0: print 'sync now:', iter_id # squeeze those parameters to be communicated into one tensor transmitted_grad = torch.masked_select(self.local_ac_grad, self.synchronization_mask) dist.all_reduce(transmitted_grad, op=dist.reduce_op.SUM, group=self.group) transmitted_grad /= self.world_size # unsqueeze transmitted grad to full length flattened grad self.global_ac_grad = torch.zeros(self.flattened_shape) self.global_ac_grad[self.synchronization_mask] = transmitted_grad print 'global_ac_grad[0:3]:', self.global_ac_grad[0:3] print '5081: ',self.global_ac_grad[5081] # update synchronization mask & prepare gradient self.update_synchronization_mask() valid_grad = filtered_grad + self.global_ac_grad - self.local_ac_grad self.local_ac_grad = torch.zeros(self.flattened_shape) # unwrap to high-dimension gradient for i, p in enumerate(model.parameters()): p.grad.data = valid_grad[self.frag_index_list[i][0]:self.frag_index_list[i][1]].view(self.frag_shape_list[i]) # model.parameters.grad.data if sum(self.synchronization_mask) == 0: self.phase = 1
def sync(self, iter_id): if self.phase > 0: for p in self.model.parameters(): dist.all_reduce(p.data, op=dist.reduce_op.SUM, group=self.group) p.data /= self.world_size elif iter_id == self.next_sync_iter_id: # current for simplicity the full model is transmitted, but it can be compressed later for p in self.model.parameters(): dist.all_reduce(p.data, op=dist.reduce_op.SUM, group=self.group) p.data /= self.world_size self.update_frozen_lengths() self.last_model_copy = copy.deepcopy(self.model) # update round id and defrozen corresponded parameters self.round_id += 1 self.synchronization_mask = torch.where( self.defrozen_round_ids == self.round_id, torch.tensor(1).byte(), torch.tensor(0).byte()) # adjust the synchronization frequency when necessary stable_ratio = 1 - float(sum( self.synchronization_mask.int())) / self.model_size # print self.synchronization_mask if stable_ratio > self.comp_comm_ratio: self.sync_frequency = (self.sync_frequency + 1) / 2 else: # if few parameters are stable, we shall reduce the synchronization frequency self.sync_frequency += self.change_frequency_step self.next_sync_iter_id += self.sync_frequency print 'at iteration: ', iter_id, '; stable ratio: ', stable_ratio, '; new synchronization frequency: ', self.sync_frequency return True return False
def all_reduce(model, world_size, group): for param in model.parameters(): dist.all_reduce(param.data, op=dist.reduce_op.SUM, group=group) param.data /= world_size return model
def sync(self, model, iter_id): if self.phase > 0: for p in model.parameters(): dist.all_reduce(p.grad, op=dist.reduce_op.SUM, group=self.group) p.grad /= self.world_size return grad = [p.grad for p in model.parameters()] flattened_grad = torch.tensor([]) for g in grad: frag = g.view(-1) flattened_grad = torch.cat((flattened_grad, frag), 0) # flattened_grad = torch.cat([p.grad.data.view(-1) for p in model.parameters()], 0) # filter gradient with mask #filtered_grad = flattened_grad * self.synchronization_mask.float() filtered_grad = torch.zeros(self.flattened_shape) # filtered_grad[self.synchronization_mask] = flat # filtered_grad = torch.where(self.synchronization_mask > 0, flattened_grad, filtered_grad) filtered_grad = torch.where(self.fixed_synchronization_mask > 0, flattened_grad, filtered_grad) self.local_ac_grad += filtered_grad valid_grad = filtered_grad if iter_id % self.sync_frequency == 0: print 'sync now:', iter_id, '; phase:', self.phase # squeeze those parameters to be communicated into one tensor # transmitted_grad = torch.masked_select(self.local_ac_grad, self.synchronization_mask) transmitted_grad = torch.masked_select( self.local_ac_grad, self.fixed_synchronization_mask) dist.all_reduce(transmitted_grad, op=dist.reduce_op.SUM, group=self.group) # numpy.save(str(iter_id)+'-tm_grad', transmitted_grad.detach().numpy()) transmitted_grad /= self.world_size # unsqueeze transmitted grad to full length flattened grad self.global_ac_grad = torch.zeros(self.flattened_shape) # self.global_ac_grad[self.synchronization_mask] = transmitted_grad self.global_ac_grad[ self.fixed_synchronization_mask] = transmitted_grad print 'global_ac_grad[0:3]:', self.global_ac_grad[0:3] print '5081: ', self.global_ac_grad[5081] # update synchronization mask & prepare gradient self.update_synchronization_mask() valid_grad = filtered_grad + self.global_ac_grad - self.local_ac_grad # numpy.save(str(iter_id)+'-gg', self.global_ac_grad.detach().numpy()) # numpy.save(str(iter_id)+'-lg', self.local_ac_grad.detach().numpy()) # numpy.save(str(iter_id)+'-fg', filtered_grad.detach().numpy()) # numpy.save(str(iter_id)+'-vg', valid_grad.detach().numpy()) # numpy.save(str(iter_id)+'-model', numpy.asarray([i.detach().numpy() for i in list(model.parameters())])) self.local_ac_grad = torch.zeros(self.flattened_shape) # exit() # numpy.save(str(iter_id)+'-model-y', numpy.asarray([i.detach().numpy() for i in list(model.parameters())])) if (iter_id / self.sync_frequency) % 50 == 0: self.synchronization_mask = torch.ones( self.flattened_shape).byte() # unwrap to high-dimension gradient for i, p in enumerate(model.parameters()): p.grad.data = valid_grad[self.frag_index_list[i][0]:self. frag_index_list[i][1]].view( self.frag_shape_list[i]) # model.parameters.grad.data if iter_id % self.sync_frequency == 0: print 'stable ratio: ', 1 - float( sum(self.synchronization_mask.int())) / self.flattened_shape[0] if self.phase == 0 and float(sum(self.synchronization_mask.int( ))) / self.flattened_shape[0] < self.change_phase_threshold: self.phase = 0 return True return False