def async_send(self, tensors_compressed, name): """ :param tensors_compressed: list of flat tensors to communicate :param name: for the all_gather operation :return: handles to synchronize, tensor sizes per rank """ tensors_size = [t.numel() for t in tensors_compressed ] # list of tensor size for this rank if self.compressor.tensors_size_are_same: tensors_size_ag = [ tensors_size ] * self.world_size # list of tensor sizes per rank tensor_sizes = zip(*tensors_size_ag) # transpose else: tensors_size = torch.tensor(tensors_size) # TODO: set device gathered = allgather( tensors_size) # tensor of tensor sizes per rank tensor_sizes = gathered.view([self.world_size, -1 ]).t().tolist() # transpose, to list handles = [] for tensor_compressed in tensors_compressed: handle = allgather_async(tensor_compressed) handles.append(handle) return handles, tensor_sizes
def async_send(self, tensors_compressed, ctx): if tensors_compressed is None: return handles = [] """ We use alltoall()+allgather() to implement Parameter Server. For quantization compression algorithms, we allgather() the corresponding scalars for each server to decompress the data. """ name, numel = ctx if self.compressor.quantization and len(tensors_compressed) == 2: handle = alltoall_async(tensors_compressed[0], splits=self.get_splits( tensors_compressed[0].numel()), name=name) handles.append(handle) handle = allgather_async(tensors_compressed[1], name=name) handles.append(handle) else: for i, tensor_compressed in enumerate(tensors_compressed): handle = alltoall_async(tensor_compressed, name + str(i)) handles.append(handle) #self.thread = threading.Thread(target=self.ps_synchronize, args=(handles, ctx)) #self.thread.start() self.ps_synchronize(handles, ctx) return handles
def allgather_sync(self, tensors, ranks): nworkers = hvd.size() rank = hvd.rank() start = 0 sub_ranks = ranks[start:start+nworkers] sub_tensors = tensors[start:start+nworkers] while len(sub_ranks) > 0: #print('len(sub_ranks): ', len(sub_ranks)) #print('len(sub_tensors): ', len(sub_tensors)) try: idx = sub_ranks.index(rank) except: idx = -1 if idx < 0: tensor = sub_tensors[0].new(0) else: tensor = sub_tensors[idx] handle = hvd.allgather_async(tensor.view(-1)) sync_tensors = hvd.synchronize(handle) offset = 0 for i, r in enumerate(sub_ranks): if idx < 0: continue original_t = sub_tensors[r] numel = original_t.numel() t = sync_tensors[offset:offset+numel] original_t.copy_(t.view(original_t.shape)) offset += numel start += nworkers sub_ranks = ranks[start:start+nworkers] sub_tensors = tensors[start:start+nworkers]
def fsp_matrix_transfer(self): ''' obtain the feature maps of bottlenecks (h*w*m), reshape it to (hw*m), then do matrix multiplication (m*n) allgather the mm, use L2 loss on it :return: ''' handles = [] matrix_group = [] for key in self.activation: if 'in' in key: fm_in = self.activation[key] if 'out' in key: fm_out = self.activation[key] fm_in = fm_in.view(fm_in.shape[0], fm_in.shape[1], -1) fm_out = fm_out.view(fm_out.shape[0], fm_out.shape[1], -1) fm_out = torch.transpose(fm_out, 1, 2) fsp_matrix = torch.bmm(fm_in, fm_out) / fm_in.shape[-1] matrix_group.append(fsp_matrix) fsp_matrix = fsp_matrix.unsqueeze(0) handle = hvd.allgather_async(fsp_matrix, key) handles.append(handle) fsp_loss = 0 for idx, handle in enumerate(handles): rec_fsp = hvd.synchronize(handle) for i in range(0, hvd.size()): if i != self.task_id: fsp_loss += self.norm_loss(matrix_group[idx], rec_fsp[i]) fsp_loss /= (hvd.size() - 1) self.log_dict['transfer_count'] += 1 return fsp_loss
def attention_transfer(self): def at(x): return F.normalize(x.pow(2).mean(1).view(x.size(0), -1)) handles = [] att_group = [] for key in self.activation: at_out = at(self.activation[key]) att_group.append(at_out) at_numpy = at_out.data.unsqueeze(0) handle = hvd.allgather_async(at_numpy, key) handles.append(handle) # self.norm_loss att_loss = 0 for idx, handle in enumerate(handles): rec_att = hvd.synchronize(handle) # att_loss += self.norm_loss(att_group[idx], rec_att.mean(0).cuda(self.device)) for i in range(0, hvd.size()): if i != self.task_id: att_loss += self.norm_loss(att_group[idx], rec_att[i].cuda(self.device)) att_loss /= (hvd.size() - 1) self.log_dict['transfer_count'] += 1 return att_loss
def forward(ctx, tensor, name): ctx.dim = tensor.shape[0] # we try to put all sync ops in forward pass ctx.all_dims = hvd.allgather( torch.tensor([ctx.dim], device=tensor.device)).view(hvd.size()) handle = hvd.allgather_async(tensor, name) return hvd.synchronize(handle)
def forward(self, data): """ Arguments: data: Tensor to be gathered across all processes. """ return hvd.allgather_async(data, name=self.name,)
def async_send(self, tensors_compressed, ctx): if tensors_compressed is None: return handles = [] for i, tensor_compressed in enumerate(tensors_compressed): handle = allgather_async(tensor_compressed, ctx[0] + str(i)) handles.append(handle) return handles
def test_horovod_allgather_duplicate_name_error(self): """Test that the allgather raises an error if there are two concurrent operations with the same name.""" hvd.init() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return dims = [17] * 3 tensor = torch.FloatTensor(*dims) hvd.allgather_async(tensor, name='duplicate_name') try: for i in range(10): hvd.allgather_async(tensor, name='duplicate_name') assert False, 'hvd.allgather_async did not throw error' except (torch.FatalError, ValueError): pass
def test_allgather(): torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() tensor = torch.rand(10).float().cuda() print('rank: ', rank, ', tensor: ', tensor) handle = hvd.allgather_async(tensor) #tensor = hvd.synchronize(handle) #handle = hvd.broadcast_async(tensor, 0) hvd.synchronize(handle) comm.Barrier() print('---------') print('rank: ', rank, ', tensor: ', tensor)
def weights_transfer(self): # transfer model weights weights = copy.deepcopy(self.network.state_dict()) handles = [] for name in weights: # TODO: need to consider bias if 'weight' in name: # print(self.task_id, 'send', name) handle = hvd.allgather_async(weights[name], name) handles.append(handle) hidx = 0 for name, param in self.network.named_parameters(): if 'weight' in name: # print(self.task_id, 'rec', name) rec_weights = hvd.synchronize(handles[hidx]) hidx += 1 # print(rec_weights.shape) n_num = param.shape[0] rec_weights = list(torch.split(rec_weights, n_num, 0)) del rec_weights[self.task_id] # TODO weights cat in the first dim, 2*[64,3]--> [128,3] # logging.info(type(rec_weights), rec_weights.shape) # calculate IOM of each filter im_list = [] for i in range(param.shape[0]): im_list.append( torch.sum(torch.abs(param[i])).data.cpu().numpy()) im_list = np.array(im_list) # print('minimal weight sum is {} size {}'.format(im_list.min(), im_list.shape[0])) for i, im in enumerate(im_list): prob = 1 - stats.norm(0, 2).cdf(im) if np.random.rand() < prob: random_sender = np.random.randint(0, len(rec_weights)) new_param = rec_weights[random_sender].clone() # random pic random_filter = np.random.randint( 0, new_param.shape[0]) # TODO give larger weights more chance weights[name][i] = new_param[random_filter] self.log_dict['transfer_count'] += 1 # self.network.state_dict()[name].copy_(param.clone()) # TODO: maybe modify the optimizer self.network.load_state_dict(weights) hvd.allreduce(torch.zeros(1), name='Barrier')
def _allgather_factors(self): """Allgather the factors for all layers""" handles = [] def _get_value_and_idx(sparse_tensor): tensor = sparse_tensor.data.view(-1) one_indexes = tensor != 0.0 indexes = one_indexes.nonzero().data.squeeze().view(-1) values = tensor.data[indexes] return values, indexes.int() for i, m in enumerate(self.modules): module_name = self.module_names[i] A_values, A_indexes = self.m_sparseA[ m] #_get_value_and_idx(self.m_A[m].data) if A_values.numel() == 0: continue A_value_name = module_name + '_A_value' A_idx_name = module_name + '_A_idx' #h_value = hvd.allgather_async(A_values, A_value_name) #h_idx = hvd.allgather_async(A_indexes, A_idx_name) h_value = hvd.allgather_async(A_values) h_idx = hvd.allgather_async(A_indexes) G_values, G_indexes = self.m_sparseG[ m] #_get_value_and_idx(self.m_G[m].data) G_value_name = module_name + '_G_value' G_idx_name = module_name + '_G_idx' #h_value_G = hvd.allgather_async(G_values, G_value_name) #h_idx_G = hvd.allgather_async(G_indexes, G_idx_name) if G_values is not None and G_values.numel() > 0: h_value_G = hvd.allgather_async(G_values) h_idx_G = hvd.allgather_async(G_indexes) handles.append((h_value, h_idx, h_value_G, h_idx_G)) num_of_workers = hvd.size() def _decompress(values, indices, output): numel = indices.numel() real_num_values = numel // num_of_workers for i in range(num_of_workers): tmp_values = values.data[i * real_num_values:(i + 1) * real_num_values] tmp_indices = indices.data[i * real_num_values:(i + 1) * real_num_values] output[tmp_indices] += tmp_values for i, handle in enumerate(handles): module_name = self.module_names[i] module = self.modules[i] m_A = self.m_A[module].view(-1) m_A.fill_(0.0) m_G = self.m_G[module].view(-1) m_G.fill_(0.0) h_value_A, h_idx_A, h_value_G, h_idx_G = handle A_values = hvd.synchronize(h_value_A) A_indexes = hvd.synchronize(h_idx_A).long() _decompress(A_values, A_indexes, m_A) #print(A_indexes[0]) #print(A_values[0]) #m_A.scatter_add_(0, A_indexes, A_values) m_A.div_(hvd.size()) G_values = hvd.synchronize(h_value_G) G_indexes = hvd.synchronize(h_idx_G).long() #print('G_I: ', G_indexes[0]) #print('G_V: ', G_values[0]) #m_G.scatter_add_(0, G_indexes, G_values) _decompress(G_values, G_indexes, m_G) m_G.div_(hvd.size())