def _allreduce_factors(self): """Allreduce the factors for all layers""" handles = [] for m in self.modules: handles.append(hvd.allreduce_async_(self.m_A[m].data, op=hvd.Average)) handles.append(hvd.allreduce_async_(self.m_G[m].data, op=hvd.Average)) for handle in handles: hvd.synchronize(handle)
def allreduce_async_(self, name, tensor, op=hvd.Average): self.op = op if self.merge: if self.symmetric: upper_indices = torch.triu_indices(tensor.shape[0], tensor.shape[0], device=tensor.device) comm_tensor = tensor[upper_indices[0], upper_indices[1]] else: comm_tensor = tensor if self.fp16: if self.residual: if name not in self._residuals: self._residuals[name] = comm_tensor.new_zeros( comm_tensor.shape) comm_tensor.add_(self._residuals[name]) half_tensor = comm_tensor.half() if self.residual: self._residuals[name] = comm_tensor - half_tensor comm_tensor = half_tensor self._name_tensors[name] = (tensor, comm_tensor) new_name, new_tensor = self._tensor_group.push_tensor( name, comm_tensor) if new_tensor is not None: current_stream = torch.cuda.current_stream() current_stream.synchronize() handle = hvd.allreduce_async_(new_tensor, op=hvd.Sum, name=self.prefix + new_name) self.handles.append(handle) else: if self.symmetric: upper_indices = torch.triu_indices(tensor.shape[0], tensor.shape[0], device=tensor.device) comm_tensor = tensor[upper_indices[0], upper_indices[1]] else: comm_tensor = tensor if self.fp16: if self.residual: if name not in self._residuals: self._residuals[name] = comm_tensor.new_zeros( comm_tensor.shape) comm_tensor.add_(self._residuals[name]) half_tensor = comm_tensor.half() if self.residual: self._residuals[name] = comm_tensor - half_tensor comm_tensor = half_tensor #comm_tensor.half() #comm_tensor = comm_tensor.bfloat16() self._name_tensors[name] = (tensor, comm_tensor) handle = hvd.allreduce_async_(comm_tensor, op=hvd.Sum) self.handles.append(handle)
def _reduce_factors(self, eigen_ranks): """Allreduce the factors for all layers""" handles = [] for m in self.modules: name = self.module_name_map[m] ranks_a, ranks_g = eigen_ranks[m] rank_a = ranks_a[0] rank_g = ranks_g[0] handles.append(hvd.allreduce_async_(self.m_A[m].data, op=hvd.Average)) handles.append(hvd.allreduce_async_(self.m_G[m].data, op=hvd.Average)) for handle in handles: hvd.synchronize(handle)
def _allreduce_eigendecomp(self): """Allreduce the eigendecompositions for all layers Note: we use `op=hvd.Sum` to simulate an allgather`. Each rank will either compute the eigendecomposition for a factor or just return zeros so we sum instead of averaging. """ handles = [] for m in self.modules: handles.append(hvd.allreduce_async_(self.m_QA[m].data, op=hvd.Sum)) handles.append(hvd.allreduce_async_(self.m_QG[m].data, op=hvd.Sum)) for handle in handles: hvd.synchronize(handle)
def delayedupdate(self, val): if self.handle is None: self.sum += 0 else: self.sum += hvd.synchronize(self.handle) self.handle = hvd.allreduce_async_(val.detach().cpu(), name=self.name) self.n += 1
def async_send(self, tensors_compressed, name): handles = [] for i, tensor_compressed in enumerate(tensors_compressed): handles.append( allreduce_async_(tensor_compressed, self.compressor.average, name + str(i))) return handles
def allreduce_parameters(params): handles = [] if isinstance(params, dict): params = sorted(params.items()) elif isinstance(params, list): # support both named_parameters() and regular parameters() params = [p if isinstance(p, tuple) else (None, p) for p in params] else: raise ValueError('invalid params of type: %s' % type(params)) # Run asynchronous broadcasts. handles = [] for name, p in params: handle = hvd.allreduce_async_(p, average=True, name=name) handles.append(handle) # Wait for completion. for handle in handles: hvd.synchronize(handle)
def maybe_allreduce_grads(model): if hvd.size() > 1: tstart_reduce = time.time() named_parameters = list( sorted(model.named_parameters(), key=lambda a: a[0])) grad_handles = [] for name, p in named_parameters: if p.requires_grad: if p.grad is None: p.grad = torch.zeros_like(p) with torch.no_grad(): grad_handles.append(hvd.allreduce_async_(p.grad, name=name)) for handle in grad_handles: hvd.synchronize(handle) tlogger.record_tabular("TimeElapsedAllReduce", time.time() - tstart_reduce) if time.time() - tstart_reduce > 5: import socket tlogger.info( "Allreduce took more than 5 seconds for node {} (rank {})". format(socket.gethostname(), hvd.rank()))
def get_eigen(model, inputs, targets, criterion, maxIter=50, tol=1e-3, comm=True): """ compute the top eigenvalues of model parameters and the corresponding eigenvectors. change the model to evaluation mode, otherwise the batch Normalization Layer will change. If you call this functino during training, remember to change the mode back to training mode. model.eval() """ model.eval() # torch.no_grad() #model_copy = squeezenet1_1(pretrained=False) #model_copy.load_state_dict(model.state_dict()) #optimizer = optim.SGD(model_copy.parameters(), lr=0.001 * hvd.size(), momentum=0.9) outputs = model(inputs) loss = criterion(outputs, targets) loss.backward(create_graph=True) params, gradsH = get_params_grad(model) v = [torch.randn(p.size()) for p in params] v = normalization(v) if comm: hvd.broadcast_parameters(v, root_rank=0) eigenvalue = None for i in range(maxIter): print(i) model.zero_grad() Hv = hessian_vector_product(gradsH, params, v) if comm: handles = [] for i in range(len(Hv)): handles.append( hvd.allreduce_async_( Hv[i], name='reduce random vector update {}'.format(i))) for handle in handles: hvd.synchronize(handle) eigenvalue_tmp = group_product(Hv, v).item() v = normalization(Hv) if eigenvalue == None: eigenvalue = eigenvalue_tmp else: if abs(eigenvalue - eigenvalue_tmp) < tol: if comm: return eigenvalue_tmp, v else: eigenvalue = eigenvalue_tmp if not comm: print("{} is here".format(hvd.rank())) eigenvalue = torch.FloatTensor([eigenvalue]) hvd.allreduce_(eigenvalue, name='eigenvalue') print("allreduced eigs for rank {}".format(hvd.rank())) eigenvalue = float(eigenvalue) if hvd.rank() == 0: print("No Communication eigenvalue approximated at {}".format( eigenvalue)) return eigenvalue, v
#p_g = torch.randn(256,256,3,3).cuda() #p_g = torch.randn(128,128,3,3).cuda() data_size = [] bwd = [] handles = {} for i in range(18): size = 1024 * (2**i) p_g = torch.randn(size).cuda() torch.cuda.synchronize() begin_time = time.time() rept = 10 if i < 10: rept = 10000 for i in range(rept): handle = hvd.allreduce_async_(p_g, average=False) handles[i] = handle for i in range(rept): synchronize(handles[i]) torch.cuda.synchronize() end_time = time.time() if hvd.local_rank() == 0: print(size * 4 / 1024, "KB") #print('allreduce time, ', (end_time - begin_time) / rept) bandwidth = size * 4 * rept / (end_time - begin_time) / 1e9 print('allreduce bandwidth, ', bandwidth) data_size.append(size) bwd.append(bandwidth) print(data_size) print(bwd) exit(0)