def _benchmark_communication(self): logger.info('Benchmarking communication performance...') comm_profiler = CommunicationProfiler(allreduce_async_, synchronize) sizes, times = comm_profiler.benchmark(num_iters=10) def _fit_linear_function(x, y): X = np.array(x).reshape((-1, 1)) * 4 Y = np.array(y) model = LinearRegression() model.fit(X, Y) alpha = model.intercept_ beta = model.coef_[0] return alpha, beta alpha, beta = _fit_linear_function(sizes, times) self.alpha = alpha self.beta = beta alpha_tensor = torch.ones(1) * alpha beta_tensor = torch.ones(1) * beta alpha_tensor = broadcast(alpha_tensor, root_rank=0) beta_tensor = broadcast(beta_tensor, root_rank=0) if rank() != 0: self.alpha = float(alpha_tensor[0]) self.beta = float(beta_tensor[0]) logger.info( '[rank:{}] Communication performance fitted with f(p)=a+b*p, where a={} and b={}' .format(rank(), self.alpha, self.beta))
def _sparse_allreduce_async(self, p, name, density): stime = time.time() tensor = p.data.view(-1) tensor_compressed, ctx, selected_values = self._compression.compress( tensor, name, ratio=density) if False and rank( ) == 0 and self.train_iter % 200 == 0 and self.train_iter < 3000: grads = tensor.cpu().numpy() layer_idx = self._sequential_keys.index(name) np.save( '%s/r%d_gradients_iter_%d::%s::%d' % (self._gradient_path, rank(), self.train_iter, name, layer_idx), grads) indexes = ctx if indexes is None: handle = allgather_async(tensor_compressed, name) handle_idx = None # quantization uses all indices else: handle = allgather_async(selected_values, name) handle_idx = allgather_async(indexes.int(), name + '_indexes') if self._profiling: utils.force_insert_item(self._compression_timers, name, time.time() - stime) return (handle, handle_idx), ctx
def _print_profiling(self): if self._profiling and rank() == 0 and len( self._allreduce_timers.keys() ) > 0 and self.train_iter % settings.DISPLAY == 0: #and len(self._allreduce_timers.get(list(self._allreduce_timers.keys())[0], [])) == settings.DISPLAY: cps = self._compression_timers # compression ars = self._allreduce_timers # allreduce times ups = self._update_times # update times r = rank() tcp = 0.0 tar = 0.0 tup = 0.0 total = 0.0 for k in ars: if len(cps) > 0: acp = np.mean(cps[k]) tcp += acp aar = np.mean(ars[k]) tar += aar aup = np.mean(ups[k]) tup += aup total = tcp + tar + tup logger.info( '[%d]: Total compress: %f, allreduce: %f, update: %f, total: %f', r, tcp, tar, tup, total) #Ahmed - log to wandb micromeasurments of RANK 0 if r == 0: self._tb.log('micro/compress_ms', tcp * 1000) self._tb.log('micro/comm_ms', tar * 1000) self._tb.log('micro/gradagg_ms', tup * 1000) self._tb.log('micro/total_ms', total * 1000) cps.clear() ars.clear() ups.clear()
def _print_profiling(self): if self._profiling and rank() == 0 and len( self._allreduce_timers.keys()) > 0 and len( self._allreduce_timers.get( self._allreduce_timers.keys()[0], [])) == 40: cps = self._compression_timers # compression ars = self._allreduce_timers # allreduce times ups = self._update_times # update times r = rank() tcp = 0.0 tar = 0.0 tup = 0.0 total = 0.0 for k in cps: acp = np.mean(cps[k]) tcp += acp aar = np.mean(ars[k]) tar += aar aup = np.mean(ups[k]) tup += aup #logger.info('[%d][%s]: %f, %f, %f', r, k, acp, aar, aup) total = tcp + tar + tup cps.clear() ars.clear() ups.clear()
def _allreduce_grad_async(self, p, name): tensor = p.data.view(-1) tensor_compressed, ctx = tensor, None #self._compression.compress(tensor, name) if settings.LOGGING_GRADIENTS and rank() == 0: grads = tensor.cpu().numpy() np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads) handle = allreduce_async_(tensor_compressed, average=True, name=name) return handle, ctx
def __init__(self, model, hvd_opt, num_steps=10**6): """Construct a new ScheduledOptimizer, which uses horovod optimizer under the hood for averaging gradients across all the Horovod ranks. Args: model: The training model. ByteScheduler uses the model object to register hooks. hvd_opt: Optimizer to use for averaging gradients and applying updates. num_steps: The maximum number of training steps. ByteScheduler needs to know when to stop cross-iteration scheduling. Usage example: ``` import bytescheduler.pytorch.horovod as bsc bsc.init() optimizer = hvd.DistributedOptimizer(optimizer, named_parameters, compression) optimizer = bsc.ScheduledOptimizer(model, optimizer, num_steps) ``` """ self._model = model self._opt = hvd_opt self._logger = logging.getLogger("ByteScheduler") self._logger.debug("hvd size {}, rank {}".format(size(), rank())) self._desc = "rank {}".format(rank()) # Track training steps self._step = 0 self._final_step = num_steps # Use lock to block the forward propagation of each parameter. self._locks = {} for param_group in self.param_groups: for p in param_group['params']: self._locks[p] = threading.Lock() # The closer to input layer, the higher the priority is. self._priority_indexes = {} priority = 0 for p in model.parameters(): self._priority_indexes[p] = priority priority += 1 assert len(self._grad_accs) == 0 if size() > 1: self._register_forward_hooks() self._register_hooks() # Poll whether the tensor is ready for allreduce or whether the allreduce is finished. self.event_queue = queue.Queue() self._poller = threading.Thread(target=self._poll, args=()) self._poller.start() # Let rank 0 decide the communication order. self._immediate = False self._rank = rank() if self._rank != 0: self._immediate = True core.start(rank=self._rank, arch="allreduce")
def _allreduce_grad_async(self, p, name): tensor = p.data.view(-1) stime = time.time() #print("Rank: %s Original Values: %s" %(rank(), tensor)) tensor, ctx, selected_tensors = self._compression.compress(tensor, name) #tensor, None #logger.info("Compression Time: %s" %(time.time()-stime)) if settings.LOGGING_GRADIENTS and rank() == 0: grads = tensor.cpu().numpy() np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads) #print("Rank: %s Selected Values: %s" %(rank(), selected_tensors)) handle = allreduce_async_(selected_tensors, average=True, name=name) #(tensor_compressed, average=True, name=name) return handle, None
def _allreduce_grad_async(self, p, name): tensor = p.data.view(-1) if False and rank( ) == 0 and self.train_iter % 200 == 0 and self.train_iter < 3000: grads = tensor.cpu().numpy() layer_idx = self._sequential_keys.index(name) np.save( '%s/r%d_gradients_iter_%d::%s::%d' % (self._gradient_path, rank(), self.train_iter, name, layer_idx), grads) allreduce_name = name if len(name) > 200: allreduce_name = name[0:100] + '...' + name[-100:] handle = allreduce_async_(tensor, average=True, name=allreduce_name) return handle, None
def _sparse_allreduce_async(self, p, name, density): stime = time.time() tensor = p.data.view(-1) tensor_compressed, ctx, selected_values = self._compression.compress(tensor, name, ratio=density) self._selected_num_gradients.append(int(ctx.numel())) if settings.LOGGING_GRADIENTS and rank() == 0: grads = tensor.cpu().numpy() np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads) indexes = ctx handle = allgather_async(selected_values, name) handle_idx = allgather_async(indexes.int(), name+'_indexes') if self._profiling: utils.force_insert_item(self._compression_timers, name, time.time()-stime) return (handle, handle_idx), ctx
def _benchmark_communication(self): #logger.info('Benchmarking communication performance...') comm_profiler = CommunicationProfiler(allreduce_async_, synchronize) sizes, times = comm_profiler.benchmark(num_iters=10) def _fit_linear_function(x, y): X = np.array(x).reshape((-1, 1)) * 4 Y = np.array(y) model = LinearRegression() model.fit(X, Y) alpha = model.intercept_ beta = model.coef_[0] #A = np.vstack([X, np.ones(len(X))]).T #beta, alpha = np.linalg.lstsq(A, Y, rcond=None)[0] return alpha, beta alpha, beta = _fit_linear_function(sizes, times) self.alpha = alpha self.beta = beta alpha_tensor = torch.ones(1) * alpha beta_tensor = torch.ones(1) * beta alpha_tensor = broadcast(alpha_tensor, root_rank=0) beta_tensor = broadcast(beta_tensor, root_rank=0) if rank() != 0: self.alpha = float(alpha_tensor[0]) self.beta = float(beta_tensor[0])
def broadcast_object(obj, root_rank=0, name=None, process_set=global_process_set): """ Serializes and broadcasts an object from root rank to all other processes. Typical usage is to broadcast the `optimizer.state_dict()`, for example: .. code-block:: python state_dict = broadcast_object(optimizer.state_dict(), 0) if hvd.rank() > 0: optimizer.load_state_dict(state_dict) Arguments: obj: An object capable of being serialized without losing any context. root_rank: The rank of the process from which parameters will be broadcasted to all other processes. name: Optional name to use during broadcast, will default to the class type. process_set: Process set object to limit this operation to a subset of Horovod processes. Default is the global process set. Returns: The object that was broadcast from the `root_rank`. """ if name is None: name = type(obj).__name__ if rank() == root_rank: b = io.BytesIO() cloudpickle.dump(obj, b) t = torch.ByteTensor(bytearray(b.getvalue())) sz = torch.IntTensor([t.shape[0]]) broadcast_(sz, root_rank, name + '.sz', process_set) else: sz = torch.IntTensor([0]) broadcast_(sz, root_rank, name + '.sz', process_set) t = torch.ByteTensor(sz.tolist()[0]) broadcast_(t, root_rank, name + '.t', process_set) if rank() != root_rank: buf = io.BytesIO(t.numpy().tobytes()) obj = cloudpickle.load(buf) return obj
def increase_one_epoch(self): self.train_epoch += 1 if rank() == 0: density = self.get_current_density() size = np.sum(self._sizes) k = max(int(size * density), 1) logger.info('Average number of selected gradients: %f, exact k: %d', np.mean(self._selected_num_gradients), k) logger.info('The number of selected gradients: %s', self._selected_num_gradients) self._selected_num_gradients = []
def reset(self): self.num_replicas = size() self.rank = rank() # Exclude any samples we have already processed this epoch self.remaining_indices = [idx for idx in range(len(self.dataset)) if idx not in self.processed_indices] self.num_samples = int(math.ceil(len(self.remaining_indices) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas
def _print_profiling(self): if self._profiling and rank() == 0 and len(self._allreduce_timers.keys()) > 0 and len(self._allreduce_timers.get(self._allreduce_timers.keys()[0], [])) == 40: cps = self._compression_timers # compression ars = self._allreduce_timers # allreduce times ups = self._update_times # update times r = rank() tcp = 0.0; tar = 0.0; tup = 0.0; total=0.0 for k in cps: acp = np.mean(cps[k]) tcp += acp aar = np.mean(ars[k]) tar += aar aup = np.mean(ups[k]) tup += aup total = tcp+tar+tup logger.info('[%d]: Total compress: %f, allreduce: %f, update: %f, total: %f', r, tcp, tar, tup, total) cps.clear() ars.clear() ups.clear()
def reset(self): self.num_replicas = size() self.rank = rank() # Exclude any samples we have already processed this epoch all_indices = [idx for idx in range(len(self.dataset))] if self.shuffle: # Shuffle indices across workers deterministically in place seed = self.seed + self.epoch random.Random(seed).shuffle(all_indices) self.remaining_indices = all_indices[self.processed_num:] self.num_samples = int( math.ceil(len(self.remaining_indices) * 1.0 / self.num_replicas)) self.total_size = self.num_samples * self.num_replicas
def _init_logging(): class MyLogger: def __init__(self, logpath): self.log_file = open(logpath, 'w+') def debug(self, msg): self.log_file.write(msg + '\n') # self.log_file.wrtie("\n") def __del__(self): self.log_file.close() logdir = "~/horovod_logs/hooks" logdir = os.path.expanduser(logdir) if not os.path.exists(logdir): os.makedirs(logdir) dt = datetime.fromtimestamp(time.time()) timestamp = dt.strftime("%Y%m%d-%H%M%S") logging_file = os.path.join(logdir, "hook-{}-rank{}.log".format(timestamp, rank())) print(logging_file) logger = MyLogger(logging_file) return logger
def _generate_groups_mgwfbp(self): num_of_workers = size() p_alpha_beta_56Gbps = { 64: (0.00080632079996292579, 1.8 * 3.2713239529771973e-10), 32: (0.00040632079996292579, 1.5 * 3.2713239529771973e-10), 16: (0.00023583677659915685 * 3, 4.0594787739537565e-10), 8: (9.75367204301171e-05, 3.0568230536676206e-10), 4: (4.204298980348825e-05, 2.0589360830118177e-10), 2: (2.554691138304671e-06, 9.837548167872609e-11) } p_alpha_beta_10Gbps = { 64: (0.0023476410788581382 * 3, 9.643300782166769e-10), 32: (0.0013476410788581382 * 3, 8.643300782166769e-10), 16: (0.0009080981007148093, 7.395651186836712e-10), 8: (0.0005230272768511732, 8.570746975492128e-10), 4: (4.204298980348825e-05, 2.0589360830118177e-10), 2: (2.554691138304671e-06, 9.837548167872609e-11) } if self.alpha is not None: alpha, beta = self.alpha, self.beta else: if self._rdma: alpha, beta = p_alpha_beta_56Gbps[num_of_workers] else: alpha, beta = p_alpha_beta_10Gbps[num_of_workers] nbytes = 2 if self._fp16 else 4 def __calculate_comm_start(tc, tb, taob, L): taoc = [0] * L taoc[L - 1] = taob[L - 1] + tb[L - 1] for l in range(L - 1)[::-1]: taoc[l] = max(taoc[l + 1] + tc[l + 1], taob[l] + tb[l]) return taoc def __merge(taob, tc, p, l): tc[l] = 0 p[l - 1] = p[l - 1] + p[l] p[l] = 0 if self.size_commtime_dict is not None: tc[l - 1] = self.size_commtime_dict[l - 1] else: tc[l - 1] = utils.predict_allreduce_time_with_size( alpha, beta, p[l - 1] * nbytes, num_of_workers) sizes = [ self._named_parameters[k].data.numel() for k in self._seq_layernames ] seq_layernames = self._seq_layernames if not utils.check_unique(seq_layernames): raise ValueError self._sizes = sizes p = sizes[:] L = len(sizes) if self.size_commtime_dict is not None: tc = [self.size_commtime_dict[s] for s in sizes] else: tc = [ utils.predict_allreduce_time_with_size(alpha, beta, s * nbytes, num_of_workers) for s in sizes ] tb = list(self._layerwise_times) taob = [0] * L for l in range(0, L - 1)[::-1]: taob[l] = taob[l + 1] + tb[l + 1] taoc = __calculate_comm_start(tc, tb, taob, L) if rank() == 0: #logger.debug('seq_layernames: %s', seq_layernames) #logger.debug('tb: %s', tb) #logger.debug('taob: %s', taob) #logger.debug('sizes: %s', p) #logger.warn('tc sum: %f', np.sum(tc)) pass #logger.warn('tc: %s', tc) #logger.warn('taoc: %s', taoc) groups = [] group = [] idx = 0 key_groupidx_maps = {} l = L - 1 key = seq_layernames[l] key_groupidx_maps[key] = idx for l in range(1, L)[::-1]: key = seq_layernames[l] group.append(key) key_groupidx_maps[key] = idx current_taob = taob[l - 1] + tb[l - 1] merged = False if current_taob < taoc[l] + tc[l]: if taoc[l] > current_taob: __merge(taob, tc, p, l) taoc = __calculate_comm_start(tc, tb, taob, L) merged = True else: t_wait = current_taob - taoc[l] t_saved = alpha if t_wait < t_saved: __merge(taob, tc, p, l) taoc = __calculate_comm_start(tc, tb, taob, L) merged = True #if not merged and (key.find('bn') >= 0 or key.find('bias') >= 0): if not merged and p[l] < 8192: __merge(taob, tc, p, l) taoc = __calculate_comm_start(tc, tb, taob, L) merged = True if not merged: idx += 1 groups.append(group) group = [] #elif current_taob > taoc[l+1]+tc[l+1] and current_taob < taoc[l]+tc[l] and taoc[l]+alpha > current_taob: # __merge(taob, tc, p, l) # taoc = __calculate_comm_start(tc, tb, taob, L) #else: # idx += 1 # groups.append(group) # group = [] l = 0 key = seq_layernames[l] key_groupidx_maps[key] = idx group.append(key) if len(group) > 0: groups.append(group) if rank() == 0: #logger.debug('seq_layernames: %s', seq_layernames) #pass #logger.info('Merged tc sum: %f', np.sum(tc)) print('Merged sizes: ', p[::-1]) print('# of parameters: ', np.sum(p[::-1])) #logger.info('Merged tb: %s', tb[::-1]) #logger.info('Merged taob: %s', taob[::-1]) #logger.info('Merged tc: %s', tc[::-1]) #logger.info('Merged taoc: %s', taoc[::-1]) return groups, key_groupidx_maps
def _generate_groups_mgwfbp(self): num_of_workers = size() p_alpha_beta_56Gbps = { 16: (0.00023583677659915685, 4.0594787739537565e-10), 8: (9.75367204301171e-05, 3.0568230536676206e-10), 4: (4.204298980348825e-05, 2.0589360830118177e-10), 2: (2.554691138304671e-06, 9.837548167872609e-11) } p_alpha_beta_10Gbps = { 16: (0.0009080981007148093, 7.395651186836712e-10), 8: (0.0005230272768511732, 8.570746975492128e-10), 4: (4.204298980348825e-05, 2.0589360830118177e-10), 2: (2.554691138304671e-06, 9.837548167872609e-11) } if self.alpha is not None: alpha, beta = self.alpha, self.beta else: if settings.CONNECTION == '10GbE': alpha, beta = p_alpha_beta_10Gbps[num_of_workers] else: alpha, beta = p_alpha_beta_56Gbps[num_of_workers] nbytes = 2 if settings.FP16 else 4 def __calculate_comm_start(tc, tb, taob, L): taoc = [0] * L taoc[L - 1] = taob[L - 1] + tb[L - 1] for l in range(L - 1)[::-1]: taoc[l] = max(taoc[l + 1] + tc[l + 1], taob[l] + tb[l]) return taoc def __merge(taob, tc, p, l): tc[l] = 0 p[l - 1] = p[l - 1] + p[l] p[l] = 0 if self.size_commtime_dict is not None: tc[l - 1] = self.size_commtime_dict[l - 1] else: tc[l - 1] = utils.predict_allreduce_time_with_size( alpha, beta, p[l - 1] * nbytes, num_of_workers) sizes = [ self._named_parameters[k].data.numel() for k in self._seq_layernames ] seq_layernames = self._seq_layernames if not utils.check_unique(seq_layernames): raise ValueError self._sizes = sizes p = sizes[:] L = len(sizes) if self.size_commtime_dict is not None: tc = [self.size_commtime_dict[s] for s in sizes] else: tc = [ utils.predict_allreduce_time_with_size(alpha, beta, s * nbytes, num_of_workers) for s in sizes ] tb = list(self._layerwise_times) taob = [0] * L for l in range(0, L - 1)[::-1]: taob[l] = taob[l + 1] + tb[l + 1] taoc = __calculate_comm_start(tc, tb, taob, L) if rank() == 0: logger.info('tc sum: %f', np.sum(tc)) groups = [] group = [] idx = 0 key_groupidx_maps = {} l = L - 1 key = seq_layernames[l] key_groupidx_maps[key] = idx for l in range(1, L)[::-1]: key = seq_layernames[l] group.append(key) key_groupidx_maps[key] = idx current_taob = taob[l - 1] + tb[l - 1] merged = False if current_taob < taoc[l] + tc[l]: if taoc[l] > current_taob: __merge(taob, tc, p, l) taoc = __calculate_comm_start(tc, tb, taob, L) merged = True else: t_wait = current_taob - taoc[l] t_saved = alpha if t_wait < t_saved: __merge(taob, tc, p, l) taoc = __calculate_comm_start(tc, tb, taob, L) merged = True if not merged: idx += 1 groups.append(group) group = [] l = 0 key = seq_layernames[l] key_groupidx_maps[key] = idx group.append(key) if len(group) > 0: groups.append(group) if rank() == 0: logger.info('Predicted non-overlapped time: %f', taoc[0] + tc[0] - (taob[0] + tb[0])) logger.info('Predicted tb+tc= %f', taoc[0] + tc[0]) logger.info('Merged tc sum: %f', np.sum(tc)) return groups, key_groupidx_maps
def synchronize(self): global SPEED num_of_workers = size() ratio = 0 i = 0 for p, value in self._handles.items(): name = self._merged_parameter_names.get(p) handle, ctx, density = value if self._sparse and density < 1: stime = time.time() handle_idx = None all_indexes = None if type(handle) is tuple: handle, handle_idx = handle[0], handle[1] output = synchronize(handle) if handle_idx is not None: all_indexes = synchronize(handle_idx) if self._profiling: utils.force_insert_item(self._allreduce_timers, name, time.time() - stime) stime = time.time() new_grad = p.data.view(-1) dectx = output, all_indexes, num_of_workers new_grad = self._compression.decompress(new_grad, dectx) if self._profiling: utils.force_insert_item(self._update_times, name, time.time() - stime) elif density == 1: stime = time.time() output = synchronize(handle) if self._profiling: utils.force_insert_item(self._allreduce_timers, name, time.time() - stime) stime = time.time() if self._norm_clip is not None: norm_clip = np.sqrt(1.0 / size()) * self._norm_clip norm_type = 2.0 param_norm = output.norm(norm_type) total_norm = param_norm.item() clip_coef = norm_clip / (total_norm + 1e-6) if clip_coef < 1: output.mul_(clip_coef) p.set_(output) if self._profiling: utils.force_insert_item(self._update_times, name, time.time() - stime) elif density > 1: #allgather instead of allreduce of sparse tensor stime = time.time() output = synchronize(handle) if self._profiling: utils.force_insert_item(self._allreduce_timers, name, time.time() - stime) stime = time.time() new_grad = p.data.view(-1) new_grad.fill_(0.0) numel = output.size(0) real_num_values = numel // num_of_workers for i in range(num_of_workers): values = output.data[i * real_num_values:(i + 1) * real_num_values] new_grad += values new_grad /= num_of_workers if self._norm_clip is not None: norm_clip = np.sqrt(1.0 / size()) * self._norm_clip norm_type = 2.0 param_norm = new_grad.norm(norm_type) total_norm = param_norm.item() clip_coef = norm_clip / (total_norm + 1e-6) if clip_coef < 1: new_grad.mul_(clip_coef) p.set_(new_grad) if self._profiling: utils.force_insert_item(self._update_times, name, time.time() - stime) # Ahmed - track number of elments if ctx is not None: ratio += ctx.numel() / p.data.numel() else: ratio += 1 self._avg_ratio += ratio self._num_avg_sample += 1 if density < 1: #Volume for all-gather compression (data + indexes) - %TODO should multiply (1-1/num-of-workers) (to remove portion of local node) self._sum_volume += output.numel() * output.element_size( ) + all_indexes.numel() * all_indexes.element_size() elif density == 1: #Volume for all-reduce no-compression self._sum_volume += 2 * output.numel() * output.element_size() elif density == 2: #Volume for all-gather no compression (data ) - %TODO should multiply (1-1/num-of-workers) (to remove portion of local node) self._sum_volume += output.numel() * output.element_size() self._num_vol_sample += 1 if rank() == 0 and self.train_iter % settings.DISPLAY == 0: self._tb.log('datavol/cum_vol_bytes', self._sum_volume) self._tb.log('datavol/avg_vol_bytes', self._sum_volume / self._num_vol_sample) if self._compression is not compressors['none']: #and ratio > 0: #target_k = (self.model_elemnum * density) self._tb.log('compress/comp_ratio', ratio) self._tb.log('compress/est_compratio', ratio / density) self._tb.log('compress/avg_est_compratio', (1.0 * self._avg_ratio / self._num_avg_sample) / density) if self.stages < 0: self._tb.log('compress/num_stages', self._compression.cur_stages) else: self._tb.log('compress/num_stages', self.stages) if self.stages == 0: self._tb.log('compress/first_ratio', self.iratio) else: self._tb.log('compress/first_ratio', self._compression.first_ratio) self._num_sample = 0 self._sum_elems = 0 if len(self._groups) != len(self._sequential_keys): for merged_p, value in self._handles.items(): new_name = self._merged_parameter_names.get(merged_p) tensors = self._pull_from_buffer(new_name, merged_p) for n in tensors: p = self._named_parameters.get(n) p.grad.set_(tensors[n].data.type(p.grad.type())) self.train_iter += 1 self._handles.clear() self._print_profiling()
def _generate_groups_mgwfbp(self): num_of_workers = size() alpha = 9.618801111215886e-08 beta = 3.89407453e-13 def __calculate_comm_start(tc, tb, taob, L): taoc = [0] * L taoc[L - 1] = taob[L - 1] + tb[L - 1] for l in range(L - 1)[::-1]: taoc[l] = max(tc[l + 1] + tc[l + 1], taob[l] + tb[l]) return taoc def __merge(taob, tc, p, l): tc[l] = 0 p[l - 1] = p[l - 1] + p[l] p[l] = 0 tc[l - 1] = utils.predict_allreduce_time_with_size( alpha, beta, p[l - 1] * 4, num_of_workers) sizes = [ self._named_parameters[k].data.numel() for k in self._seq_layernames ][::-1] seq_layernames = self._seq_layernames[::-1] self._sizes = sizes p = sizes[:] L = len(sizes) tc = [ utils.predict_allreduce_time_with_size(alpha, beta, s * 4, num_of_workers) for s in sizes ] tb = list(self._layerwise_times[::-1]) taob = [0] for t in tb[:-1]: taob.append(t + taob[-1]) taob = taob[::-1] taoc = __calculate_comm_start(tc, tb, taob, L) if rank() == 0 and DEBUG: logger.debug('seq_layernames: %s', seq_layernames) logger.debug('tb: %s', tb) logger.debug('taob: %s', taob) logger.debug('sizes: %s', p) logger.debug('tc: %s', tc) logger.debug('taoc: %s', taoc) groups = [] group = [] idx = 0 key_groupidx_maps = {} l = L - 1 key = seq_layernames[l] key_groupidx_maps[key] = idx group.append(key) for l in range(1, L - 1)[::-1]: key = seq_layernames[l] group.append(key) key_groupidx_maps[key] = idx current_taob = taob[l - 2] if l >= 2 else taob[0] if current_taob - taoc[l] < alpha: __merge(taob, tc, p, l) taoc = __calculate_comm_start(tc, tb, taob, L) else: idx += 1 groups.append(group) group = [] l = 0 key = seq_layernames[l] key_groupidx_maps[key] = idx group.append(key) if len(group) > 0: groups.append(group) return groups, key_groupidx_maps
def _generate_groups_mgwfbp(self): num_of_workers = size() p_alpha_beta = { 16: (0.00010632079996292579, 1.5 * 3.2713239529771973e-10), 8: (9.75367204301171e-05, 3.0568230536676206e-10), 4: (4.204298980348825e-05, 2.0589360830118177e-10), 2: (2.554691138304671e-06, 9.837548167872609e-11) } alpha, beta = p_alpha_beta[num_of_workers] def __calculate_comm_start(tc, tb, taob, L): taoc = [0] * L taoc[L - 1] = taob[L - 1] + tb[L - 1] for l in range(L - 1)[::-1]: taoc[l] = max(taoc[l + 1] + tc[l + 1], taob[l] + tb[l]) return taoc def __merge(taob, tc, p, l): tc[l] = 0 p[l - 1] = p[l - 1] + p[l] p[l] = 0 tc[l - 1] = utils.predict_allreduce_time_with_size( alpha, beta, p[l - 1] * 4, num_of_workers) sizes = [ self._named_parameters[k].data.numel() for k in self._seq_layernames ] seq_layernames = self._seq_layernames self._sizes = sizes p = sizes[:] L = len(sizes) tc = [ utils.predict_allreduce_time_with_size(alpha, beta, s * 4, num_of_workers) for s in sizes ] tb = list(self._layerwise_times) taob = [0] * L for l in range(0, L - 1)[::-1]: taob[l] = taob[l + 1] + tb[l + 1] taoc = __calculate_comm_start(tc, tb, taob, L) if rank() == 0: logger.warn('tc sum: %f', np.sum(tc)) logger.warn('tc: %s', tc) logger.warn('taoc: %s', taoc) groups = [] group = [] idx = 0 key_groupidx_maps = {} l = L - 1 key = seq_layernames[l] key_groupidx_maps[key] = idx group.append(key) for l in range(1, L - 1)[::-1]: key = seq_layernames[l] group.append(key) key_groupidx_maps[key] = idx current_taob = taob[l - 1] + tb[l - 1] if current_taob < taoc[l + 1] + tc[l + 1]: __merge(taob, tc, p, l) taoc = __calculate_comm_start(tc, tb, taob, L) elif current_taob > taoc[l + 1] + tc[l + 1] and current_taob < taoc[ l] + tc[l] and taoc[l] + alpha > current_taob: __merge(taob, tc, p, l) taoc = __calculate_comm_start(tc, tb, taob, L) else: idx += 1 groups.append(group) group = [] l = 0 key = seq_layernames[l] key_groupidx_maps[key] = idx group.append(key) logger.info('Predicted non-overlapped time: %f', taoc[0] + tc[0] - (taob[0] + tb[0])) logger.info('Predicted tb+tc= %f', taoc[0] + tc[0]) if len(group) > 0: groups.append(group) return groups, key_groupidx_maps
def synchronize(self): num_of_workers = size() for p, value in self._handles.items(): name = self._merged_parameter_names.get(p) handle, ctx, density = value if self._sparse and density < 1: stime = time.time() handle_idx = None all_indexes = None if type(handle) is tuple: handle, handle_idx = handle[0], handle[1] output = synchronize(handle) if handle_idx is not None: all_indexes = synchronize(handle_idx) if self._profiling: utils.force_insert_item(self._allreduce_timers, name, time.time()-stime) stime = time.time() new_grad = p.data.view(-1) new_grad.fill_(0.0) numel = output.size(0) real_num_values = numel//num_of_workers for i in range(num_of_workers): values_and_indexes = output.data[i*real_num_values:(i+1)*real_num_values] if all_indexes is None: values = values_and_indexes[0:real_num_values//2] indexes = values_and_indexes[real_num_values//2:].long() else: values = values_and_indexes indexes = all_indexes.data[i*real_num_values:(i+1)*real_num_values].long() new_grad[indexes[0:indexes.numel()//2]] += values[0:indexes.numel()//2] new_grad[indexes[indexes.numel()//2:]] += values[indexes.numel()//2:] new_grad /= num_of_workers if self._profiling: utils.force_insert_item(self._update_times, name, time.time()-stime) else: stime = time.time() output = synchronize(handle) print("Rank: %s Mean after allreduce: %s" %(rank(),output)) stime = time.time() output = self._compression.decompress(output) #logger.info("Decompression Time : %s" %(time.time()-stime)) #print("Rank: %s Tensor Decompressed: %s" %(rank(),output)) if self._profiling: utils.force_insert_item(self._allreduce_timers, name, time.time()-stime) stime = time.time() if self._norm_clip is not None: norm_clip = np.sqrt(1.0/size()) * self._norm_clip norm_type = 2.0 param_norm = output.norm(norm_type) total_norm = param_norm.item() clip_coef = norm_clip / (total_norm + 1e-6) if clip_coef < 1: output.mul_(clip_coef) p.set_(output) if self._profiling: utils.force_insert_item(self._update_times, name, time.time()-stime) if len(self._groups) != len(self._sequential_keys): for merged_p, value in self._handles.items(): new_name = self._merged_parameter_names.get(merged_p) tensors = self._pull_from_buffer(new_name, merged_p) for n in tensors: p = self._named_parameters.get(n) p.grad.set_(tensors[n].data.type(p.grad.type())) self.train_iter += 1 self._handles.clear() self._print_profiling()