def aggregate(conf, master_model, fedavg_model, client_models, flatten_local_models): # perform the server momentum (either heavy-ball momentum or nesterov momentum) fl_aggregate = conf.fl_aggregate assert "server_momentum_factor" in fl_aggregate # start the server momentum acceleration. current_model_tb = TensorBuffer(list(fedavg_model.parameters())) previous_model_tb = TensorBuffer(list(master_model.parameters())) # get the update direction. update = previous_model_tb.buffer - current_model_tb.buffer # using server momentum for the update. if not hasattr(conf, "server_momentum_buffer"): conf.server_momentum_buffer = torch.zeros_like(update) conf.server_momentum_buffer.mul_( fl_aggregate["server_momentum_factor"]).add_(update) previous_model_tb.buffer.add_(-conf.server_momentum_buffer) # update the master_model (but will use the bn stats from the fedavg_model) master_model = fedavg_model _model_param = list(master_model.parameters()) previous_model_tb.unpack(_model_param) # free the memory. torch.cuda.empty_cache() # a temp hack (only for debug reason). client_models = dict((used_client_arch, master_model.cpu()) for used_client_arch in conf.used_client_archs) return client_models
def _receive_models_from_selected_clients(self, selected_client_ids): self.conf.logger.log(f"Master waits to receive the local models.") dist.barrier() # init the placeholders to recv the local models from workers. flatten_local_models = dict() for selected_client_id in selected_client_ids: arch = self.clientid2arch[selected_client_id] client_tb = TensorBuffer( list(self.client_models[arch].state_dict().values())) client_tb.buffer = torch.zeros_like(client_tb.buffer) flatten_local_models[selected_client_id] = client_tb # async to receive model from clients. reqs = [] for client_id, world_id in zip(selected_client_ids, self.world_ids): req = dist.irecv(tensor=flatten_local_models[client_id].buffer, src=world_id) reqs.append(req) for req in reqs: req.wait() dist.barrier() self.conf.logger.log(f"Master received all local models.") return flatten_local_models
def step(self, closure=None, **kargs): # Apply the gradients with the weight decay and momentum. with kargs["timer"]("grad.apply_grad", epoch=self.conf.epoch_): utils.apply_gradient( self.param_groups, self.state, apply_grad_to_model=True ) with kargs["timer"]("grad.get_params", epoch=self.conf.epoch_): params, _ = comm.get_data( self.param_groups, self.param_names, is_get_grad=False ) params_tb = TensorBuffer(params) with kargs["timer"]("grad.error_compensate", epoch=self.conf.epoch_): self.memory.buffer += params_tb.buffer with kargs["timer"]("grad.compress", epoch=self.conf.epoch_): sync_buffer = {"original_shapes": self.shapes, "params_tb": self.memory} local_compressed_params_tb = self.compressor.compress(sync_buffer) with kargs["timer"]("grad.update_memory", epoch=self.conf.epoch_): self.memory.buffer = self.memory.buffer - local_compressed_params_tb.buffer with kargs["timer"]("grad.sync", epoch=self.conf.epoch_): self.compressor.sync(sync_buffer) # update local model. with kargs["timer"]("grad.decompress", epoch=self.conf.epoch_): aggregated_info_tb = self.compressor.uncompress( sync_buffer, self.neighbors_info ) params_tb.buffer += aggregated_info_tb.buffer params_tb.unpack(params) return sync_buffer["n_bits"]
def step(self, closure=None, **kargs): with kargs['timer']('sync', epoch=self.conf.epoch_): # do the local update steps. with kargs["timer"]("local_update", epoch=self.conf.epoch_): utils.apply_gradient(self.param_groups, self.state, apply_grad_to_model=True) # enter the global sync if it satisfies the condition. if (self.conf.epoch_ < self.turn_on_local_step_from_epoch or self.conf.local_index % self.local_step == 0): with kargs["timer"]("get_params", epoch=self.conf.epoch_): # get parmas. params, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=False) params_tb = TensorBuffer(params) with kargs['timer']('memory_and_compress', epoch=self.conf.epoch_): # get the params difference w.r.t. previous synced model. local_scale, local_sign = [], [] for consensus_param, param, memory in zip( self.consensus_params_tb, params_tb, self.memory_tb): memory.data.copy_(consensus_param - param + memory) # compress. with kargs["timer"]("directions", epoch=self.conf.epoch_): direction = exchange(self.memory_tb.buffer) #signum with kargs['timer']('memory_and_compress', epoch=self.conf.epoch_): for consensus_param, param, memory in zip( self.consensus_params_tb, params_tb, self.memory_tb): _local_scale, _local_sign = scaled_sign(memory) local_scale.append(_local_scale) local_sign.append(_local_sign) memory.data.copy_(memory - _local_scale * _local_sign) with kargs["timer"]("directions", epoch=self.conf.epoch_): global_direction = TB(self.memory_tb, direction) with kargs["timer"]("magnitudes", epoch=self.conf.epoch_): magnitudes_tb = TensorBuffer(local_scale) magnitudes_tb.buffer = self.world_aggregator._agg( magnitudes_tb.buffer, "avg", distributed=self.conf.distributed) # unpack the synced info and update the consensus params. with kargs["timer"]("update_consensus", epoch=self.conf.epoch_): for update_magnitude, update_direction, consensus_param in zip( magnitudes_tb, global_direction, self.consensus_params_tb): consensus_param.add_( -1.0, update_direction.mul(update_magnitude)) # consistent the local models by assigning the consensus params. self.consensus_params_tb.unpack(params) n_bits = get_n_bits(magnitudes_tb.buffer) else: n_bits = 0 return n_bits
def compress(self, sync_buffer): # get the sign/magnitude for the tensor (to be transmitted). selected_values, selected_indices = [], [] for half_param, hat_param in zip(sync_buffer["flatten_half_params"], sync_buffer["flatten_params"]): _selected_values, _selected_indices = self.compressor_fn.compress( half_param - hat_param, self.comm_op, self.compress_ratio, self.is_biased, ) selected_values.append(_selected_values) selected_indices.append(_selected_indices) # get selected shapes. selected_shapes = [len(_value) for _value in selected_values] # flatten selected values/indices. flatten_selected_values = TensorBuffer(selected_values) flatten_selected_indices = TensorBuffer(selected_indices) # get n_bits to transmit. n_bits = get_n_bits(flatten_selected_values.buffer) + get_n_bits( flatten_selected_indices.buffer) # update shared dict. sync_buffer["selected_shapes"] = selected_shapes sync_buffer["flatten_selected_values"] = flatten_selected_values sync_buffer["flatten_selected_indices"] = flatten_selected_indices sync_buffer["n_bits"] = n_bits
def step(self, closure=None, **kargs): # Apply the gradients with the weight decay and momentum. with kargs["timer"]("grad.apply_grad", epoch=self.conf.epoch_): utils.apply_gradient( self.param_groups, self.state, apply_grad_to_model=False ) # get flattened params. with kargs["timer"]("grad.get_params", epoch=self.conf.epoch_): params, _ = comm.get_data( self.param_groups, self.param_names, is_get_grad=False ) flatten_params = TensorBuffer(params) grads, _ = comm.get_data( self.param_groups, self.param_names, is_get_grad=True ) flatten_grads = TensorBuffer(grads) with kargs["timer"]("grad.get_extrapolated_model", epoch=self.conf.epoch_): flatten_updated_params = deepcopy(flatten_params) # get weighted hat params. flatten_updated_params.buffer = sum( [ _hat_params.buffer * self.neighbors_info[_rank] for _rank, _hat_params in self.neighbor_hat_params.items() ] ) # get updated local model (flatten params). with kargs["timer"]("grad.unflatten_to_update", epoch=self.conf.epoch_): flatten_updated_params.buffer.add_( flatten_grads.buffer, alpha=-self.param_groups[0]["lr"] ) flatten_updated_params.unpack(params) # get extrapolated model. flatten_updated_params.buffer = ( (1 - 0.5 * self.conf.local_index) * flatten_params.buffer + 0.5 * self.conf.local_index * flatten_updated_params.buffer ) # compress the model difference and sync. with kargs["timer"]("grad.compress", epoch=self.conf.epoch_): sync_buffer = { "original_shapes": self.shapes, "flatten_updated_params": flatten_updated_params, } self.compressor.compress(sync_buffer) with kargs["timer"]("grad.sync", epoch=self.conf.epoch_): self.compressor.sync(sync_buffer) with kargs["timer"]("grad.unflatten_to_update", epoch=self.conf.epoch_): self.compressor.uncompress( sync_buffer, self.neighbor_hat_params, self.conf.local_index ) return sync_buffer["n_bits"]
def step(self, closure=None, **kargs): if self.conf.is_centralized: with kargs["timer"]("sync/get_data", epoch=self.conf.epoch_): # Get data. grads, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=True) flatten_grads = TensorBuffer(grads) with kargs["timer"]("sync/sync", epoch=self.conf.epoch_): # Aggregate the gradients. flatten_grads.buffer = self.world_aggregator._agg( flatten_grads.buffer, op="avg", distributed=self.conf.distributed) with kargs["timer"]("sync/unflatten_grad", epoch=self.conf.epoch_): # unflatten grads. flatten_grads.unpack(grads) with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_): utils.apply_gradient(self.param_groups, self.state, apply_grad_to_model=True) # Get n_bits to transmit. n_bits = get_n_bits(flatten_grads.buffer) else: with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_): utils.apply_gradient(self.param_groups, self.state, apply_grad_to_model=True) with kargs["timer"]("sync/get_data", epoch=self.conf.epoch_): # first get and flatten all params. params, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=False) flatten_params = TensorBuffer(params) with kargs["timer"]("sync/sync", epoch=self.conf.epoch_): # prepare the sync. if self.conf.comm_device == "cpu": flatten_params.buffer.cpu().detach_() # then sync. flatten_params.buffer = self.decentralized_aggregator._agg( flatten_params.buffer, op="weighted") with kargs["timer"]("sync/update_model", epoch=self.conf.epoch_): # finally unflatten. flatten_params.unpack(params) # Get n_bits to transmit. n_bits = get_n_bits(flatten_params.buffer) return n_bits
def init_neighbor_hat_params(self): params, self.shapes = comm.get_data(self.param_groups, self.param_names, is_get_grad=False) flatten_params = TensorBuffer(params) flatten_params.buffer = torch.zeros_like(flatten_params.buffer) # init the neighbor_params. self.neighbor_hat_params = { self.rank: deepcopy(flatten_params), "memory": deepcopy(flatten_params), }
def _init_neighbor_hat_params(conf, param_groups, param_names): params, params_shapes = comm.get_data(param_groups, param_names, is_get_grad=False) flatten_params = TensorBuffer(params) flatten_params.buffer = torch.zeros_like(flatten_params.buffer) # init the neighbor_params. return ( { conf.graph.rank: deepcopy(flatten_params), "memory": deepcopy(flatten_params), }, params_shapes, )
def compress(self, grads_tb): # get the sign/magnitude for the tensor (to be transmitted). sync_buffer = dict() # flatten selected values/indices. grad_norms_tb = TensorBuffer([grad.norm(p=1) for grad in grads_tb]) signs, sign_size = self.compressor_fn.compress(grads_tb.buffer) # get compressed grad. synced_grads_tb = copy.deepcopy(grads_tb) for synced_grad, grad_norm, grad in zip(synced_grads_tb, grad_norms_tb, grads_tb): synced_grad.data.copy_(grad_norm * torch.sign(grad) / grad.nelement()) # get n_bits to transmit. n_bits = get_n_bits(grad_norms_tb.buffer) + get_n_bits(signs) # update shared dict. sync_buffer["grad_norms_tb"] = grad_norms_tb sync_buffer["grads_tb"] = grads_tb sync_buffer["synced_grads_tb"] = synced_grads_tb sync_buffer["signs"] = signs sync_buffer["sign_size"] = sign_size sync_buffer["n_bits"] = n_bits return sync_buffer
def compress(self, sync_buffer): # flatten selected values/indices. param_norms_tb = TensorBuffer( [param.norm(p=1) for param in sync_buffer["params_tb"]] ) signs, sign_size = self.compressor_fn.compress(sync_buffer["params_tb"].buffer) # get compressed model. local_compressed_params_tb = deepcopy(sync_buffer["params_tb"]) for local_compressed_param, param_norm, param in zip( local_compressed_params_tb, param_norms_tb, sync_buffer["params_tb"] ): local_compressed_param.data.copy_( param_norm * torch.sign(param) / param.nelement() ) # get n_bits to transmit. n_bits = get_n_bits(param_norms_tb.buffer) + get_n_bits(signs) # update shared dict. sync_buffer["param_norms_tb"] = param_norms_tb sync_buffer["signs"] = signs sync_buffer["sign_size"] = sign_size sync_buffer["n_bits"] = n_bits return local_compressed_params_tb
def compress(self, sync_buffer): # get the sign/magnitude for the tensor (to be transmitted). quantized_values = [] # compress and get compressed model. local_compressed_params_tb = deepcopy(sync_buffer["params_tb"]) local_compressed_params_tb.buffer = torch.zeros_like( local_compressed_params_tb.buffer ) for param, local_compressed_param in zip( sync_buffer["params_tb"], local_compressed_params_tb ): # quantize. _quantized_values = self.compressor_fn.compress( param, self.comm_op, self.quantize_level, self.is_biased ) quantized_values.append(_quantized_values) # update the local compressed params. local_compressed_param.data.copy_(_quantized_values) # flatten selected values/indices. flatten_updates = TensorBuffer(quantized_values) # get n_bits to transmit. n_bits = get_n_bits(flatten_updates.buffer) * self.quantize_level / 32 # update shared dict. sync_buffer["flatten_updates"] = flatten_updates sync_buffer["n_bits"] = n_bits return local_compressed_params_tb
def decompress(self, sync_buffer): # decompress and update. for rank in range(self.world_size): if rank == self.rank: continue # get grad_norm and build its tensorbuffer. _grad_norms = comm.recover_device( sync_buffer["synced_grad_norms"][rank], device=sync_buffer["synced_grads_tb"].buffer.device, ) grad_norms_tb = TensorBuffer(_grad_norms) # get signs and build its tensorbuffer. signs = comm.recover_device( sync_buffer["synced_signs"][rank], device=sync_buffer["synced_grads_tb"].buffer.device, ) _signs = self.compressor_fn.uncompress(signs, sync_buffer["sign_size"]) signs_tb = copy.deepcopy(sync_buffer["synced_grads_tb"]) signs_tb.buffer = _signs # update grads. for grad_norm, sign, synced_grad in zip( grad_norms_tb, signs_tb, sync_buffer["synced_grads_tb"]): _update = grad_norm * sign / synced_grad.nelement() synced_grad.add_(_update) # average grad. sync_buffer["synced_grads_tb"].buffer /= self.world_size * 1.0 return sync_buffer["synced_grads_tb"]
def uncompress(self, sync_buffer, neighbors_info): aggregated_info_tb = deepcopy(sync_buffer["params_tb"]) aggregated_info_tb.buffer = torch.zeros_like(aggregated_info_tb.buffer) # uncompress and update. for rank in neighbors_info.keys(): param_norms = sync_buffer["synced_param_norms"][rank] signs = sync_buffer["synced_signs"][rank] # recover the message and the corresponding device. param_norms = comm.recover_device( param_norms, device=sync_buffer["params_tb"].buffer.device ) signs = self.compressor_fn.uncompress( comm.recover_device( signs, device=sync_buffer["params_tb"].buffer.device ), sync_buffer["sign_size"], ) # build the corresponding tensorbuffer. param_norms_tb = TensorBuffer(param_norms) signs_tb = deepcopy(sync_buffer["params_tb"]) signs_tb.buffer = signs # accumulate information for the neighborhood.. for _info, _param_norm, _sign in zip( aggregated_info_tb, param_norms_tb, signs_tb ): _info.add_( self.consensus_stepsize * (neighbors_info[rank] - (1 if rank == self.rank else 0)) * (_param_norm / _sign.nelement() * _sign) ) return aggregated_info_tb
def step(self, closure=None, **kargs): with kargs["timer"]("sync.local_update", epoch=self.conf.epoch_): utils.apply_gradient(self.param_groups, self.state, apply_grad_to_model=True) with kargs["timer"]("sync.sync_and_update", epoch=self.conf.epoch_): # enter the global sync if it satisfies the condition. if (self.conf.epoch_ < self.turn_on_local_step_from_epoch or self.conf.local_index % self.local_step == 0): # get parmas. params, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=False) params_tb = TensorBuffer(params) # get params_diff. param_diff = self.consensus_params_tb.buffer - params_tb.buffer # sync the directions. param_diff = self.world_aggregator._agg( param_diff, "avg", distributed=self.conf.distributed) # unpack the synced info and update the consensus params. self.consensus_params_tb.buffer.add_(-1.0, param_diff) # consistent the local models by assigning the consensus params. self.consensus_params_tb.unpack(params) # Get n_bits to transmit. n_bits = get_n_bits(param_diff) else: n_bits = 0 return n_bits
def _listen_to_master(self): # listen to master, related to the function `_activate_selected_clients` in `master.py`. msg = torch.zeros((3, self.conf.n_participated)) dist.broadcast(tensor=msg, src=0) self.conf.graph.client_id, self.conf.graph.comm_round, self.n_local_epochs = ( msg[:, self.conf.graph.rank - 1].to(int).cpu().numpy().tolist()) # once we receive the signal, we init for the local training. self.arch, self.model = create_model.define_model( self.conf, to_consistent_model=False, client_id=self.conf.graph.client_id) self.model_state_dict = self.model.state_dict() self.model_tb = TensorBuffer(list(self.model_state_dict.values())) self.metrics = create_metrics.Metrics(self.model, task="classification") dist.barrier()
def recover_params(param_groups, param_names, rank=None, neighbor_hat_params=None, get_hat_params=True): # get flattened params. params, _ = comm.get_data(param_groups, param_names, is_get_grad=False) flatten_params = TensorBuffer(params) if get_hat_params: assert neighbor_hat_params is not None and rank is not None # recover the hat_params. flatten_hat_params = TensorBuffer(params) flatten_hat_params.buffer.data[:] = neighbor_hat_params[rank].buffer return params, flatten_params, flatten_hat_params else: return params, flatten_params
def step(self, closure=None, **kargs): with kargs["timer"]("sync.apply_grad", epoch=self.conf.epoch_): utils.apply_gradient(self.param_groups, self.state, apply_grad_to_model=False) with kargs["timer"]("sync.get_data", epoch=self.conf.epoch_): # Get data. grads, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=True) grads_tb = TensorBuffer(grads) with kargs["timer"]("sync.use_memory", epoch=self.conf.epoch_): # use memory. grads_tb.buffer.add_(self.memory_tb.buffer) with kargs["timer"]("sync.compress", epoch=self.conf.epoch_): # compress. sync_buffer = self.compressor.compress(grads_tb) with kargs["timer"]("sync.sync", epoch=self.conf.epoch_): self.compressor.sync(sync_buffer) with kargs["timer"]("sync.update_memory", epoch=self.conf.epoch_): # update memory. self.memory_tb.buffer = (grads_tb.buffer - sync_buffer["synced_grads_tb"].buffer) with kargs["timer"]("sync.decompress", epoch=self.conf.epoch_): sync_grads_tb = self.compressor.decompress(sync_buffer) with kargs["timer"]("sync.apply_grad", epoch=self.conf.epoch_): # appply the gradient but only with the gradient. params, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=False) params_tb = TensorBuffer(params) # apply the gradient. params_tb.buffer.add_(-self.param_groups[0]["lr"] * sync_grads_tb.buffer) # unpack. params_tb.unpack(params) return sync_buffer["n_bits"]
def _send_model_to_master(self): dist.barrier() self.conf.logger.log( f"Worker-{self.conf.graph.worker_id} (client-{self.conf.graph.client_id}) sending the model ({self.arch}) back to Master." ) flatten_model = TensorBuffer(list(self.model.state_dict().values())) dist.send(tensor=flatten_model.buffer, dst=0) dist.barrier()
def init_neighbor_hat_params(self): params, self.shapes = comm.get_data(self.param_groups, self.param_names, is_get_grad=False) flatten_params = TensorBuffer(params) # init the neighbor_params. self.neighbor_hat_params = dict() for rank, _ in self.neighbors_info.items(): self.neighbor_hat_params[rank] = deepcopy(flatten_params)
def compress(self, sync_buffer): # get the sign/magnitude for the tensor (to be transmitted). selected_values, selected_indices = [], [] # compress and get compressed model. local_compressed_params_tb = deepcopy(sync_buffer["params_tb"]) local_compressed_params_tb.buffer = torch.zeros_like( local_compressed_params_tb.buffer ) for param, local_compressed_param in zip( sync_buffer["params_tb"], local_compressed_params_tb ): _selected_values, _selected_indices = self.compressor_fn.compress( param, self.comm_op, self.compress_ratio, self.is_biased ) selected_values.append(_selected_values) selected_indices.append(_selected_indices) # update the local compressed params. local_compressed_param.data = local_compressed_param.data.view(-1) local_compressed_param.data[_selected_indices] = _selected_values local_compressed_param.data.view(*param.size()) # get selected shapes. selected_shapes = [len(_value) for _value in selected_values] # flatten selected values/indices. flatten_selected_values = TensorBuffer(selected_values) flatten_selected_indices = TensorBuffer(selected_indices) # get n_bits to transmit. n_bits = get_n_bits(flatten_selected_values.buffer) + get_n_bits( flatten_selected_indices.buffer ) # update shared dict. sync_buffer["selected_shapes"] = selected_shapes sync_buffer["flatten_selected_values"] = flatten_selected_values sync_buffer["flatten_selected_indices"] = flatten_selected_indices sync_buffer["n_bits"] = n_bits return local_compressed_params_tb
def benchmark1(tensors): timer = CUDATimer('baseline') local_scale, local_sign = [], [] with timer('compression'): for tensor in tensors: _local_scale, _local_sign = scaled_sign(tensor) local_scale.append(_local_scale) local_sign.append(_local_sign) with timer('flattening'): magnitudes_tb = TensorBuffer(local_scale) directions_tb = TensorBuffer(local_sign) with timer('com'): dist.all_reduce(magnitudes_tb.buffer, op=dist.ReduceOp.SUM) magnitudes_tb.buffer /= 2 dist.all_reduce(directions_tb.buffer, op=dist.ReduceOp.SUM) directions_tb.buffer /= 2 timer.upload_raw( 'microbenchmarking', { 'microbenchmark': 'sign_sgd_com', 'input': list(map(lambda t: t.size(), tensors)) })
def aggregate(conf, master_model, fedavg_model, client_models, flatten_local_models): # perform the server Adam. # Following the setup in the paper, we use momentum of 0.9, # numerical stability constant epsilon to be 0.01, # the beta_2 is set to 0.99. # The suggested server_lr in the original paper is 0.1 fl_aggregate = conf.fl_aggregate assert "server_lr" in fl_aggregate beta_2 = fl_aggregate["beta_2"] if "beta_2" in fl_aggregate else 0.99 # start the server momentum acceleration. current_model_tb = TensorBuffer(list(fedavg_model.parameters())) previous_model_tb = TensorBuffer(list(master_model.parameters())) # get the update direction. update = previous_model_tb.buffer - current_model_tb.buffer # using server momentum for the update. if not hasattr(conf, "second_server_momentum_buffer"): conf.second_server_momentum_buffer = torch.zeros_like(update) conf.second_server_momentum_buffer.mul_(beta_2).add_( (1 - beta_2) * (update**2)) previous_model_tb.buffer.add_( -fl_aggregate["server_lr"] * update / (torch.sqrt(conf.second_server_momentum_buffer) + 0.01)) # update the master_model (but will use the bn stats from the fedavg_model) master_model = fedavg_model _model_param = list(master_model.parameters()) previous_model_tb.unpack(_model_param) # free the memory. torch.cuda.empty_cache() # a temp hack (only for debug reason). client_models = dict((used_client_arch, master_model.cpu()) for used_client_arch in conf.used_client_archs) return client_models
def compress(self, sync_buffer): # get the sign/magnitude for the tensor (to be transmitted). norms, updates = [], [] for flatten_updated_param in sync_buffer["flatten_updated_params"]: _update = flatten_updated_param updates += [_update] norms += [_update.norm(p=1)] # flatten selected values/indices. flatten_norms = TensorBuffer(norms) flatten_updates = TensorBuffer(updates) signs, sign_size = self.compressor_fn.compress(flatten_updates.buffer) # get n_bits to transmit. n_bits = get_n_bits(flatten_norms.buffer) + get_n_bits(signs) # update shared dict. sync_buffer["flatten_norms"] = flatten_norms sync_buffer["flatten_updates"] = flatten_updates sync_buffer["signs"] = signs sync_buffer["sign_size"] = sign_size sync_buffer["n_bits"] = n_bits
def _send_model_to_selected_clients(self, selected_client_ids): # the master_model can be large; the client_models can be small and different. self.conf.logger.log(f"Master send the models to workers.") for worker_rank, selected_client_id in enumerate(selected_client_ids, start=1): arch = self.clientid2arch[selected_client_id] client_model_state_dict = self.client_models[arch].state_dict() flatten_model = TensorBuffer(list( client_model_state_dict.values())) dist.send(tensor=flatten_model.buffer, dst=worker_rank) self.conf.logger.log( f"\tMaster send the current model={arch} to process_id={worker_rank}." ) dist.barrier()
def step(self, closure=None, **kargs): # do the local update steps. with kargs["timer"]("sync/get_data", epoch=self.conf.epoch_): # get parmas. params, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=False) params_tb = TensorBuffer(params) with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_): # prepare the gradient (sign) utils.apply_gradient(self.param_groups, self.state, apply_grad_to_model=False) # get grads. grads, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=True) grads_tb = TensorBuffer(grads) # enter the global sync if it satisfies the condition. # get the params difference w.r.t. previous synced model. with kargs["timer"]("sync/compress", epoch=self.conf.epoch_): sync_buffer = self.compressor.compress(grads_tb) # sync and decompress. with kargs["timer"]("sync/sync_and_decompress", epoch=self.conf.epoch_): self.compressor.sync(sync_buffer) synced_updates_tb = self.compressor.decompress(sync_buffer) # unpack the synced info and update the consensus params. with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_): params_tb.buffer -= self.param_groups[0][ "lr"] * synced_updates_tb.buffer params_tb.unpack(params) return sync_buffer["n_bits"]
def benchmark2(tensors): timer = CUDATimer('centralized_allreduce') local_compressed, local_scale = [], [] with timer('compression'): for tensor in tensors: local_compressed.append(tensor.clone()) for tensor in tensors: _local_scale, _local_sign = scaled_sign(tensor) # store local scales and local sign. local_scale.append(_local_scale) with timer('flattening'): magnitudes_tb = TensorBuffer(local_scale) #directions_tb = TensorBuffer(local_sign) compressed_tb = TensorBuffer(local_compressed) with timer('com'): centralized_allreduce(compressed_tb.buffer, timer) #print('difff after', compressed_tb.buffer - directions_tb.buffer) dist.all_reduce(magnitudes_tb.buffer, op=dist.ReduceOp.SUM) magnitudes_tb.buffer /= 2 timer.upload_raw( 'microbenchmarking', { 'microbenchmark': 'sign_sgd_com', 'input': list(map(lambda t: t.size(), tensors)) })
def compress(self, sync_buffer): # get the sign/magnitude for the tensor (to be transmitted). quantized_values = [] for flatten_updated_param in sync_buffer["flatten_updated_params"]: _quantized_values = self.compressor_fn.compress( flatten_updated_param, self.comm_op, self.quantize_level, self.is_biased) quantized_values.append(_quantized_values) # flatten selected values/indices. flatten_updates = TensorBuffer(quantized_values) # get n_bits to transmit. n_bits = get_n_bits(flatten_updates.buffer) * self.quantize_level / 32 # update shared dict. sync_buffer["flatten_updates"] = flatten_updates sync_buffer["n_bits"] = n_bits
def step(self, closure=None, **kargs): # Apply the gradients with the weight decay and momentum. with kargs["timer"]("grad.apply_grad", epoch=self.conf.epoch_): utils.apply_gradient(self.param_groups, self.state, apply_grad_to_model=False) with kargs["timer"]("grad.get_grads", epoch=self.conf.epoch_): params, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=False) flatten_params = TensorBuffer(params) grads, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=True) flatten_grads = TensorBuffer(grads) # Get weighted hat params and apply the local gradient. with kargs["timer"]("grad.apply_local_gradient", epoch=self.conf.epoch_): flatten_half_params = deepcopy(flatten_params) flatten_half_params.buffer = (sum([ _hat_params.buffer * self.neighbors_info[_rank] for _rank, _hat_params in self.neighbor_hat_params.items() ]) - self.param_groups[0]["lr"] * flatten_grads.buffer) # compress the model difference and sync. with kargs["timer"]("grad.compress", epoch=self.conf.epoch_): sync_buffer = { "original_shapes": self.shapes, "flatten_half_params": flatten_half_params, "flatten_params": flatten_params, } self.compressor.compress(sync_buffer) with kargs["timer"]("grad.sync", epoch=self.conf.epoch_): self.compressor.sync(sync_buffer) # finally unflatten and update local model. with kargs["timer"]("grad.unflatten_to_update", epoch=self.conf.epoch_): self.compressor.uncompress(sync_buffer, self.neighbor_hat_params) flatten_params.buffer = self.neighbor_hat_params[ self.rank].buffer.clone() flatten_params.unpack(params) return sync_buffer["n_bits"]
def compress(self, sync_buffer): quantized_values = [] for half_param, hat_param in zip(sync_buffer["flatten_params"], sync_buffer["flatten_hat_params"]): _quantized_values = self.compressor_fn.compress( half_param - hat_param, self.comm_op, self.quantize_level, self.is_biased, ) quantized_values.append(_quantized_values) # flatten selected values/indices. flatten_updates = TensorBuffer(quantized_values) # get n_bits to transmit. n_bits = get_n_bits(flatten_updates.buffer) * self.quantize_level / 32 # update shared dict. sync_buffer["flatten_updates"] = flatten_updates sync_buffer["n_bits"] = n_bits