def __init__(self, params, lr=required, n_push=required, n_pull=required, model=required): """__init__ :param params: :param lr: :param freq: :param model: """ if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) defaults = dict(lr=lr,) self.accumulated_gradients = torch.zeros(ravel_model_params(model).size()) self.n_pull = n_pull self.n_push = n_push self.model = model # this sets the initial model parameters send_message(MessageCode.ParameterUpdate, ravel_model_params(self.model)) self.idx = 0 listener = DownpourListener(self.model) listener.start() super(DownpourSGD, self).__init__(params, defaults)
def __init__(self, params, lr=required, n_push=0, n_pull=0, model=required): """__init__ :param params: :param lr: :param freq: :param model: """ if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {}".format(lr)) print('I am node rank:%d' % dist.get_rank()) defaults = dict(lr=lr, ) self.accumulated_gradients = torch.zeros( ravel_model_params(model).size()) self.model = model # this sets the initial model parameters # send_message(MessageCode.ParameterUpdate, ravel_model_params(self.model)) self.idx = 0 self.version = 0 self.queue = Queue(maxsize=1) self.gradient_warehouse = WorkerGradientWarehouse() self.listener = GradientListener(self.model, self.queue, self.gradient_warehouse) self.listener.start() self.sender = GradientMessageSender(self.queue) self.sender.start() super(GradientSGD, self).__init__(params, defaults)
def __init__(self, model): _LOGGER.info("Creating ParameterServer") print("Creating ParameterServer") self.parameter_shard = torch.rand(ravel_model_params(model).numel()) self.model = model # init superclass super(ParameterServer, self).__init__(model)
def __init__(self, model): """__init__ :param model: nn.Module to be defined by the user """ _LOGGER.info("Setting m_parameter") self.m_parameter = torch.zeros(ravel_model_params(model).numel() + 5) print('Tensor size : %d' % self.m_parameter.numel()) super(GradientMessageListener, self).__init__()
def __init__(self, model): """__init__ :param model: nn.Module to be defined by the user """ self.model = model _LOGGER.info("Setting m_parameter") self.m_parameter = torch.zeros(ravel_model_params(model).numel() + 2) super(MessageListener, self).__init__()
def __init__(self, model, gradient_warehouse, storage_num=10, rank=0): _LOGGER.info("Creating GradientServer") print("Creating GradientServer") # self.parameter_shard = torch.rand(ravel_model_params(model).numel()) # self.model = model self.model = torch.zeros(ravel_model_params(model).numel()) self.gradient_warehouse = gradient_warehouse self.rank = rank super(GradientServer, self).__init__(model) self.gradient_warehouse.model = self.model
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() # increase version No. # self.version += 1 if dist.get_rank() == 1: time.sleep(0.04) # get the lr lr = self.param_groups[0]['lr'] self.listener.lr = lr # keep track of accumulated gradients so that we can send gradients = ravel_model_params(self.model, grads=True) self.listener.worker_ahead_count += 1 self.listener.version += 1 current_version = self.listener.version while self.listener.worker_ahead_count >= self.listener.waiting_bound: pass self.gradient_warehouse.push(gradients, current_version) # send message self.queue.put((GSMessageCode.GradientUpdate, gradients, 0, current_version, 0, 0)) # send_thread = threading.Thread(target=send_message, # args=(GSMessageCode.GradientUpdate, gradients, 0, current_version)) # send_message(GSMessageCode.GradientUpdate, gradients, dst=0, gradient_version=self.listener.version + 1) # send_thread.start() # reset gradient version lock_queue.get() if self.idx % 100 == 1: self.gradient_warehouse.clean_redundant() self.idx += 1 if lock.locked(): # skip this iteration lock.acquire() lock.release() return loss return loss
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() # send parameter request every N iterations if self.idx % self.n_pull == 0: send_message(MessageCode.ParameterRequest, self.accumulated_gradients) # dummy val # get the lr lr = self.param_groups[0]['lr'] # keep track of accumulated gradients so that we can send gradients = ravel_model_params(self.model, grads=True) self.accumulated_gradients.add_(-lr, gradients) # send gradient update every N iterations if self.idx % self.n_push == 0: send_message( MessageCode.GradientUpdate, self.accumulated_gradients) # send gradients to the server self.accumulated_gradients.zero_() # internal sgd update for group in self.param_groups: for p in group['params']: if p.grad is None: continue d_p = p.grad.data p.data.add_(-group['lr'], d_p) self.idx += 1 return loss
def receive(self, sender, message_code, gradient_version, trigger, fast_flag, parameter): """receive parameter updates from the server and reflect them into the client's model.""" _LOGGER.info("Processing message: {}, version: {}, lr: {}".format( message_code.name, gradient_version, self.lr)) if message_code == GSMessageCode.GradientUpdate: if not fast_flag: # means this version of gradient should not stored by worker cuz this worker is not a fast-node self.gradient_warehouse.remove(self.version) # print(len(self.gradient_warehouse.gradient_storage), self.gradient_warehouse.gradient_storage.keys()) self.version = max(self.version, gradient_version) if trigger is 0: update_model_params(self.model, parameter, self.lr) elif trigger is not 0 and trigger in self.gradient_warehouse.gradient_storage.keys( ): # received lower nodes' gradient # pass update_model_params(self.model, parameter, self.lr) # update_model_params(self.model, self.gradient_warehouse.pop(trigger), -self.lr) # print("Sync-fast, Received version %d from other nodes" % trigger) lock_queue.put(gradient_version) elif message_code == GSMessageCode.ModelRequest: lock.acquire() model = ravel_model_params(self.model, grads=False) print(model) self.queue.put((GSMessageCode.ModelUpdate, model, 0, 0, 0, 0)) # send current model print('send model to server') elif message_code == GSMessageCode.ModelUpdate: print(parameter) unravel_model_params(self.model, parameter) self.version = max(self.version, gradient_version) print('unravel_model_params') lock.release() self.worker_ahead_count = 0