Exemplo n.º 1
0
    def get_aggr_grads(self, num_wait_ps=-1):
        """ ask servers to get their latest aggregated gradients
        Args
        num_wait_ps number of servers to wait for their response (useful in asynchronous deployments)
        """
        if num_wait_ps < 0:
            num_wait_ps = self.num_ps - self.byz_ps
        if num_wait_ps == self.num_ps:  #FAST PATH: synchronous
            futs = [
                _remote_method_async(ps_type.get_latest_aggr_grad, ps_rref)
                for ps_rref, ps_type in zip(self.ps_rref, self.ps_types)
            ]
            aggr_grads = [fut.wait()[1].to(self.device) for fut in futs]
        else:  #Asynchronous path
            aggr_grads_ph = [
                None for i in range(self.num_ps)
            ]  #placeholder for possible received aggregated gradients

            def assign_grads_async(fut):
                id, aggr_grad = fut.wait()
                aggr_grads_ph[id] = aggr_grad.to(self.device)

            for id, (ps_rref,
                     ps_type) in enumerate(zip(self.ps_rref, self.ps_types)):
                fut = _remote_method_async(ps_type.get_latest_aggr_grad,
                                           ps_rref)
                #fut.then allows to do something after the future object returns a result
                #x here is the future object itself; result of waiting it should return a model from that server
                fut.then(assign_grads_async)
            while self.num_ps - aggr_grads_ph.count(None) < num_wait_ps:
                sleep(1)
            aggr_grads = [ag for ag in aggr_grads_ph if ag is not None]
        return aggr_grads
Exemplo n.º 2
0
    def get_gradients(self, iter_num, num_wait_wrk=-1):
        """ ask workers to compute gradients and return them
        Args
        iter_num     the number of the current iteration, to be passed to workers
        num_wait_wrk number of workers to wait for their response (useful in asynchronous deployments)
        """
        if num_wait_wrk < 0:
            num_wait_wrk = self.num_workers - self.byz_wrk
        self.model.train()
        self.optimizer.zero_grad()
        #Fast path
        if num_wait_wrk == self.num_workers:
            def get_grad(fut):
                return fut.wait()[1].to(self.device)
            pool_wrk = ThreadPool()
            futs = [_remote_method_async(wrk_type.compute_gradients, wrk_rref, iter_num, self.model) for wrk_rref, wrk_type in zip(self.workers_rref, self.workers_types)]
            build_th = threading.Thread(target=self.build_graph, args=(iter_num,))
            build_th.start()
            grads = pool_wrk.map(get_grad, futs)
            pool_wrk.terminate()
            pool_wrk.join()
            del pool_wrk
        else:       #The call should be done asynchronously, and we should only consider the fastest workers responding
            grads=[None for i in range(self.num_workers)]           #placeholder for possible received gradients
            def assign_grad_async(fut):
                """ callback function that is called when some gradient is received asynchronously
                Args
                fut     the future object on which the callback is called
                """
                id, gradient, loss = fut.wait()
                grads[id] = gradient.to(self.device)
            for id, (wrk_rref, wrk_type) in enumerate(zip(self.workers_rref,self.workers_types)):
                fut = _remote_method_async(wrk_type.compute_gradients, wrk_rref, iter_num, self.model)
                #fut.then allows to do something after the future object returns a result
                #x here is the future object itself; result of waiting it should return a grad from that worker
                fut.then(assign_grad_async)
            #busy-wait for the replies
            build_th = threading.Thread(target=self.build_graph, args=(iter_num,))
            build_th.start()
            while self.num_workers - grads.count(None) < num_wait_wrk:
                sleep(1)
            #now, I'm sure I have at least "num_wait_wrk" replies at least
            #let's remove all "None" values
            grads = [grad for grad in grads if grad is not None]
#            del grads				#empty it for the next iteration
        #make sure that the graph is built (regardless of synchrony or not)
        build_th.join()
        return grads
Exemplo n.º 3
0
 def get_fake_models(self):
     futs = [_remote_method_async(ps_type.get_fake_model, ps_rref) for ps_rref, ps_type in zip(self.ps_rref, self.ps_types)]
     models = [fut.wait().to(self.device) for fut in futs]