Exemplo n.º 1
0
def train_step(peer, t, PSS):
    T = t if isinstance(t, tqdm) or isinstance(t, range) else [t]
    for t in T:
        # train one epoch
        peer.train_one_epoch()
        # broadcast gradients
        active = active_peers(peer.neighbors, peer.params.frac)
        msg = protocol.train_step(t, peer.get_gradients())
        peer.broadcast(msg, active)
        # wait for enough updates labeled with round number t
        wait_until(enough_grads, 3, 0.05, peer, t, len(active))
        if t not in peer.V:
            peer.V[t] = []
            log('error', f"{peer} received no messages in round {t}")
        else:
            log(
                'log',
                f"{peer} -- T= {t} -- Got enough messages : {len(peer.V[t])}.")
        # collaborativeUpdate
        v_t = collaborativeUpdateLight(peer, t)
        # update and evaluate the model
        # TODO Review update function
        update_model(peer, v_t, evaluate=(t % 10 == 0))
        # start accepting gradients from next round
        peer.current_round = t + 1
        del peer.V[t]
        # networkUpdate(peer, t, PSS)
    return
Exemplo n.º 2
0
def train_step(peer, t):
    T = t if isinstance(t, tqdm) or isinstance(t, range) else [t]
    for t in T:
        # train for E (one) epoch
        peer.train_one_epoch()  # weights ==> multiple epochs
        # train_for_x_epoch(peer, 10)
        # broadcast current model to all my active neighbors
        active = active_peers(peer.neighbors, peer.params.frac)
        # TODO check exchanging grads instead of model params.
        msg = protocol.train_step(t, peer.get_model_params())
        peer.broadcast(msg, active)
        # wait for enough updates labeled with round number t
        wait_until(enough_received, WAIT_TIMEOUT, WAIT_INTERVAL, peer, t,
                   len(active))
        if t not in peer.V:
            peer.V[t] = []
            log('error', f"{peer} received no messages in round {t}.")
        else:
            log(
                'log',
                f"{peer} got {len(peer.V[t])}/{len(active)} messages in round {t}."
            )
        # estimate \sigma in first round
        estimate_sigma(peer)
        # collaborativeUpdate
        v = collaborativeUpdate(peer, t)
        # update and evaluate the model
        update_model(peer, v, evaluate=(t % EVAL_ROUND == 0))
        # start accepting gradients from next round
        peer.current_round = t + 1
        del peer.V[t]
    return
Exemplo n.º 3
0
def train_step(peer, t):
    T = t if isinstance(t, tqdm) or isinstance(t, range) else [t]
    for t in T:
        # train for E (one) epoch
        peer.train_one_epoch()
        # broadcast current model to all my active neighbors
        active = active_peers(peer.neighbors, peer.params.frac)
        msg = protocol.train_step(t, peer.get_model_params())
        peer.broadcast(msg, active)
        # wait for enough updates labeled with round number t
        wait_until(enough_received, WAIT_TIMEOUT, WAIT_INTERVAL, peer, t,
                   len(active))
        if t not in peer.V:
            peer.V[t] = []
            peer.log('error', f"{peer} received no messages in round {t}.")
        # peer.log('log', f"{peer} got {len(peer.V[t])}/{len(active)} messages in round {t}.", remote=False)
        # collaborativeUpdate
        w_t = collaborativeUpdateLight(peer, t)
        # update and evaluate the model
        if isinstance(T, tqdm):
            T.set_postfix_str(f"{peer} running evaluation in round {t}..." if (
                t % EVAL_ROUND) == 0 else "")
        # TODO Review update function
        update_model(peer, w_t, evaluate=(t % EVAL_ROUND == 0), t=t)
        # start accepting gradients from next round
        peer.current_round = t + 1
        del peer.V[t]
        # networkUpdate(peer, t, PSS)
    return
Exemplo n.º 4
0
def train_step(peer: Node, t, args):
    T = t if isinstance(t, tqdm) or isinstance(t, range) else [t]
    for t in T:
        if peer.id == args.server_id:
            # Server
            wait_until(enough_received, WAIT_TIMEOUT * 100, WAIT_INTERVAL * 10, peer, t, len(peer.neighbors))
            w = GAR(peer, [v for i, v in peer.V[t]])
            msg = protocol.train_step(t, peer.get_model_params())  # not grads
            peer.broadcast(msg)
            peer.set_model_params(w)
            if t % EVAL_ROUND == 0:
                t_eval = peer.evaluate(peer.inference, one_batch=True)
                peer.params.logs.append(t_eval)
        else:
            if t > 0:
                wait_until(server_received, WAIT_TIMEOUT * 100, WAIT_INTERVAL * 10, peer, t)
                w_server = peer.V[t - 1][0][1]
                peer.set_model_params(w_server)
            # Worker
            train_for_x_epoch(peer, args.epochs)
            msg = protocol.train_step(t, peer.get_model_params())  # not grads
            server = peer.neighbors[0]
            peer.send(server, msg)
            # peer.params.server.params.models[t].append(peer.get_model_params())
    return
Exemplo n.º 5
0
 def populate(self, info):
     self.send(protocol.call_method("populate", info))
     done = wait_until(self.return_method, conf.FUNC_TIMEOUT, 1, "populate")
     if done and self.callbacks['populate']['s']:
         del self.callbacks['populate']
         log('success', f"{self} populated successfully")
     elif done:
         log("error", f"Error populating {self}")
     else:
         log('warning', f"Calling populate() timeout  after {conf.FUNC_TIMEOUT} seconds")
Exemplo n.º 6
0
 def fit(self, inference):
     self.send(protocol.call_method("fit", inference))
     done = wait_until(self.return_method, conf.FUNC_TIMEOUT, 1, "fit")
     if done and self.callbacks['fit']['s']:
         history = self.callbacks['fit']['m']
         del self.callbacks['fit']
         # for i, h in enumerate(history):
         #     log('', f"Epoch [{i}], val_loss: {h['val_loss']:.4f}, val_acc: {h['val_acc']:.4f}")
         return history
     else:
         log('warning', f"Calling fit() timeout  after {conf.FUNC_TIMEOUT} seconds")
         return None
Exemplo n.º 7
0
 def connect(self, neighbor):
     self.send(protocol.call_method("connect", neighbor.id, neighbor.host, neighbor.port))
     done = wait_until(self.return_method, conf.FUNC_TIMEOUT, 1, "connect")
     if done and self.callbacks['connect']['s']:
         self.neighbors.append(neighbor.id)
         del self.callbacks["connect"]
         return True
     elif done:
         log("error", self.callbacks['connect']['m'])
         return False
     else:
         log('warning', f"Calling connect() timeout  after {conf.FUNC_TIMEOUT} seconds")
         return False
Exemplo n.º 8
0
def edge_devices(args, count=1, rand_ids=False):
    if count < 1:
        return None
    if conf.ML_ENGINE != "NumPy":
        log('error', f"Mobile devices currently only support NumPy based ML")
        exit()
    if args.mp == 0:
        log('error', f"You need to use message passing when edge devices are involved")
        exit()
    launcher = Bridge(count, args, rand_ids=rand_ids)
    launcher.start()
    wait_until(launcher.bridged, conf.LAUNCHER_TIMEOUT, 1)
    if len(launcher.bridges) == count:
        log('success', f"All edge devices joined successfully")
    elif len(launcher.bridges) == 0:
        log('error', f"No device joined in {conf.LAUNCHER_TIMEOUT} seconds")
        launcher.stop()
        exit()
    else:
        log('error', f"Only {len(launcher.bridges)} devices joined after waiting for {conf.LAUNCHER_TIMEOUT} seconds")
        exit()

    return launcher
Exemplo n.º 9
0
 def wait_method(self, method):
     done = wait_until(self.return_method, conf.FUNC_TIMEOUT, 1, method)
     if not done:
         log('warning', f"Calling execute({method}) timeout  after {conf.FUNC_TIMEOUT} seconds")