Exemplo n.º 1
0
def monitor_node_latencies():
    """Continously emits latency metric for other nodes by pinging them."""
    ID = int(os.getenv("ID"))
    nodes = get_nodes()
    other_nodes = {
        k: nodes[k]
        for k in nodes if k != ID and nodes[k].hostname != "localhost"
    }

    if len(other_nodes) == 0:
        logger.info(f"No use to ping when running locally, aborting")
        return

    while True:
        nodes = get_nodes()
        other_nodes = {
            k: nodes[k]
            for k in nodes if k != ID and nodes[k].hostname != "localhost"
        }
        for n_id in other_nodes:
            node = nodes[n_id]
            try:
                cmd = f"sudo sh ./metrics/ping.sh {node.hostname}".split(" ")
                res = subprocess.Popen(cmd,
                                       stdout=subprocess.PIPE).communicate()
                latency = float(res[0].decode().replace("\n", ""))
                host_latency.labels(ID, nodes[ID].hostname, n_id,
                                    node.hostname).set(latency)
            except Exception as e:
                logger.error(f"Got error {e} when pinging {node.hostname}")
Exemplo n.º 2
0
def setup_communication(resolver):
    """Sets up the communication using asyncio event loop."""
    nodes = config.get_nodes()

    # setup receiver to receiver channel messages from other nodes
    receiver = Receiver(id, nodes[id].ip, nodes[id].port, resolver,
                        resolver.on_message_sent)
    t = Thread(target=receiver.start)
    t.start()

    # setup sender channel to other nodes
    senders = {}
    for _, node in nodes.items():
        if id != node.id:
            sender = Sender(id, node, resolver.on_message_sent)
            senders[node.id] = sender
    logger.info("All senders connected")

    resolver.senders = senders
    resolver.receiver = receiver

    loop = asyncio.get_event_loop()
    for i in senders:
        loop.create_task(senders[i].start())

    resolver.system_status = SystemStatus.READY

    loop.run_forever()
    loop.close()
Exemplo n.º 3
0
def setup_fd_communication(resolver):
    """Sets up the self-stabilizing communication for the failure detectors."""
    nodes = config.get_nodes()

    # setup self-stabilizing receiver channel for failure detectors on
    # other nodes
    receiver = FDReceiver(("0.0.0.0", 7000 + id),
                          on_message_recv=resolver.dispatch_msg)
    t = Thread(target=receiver.listen)
    t.start()

    # setup self-stabilizing sender channels for failure detectors for
    # other nodes
    senders = {}
    for _, node in nodes.items():
        if id != node.id:
            sender = FDSender(id, (node.hostname, 7000 + node.id),
                              check_ready=resolver.system_running,
                              on_message_sent=resolver.on_message_sent)
            senders[node.id] = sender
            t = Thread(target=sender.start)
            t.start()

    # inject to resolver
    resolver.fd_senders = senders
    resolver.fd_receiver = receiver
    logger.info("All self-stab UDP senders connected")
    def refresh(self, new_node):
        """Called by API when a new node has been added to the system.
        
        Sets up communication channels etc.
        """
        self.nodes = get_nodes()

        # update modules
        self.modules[Module.RECMA_MODULE].number_of_nodes = len(self.nodes)
        self.modules[Module.RECSA_MODULE].number_of_nodes = len(self.nodes)
        self.modules[Module.FAILURE_DETECTOR_MODULE].number_of_nodes = len(
            self.nodes)
        self.modules[Module.FAILURE_DETECTOR_MODULE].beat += [0]
        self.modules[Module.JOINING_MECHANISM_MODULE].number_of_nodes = len(
            self.nodes)

        Thread(target=self.run_sender_in_new_thread, args=(new_node, )).start()

        # set up new fd sender
        new_fd_sender = FDSender(id, (new_node.hostname, 7000 + new_node.id),
                                 check_ready=self.system_running,
                                 on_message_sent=self.on_message_sent)
        self.fd_senders[new_node.id] = new_fd_sender
        Thread(target=self.fd_senders[new_node.id].start).start()

        logger.info(f"System refreshed, now {len(self.nodes)} nodes in system")
Exemplo n.º 5
0
    def run(self, testing=False):
        """Called whenever the module is launched in a separate thread."""
        # block until system is ready
        while not testing and not self.resolver.system_running():
            time.sleep(0.1)

        while True:
            if self.msg_queue.empty():
                time.sleep(0.1)
            else:
                msg = self.msg_queue.get()
                processor_j = msg["sender"]
                self.upon_token_from_pj(processor_j)
                self.send_msg(processor_j)

            if testing:
                break

            if self.first_run:
                nodes = conf.get_nodes()
                for node_j, _ in nodes.items():
                    if node_j != self.id:
                        self.send_msg(node_j)
                self.first_run = False

            throttle()
def get_nodes_list():
    """Returns a list of all nodes in the system.
    
    Used by joining script to have a new node join the system.
    """
    nodes = conf.get_nodes()
    ns = {}
    for n_id, n in nodes.items():
        ns[n_id] = n.to_dct()
    return jsonify(ns)
Exemplo n.º 7
0
def fetch_data_for_all_nodes():
    """Fetches data from all nodes through their /data endpoint."""
    try:
        data = []
        for _, node in conf.get_nodes().items():
            r = requests.get(f"http://{node.ip}:{4000+node.id}/data")
            data.append({"node": node.to_dct(), "data": r.json()})
        return data
    except Exception as e:
        logger.error(f"Error when fetching data for other nodes: {e}")
        return None
Exemplo n.º 8
0
 def test_config_can_parse_nodes_txt(self):
     s = "0,localhost,127.0.0.1,5000\n1,localhost,127.0.0.1,5001\n"
     path = "conf/tmp.txt"
     with open(path, "w") as f:
         f.write(s)
     hosts = config.get_nodes(hosts_path=path)
     self.assertEqual(len(hosts.values()), 2)
     self.assertEqual(hosts[0].id, 0)
     self.assertEqual(hosts[0].hostname, "localhost")
     self.assertEqual(hosts[0].ip, "127.0.0.1")
     self.assertEqual(hosts[0].port, 5000)
     self.assertEqual(hosts[1].id, 1)
     os.remove(path)
Exemplo n.º 9
0
    def __init__(self, testing=False):
        """Initializes the resolver."""
        self.modules = None
        self.senders = {}
        self.fd_senders = {}
        self.receiver = None
        self.fd_receiver = None
        self.nodes = get_nodes()

        # locks used to avoid race conditions with modules
        self.view_est_lock = Lock()
        self.replication_lock = Lock()
        self.prim_mon_lock = Lock()

        self.own_comm_ready = False
        self.other_comm_ready = False
        self.system_status = SystemStatus.BOOTING

        # check other nodes for system ready before starting system
        if not testing:
            t = Thread(target=self.wait_for_other_nodes)
            t.start()

        # inject resolver in rate limiter module
        rate_limiter.resolver = self

        # Support non-self-stabilizing mode
        self.self_stab = os.getenv("NON_SELF_STAB") is None

        # metrics
        self.total_msgs_sent = 0
        self.view_est_msgs = 0
        self.view_est_bytes = 0
        self.rep_msgs = 0
        self.rep_bytes = 0
        self.prim_mon_msgs = 0
        self.prim_mon_bytes = 0
        self.fd_msgs = 0
        self.fd_bytes = 0
        self.total_bytes_sent = 0
        self.experiment_started = False
Exemplo n.º 10
0
    def __init__(self, testing=False):
        """Initializes the resolver."""
        self.modules = None
        self.senders = {}
        self.fd_senders = {}
        self.receiver = None
        self.fd_receiver = None
        self.nodes = get_nodes()

        self.own_comm_ready = False
        self.other_comm_ready = False
        self.system_status = SystemStatus.BOOTING

        # check other nodes for system ready before starting system
        if not testing:
            t = Thread(target=self.wait_for_other_nodes)
            t.start()

        # inject resolver in rate limiter module
        rate_limiter.resolver = self

        # Support non-self-stabilizing mode
        self.self_stab = os.getenv("NON_SELF_STAB") is None
Exemplo n.º 11
0
    def send_msg(self):
        """Method description.

        Calls the Resolver to send a message containing the phase, view and
        witnesses of processor i and what processor wants to echo about
        processor j to processor_j
        """
        # stay silent if node configured to be unresponsive
        if byz.is_byzantine() and byz.get_byz_behavior() == byz.UNRESPONSIVE:
            return

        nodes = conf.get_nodes()
        for node_j, _ in nodes.items():
            # update own echo instead of sending message
            if node_j == self.id:
                predicate_info = self.pred_and_action.get_info(self.id)
                self.echo[self.id] = {
                    VIEWS: predicate_info[0],
                    PHASE: self.phs[self.id],
                    WITNESSES: self.witnesses[self.id],
                    VCHANGE: predicate_info[1]
                }
            else:
                # node_i's own data
                pred_and_action_own_data = self.pred_and_action.get_info(
                    self.id)
                own_data = [
                    deepcopy(self.phs[self.id]),
                    deepcopy(self.witnesses[self.id]),
                    deepcopy(pred_and_action_own_data[0]),
                    deepcopy(pred_and_action_own_data[1])
                ]
                pred_and_action_about_data = self.pred_and_action.get_info(
                    node_j)
                # what node_i thinks about node_j
                about_data = [
                    deepcopy(self.phs[node_j]),
                    deepcopy(self.witnesses[node_j]),
                    deepcopy(pred_and_action_about_data[0]),
                    deepcopy(pred_and_action_about_data[1])
                ]

                # Overwriting own_data to send different views to different
                # nodes, to trick them
                # if acting Byzantine with different_views - behaviour
                if byz.is_byzantine():
                    if byz.get_byz_behavior() == byz.DIFFERENT_VIEWS:
                        if (node_j % 2 == 0):
                            own_data = [0, True, {CURRENT: 1, NEXT: 1}, False]
                        else:
                            own_data = [0, True, {CURRENT: 2, NEXT: 2}, False]
                    elif byz.get_byz_behavior() == byz.FORCING_RESET:
                        own_data = [
                            0, True, self.pred_and_action.RST_PAIR, False
                        ]

                msg = {
                    "type": MessageType.VIEW_ESTABLISHMENT_MESSAGE,
                    "sender": self.id,
                    "data": {
                        "own_data": deepcopy(own_data),
                        "about_data": deepcopy(about_data)
                    }
                }
                self.resolver.send_to_node(node_j, msg)