Exemplo n.º 1
0
class WatchdogProcess:
    """This process receives heartbeats from other containers
    and launches new instances when it detects that a process
    has failed.
    Since this particular process can be started while the
    cluster is already running, in case of a leader failure for
    example, it needs to know the a priori configuration
    of instances. It can be the case that there already are
    failed processes when this process starts."""
    def __init__(self, hostname):
        self.hostname = hostname
        self.basedirname = os.getenv("BASEDIRNAME", "error")
        self.to_start = []
        self.to_stop = []
        self.reassigning = False
        self.load_default_config()

    def run(self):
        logging.info("Starting Watchdog Logic")
        logging.info("Setting up queues")
        self.setup_queues()
        logging.info("Launching receiver thread")
        self.launch_receiver_thread()
        logging.info("Launching spawner threads")
        self.launch_spawner_threads()
        logging.info("Launching checker")
        self.launch_checker()

    def setup_queues(self):
        self.leader_queue = RabbitMQQueue(exchange=EXCHANGE,
                                          consumer=True,
                                          durable=False)
        self.storage_exchange = RabbitMQQueue(exchange='storage_slave',
                                              consumer=False)

    def load_default_config(self):
        self.default_config = load_yaml()

        self.last_timeout = {}
        self.storage_roles = {}
        for key in self.default_config.keys():
            if key == "client":
                continue
            structure_single_key = {
                "{}_{}".format(key, i): time.time() + STARTING_TOLERANCE
                for i in range(self.default_config[key])
            }
            self.last_timeout.update(structure_single_key)
            if key == "storage":  # TODO
                self.storage_roles.update(
                    {k: SLAVE_ROLE
                     for k in structure_single_key.keys()})

        self.storage_master_heartbeat = 0  # instant timeout
        # Special structure to indicate that a container is being restarted and
        # should not be checked for timeouts.
        self.respawning = {k: False for k in self.last_timeout.keys()}

    def process_heartbeat(self, ch, method, properties, body):
        logging.debug("Received heartbeat message is {}".format(str(body)))
        data = body.decode().split(',')
        recv_hostname = data[1]
        recv_metadata = data[2]

        self.last_timeout[recv_hostname] = time.time()

        if recv_hostname.split("_")[0] == "storage":
            self.storage_roles[recv_hostname] = recv_metadata
            if self.storage_roles[recv_hostname] == MASTER_ROLE:
                self.storage_master_heartbeat = self.last_timeout[
                    recv_hostname]

    def mkimgname(self, imgid):
        return "{}_{}_1".format(self.basedirname, imgid)

    def stop_container(self, imgid):
        imgid = self.mkimgname(imgid)
        failed = 0
        result = subprocess.run(['docker', 'stop', "-t", "1", imgid],
                                check=False,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        logging.info(
            'Stop command executed. Result={}. Output={}. Error={}'.format(
                result.returncode, result.stdout, result.stderr))
        while result.returncode != 0 and failed < MAXFAILED:
            failed += 1
            logging.error(
                "Return code was not zero. Trying again.. ({}/{} attempts)".
                format(failed, MAXFAILED))
            result = subprocess.run(['docker', 'stop', "-t", "1", imgid],
                                    check=False,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)

        if failed >= MAXFAILED:
            raise Exception("Surpassed maximum retries for docker stop")

    def launch_container(self, imgid):
        imgid = self.mkimgname(imgid)
        cmd = ['docker', 'start', imgid]
        result = subprocess.run(cmd,
                                check=False,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        logging.info(
            'Start command executed. CMD={} Result={}. Output={}. Error={}'.
            format(" ".join(cmd), result.returncode, result.stdout,
                   result.stderr))

    def container_is_running(self, imgid):
        result = subprocess.run(['docker', 'ps'],
                                check=False,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        return imgid in str(result.stdout)

    def respawn_stop_loop(self):
        """
        This thread stops containers listed in self.to_pop
        and sends them onto the respawn_start_thread via the
        self.to_start list.
        """
        while True:
            if len(self.to_stop) == 0:
                time.sleep(POLL_INTERVAL)
                continue
            imgid = self.to_stop.pop()
            try:
                self.stop_container(imgid)
                self.to_start.append(imgid)
            except Exception as e:
                # If we find an error stopping the container then we unmark
                #it and hope next time there is no error.
                logging.error("Exception on respawn_stop_thread: " + str(e))
                self.respawning[imgid] = False

    def respawn_start_loop(self):
        """
        This thread starts containers listed in self.to_start
        after a START_SLEEP interval and removes the respawning flag.
        The first heartbeat timeout is set with a STARTING_TOLERANCE
        offset.
        """
        waiting = []  # internal list that we don't have race conditions on
        while True:
            current_time = time.time()
            if len(self.to_start) == 0 and len(waiting) == 0:
                time.sleep(POLL_INTERVAL)
                continue
            if len(self.to_start) > 0:
                imgid = self.to_start.pop()  #atomic
                waiting.append((imgid, current_time + START_SLEEP))
            # launch a container if we have passed the scheduled start time
            for imgid, scheduled_time in waiting:
                if scheduled_time <= current_time:
                    try:
                        self.launch_container(imgid)
                    except Exception as e:
                        logging.error("Exception on respawn_start_thread: " +
                                      str(e))
                    finally:
                        self.last_timeout[imgid] = time.time(
                        ) + STARTING_TOLERANCE
                        self.respawning[imgid] = False
            # filter launched container
            waiting = [(i, t) for i, t in waiting if current_time < t]

    def launch_spawner_threads(self):
        respawn_start = threading.Thread(target=self.respawn_stop_loop)
        respawn_start.start()

        respawn_stop = threading.Thread(target=self.respawn_start_loop)
        respawn_stop.start()

    def launch_receiver_thread(self):
        self.leader_queue.async_consume(self.process_heartbeat, auto_ack=True)

    def reassign_storage_master(self):
        logging.info("Watchdog Leader: reassigning master")
        #TODO: what about the first time?all responding but no master => por esto el flag adicional y eso.

        # setear el master como slave y agarrar un nodo vivo
        logging.info("Storage settings right now: {}".format(
            self.storage_roles))
        current_time = time.time()
        for storage_node in self.storage_roles.keys():
            self.storage_roles[storage_node] = SLAVE_ROLE
            if current_time - self.last_timeout[
                    storage_node] < HEARTBEAT_TIMEOUT:
                new_master = storage_node

        logging.info(
            "Watchdog Leader: new master should be {}, current role is {}".
            format(new_master, self.storage_roles[new_master]))

        self.storage_exchange.publish("{},{}".format(
            NEW_MASTER_MSG,
            new_master.split("_")[-1]  # PID
        ))

        # wait until node has processed the message and changed roles
        while self.storage_roles[new_master] == SLAVE_ROLE:
            logging.info(
                "Watchdog Leader: waiting for {} to change role".format(
                    new_master))
            time.sleep(POLL_INTERVAL)

        logging.info(
            "Watchdog Leader: {} has changed role to {}, moving away".format(
                new_master, self.storage_roles[new_master]))
        logging.info("Storage settings after reassigning: {}".format(
            self.storage_roles))
        self.reassigning = False

    def launch_checker(self):
        i = 0
        while True:
            i = (i + 1) % 5
            current_time = time.time()
            for hostname, last_heartbeat in self.last_timeout.items():
                if current_time - last_heartbeat > HEARTBEAT_TIMEOUT and not self.respawning[
                        hostname]:
                    logging.info(
                        "Detected timeout of hostname {}".format(hostname))
                    self.respawning[hostname] = True
                    self.to_stop.append(
                        hostname)  # another thread reads this queue

            if current_time - self.storage_master_heartbeat > HEARTBEAT_TIMEOUT and not self.reassigning:
                self.reassigning = True
                reassign_thread = threading.Thread(
                    target=self.reassign_storage_master)
                reassign_thread.start()
            if i == 1:
                logging.info("Storage settings right now: {}".format(
                    self.storage_roles))
            time.sleep(CHECK_INTERVAL)
Exemplo n.º 2
0
class ElectableProcess:
    """This class factorizes the leader election protocol.
    When this process is chosen as a leader this class
    runs callback on a separate thread."""
    def __init__(self, pid, pid_list, callback):
        """Starts the election protocol. Receives this
        process' pid number, a list of other pids
        and a callback function used when this process is
        elected."""
        self.pid = pid
        self.pid_list = pid_list
        self.onleader_callback = callback
        self.received_answer = False
        self.leader = None
        logging.basicConfig(
            format='%(asctime)s [PID {}] %(message)s'.format(self.pid))

    def run(self, _):
        self.setup_queues()
        logging.info(
            "Waiting {}s for starting everything.".format(INITIAL_SLEEP))
        time.sleep(INITIAL_SLEEP)
        logging.info("Starting off as listening to heartbeats..")
        self.follow_leader()

    def setup_queues(self):
        self.exchange = RabbitMQQueue(exchange=ELECTION_EXCHANGE,
                                      consumer=False,
                                      exchange_type="direct")
        self.process_queue = RabbitMQQueue(exchange=ELECTION_EXCHANGE,
                                           consumer=True,
                                           exchange_type="direct",
                                           routing_keys=[str(self.pid)])
        self.leader_ex = RabbitMQQueue(exchange=LEADER_EXCHANGE,
                                       consumer=False,
                                       durable=False)
        self.leader_queue = RabbitMQQueue(exchange=LEADER_EXCHANGE,
                                          consumer=True,
                                          durable=False)

    def process_message(self, ch, method, properties, body):
        data = body.decode().split(',')
        logging.info("Received {}".format(body.decode()))
        if data[0] == "election":
            logging.info("Sending answer,{} to {}".format(self.pid, data[1]))
            self.exchange.publish("answer,{}".format(self.pid), data[1])
        elif data[0] == "answer":
            self.received_answer = True  # atomic
        elif data[0] == "coordinator":
            self.leader = int(data[1])  # atomic

    def start_election(self):
        self.received_answer = False
        self.leader = None

        logging.info("Starting election -- sending ELECTION")
        for remote_pid in self.pid_list:
            if remote_pid > self.pid:
                logging.info("Sending election,{} to {}".format(
                    self.pid, remote_pid))
                self.exchange.publish("election,{}".format(self.pid),
                                      str(remote_pid))

        # If we do not receive an answer within the timeout,
        # we are now the coordinator.
        # This processes requests in another thread,

        logging.info("Consuming from queue")

        self.process_queue.async_consume(self.process_message, auto_ack=True)

        logging.info("Sleeping for {}s".format(TIMEOUT_ANSWER))
        time.sleep(TIMEOUT_ANSWER)
        logging.info("Woke up from timeout")

        if self.received_answer:
            logging.info("Answer received!")
            time.sleep(TIMEOUT_COORD)
            logging.info("Woke up from coord timeout")
            # same as before => factorize in a function?
            if self.leader is None:
                logging.info("NO leader still! Rebooting election")
                self.start_election()
            logging.info("NEW leader! id is: {}".format(self.leader))
        else:
            logging.info(
                "NO answer received! Therefore I am the leader. Sending coord."
            )
            # No answer received in timeout, therefore this process is leader
            for remote_pid in self.pid_list:
                if remote_pid < self.pid:
                    logging.info("->coordinator,{} to key {}".format(
                        self.pid, str(remote_pid)))
                    self.exchange.publish("coordinator,{}".format(self.pid),
                                          str(remote_pid))
            logging.info("Setting leader pid")
            self.leader = self.pid

        if self.leader == self.pid:
            logging.info("Starting leader logic in this process")
            self.start_leader()
        else:
            logging.info("Following leader...")
            self.follow_leader()

    def start_leader(self):
        self.logic_process = multiprocessing.Process(
            target=self.onleader_callback)
        self.logic_process.start()
        # set up leader heartbeat
        while True:
            self.leader_ex.publish("heartbeat")
            time.sleep(HEARTBEAT_INTERVAL)

    def process_heartbeat(self, ch, method, properties, body):
        self.last_heartbeat = time.time()

    def follow_leader(self):
        # in another process update the counter with the time
        # this one just wakes up every fraction of a second updating
        # the count
        self.last_heartbeat = time.time() - 0.01
        self.leader_queue.async_consume(self.process_heartbeat, auto_ack=True)
        diff = time.time() - self.last_heartbeat

        while diff < TIMEOUT_HEARTBEAT:
            #logging.info("Sleeping for {}s".format(TIMEOUT_HEARTBEAT - diff + 0.1))
            time.sleep(TIMEOUT_HEARTBEAT - diff + 0.1)  # So we don't over-wait
            diff = time.time() - self.last_heartbeat

        logging.info("Timed out leader. About to start a new election")
        self.start_election()