예제 #1
0
class TestPaastaLeaderElection(unittest.TestCase):
    def setUp(self):
        with mock.patch(
            "paasta_tools.deployd.leader.Election.__init__", autospec=False
        ):
            self.mock_client = mock.Mock()
            self.mock_control = mock.Mock()
            self.election = PaastaLeaderElection(
                self.mock_client, control=self.mock_control
            )

    def test_init(self):
        assert not self.election.waiting_for_reconnect

    def test_log(self):
        self.election.log.info("THING")

    def test_run(self):
        mock_fun = mock.Mock()
        mock_arg = mock.Mock()
        with mock.patch(
            "paasta_tools.deployd.leader.Election.run", autospec=True
        ) as mock_kazoo_election:
            self.election.run(mock_fun, mock_arg)
            mock_kazoo_election.assert_called_with(self.election, mock_fun, mock_arg)

    def test_connection_listener(self):
        with mock.patch(
            "paasta_tools.deployd.leader.PaastaThread", autospec=True
        ) as mock_paasta_thread:
            self.election.connection_listener(KazooState.CONNECTED)
            self.election.connection_listener(KazooState.SUSPENDED)
            mock_paasta_thread.assert_called_with(
                target=self.election.reconnection_listener
            )
            assert self.election.waiting_for_reconnect
            self.election.connection_listener(KazooState.LOST)
            self.mock_control.put.assert_called_with("ABORT")

    def test_reconnection_listener(self):
        self.mock_client.state = KazooState.CONNECTED
        self.election.reconnection_listener()
        assert not self.election.waiting_for_reconnect
        assert not self.mock_control.put.called

        self.mock_client.state = KazooState.SUSPENDED
        self.election.waiting_for_reconnect = True
        with mock.patch("time.sleep", autospec=True):
            self.election.reconnection_listener()
        assert self.election.waiting_for_reconnect
        self.mock_control.put.assert_called_with("ABORT")
class DeployDaemon(PaastaThread):
    def __init__(self) -> None:
        super().__init__()
        self.started = False
        self.daemon = True
        service_configuration_lib.disable_yaml_cache()
        self.config = load_system_paasta_config()
        self.setup_logging()
        self.metrics = get_metrics_interface("paasta.deployd")
        self.setup_instances_to_bounce()
        self.control = PaastaQueue("ControlQueue")
        self.marathon_clients = get_marathon_clients_from_config()

    def setup_instances_to_bounce(self) -> None:
        if self.config.get_deployd_use_zk_queue():
            zk_client = KazooClient(hosts=self.config.get_zk_hosts())
            zk_client.start()
            self.instances_to_bounce: DelayDeadlineQueueProtocol = ZKDelayDeadlineQueue(
                client=zk_client)
        else:
            self.instances_to_bounce = DelayDeadlineQueue()

    def setup_logging(self) -> None:
        root_logger = logging.getLogger()
        root_logger.setLevel(
            getattr(logging, self.config.get_deployd_log_level()))
        handler = logging.StreamHandler()
        handler.addFilter(AddHostnameFilter())
        root_logger.addHandler(handler)
        logging.getLogger("kazoo").setLevel(logging.CRITICAL)
        handler.setFormatter(
            logging.Formatter(
                "%(asctime)s:%(hostname)s:%(levelname)s:%(name)s:%(message)s"))

    def run(self) -> None:
        self.log.info("paasta-deployd starting up...")
        startup_counter = self.metrics.create_counter(
            "process_started", paasta_cluster=self.config.get_cluster())
        startup_counter.count()
        with ZookeeperPool() as self.zk:
            self.election = PaastaLeaderElection(
                self.zk,
                "/paasta-deployd-leader",
                socket.getfqdn(),
                control=self.control,
            )
            self.is_leader = False
            self.log.info("Waiting to become leader")
            self.election.run(self.startup)
            self.log.info("Leadership given up, exiting...")

    @property
    def watcher_threads_enabled(self) -> List[Type[watchers.PaastaWatcher]]:
        disabled_watchers = self.config.get_disabled_watchers()
        watcher_classes = [
            obj[1] for obj in inspect.getmembers(watchers)
            if inspect.isclass(obj[1])
            and obj[1].__bases__[0] == watchers.PaastaWatcher
        ]
        enabled_watchers = [
            x for x in watcher_classes if x.__name__ not in disabled_watchers
        ]
        return enabled_watchers

    def startup(self) -> None:
        self.is_leader = True
        self.log.info("This node is elected as leader {}".format(
            socket.getfqdn()))
        leader_counter = self.metrics.create_counter(
            "leader_elections", paasta_cluster=self.config.get_cluster())
        leader_counter.count()
        self.log.info("Starting all watcher threads")
        self.start_watchers()
        self.log.info(
            "All watchers started, now adding all services for initial bounce")
        self.add_all_services()
        self.log.info("Prioritising services that we know need a bounce...")
        if self.config.get_deployd_startup_oracle_enabled():
            self.prioritise_bouncing_services()
        self.log.info("Starting worker threads")
        self.start_workers()
        QueueAndWorkerMetrics(
            queue=self.instances_to_bounce,
            workers=self.workers,
            cluster=self.config.get_cluster(),
            metrics_provider=self.metrics,
        ).start()
        self.started = True
        self.log.info("Startup finished!")
        self.main_loop()

    def main_loop(self) -> None:
        while True:
            try:
                message = self.control.get(block=False)
            except Empty:
                message = None
            if message == "ABORT":
                self.log.info("Got ABORT message, main_loop exiting...")
                break
            if not self.all_watchers_running():
                self.log.error("One or more watcher died, committing suicide!")
                sys.exit(1)
            if self.all_workers_dead():
                self.log.error("All workers have died, committing suicide!")
                sys.exit(1)
            self.check_and_start_workers()
            time.sleep(0.1)

    def all_watchers_running(self) -> bool:
        return all([watcher.is_alive() for watcher in self.watcher_threads])

    def all_workers_dead(self) -> bool:
        return all([not worker.is_alive() for worker in self.workers])

    def check_and_start_workers(self) -> None:
        live_workers = len(
            [worker for worker in self.workers if worker.is_alive()])
        number_of_dead_workers = self.config.get_deployd_number_workers(
        ) - live_workers
        for i in range(number_of_dead_workers):
            self.log.error(DEAD_DEPLOYD_WORKER_MESSAGE)
            worker_no = len(self.workers) + 1
            worker = PaastaDeployWorker(worker_no, self.instances_to_bounce,
                                        self.config, self.metrics)
            worker.start()
            self.workers.append(worker)

    def stop(self) -> None:
        self.control.put("ABORT")

    def start_workers(self) -> None:
        self.workers: List[PaastaDeployWorker] = []
        for i in range(self.config.get_deployd_number_workers()):
            worker = PaastaDeployWorker(i, self.instances_to_bounce,
                                        self.config, self.metrics)
            worker.start()
            self.workers.append(worker)

    def add_all_services(self) -> None:
        instances = get_services_for_cluster(
            cluster=self.config.get_cluster(),
            instance_type="marathon",
            soa_dir=DEFAULT_SOA_DIR,
        )
        for service, instance in instances:
            self.instances_to_bounce.put(
                ServiceInstance(
                    service=service,
                    instance=instance,
                    watcher="daemon_start",
                    bounce_by=time.time() +
                    self.config.get_deployd_startup_bounce_deadline(),
                    wait_until=time.time(),
                    failures=0,
                    bounce_start_time=time.time(),
                    enqueue_time=time.time(),
                ))

    def prioritise_bouncing_services(self) -> None:
        service_instances = get_service_instances_that_need_bouncing(
            self.marathon_clients, DEFAULT_SOA_DIR)

        now = time.time()

        for service_instance in service_instances:
            self.log.info(
                f"Prioritising {service_instance} to be bounced immediately")
            service, instance = service_instance.split(".")
            self.instances_to_bounce.put(
                ServiceInstance(
                    service=service,
                    instance=instance,
                    watcher=type(self).__name__,
                    bounce_by=now,
                    wait_until=now,
                    failures=0,
                    bounce_start_time=time.time(),
                    enqueue_time=time.time(),
                ))

    def start_watchers(self) -> None:
        """ should block until all threads happy"""
        self.watcher_threads = [
            watcher(
                instances_to_bounce=self.instances_to_bounce,
                cluster=self.config.get_cluster(),
                zookeeper_client=self.zk,
                config=self.config,
            ) for watcher in self.watcher_threads_enabled
        ]

        self.log.info(
            f"Starting the following watchers {self.watcher_threads}")
        for watcher in self.watcher_threads:
            watcher.start()
        self.log.info("Waiting for all watchers to start")
        attempts = 0
        while attempts < 120:
            if all([watcher.is_ready for watcher in self.watcher_threads]):
                return
            self.log.info("Sleeping and waiting for watchers to all start")
            self.log.info("Waiting on: {}".format([
                watcher.__class__.__name__ for watcher in self.watcher_threads
                if not watcher.is_ready
            ]))
            time.sleep(1)
            attempts += 1
        self.log.error("Failed to start all the watchers, exiting...")
        sys.exit(1)
예제 #3
0
class DeployDaemon(PaastaThread):
    def __init__(self):
        super(DeployDaemon, self).__init__()
        self.started = False
        self.daemon = True
        service_configuration_lib.disable_yaml_cache()
        self.config = load_system_paasta_config()
        self.setup_logging()
        self.bounce_q = DedupedPriorityQueue("BounceQueue")
        self.inbox_q = PaastaQueue("InboxQueue")
        self.control = PaastaQueue("ControlQueue")
        self.inbox = Inbox(self.inbox_q, self.bounce_q)
        self.marathon_clients = get_marathon_clients_from_config()

    def setup_logging(self):
        root_logger = logging.getLogger()
        root_logger.setLevel(
            getattr(logging, self.config.get_deployd_log_level()))
        handler = logging.StreamHandler()
        handler.addFilter(AddHostnameFilter())
        root_logger.addHandler(handler)
        logging.getLogger("kazoo").setLevel(logging.CRITICAL)
        handler.setFormatter(
            logging.Formatter(
                '%(asctime)s:%(hostname)s:%(levelname)s:%(name)s:%(message)s'))

    def run(self):
        self.log.info("paasta-deployd starting up...")
        with ZookeeperPool() as self.zk:
            self.log.info("Waiting to become leader")
            self.election = PaastaLeaderElection(
                self.zk,
                "/paasta-deployd-leader",
                socket.getfqdn(),
                control=self.control,
            )
            self.is_leader = False
            self.election.run(self.startup)

    def bounce(self, service_instance):
        self.inbox_q.put(service_instance)

    @property
    def watcher_threads_enabled(self):
        disabled_watchers = self.config.get_disabled_watchers()
        watcher_classes = [
            obj[1] for obj in inspect.getmembers(watchers)
            if inspect.isclass(obj[1])
            and obj[1].__bases__[0] == watchers.PaastaWatcher
        ]
        enabled_watchers = [
            x for x in watcher_classes if x.__name__ not in disabled_watchers
        ]
        return enabled_watchers

    def startup(self):
        self.is_leader = True
        self.log.info("This node is elected as leader {}".format(
            socket.getfqdn()))
        self.metrics = get_metrics_interface('paasta.deployd')
        QueueMetrics(self.inbox, self.bounce_q, self.config.get_cluster(),
                     self.metrics).start()
        self.inbox.start()
        self.log.info("Starting all watcher threads")
        self.start_watchers()
        self.log.info(
            "All watchers started, now adding all services for initial bounce")
        self.add_all_services()
        self.log.info("Prioritising services that we know need a bounce...")
        if self.config.get_deployd_startup_oracle_enabled():
            self.prioritise_bouncing_services()
        self.log.info("Starting worker threads")
        self.start_workers()
        self.started = True
        self.log.info("Startup finished!")
        self.main_loop()

    def main_loop(self):
        while True:
            try:
                message = self.control.get(block=False)
            except Empty:
                message = None
            if message == "ABORT":
                break
            if not self.all_watchers_running():
                self.log.error("One or more watcher died, committing suicide!")
                sys.exit(1)
            if self.all_workers_dead():
                self.log.error("All workers have died, comitting suicide!")
                sys.exit(1)
            self.check_and_start_workers()
            time.sleep(0.1)

    def all_watchers_running(self):
        return all([watcher.is_alive() for watcher in self.watcher_threads])

    def all_workers_dead(self):
        return all([not worker.is_alive() for worker in self.workers])

    def check_and_start_workers(self):
        live_workers = len(
            [worker for worker in self.workers if worker.is_alive()])
        number_of_dead_workers = self.config.get_deployd_number_workers(
        ) - live_workers
        for i in range(number_of_dead_workers):
            self.log.error(
                "Detected a dead worker, starting a replacement thread")
            worker_no = len(self.workers) + 1
            worker = PaastaDeployWorker(worker_no, self.inbox_q, self.bounce_q,
                                        self.config, self.metrics)
            worker.start()
            self.workers.append(worker)

    def stop(self):
        self.control.put("ABORT")

    def start_workers(self):
        self.workers = []
        for i in range(self.config.get_deployd_number_workers()):
            worker = PaastaDeployWorker(i, self.inbox_q, self.bounce_q,
                                        self.config, self.metrics)
            worker.start()
            self.workers.append(worker)

    def add_all_services(self):
        instances = get_services_for_cluster(
            cluster=self.config.get_cluster(),
            instance_type='marathon',
            soa_dir=DEFAULT_SOA_DIR,
        )
        instances_to_add = rate_limit_instances(
            instances=instances,
            cluster=self.config.get_cluster(),
            number_per_minute=self.config.get_deployd_startup_bounce_rate(),
            watcher_name='daemon_start',
            priority=99,
        )
        for service_instance in instances_to_add:
            self.inbox_q.put(service_instance)

    def prioritise_bouncing_services(self):
        service_instances = get_service_instances_that_need_bouncing(
            self.marathon_clients,
            DEFAULT_SOA_DIR,
        )
        for service_instance in service_instances:
            self.log.info("Prioritising {} to be bounced immediately".format(
                service_instance))
            service, instance = service_instance.split('.')
            self.inbox_q.put(
                ServiceInstance(
                    service=service,
                    instance=instance,
                    cluster=self.config.get_cluster(),
                    watcher=type(self).__name__,
                    bounce_by=int(time.time()),
                    bounce_timers=None,
                    failures=0,
                ))

    def start_watchers(self):
        """ should block until all threads happy"""
        self.watcher_threads = [
            watcher(
                inbox_q=self.inbox_q,
                cluster=self.config.get_cluster(),
                zookeeper_client=self.zk,
                config=self.config,
            ) for watcher in self.watcher_threads_enabled
        ]

        self.log.info("Starting the following watchers {}".format(
            self.watcher_threads))
        for watcher in self.watcher_threads:
            watcher.start()
        self.log.info("Waiting for all watchers to start")
        while not all([watcher.is_ready for watcher in self.watcher_threads]):
            self.log.debug("Sleeping and waiting for watchers to all start")
            time.sleep(1)
예제 #4
0
파일: master.py 프로젝트: ycaihua/paasta
class DeployDaemon(PaastaThread):
    def __init__(self):
        super(DeployDaemon, self).__init__()
        self.started = False
        self.daemon = True
        service_configuration_lib.disable_yaml_cache()
        self.config = load_system_paasta_config()
        root_logger = logging.getLogger()
        root_logger.setLevel(
            getattr(logging, self.config.get_deployd_log_level()))
        log_handlers = [logging.StreamHandler()]
        if os.path.exists('/dev/log'):
            log_handlers.append(logging.handlers.SysLogHandler('/dev/log'))
        for handler in log_handlers:
            root_logger.addHandler(handler)
            handler.setFormatter(
                logging.Formatter('%(levelname)s:%(name)s:%(message)s'))
        self.bounce_q = PaastaQueue("BounceQueue")
        self.inbox_q = PaastaQueue("InboxQueue")
        self.control = PaastaQueue("ControlQueue")
        self.inbox = Inbox(self.inbox_q, self.bounce_q)

    def run(self):
        self.log.info("paasta-deployd starting up...")
        with ZookeeperPool() as self.zk:
            self.log.info("Waiting to become leader")
            self.election = PaastaLeaderElection(self.zk,
                                                 "/paasta-deployd-leader",
                                                 socket.getfqdn(),
                                                 control=self.control)
            self.is_leader = False
            self.election.run(self.startup)

    def bounce(self, service_instance):
        self.inbox_q.put(service_instance)

    def startup(self):
        self.is_leader = True
        self.log.debug("This node is elected as leader {}".format(
            socket.getfqdn()))
        self.metrics = get_metrics_interface(
            self.config.get_deployd_metrics_provider())
        QueueMetrics(self.inbox, self.bounce_q, self.config.get_cluster(),
                     self.metrics).start()
        self.inbox.start()
        self.log.info("Starting all watcher threads")
        self.start_watchers()
        self.log.info(
            "All watchers started, now adding all services for initial bounce")
        self.add_all_services()
        self.log.info("Starting worker threads")
        self.start_workers()
        self.started = True
        self.main_loop()

    def main_loop(self):
        while True:
            try:
                message = self.control.get(block=False)
            except Empty:
                message = None
            if message == "ABORT":
                break
            if not self.all_watchers_running():
                self.log.error("One or more watcher died, committing suicide!")
                sys.exit(1)
            if self.all_workers_dead():
                self.log.error("All workers have died, comitting suicide!")
                sys.exit(1)
            self.check_and_start_workers()
            time.sleep(0.1)

    def all_watchers_running(self):
        return all([watcher.is_alive() for watcher in self.watcher_threads])

    def all_workers_dead(self):
        return all([not worker.is_alive() for worker in self.workers])

    def check_and_start_workers(self):
        live_workers = len(
            [worker for worker in self.workers if worker.is_alive()])
        number_of_dead_workers = self.config.get_deployd_number_workers(
        ) - live_workers
        for i in range(number_of_dead_workers):
            worker_no = len(self.workers) + 1
            worker = PaastaDeployWorker(worker_no, self.inbox_q, self.bounce_q,
                                        self.config.get_cluster(),
                                        self.metrics)
            worker.start()
            self.workers.append(worker)

    def stop(self):
        self.control.put("ABORT")

    def start_workers(self):
        self.workers = []
        for i in range(self.config.get_deployd_number_workers()):
            worker = PaastaDeployWorker(i, self.inbox_q, self.bounce_q,
                                        self.config.get_cluster(),
                                        self.metrics)
            worker.start()
            self.workers.append(worker)

    def add_all_services(self):
        instances = get_services_for_cluster(cluster=self.config.get_cluster(),
                                             instance_type='marathon',
                                             soa_dir=DEFAULT_SOA_DIR)
        instances_to_add = rate_limit_instances(
            instances=instances,
            number_per_minute=self.config.get_deployd_startup_bounce_rate(),
            watcher_name='daemon_start')
        for service_instance in instances_to_add:
            self.inbox_q.put(service_instance)

    def start_watchers(self):
        """ should block until all threads happy"""
        watcher_classes = [
            obj[1] for obj in inspect.getmembers(watchers)
            if inspect.isclass(obj[1])
            and obj[1].__bases__[0] == watchers.PaastaWatcher
        ]
        self.watcher_threads = [
            watcher(inbox_q=self.inbox_q,
                    cluster=self.config.get_cluster(),
                    zookeeper_client=self.zk) for watcher in watcher_classes
        ]
        self.log.info("Starting the following watchers {}".format(
            self.watcher_threads))
        for watcher in self.watcher_threads:
            watcher.start()
        self.log.info("Waiting for all watchers to start")
        while not all([watcher.is_ready for watcher in self.watcher_threads]):
            self.log.debug("Sleeping and waiting for watchers to all start")
            time.sleep(1)