class TestPaastaLeaderElection(unittest.TestCase): def setUp(self): with mock.patch( "paasta_tools.deployd.leader.Election.__init__", autospec=False ): self.mock_client = mock.Mock() self.mock_control = mock.Mock() self.election = PaastaLeaderElection( self.mock_client, control=self.mock_control ) def test_init(self): assert not self.election.waiting_for_reconnect def test_log(self): self.election.log.info("THING") def test_run(self): mock_fun = mock.Mock() mock_arg = mock.Mock() with mock.patch( "paasta_tools.deployd.leader.Election.run", autospec=True ) as mock_kazoo_election: self.election.run(mock_fun, mock_arg) mock_kazoo_election.assert_called_with(self.election, mock_fun, mock_arg) def test_connection_listener(self): with mock.patch( "paasta_tools.deployd.leader.PaastaThread", autospec=True ) as mock_paasta_thread: self.election.connection_listener(KazooState.CONNECTED) self.election.connection_listener(KazooState.SUSPENDED) mock_paasta_thread.assert_called_with( target=self.election.reconnection_listener ) assert self.election.waiting_for_reconnect self.election.connection_listener(KazooState.LOST) self.mock_control.put.assert_called_with("ABORT") def test_reconnection_listener(self): self.mock_client.state = KazooState.CONNECTED self.election.reconnection_listener() assert not self.election.waiting_for_reconnect assert not self.mock_control.put.called self.mock_client.state = KazooState.SUSPENDED self.election.waiting_for_reconnect = True with mock.patch("time.sleep", autospec=True): self.election.reconnection_listener() assert self.election.waiting_for_reconnect self.mock_control.put.assert_called_with("ABORT")
class DeployDaemon(PaastaThread): def __init__(self) -> None: super().__init__() self.started = False self.daemon = True service_configuration_lib.disable_yaml_cache() self.config = load_system_paasta_config() self.setup_logging() self.metrics = get_metrics_interface("paasta.deployd") self.setup_instances_to_bounce() self.control = PaastaQueue("ControlQueue") self.marathon_clients = get_marathon_clients_from_config() def setup_instances_to_bounce(self) -> None: if self.config.get_deployd_use_zk_queue(): zk_client = KazooClient(hosts=self.config.get_zk_hosts()) zk_client.start() self.instances_to_bounce: DelayDeadlineQueueProtocol = ZKDelayDeadlineQueue( client=zk_client) else: self.instances_to_bounce = DelayDeadlineQueue() def setup_logging(self) -> None: root_logger = logging.getLogger() root_logger.setLevel( getattr(logging, self.config.get_deployd_log_level())) handler = logging.StreamHandler() handler.addFilter(AddHostnameFilter()) root_logger.addHandler(handler) logging.getLogger("kazoo").setLevel(logging.CRITICAL) handler.setFormatter( logging.Formatter( "%(asctime)s:%(hostname)s:%(levelname)s:%(name)s:%(message)s")) def run(self) -> None: self.log.info("paasta-deployd starting up...") startup_counter = self.metrics.create_counter( "process_started", paasta_cluster=self.config.get_cluster()) startup_counter.count() with ZookeeperPool() as self.zk: self.election = PaastaLeaderElection( self.zk, "/paasta-deployd-leader", socket.getfqdn(), control=self.control, ) self.is_leader = False self.log.info("Waiting to become leader") self.election.run(self.startup) self.log.info("Leadership given up, exiting...") @property def watcher_threads_enabled(self) -> List[Type[watchers.PaastaWatcher]]: disabled_watchers = self.config.get_disabled_watchers() watcher_classes = [ obj[1] for obj in inspect.getmembers(watchers) if inspect.isclass(obj[1]) and obj[1].__bases__[0] == watchers.PaastaWatcher ] enabled_watchers = [ x for x in watcher_classes if x.__name__ not in disabled_watchers ] return enabled_watchers def startup(self) -> None: self.is_leader = True self.log.info("This node is elected as leader {}".format( socket.getfqdn())) leader_counter = self.metrics.create_counter( "leader_elections", paasta_cluster=self.config.get_cluster()) leader_counter.count() self.log.info("Starting all watcher threads") self.start_watchers() self.log.info( "All watchers started, now adding all services for initial bounce") self.add_all_services() self.log.info("Prioritising services that we know need a bounce...") if self.config.get_deployd_startup_oracle_enabled(): self.prioritise_bouncing_services() self.log.info("Starting worker threads") self.start_workers() QueueAndWorkerMetrics( queue=self.instances_to_bounce, workers=self.workers, cluster=self.config.get_cluster(), metrics_provider=self.metrics, ).start() self.started = True self.log.info("Startup finished!") self.main_loop() def main_loop(self) -> None: while True: try: message = self.control.get(block=False) except Empty: message = None if message == "ABORT": self.log.info("Got ABORT message, main_loop exiting...") break if not self.all_watchers_running(): self.log.error("One or more watcher died, committing suicide!") sys.exit(1) if self.all_workers_dead(): self.log.error("All workers have died, committing suicide!") sys.exit(1) self.check_and_start_workers() time.sleep(0.1) def all_watchers_running(self) -> bool: return all([watcher.is_alive() for watcher in self.watcher_threads]) def all_workers_dead(self) -> bool: return all([not worker.is_alive() for worker in self.workers]) def check_and_start_workers(self) -> None: live_workers = len( [worker for worker in self.workers if worker.is_alive()]) number_of_dead_workers = self.config.get_deployd_number_workers( ) - live_workers for i in range(number_of_dead_workers): self.log.error(DEAD_DEPLOYD_WORKER_MESSAGE) worker_no = len(self.workers) + 1 worker = PaastaDeployWorker(worker_no, self.instances_to_bounce, self.config, self.metrics) worker.start() self.workers.append(worker) def stop(self) -> None: self.control.put("ABORT") def start_workers(self) -> None: self.workers: List[PaastaDeployWorker] = [] for i in range(self.config.get_deployd_number_workers()): worker = PaastaDeployWorker(i, self.instances_to_bounce, self.config, self.metrics) worker.start() self.workers.append(worker) def add_all_services(self) -> None: instances = get_services_for_cluster( cluster=self.config.get_cluster(), instance_type="marathon", soa_dir=DEFAULT_SOA_DIR, ) for service, instance in instances: self.instances_to_bounce.put( ServiceInstance( service=service, instance=instance, watcher="daemon_start", bounce_by=time.time() + self.config.get_deployd_startup_bounce_deadline(), wait_until=time.time(), failures=0, bounce_start_time=time.time(), enqueue_time=time.time(), )) def prioritise_bouncing_services(self) -> None: service_instances = get_service_instances_that_need_bouncing( self.marathon_clients, DEFAULT_SOA_DIR) now = time.time() for service_instance in service_instances: self.log.info( f"Prioritising {service_instance} to be bounced immediately") service, instance = service_instance.split(".") self.instances_to_bounce.put( ServiceInstance( service=service, instance=instance, watcher=type(self).__name__, bounce_by=now, wait_until=now, failures=0, bounce_start_time=time.time(), enqueue_time=time.time(), )) def start_watchers(self) -> None: """ should block until all threads happy""" self.watcher_threads = [ watcher( instances_to_bounce=self.instances_to_bounce, cluster=self.config.get_cluster(), zookeeper_client=self.zk, config=self.config, ) for watcher in self.watcher_threads_enabled ] self.log.info( f"Starting the following watchers {self.watcher_threads}") for watcher in self.watcher_threads: watcher.start() self.log.info("Waiting for all watchers to start") attempts = 0 while attempts < 120: if all([watcher.is_ready for watcher in self.watcher_threads]): return self.log.info("Sleeping and waiting for watchers to all start") self.log.info("Waiting on: {}".format([ watcher.__class__.__name__ for watcher in self.watcher_threads if not watcher.is_ready ])) time.sleep(1) attempts += 1 self.log.error("Failed to start all the watchers, exiting...") sys.exit(1)
class DeployDaemon(PaastaThread): def __init__(self): super(DeployDaemon, self).__init__() self.started = False self.daemon = True service_configuration_lib.disable_yaml_cache() self.config = load_system_paasta_config() self.setup_logging() self.bounce_q = DedupedPriorityQueue("BounceQueue") self.inbox_q = PaastaQueue("InboxQueue") self.control = PaastaQueue("ControlQueue") self.inbox = Inbox(self.inbox_q, self.bounce_q) self.marathon_clients = get_marathon_clients_from_config() def setup_logging(self): root_logger = logging.getLogger() root_logger.setLevel( getattr(logging, self.config.get_deployd_log_level())) handler = logging.StreamHandler() handler.addFilter(AddHostnameFilter()) root_logger.addHandler(handler) logging.getLogger("kazoo").setLevel(logging.CRITICAL) handler.setFormatter( logging.Formatter( '%(asctime)s:%(hostname)s:%(levelname)s:%(name)s:%(message)s')) def run(self): self.log.info("paasta-deployd starting up...") with ZookeeperPool() as self.zk: self.log.info("Waiting to become leader") self.election = PaastaLeaderElection( self.zk, "/paasta-deployd-leader", socket.getfqdn(), control=self.control, ) self.is_leader = False self.election.run(self.startup) def bounce(self, service_instance): self.inbox_q.put(service_instance) @property def watcher_threads_enabled(self): disabled_watchers = self.config.get_disabled_watchers() watcher_classes = [ obj[1] for obj in inspect.getmembers(watchers) if inspect.isclass(obj[1]) and obj[1].__bases__[0] == watchers.PaastaWatcher ] enabled_watchers = [ x for x in watcher_classes if x.__name__ not in disabled_watchers ] return enabled_watchers def startup(self): self.is_leader = True self.log.info("This node is elected as leader {}".format( socket.getfqdn())) self.metrics = get_metrics_interface('paasta.deployd') QueueMetrics(self.inbox, self.bounce_q, self.config.get_cluster(), self.metrics).start() self.inbox.start() self.log.info("Starting all watcher threads") self.start_watchers() self.log.info( "All watchers started, now adding all services for initial bounce") self.add_all_services() self.log.info("Prioritising services that we know need a bounce...") if self.config.get_deployd_startup_oracle_enabled(): self.prioritise_bouncing_services() self.log.info("Starting worker threads") self.start_workers() self.started = True self.log.info("Startup finished!") self.main_loop() def main_loop(self): while True: try: message = self.control.get(block=False) except Empty: message = None if message == "ABORT": break if not self.all_watchers_running(): self.log.error("One or more watcher died, committing suicide!") sys.exit(1) if self.all_workers_dead(): self.log.error("All workers have died, comitting suicide!") sys.exit(1) self.check_and_start_workers() time.sleep(0.1) def all_watchers_running(self): return all([watcher.is_alive() for watcher in self.watcher_threads]) def all_workers_dead(self): return all([not worker.is_alive() for worker in self.workers]) def check_and_start_workers(self): live_workers = len( [worker for worker in self.workers if worker.is_alive()]) number_of_dead_workers = self.config.get_deployd_number_workers( ) - live_workers for i in range(number_of_dead_workers): self.log.error( "Detected a dead worker, starting a replacement thread") worker_no = len(self.workers) + 1 worker = PaastaDeployWorker(worker_no, self.inbox_q, self.bounce_q, self.config, self.metrics) worker.start() self.workers.append(worker) def stop(self): self.control.put("ABORT") def start_workers(self): self.workers = [] for i in range(self.config.get_deployd_number_workers()): worker = PaastaDeployWorker(i, self.inbox_q, self.bounce_q, self.config, self.metrics) worker.start() self.workers.append(worker) def add_all_services(self): instances = get_services_for_cluster( cluster=self.config.get_cluster(), instance_type='marathon', soa_dir=DEFAULT_SOA_DIR, ) instances_to_add = rate_limit_instances( instances=instances, cluster=self.config.get_cluster(), number_per_minute=self.config.get_deployd_startup_bounce_rate(), watcher_name='daemon_start', priority=99, ) for service_instance in instances_to_add: self.inbox_q.put(service_instance) def prioritise_bouncing_services(self): service_instances = get_service_instances_that_need_bouncing( self.marathon_clients, DEFAULT_SOA_DIR, ) for service_instance in service_instances: self.log.info("Prioritising {} to be bounced immediately".format( service_instance)) service, instance = service_instance.split('.') self.inbox_q.put( ServiceInstance( service=service, instance=instance, cluster=self.config.get_cluster(), watcher=type(self).__name__, bounce_by=int(time.time()), bounce_timers=None, failures=0, )) def start_watchers(self): """ should block until all threads happy""" self.watcher_threads = [ watcher( inbox_q=self.inbox_q, cluster=self.config.get_cluster(), zookeeper_client=self.zk, config=self.config, ) for watcher in self.watcher_threads_enabled ] self.log.info("Starting the following watchers {}".format( self.watcher_threads)) for watcher in self.watcher_threads: watcher.start() self.log.info("Waiting for all watchers to start") while not all([watcher.is_ready for watcher in self.watcher_threads]): self.log.debug("Sleeping and waiting for watchers to all start") time.sleep(1)
class DeployDaemon(PaastaThread): def __init__(self): super(DeployDaemon, self).__init__() self.started = False self.daemon = True service_configuration_lib.disable_yaml_cache() self.config = load_system_paasta_config() root_logger = logging.getLogger() root_logger.setLevel( getattr(logging, self.config.get_deployd_log_level())) log_handlers = [logging.StreamHandler()] if os.path.exists('/dev/log'): log_handlers.append(logging.handlers.SysLogHandler('/dev/log')) for handler in log_handlers: root_logger.addHandler(handler) handler.setFormatter( logging.Formatter('%(levelname)s:%(name)s:%(message)s')) self.bounce_q = PaastaQueue("BounceQueue") self.inbox_q = PaastaQueue("InboxQueue") self.control = PaastaQueue("ControlQueue") self.inbox = Inbox(self.inbox_q, self.bounce_q) def run(self): self.log.info("paasta-deployd starting up...") with ZookeeperPool() as self.zk: self.log.info("Waiting to become leader") self.election = PaastaLeaderElection(self.zk, "/paasta-deployd-leader", socket.getfqdn(), control=self.control) self.is_leader = False self.election.run(self.startup) def bounce(self, service_instance): self.inbox_q.put(service_instance) def startup(self): self.is_leader = True self.log.debug("This node is elected as leader {}".format( socket.getfqdn())) self.metrics = get_metrics_interface( self.config.get_deployd_metrics_provider()) QueueMetrics(self.inbox, self.bounce_q, self.config.get_cluster(), self.metrics).start() self.inbox.start() self.log.info("Starting all watcher threads") self.start_watchers() self.log.info( "All watchers started, now adding all services for initial bounce") self.add_all_services() self.log.info("Starting worker threads") self.start_workers() self.started = True self.main_loop() def main_loop(self): while True: try: message = self.control.get(block=False) except Empty: message = None if message == "ABORT": break if not self.all_watchers_running(): self.log.error("One or more watcher died, committing suicide!") sys.exit(1) if self.all_workers_dead(): self.log.error("All workers have died, comitting suicide!") sys.exit(1) self.check_and_start_workers() time.sleep(0.1) def all_watchers_running(self): return all([watcher.is_alive() for watcher in self.watcher_threads]) def all_workers_dead(self): return all([not worker.is_alive() for worker in self.workers]) def check_and_start_workers(self): live_workers = len( [worker for worker in self.workers if worker.is_alive()]) number_of_dead_workers = self.config.get_deployd_number_workers( ) - live_workers for i in range(number_of_dead_workers): worker_no = len(self.workers) + 1 worker = PaastaDeployWorker(worker_no, self.inbox_q, self.bounce_q, self.config.get_cluster(), self.metrics) worker.start() self.workers.append(worker) def stop(self): self.control.put("ABORT") def start_workers(self): self.workers = [] for i in range(self.config.get_deployd_number_workers()): worker = PaastaDeployWorker(i, self.inbox_q, self.bounce_q, self.config.get_cluster(), self.metrics) worker.start() self.workers.append(worker) def add_all_services(self): instances = get_services_for_cluster(cluster=self.config.get_cluster(), instance_type='marathon', soa_dir=DEFAULT_SOA_DIR) instances_to_add = rate_limit_instances( instances=instances, number_per_minute=self.config.get_deployd_startup_bounce_rate(), watcher_name='daemon_start') for service_instance in instances_to_add: self.inbox_q.put(service_instance) def start_watchers(self): """ should block until all threads happy""" watcher_classes = [ obj[1] for obj in inspect.getmembers(watchers) if inspect.isclass(obj[1]) and obj[1].__bases__[0] == watchers.PaastaWatcher ] self.watcher_threads = [ watcher(inbox_q=self.inbox_q, cluster=self.config.get_cluster(), zookeeper_client=self.zk) for watcher in watcher_classes ] self.log.info("Starting the following watchers {}".format( self.watcher_threads)) for watcher in self.watcher_threads: watcher.start() self.log.info("Waiting for all watchers to start") while not all([watcher.is_ready for watcher in self.watcher_threads]): self.log.debug("Sleeping and waiting for watchers to all start") time.sleep(1)