def main_loop(self, options): last_schedule = last_dispatcher_check = time.time() encryption_settings = get_encryption_settings(options) while True: try: try: # Compute the timeout now = time.time() timeout = min( SCHEDULE_INTERVAL - (now - last_schedule), PING_INTERVAL - (now - last_dispatcher_check), ) # If some actions are remaining, decrease the timeout if any([self.events[k] for k in self.events.keys()]): timeout = min(timeout, 2) # Wait at least for 1ms timeout = max(timeout * 1000, 1) # Wait for data or a timeout sockets = dict(self.poller.poll(timeout)) except zmq.error.ZMQError: continue if sockets.get(self.pipe_r) == zmq.POLLIN: self.logger.info("[POLL] Received a signal, leaving") break # Command socket if sockets.get(self.controler) == zmq.POLLIN: while self.controler_socket( ): # Unqueue all pending messages pass # Events socket if sockets.get(self.event_socket) == zmq.POLLIN: self.logger.info("[EVENT] handling events") while self.read_event_socket( ): # Unqueue all pending messages pass # Wait for the next iteration to handle the event. # In fact, the code that generated the event (lava-logs or # lava-server-gunicorn) needs some time to commit the # database transaction. # If we are too fast, the database object won't be # available (or in the right state) yet. continue # Inotify socket if sockets.get(self.inotify_fd) == zmq.POLLIN: os.read(self.inotify_fd, 4096) if self.auth is not None: self.logger.info( "[AUTH] Reloading certificates from %s", encryption_settings["slaves_certs"], ) self.auth.configure_curve( domain="*", location=encryption_settings["slaves_certs"]) else: self.logger.error( "[AUTH] New certificates in %s but encryption is disabled", encryption_settings["slaves_certs"], ) # Check dispatchers status now = time.time() if now - last_dispatcher_check > PING_INTERVAL: for hostname, dispatcher in self.dispatchers.items(): if (dispatcher.online and now - dispatcher.last_msg > DISPATCHER_TIMEOUT): if hostname == "lava-logs": self.logger.error( "[STATE] lava-logs goes OFFLINE") else: self.logger.error( "[STATE] Dispatcher <%s> goes OFFLINE", hostname) self.dispatchers[hostname].go_offline() last_dispatcher_check = now # Limit accesses to the database. This will also limit the rate of # CANCEL and START messages if time.time() - last_schedule > SCHEDULE_INTERVAL: if self.dispatchers["lava-logs"].online: schedule(self.logger) # Dispatch scheduled jobs with transaction.atomic(): self.start_jobs() else: self.logger.warning( "lava-logs is offline: can't schedule jobs") # Handle canceling jobs with transaction.atomic(): self.cancel_jobs() # Do not count the time taken to schedule jobs last_schedule = time.time() else: # Cancel the jobs and remove the jobs from the set if self.events["canceling"]: with transaction.atomic(): self.cancel_jobs(partial=True) self.events["canceling"] = set() # Schedule for available device-types if self.events["available_dt"]: jobs = schedule(self.logger, self.events["available_dt"]) self.events["available_dt"] = set() # Dispatch scheduled jobs with transaction.atomic(): self.start_jobs(jobs) except (OperationalError, InterfaceError): self.logger.info("[RESET] database connection reset.") # Closing the database connection will force Django to reopen # the connection connection.close() time.sleep(2)
def handle(self, *args, **options): # Initialize logging. self.setup_logging("lava-logs", options["level"], options["log_file"], FORMAT) self.logger.info("[INIT] Starting lava-logs") self.logger.info("[INIT] Version %s", __version__) self.logger.info("[INIT] Dropping privileges") if not self.drop_privileges(options["user"], options["group"]): self.logger.error("[INIT] Unable to drop privileges") return filename = os.path.join(settings.MEDIA_ROOT, "lava-logs-config.yaml") self.logger.debug("[INIT] Dumping config to %s", filename) with open(filename, "w") as output: yaml_dump(options, output) # Create the sockets context = zmq.Context() self.log_socket = context.socket(zmq.PULL) self.controler = context.socket(zmq.ROUTER) self.controler.setsockopt(zmq.IDENTITY, b"lava-logs") # Limit the number of messages in the queue self.controler.setsockopt(zmq.SNDHWM, 2) # From http://api.zeromq.org/4-2:zmq-setsockopt#toc5 # "Immediately readies that connection for data transfer with the master" self.controler.setsockopt(zmq.CONNECT_RID, b"master") if options["ipv6"]: self.logger.info("[INIT] Enabling IPv6") self.log_socket.setsockopt(zmq.IPV6, 1) self.controler.setsockopt(zmq.IPV6, 1) encryption_settings = get_encryption_settings(options) if encryption_settings["encrypt"]: self.logger.info("[INIT] Starting encryption") try: self.auth = ThreadAuthenticator(context) self.auth.start() self.logger.debug( "[INIT] Opening master certificate: %s", encryption_settings["master_cert"], ) master_public, master_secret = zmq.auth.load_certificate( encryption_settings["master_cert"]) self.logger.debug( "[INIT] Using slaves certificates from: %s", encryption_settings["slaves_certs"], ) self.auth.configure_curve( domain="*", location=encryption_settings["slaves_certs"]) except OSError as err: self.logger.error("[INIT] %s", err) self.auth.stop() return self.log_socket.curve_publickey = master_public self.log_socket.curve_secretkey = master_secret self.log_socket.curve_server = True self.controler.curve_publickey = master_public self.controler.curve_secretkey = master_secret self.controler.curve_serverkey = master_public self.logger.debug("[INIT] Watching %s", encryption_settings["slaves_certs"]) self.cert_dir_path = encryption_settings["slaves_certs"] self.inotify_fd = watch_directory(encryption_settings["slaves_certs"]) if self.inotify_fd is None: self.logger.error("[INIT] Unable to start inotify") self.log_socket.bind(options["socket"]) self.controler.connect(options["master_socket"]) # Poll on the sockets. This allow to have a # nice timeout along with polling. self.poller = zmq.Poller() self.poller.register(self.log_socket, zmq.POLLIN) self.poller.register(self.controler, zmq.POLLIN) if self.inotify_fd is not None: self.poller.register(os.fdopen(self.inotify_fd), zmq.POLLIN) # Translate signals into zmq messages (self.pipe_r, _) = self.setup_zmq_signal_handler() self.poller.register(self.pipe_r, zmq.POLLIN) self.logger.info("[INIT] listening for logs") # PING right now: the master is waiting for this message to start # scheduling. self.controler.send_multipart([b"master", b"PING"]) try: self.main_loop() except BaseException as exc: self.logger.error("[EXIT] Unknown exception raised, leaving!") self.logger.exception(exc) # Close the controler socket self.controler.close(linger=0) self.poller.unregister(self.controler) # Carefully close the logging socket as we don't want to lose messages self.logger.info( "[EXIT] Disconnect logging socket and process messages") endpoint = u(self.log_socket.getsockopt(zmq.LAST_ENDPOINT)) self.logger.debug("[EXIT] unbinding from '%s'", endpoint) self.log_socket.unbind(endpoint) # Empty the queue try: while self.wait_for_messages(True): # Flush test cases cache for every iteration because we might # get killed soon. self.flush_test_cases() except BaseException as exc: self.logger.error("[EXIT] Unknown exception raised, leaving!") self.logger.exception(exc) finally: # Last flush self.flush_test_cases() self.logger.info( "[EXIT] Closing the logging socket: the queue is empty") self.log_socket.close() if encryption_settings["encrypt"]: self.auth.stop() context.term()
def handle(self, *args, **options): # Initialize logging. self.setup_logging("lava-master", options["level"], options["log_file"], FORMAT) self.logger.info("[INIT] Starting lava-master") self.logger.info("[INIT] Version %s", __version__) self.logger.info("[INIT] Using protocol version %d", PROTOCOL_VERSION) self.logger.info("[INIT] Dropping privileges") if not self.drop_privileges(options["user"], options["group"]): self.logger.error("[INIT] Unable to drop privileges") return filename = os.path.join(settings.MEDIA_ROOT, "lava-master-config.yaml") self.logger.debug("[INIT] Dumping config to %s", filename) with open(filename, "w") as output: yaml_dump(options, output) self.logger.info("[INIT] Marking all workers as offline") with transaction.atomic(): for worker in Worker.objects.select_for_update().all(): worker.go_state_offline() worker.save() # Create the sockets context = zmq.Context() self.controler = context.socket(zmq.ROUTER) self.event_socket = context.socket(zmq.SUB) if options["ipv6"]: self.logger.info("[INIT] Enabling IPv6") self.controler.setsockopt(zmq.IPV6, 1) self.event_socket.setsockopt(zmq.IPV6, 1) encryption_settings = get_encryption_settings(options) if encryption_settings["encrypt"]: self.logger.info("[INIT] Starting encryption") try: self.auth = ThreadAuthenticator(context) self.auth.start() self.logger.debug( "[INIT] Opening master certificate: %s", encryption_settings["master_cert"], ) master_public, master_secret = zmq.auth.load_certificate( encryption_settings["master_cert"]) self.logger.debug( "[INIT] Using slaves certificates from: %s", encryption_settings["slaves_certs"], ) self.auth.configure_curve( domain="*", location=encryption_settings["slaves_certs"]) except OSError as err: self.logger.error(err) self.auth.stop() return self.controler.curve_publickey = master_public self.controler.curve_secretkey = master_secret self.controler.curve_server = True self.logger.debug("[INIT] Watching %s", encryption_settings["slaves_certs"]) self.inotify_fd = watch_directory( encryption_settings["slaves_certs"]) if self.inotify_fd is None: self.logger.error("[INIT] Unable to start inotify") self.controler.setsockopt(zmq.IDENTITY, b"master") # From http://api.zeromq.org/4-2:zmq-setsockopt#toc42 # "If two clients use the same identity when connecting to a ROUTER # [...] the ROUTER socket shall hand-over the connection to the new # client and disconnect the existing one." self.controler.setsockopt(zmq.ROUTER_HANDOVER, 1) self.controler.bind(options["master_socket"]) # Set the topic and connect self.event_socket.setsockopt(zmq.SUBSCRIBE, b(settings.EVENT_TOPIC)) self.event_socket.connect(options["event_url"]) # Poll on the sockets. This allow to have a # nice timeout along with polling. self.poller = zmq.Poller() self.poller.register(self.controler, zmq.POLLIN) self.poller.register(self.event_socket, zmq.POLLIN) if self.inotify_fd is not None: self.poller.register(os.fdopen(self.inotify_fd), zmq.POLLIN) # Translate signals into zmq messages (self.pipe_r, _) = self.setup_zmq_signal_handler() self.poller.register(self.pipe_r, zmq.POLLIN) # Send master upgrade notifications. send_upgraded_master_notifications(__version__, self.logger) # Main loop self.logger.info("[INIT] Starting main loop") try: self.main_loop(options) except BaseException as exc: self.logger.error("[CLOSE] Unknown exception raised, leaving!") self.logger.exception(exc) finally: # Drop controler socket: the protocol does handle lost messages self.logger.info( "[CLOSE] Closing the controler socket and dropping messages") self.controler.close(linger=0) self.event_socket.close(linger=0) if encryption_settings["encrypt"]: self.auth.stop() context.term()