def run_container_start_checks(self, started_at, timeout_at): checks_start_time = time.time() while time.time() <= timeout_at: if not self.is_running(): raise FactoryNotStarted("{} is no longer running".format(self)) if self._container_start_checks(): break else: log.error( "Failed to run container start checks after %1.2f seconds", time.time() - checks_start_time, ) return False check_ports = set(self.get_check_ports()) if not check_ports: return True while time.time() <= timeout_at: if not self.is_running(): raise FactoryNotStarted("{} is no longer running".format(self)) if not check_ports: break check_ports -= ports.get_connectable_ports(check_ports) if check_ports: time.sleep(0.5) else: log.error( "Failed to check ports after %1.2f seconds", time.time() - checks_start_time, ) return False return True
def wait_for_events(self, patterns, timeout=30, after_time=None): start_time = time.time() patterns = set(patterns) log.info("Waiting at most %.2f seconds for event patterns: %s", timeout, patterns) if after_time is None: after_time = time.time() expire = start_time + timeout while time.time() <= expire: if not patterns: break for received, expire, master_id, tag, data in copy.copy(self.store): if received < after_time: # Too old, carry on continue for _master_id, _pattern in set(patterns): if _master_id != master_id: continue if fnmatch.fnmatch(tag, _pattern): patterns.remove((_master_id, _pattern)) time.sleep(0.125) else: log.info( "Timmed out after %.2f seconds waiting for event patterns: %s", time.time() - start_time, patterns, ) return False return True
def run_start_checks(self, started_at, timeout_at): if not super().run_start_checks(started_at, timeout_at): return False if not self.event_listener: log.debug( "The 'event_listener' attribute is not set. Not checking events..." ) return True check_events = set(self.get_check_events()) if not check_events: log.debug("No events to listen to for %s", self) return True checks_start_time = time.time() while time.time() <= timeout_at: if not self.is_running(): raise FactoryNotStarted("{} is no longer running".format(self)) if not check_events: break check_events -= self.event_listener.get_events( check_events, after_time=started_at) if check_events: time.sleep(0.5) else: log.error( "Failed to check events after %1.2f seconds for %s", time.time() - checks_start_time, self, ) return False log.debug("Successfuly checked for all events(%s) for %s", set(self.get_check_events()), self) return True
def wait_for_events(self, patterns, timeout=30, after_time=None): """ Wait for a set of patterns to match or until timeout is reached. :param ~collections.abc.Sequence pattern: An iterable of tuples in the form of ``("<daemon-id>", "<event-tag-pattern>")``, ie, which daemon ID we're targeting and the event tag pattern which will be passed to :py:func:`~fnmatch.fnmatch` to assert a match. :keyword int,float timeout: The amount of time to wait for the events, in seconds. :keyword ~datetime.datetime,float after_time: After which time to start matching events. :return: An instance of :py:class:`~saltfactories.plugins.event_listener.MatchedEvents`. :rtype ~saltfactories.plugins.event_listener.MatchedEvents: """ if after_time is None: after_time = datetime.utcnow() elif isinstance(after_time, float): after_time = datetime.utcfromtimestamp(after_time) after_time_iso = after_time.isoformat() log.debug( "%s is waiting for event patterns happening after %s: %s", self, after_time_iso, set(patterns), ) found_events = set() patterns = set(patterns) timeout_at = time.time() + timeout while True: if not patterns: return True for event in copy.copy(self.store): if event.expired: # Too old, carry on continue if event.stamp < after_time: continue for pattern in set(patterns): _daemon_id, _pattern = pattern if event.daemon_id != _daemon_id: continue if fnmatch.fnmatch(event.tag, _pattern): log.debug("%s Found matching pattern: %s", self, pattern) found_events.add(event) patterns.remove((event.daemon_id, _pattern)) if not patterns: break if time.time() > timeout_at: break time.sleep(0.5) return MatchedEvents(matches=found_events, missed=patterns)
def run(self, *args, _timeout=None, **kwargs): """ Run the given command synchronously """ start_time = time.time() # Build the cmdline to pass to the terminal # We set the _terminal_timeout attribute while calling build_cmdline in case it needs # access to that information to build the command line self._terminal_timeout = _timeout or self.default_timeout self._terminal_timeout_set_explicitly = _timeout is not None timeout_expire = time.time() + self._terminal_timeout running = self._run(*args, **kwargs) timmed_out = False while True: if timeout_expire < time.time(): timmed_out = True break if self._terminal.poll() is not None: break time.sleep(0.25) result = self.terminate() if timmed_out: raise FactoryTimeout( "{} Failed to run: {}; Error: Timed out after {:.2f} seconds!". format(self, result.cmdline, time.time() - start_time), stdout=result.stdout, stderr=result.stderr, cmdline=result.cmdline, exitcode=result.exitcode, ) cmdline = result.cmdline exitcode = result.exitcode stdout, stderr, json_out = self.process_output(result.stdout, result.stderr, cmdline=cmdline) log.info( "%s completed %r in CWD: %s after %.2f seconds", self, cmdline, self.cwd, time.time() - start_time, ) return ShellResult(exitcode, stdout, stderr, json=json_out, cmdline=cmdline)
def _cleanup(self): cleanup_at = time.time() + 30 while self.running_event.is_set(): if time.time() < cleanup_at: time.sleep(1) continue # Reset cleanup time cleanup_at = time.time() + 30 # Cleanup expired events to_remove = [] for event in self.store: if event.expired: to_remove.append(event) for event in to_remove: log.debug("%s Removing from event store: %s", self, event) self.store.remove(event) log.debug("%s store size after cleanup: %s", self, len(self.store))
def wait_for_events(self, patterns, timeout=30, after_time=None): if after_time is None: after_time = datetime.utcnow() elif isinstance(after_time, float): after_time = datetime.utcfromtimestamp(after_time) after_time_iso = after_time.isoformat() log.debug( "%s is waiting for event patterns happening after %s: %s", self, after_time_iso, set(patterns), ) found_events = set() patterns = set(patterns) timeout_at = time.time() + timeout while True: if not patterns: return True for event in copy.copy(self.store): if event.expired: # Too old, carry on continue if event.stamp < after_time: continue for pattern in set(patterns): _master_id, _pattern = pattern if event.master_id != _master_id: continue if fnmatch.fnmatch(event.tag, _pattern): log.debug("%s Found matching pattern: %s", self, pattern) patterns.remove((event.master_id, _pattern)) if not patterns: break if time.time() > timeout_at: break time.sleep(0.5) return not patterns
def init_terminal(self, cmdline, **kwargs): """ Instantiate a terminal with the passed cmdline and kwargs and return it. Additionally, it sets a reference to it in self._terminal and also collects an initial listing of child processes which will be used when terminating the terminal """ self._terminal = Popen(cmdline, **kwargs) # Reset the previous _terminal_result if set self._terminal_result = None # A little sleep to allow the subprocess to start time.sleep(0.125) try: for child in psutil.Process( self._terminal.pid).children(recursive=True): if child not in self._children: self._children.append(child) except psutil.NoSuchProcess: # The terminal process is gone pass atexit.register(self.terminate) return self._terminal
def run_start_checks(self, started_at, timeout_at): check_ports = set(self.get_check_ports()) if not check_ports: log.debug("No ports to check connection to for %s", self) return True checks_start_time = time.time() while time.time() <= timeout_at: if not self.is_running(): raise FactoryNotStarted("{} is no longer running".format(self)) if not check_ports: break check_ports -= ports.get_connectable_ports(check_ports) if check_ports: time.sleep(0.5) else: log.error( "Failed to check ports after %1.2f seconds for %s", time.time() - checks_start_time, self, ) return False log.debug("Successfuly connected to all ports(%s) for %s", set(self.get_check_ports()), self) return True
def start(self, *command, max_start_attempts=None, start_timeout=None): if self.is_running(): log.warning("%s is already running.", self) return True connectable = ContainerFactory.client_connectable(self.docker_client) if connectable is not True: self.terminate() raise RuntimeError(connectable) self._terminate_result = None atexit.register(self.terminate) factory_started = False for callback, args, kwargs in self.before_start_callbacks: try: callback(*args, **kwargs) except Exception as exc: # pylint: disable=broad-except log.info( "Exception raised when running %s: %s", self._format_callback(callback, args, kwargs), exc, exc_info=True, ) start_time = time.time() start_attempts = max_start_attempts or self.max_start_attempts current_attempt = 0 while current_attempt <= start_attempts: current_attempt += 1 if factory_started: break log.info("Starting %s. Attempt: %d of %d", self, current_attempt, start_attempts) current_start_time = time.time() start_running_timeout = current_start_time + (start_timeout or self.start_timeout) # Start the container self.container = self.docker_client.containers.run( self.image, name=self.name, detach=True, stdin_open=True, command=list(command) or None, **self.container_run_kwargs) while time.time() <= start_running_timeout: # Don't know why, but if self.container wasn't previously in a running # state, and now it is, we have to re-set the self.container attribute # so that it gives valid status information self.container = self.docker_client.containers.get(self.name) if self.container.status != "running": time.sleep(0.25) continue self.container = self.docker_client.containers.get(self.name) logs = self.container.logs(stdout=True, stderr=True, stream=False) if isinstance(logs, bytes): stdout = logs.decode() stderr = None else: stdout = logs[0].decode() stderr = logs[1].decode() if stdout and stderr: log.info("Running Container Logs:\n%s\n%s", stdout, stderr) elif stdout: log.info("Running Container Logs:\n%s", stdout) # If we reached this far it means that we got the running status above, and # now that the container has started, run start checks try: if (self.run_container_start_checks( current_start_time, start_running_timeout) is False): time.sleep(0.5) continue except FactoryNotStarted: self.terminate() break log.info( "The %s factory is running after %d attempts. Took %1.2f seconds", self, current_attempt, time.time() - start_time, ) factory_started = True break else: # We reached start_running_timeout, re-try try: self.container.remove(force=True) self.container.wait() except docker.errors.NotFound: pass self.container = None else: # The factory failed to confirm it's running status self.terminate() if factory_started: for callback, args, kwargs in self.after_start_callbacks: try: callback(*args, **kwargs) except Exception as exc: # pylint: disable=broad-except log.info( "Exception raised when running %s: %s", self._format_callback(callback, args, kwargs), exc, exc_info=True, ) # TODO: Add containers to the processes stats?! # if self.factories_manager and self.factories_manager.stats_processes is not None: # self.factories_manager.stats_processes[self.get_display_name()] = psutil.Process( # self.pid # ) return factory_started result = self.terminate() raise FactoryNotStarted( "The {} factory has failed to confirm running status after {} attempts, which " "took {:.2f} seconds({:.2f} seconds each)".format( self, current_attempt - 1, time.time() - start_time, start_timeout or self.start_timeout, ), stdout=result.stdout, stderr=result.stderr, exitcode=result.exitcode, )
def process_logs(self): address = "tcp://{}:{}".format(self.log_host, self.log_port) context = zmq.Context() puller = context.socket(zmq.PULL) exit_timeout_seconds = 5 exit_timeout = None try: puller.bind(address) except zmq.ZMQError as exc: log.exception("Unable to bind to puller at %s", address) return try: self.running_event.set() while True: if not self.running_event.is_set(): if exit_timeout is None: log.debug( "Waiting %d seconds to process any remaning log messages " "before exiting...", exit_timeout_seconds, ) exit_timeout = time.time() + exit_timeout_seconds if time.time() >= exit_timeout: log.debug( "Unable to process remaining log messages in time. " "Exiting anyway.") break try: try: msg = puller.recv(flags=zmq.NOBLOCK) except zmq.ZMQError as exc: if exc.errno != zmq.EAGAIN: raise time.sleep(0.25) continue if msgpack.version >= (0, 5, 2): record_dict = msgpack.loads(msg, raw=False) else: record_dict = msgpack.loads(msg, encoding="utf-8") if record_dict is None: # A sentinel to stop processing the queue log.info( "Received the sentinel to shutdown the log server") break try: record_dict["message"] except KeyError: # This log record was msgpack dumped from Py2 for key, value in record_dict.copy().items(): skip_update = True if isinstance(value, bytes): value = value.decode("utf-8") skip_update = False if isinstance(key, bytes): key = key.decode("utf-8") skip_update = False if skip_update is False: record_dict[key] = value # Just log everything, filtering will happen on the main process # logging handlers record = logging.makeLogRecord(record_dict) logger = logging.getLogger(record.name) logger.handle(record) except (EOFError, KeyboardInterrupt, SystemExit) as exc: break except Exception as exc: # pylint: disable=broad-except log.warning( "An exception occurred in the log server processing queue thread: %s", exc, exc_info=True, ) finally: puller.close(1) context.term()
def start(self, *extra_cli_arguments, max_start_attempts=None, start_timeout=None): """ Start the daemon """ if self.is_running(): log.warning("%s is already running.", self) return True process_running = False start_time = time.time() start_attempts = max_start_attempts or self.max_start_attempts current_attempt = 0 run_arguments = list(extra_cli_arguments) while True: if process_running: break current_attempt += 1 if current_attempt > start_attempts: break log.info("Starting %s. Attempt: %d of %d", self, current_attempt, start_attempts) for callback, args, kwargs in self.before_start_callbacks: try: callback(*args, **kwargs) except Exception as exc: # pylint: disable=broad-except log.info( "Exception raised when running %s: %s", self._format_callback(callback, args, kwargs), exc, exc_info=True, ) current_start_time = time.time() start_running_timeout = current_start_time + (start_timeout or self.start_timeout) if current_attempt > 1 and self.extra_cli_arguments_after_first_start_failure: run_arguments = list(extra_cli_arguments) + list( self.extra_cli_arguments_after_first_start_failure) self._run(*run_arguments) if not self.is_running(): # A little breathe time to allow the process to start if not started already time.sleep(0.5) while time.time() <= start_running_timeout: if not self.is_running(): log.warning("%s is no longer running", self) self.terminate() break try: if self.run_start_checks(current_start_time, start_running_timeout) is False: time.sleep(1) continue except FactoryNotStarted: self.terminate() break log.info( "The %s factory is running after %d attempts. Took %1.2f seconds", self, current_attempt, time.time() - start_time, ) process_running = True break else: # The factory failed to confirm it's running status self.terminate() if process_running: for callback, args, kwargs in self.after_start_callbacks: try: callback(*args, **kwargs) except Exception as exc: # pylint: disable=broad-except log.info( "Exception raised when running %s: %s", self._format_callback(callback, args, kwargs), exc, exc_info=True, ) return process_running result = self.terminate() raise FactoryNotStarted( "The {} factory has failed to confirm running status after {} attempts, which " "took {:.2f} seconds".format( self, current_attempt - 1, time.time() - start_time, ), stdout=result.stdout, stderr=result.stderr, exitcode=result.exitcode, )