Пример #1
0
def force_shutdown(delay: int = 10) -> None:
    time.sleep(delay)
    log_stats()
    log.error(
        "Some child process or thread timed out during shutdown - forcing shutdown completion"
    )
    os._exit(0)
Пример #2
0
def force_shutdown(delay: int = 10) -> None:
    time.sleep(delay)
    log_stats()
    log.error(
        'Some child process or thread timed out during shutdown - killing process group'
    )
    os.killpg(os.getpgid(0), SIGKILL)
    os._exit(0)
Пример #3
0
    def run(self) -> None:
        num_run = 0
        while self.__run:
            self.__run_event.clear()

            num_run += 1
            time_run_start = time.time()

            log.info(f"Starting processor run {num_run}")
            log_stats()
            dispatch_event(Event(EventType.PROCESS_BEGIN, self.gc.graph))
            self.collect()
            log_stats(garbage_collector_stats=True)
            if self.__run:
                self.cleanup()
            dispatch_event(Event(EventType.PROCESS_FINISH, self.gc.graph),
                           blocking=True)

            elapsed = int(time.time() - time_run_start)
            log.info(
                (f"Done run {num_run} with {len(self.gc.graph.nodes)} nodes in"
                 f" {elapsed} seconds"))
            log_stats(garbage_collector_stats=True)

            if self.__interval > elapsed:
                wait_time = self.__interval - elapsed
                log.debug(f"Waiting {wait_time} seconds before next run")
                self.__run_event.wait(wait_time)

            # If we're only supposed to run once we still wait for interval above.
            # If no wait is desired --interval 0 can be specified.
            if ArgumentParser.args.one_shot:
                reason = "One shot run specified"
                log.debug(f"Requesting shutdown: {reason}")
                dispatch_event(
                    Event(EventType.SHUTDOWN, {
                        "reason": reason,
                        "emergency": False
                    }),
                    blocking=True,
                )
        log.debug("Processor thread shut down")
Пример #4
0
def main() -> None:
    log.info("Cloudkeeper initializing")
    # Try to run in a new process group and
    # ignore if not possible for whatever reason
    try:
        os.setpgid(0, 0)
    except:
        pass

    cloudkeeper.signal.parent_pid = os.getpid()

    # Add cli args
    arg_parser = get_arg_parser()

    logging.add_args(arg_parser)
    Cli.add_args(arg_parser)
    WebServer.add_args(arg_parser)
    Scheduler.add_args(arg_parser)
    Processor.add_args(arg_parser)
    Cleaner.add_args(arg_parser)
    PluginLoader.add_args(arg_parser)
    GraphContainer.add_args(arg_parser)
    event_add_args(arg_parser)

    # Find cloudkeeper Plugins in the cloudkeeper.plugins module
    plugin_loader = PluginLoader()
    plugin_loader.add_plugin_args(arg_parser)

    # At this point the CLI, all Plugins as well as the WebServer have
    # added their args to the arg parser
    arg_parser.parse_args()

    # Handle Ctrl+c and other means of termination/shutdown
    cloudkeeper.signal.initializer()
    add_event_listener(EventType.SHUTDOWN, shutdown, blocking=False)

    # Try to increase nofile and nproc limits
    increase_limits()

    # We're using a GraphContainer() to contain the graph which gets replaced
    # at runtime. This way we're not losing the context in other places like
    # the webserver when the graph gets reassigned.
    graph_container = GraphContainer()

    # GraphCollector() is a custom Prometheus Collector that
    # takes a graph and yields its metrics
    graph_collector = GraphCollector(graph_container)
    REGISTRY.register(graph_collector)

    # Scheduler() starts an APScheduler instance
    scheduler = Scheduler(graph_container)
    scheduler.daemon = True
    scheduler.start()

    # Cli() is the CLI Thread
    cli = Cli(graph_container, scheduler)
    cli.daemon = True
    cli.start()

    # WebServer is handed the graph container context so it can e.g. produce graphml
    # from it. The webserver serves Prometheus Metrics as well as different graph
    # endpoints.
    web_server = WebServer(graph_container)
    web_server.daemon = True
    web_server.start()

    for Plugin in plugin_loader.plugins(PluginType.PERSISTENT):
        try:
            log.debug(f"Starting persistent Plugin {Plugin}")
            plugin = Plugin()
            plugin.daemon = True
            plugin.start()
        except Exception as e:
            log.exception(f"Caught unhandled persistent Plugin exception {e}")

    processor = Processor(graph_container,
                          plugin_loader.plugins(PluginType.COLLECTOR))
    processor.daemon = True
    processor.start()

    # Dispatch the STARTUP event
    dispatch_event(Event(EventType.STARTUP))

    # We wait for the shutdown Event to be set() and then end the program
    # While doing so we print the list of active threads once per 15 minutes
    while not shutdown_event.is_set():
        log_stats()
        shutdown_event.wait(900)
    time.sleep(5)
    cloudkeeper.signal.kill_children(cloudkeeper.signal.SIGTERM,
                                     ensure_death=True)
    log.info("Shutdown complete")
    quit()
Пример #5
0
def main() -> None:
    # Add cli args
    arg_parser = get_arg_parser()

    Cli.add_args(arg_parser)
    WebServer.add_args(arg_parser)
    Scheduler.add_args(arg_parser)
    Processor.add_args(arg_parser)
    Cleaner.add_args(arg_parser)
    PluginLoader.add_args(arg_parser)
    GraphContainer.add_args(arg_parser)
    event_add_args(arg_parser)

    # Find cloudkeeper Plugins in the cloudkeeper.plugins module
    plugin_loader = PluginLoader()
    plugin_loader.add_plugin_args(arg_parser)

    # At this point the CLI, all Plugins as well as the WebServer have added their args to the arg parser
    arg_parser.parse_args()

    # Write log to a file in addition to stdout
    if ArgumentParser.args.logfile:
        log_formatter = logging.Formatter(log_format)
        fh = logging.FileHandler(ArgumentParser.args.logfile)
        fh.setFormatter(log_formatter)
        logging.getLogger().addHandler(fh)

    # Handle Ctrl+c and other means of termination/shutdown
    signal_on_parent_exit()
    add_event_listener(EventType.SHUTDOWN, shutdown, blocking=False)
    signal(SIGINT, signal_handler)
    signal(SIGTERM, signal_handler)
    signal(SIGUSR1, signal_handler)

    # We're using a GraphContainer() to contain the graph which gets replaced at runtime.
    # This way we're not losing the context in other places like the webserver when the
    # graph gets reassigned.
    graph_container = GraphContainer()

    # GraphCollector() is a custom Prometheus Collector that
    # takes a graph and yields its metrics
    graph_collector = GraphCollector(graph_container)
    REGISTRY.register(graph_collector)

    # Scheduler() starts an APScheduler instance
    scheduler = Scheduler(graph_container)
    scheduler.daemon = True
    scheduler.start()

    # Cli() is the CLI Thread
    cli = Cli(graph_container, scheduler)
    cli.daemon = True
    cli.start()

    # WebServer is handed the graph container context so it can e.g. produce graphml from it
    # The webserver serves Prometheus Metrics as well as different graph endpoints
    web_server = WebServer(graph_container)
    web_server.daemon = True
    web_server.start()

    for Plugin in plugin_loader.plugins(PluginType.PERSISTENT):
        try:
            log.debug(f'Starting persistent Plugin {Plugin}')
            plugin = Plugin()
            plugin.daemon = True
            plugin.start()
        except Exception as e:
            log.exception(f'Caught unhandled persistent Plugin exception {e}')

    collector = Processor(graph_container,
                          plugin_loader.plugins(PluginType.COLLECTOR))
    collector.daemon = True
    collector.start()

    # Dispatch the STARTUP event
    dispatch_event(Event(EventType.STARTUP))

    # We wait for the shutdown Event to be set() and then end the program
    # While doing so we print the list of active threads once per 15 minutes
    while not shutdown_event.is_set():
        log_stats()
        shutdown_event.wait(900)
    time.sleep(5)
    log.info('Shutdown complete')
    quit()