Exemplo n.º 1
0
def handler(sig, frame) -> None:
    """Handles Ctrl+c by letting the Collector() know to shut down"""
    current_pid = os.getpid()
    if current_pid == parent_pid:
        reason = f"Received shutdown signal {sig}"
        log.debug(f"Parent caught signal {sig} - dispatching shutdown event")
        # Dispatch shutdown event in parent process which also causes SIGUSR1 to be sent to
        # the process group and in turn causes the shutdown event in all child processes.
        dispatch_event(
            Event(EventType.SHUTDOWN, {
                "reason": reason,
                "emergency": False
            }))
    else:
        reason = f"Received shutdown signal {sig} from parent process"
        log.debug(
            f"Child with PID {current_pid} shutting down - you might see exceptions from interrupted worker threads"
        )
        # Child's threads have 3s to shut down before the following thread will shut them down hard.
        kt = threading.Thread(target=delayed_exit, name="shutdown")
        kt.start()
        # Dispatch shutdown event in child process
        dispatch_event(
            Event(EventType.SHUTDOWN, {
                "reason": reason,
                "emergency": False
            }),
            blocking=False,
        )
        sys.exit(0)
Exemplo n.º 2
0
    def collect(self) -> None:
        """collect() is run every loop, collecting Plugin resources.

        Every time collect() is run it creates a new working Graph. It then creates instances of each Plugin
        and starts their thread which in turn runs the Plugin's collect() method. Once all Plugins have finished
        collecting cloud resources, it retrieves the Plugin's Graphs and appends them to its own working Graph.

        At the end the live Graph is swapped with the working Graph.
        """
        gc = GraphContainer(
            cache_graph=False
        )  # Create a new graph container to hold the Graph() which we'll swap out at the end
        dispatch_event(
            Event(EventType.COLLECT_BEGIN, gc.graph)
        )  # Let interested parties know that we're about to start our collect run
        plugins = [Plugin() for Plugin in self.plugins
                   ]  # Create instances of each Plugin()
        start_time = time.time()

        # First we run each Collector Plugin
        # Each Plugin is a threading.Thread so we call start() on it
        for plugin in plugins:
            plugin.start(
            )  # Run the collect() method on each plugin which in turn generates a Graph()

        # Now we wait for each Plugin to complete its work or time out
        # Because we always swap out the completed graph at the end of our collect run
        # it doesn't matter in which order we wait for (join) Plugins. I.e. there's no speed
        # advantage in checking for already completed Plugins and collecting slow ones last.
        for plugin in plugins:
            timeout = start_time + ArgumentParser.args.timeout - time.time()
            if timeout < 1:
                timeout = 1
            log.info(
                f'Waiting for collector thread of plugin {plugin.cloud} to finish'
            )
            plugin.join(timeout)
            if not plugin.is_alive():  # The plugin has finished its work
                if not is_directed_acyclic_graph(plugin.graph):
                    log.error(
                        f'Graph of plugin {plugin.cloud} is not acyclic - ignoring plugin results'
                    )
                    continue
                log.info(
                    f'Merging graph of plugin {plugin.cloud} with global graph'
                )
                gc.add(plugin.graph)
                gc.graph.add_edge(
                    gc.GRAPH_ROOT, plugin.root
                )  # Connect the root of our graph with the plugin's
            else:
                log.error(
                    f'Plugin {plugin.cloud} timed out - discarding Plugin graph'
                )
        sanitize(gc.graph, gc.GRAPH_ROOT)
        dispatch_event(Event(EventType.GENERATE_METRICS, gc.graph),
                       blocking=True)
        dispatch_event(Event(EventType.COLLECT_FINISH, gc.graph),
                       blocking=True)
        self.gc.graph = gc.graph  # Swap the live graph with the newly created one from our current run
Exemplo n.º 3
0
    def cleanup(self) -> None:
        if not ArgumentParser.args.cleanup:
            log.error('Cleanup called but --cleanup flag not provided at startup - ignoring call')
            return

        log.info('Notifying plugins to plan cleanup')
        dispatch_event(Event(EventType.CLEANUP_PLAN, self.graph), blocking=True)
        log.info('Running cleanup')
        dispatch_event(Event(EventType.CLEANUP_BEGIN, self.graph), blocking=True)
        with self.graph.lock.read_access:
            cleanup_nodes = [node for node in self.graph.nodes() if node.clean]
            cleanup_plan = defaultlist(lambda: [])

            for node in cleanup_nodes:
                log.debug(f'Adding {node.resource_type} {node.dname} to cleanup plan with priority {node.max_graph_depth}')
                cleanup_plan[node.max_graph_depth].append(node)

            with ThreadPoolExecutor(max_workers=ArgumentParser.args.cleanup_pool_size, thread_name_prefix='pre_cleaner') as executor:
                executor.map(self.pre_clean, cleanup_nodes)

            for nodes in reversed(cleanup_plan):
                with ThreadPoolExecutor(max_workers=ArgumentParser.args.cleanup_pool_size, thread_name_prefix='cleaner') as executor:
                    executor.map(self.clean, nodes)

        dispatch_event(Event(EventType.CLEANUP_FINISH, self.graph))
Exemplo n.º 4
0
def signal_handler(sig, frame) -> None:
    """Handles Ctrl+c by letting the Collector() know to shut down"""
    signal(SIGINT, original_sigint_handler)
    signal(SIGTERM, original_sigterm_handler)

    current_pid = os.getpid()
    if current_pid == parent_pid:
        if sig != SIGUSR1:
            reason = f'Received shutdown signal {sig}'
            log.debug(
                f'Parent caught signal {sig} - dispatching shutdown event')
            # Dispatch shutdown event in parent process which also causes SIGUSR1 to be sent to
            # the process group and in turn causes the shutdown event in all child processes.
            dispatch_event(
                Event(EventType.SHUTDOWN, {
                    'reason': reason,
                    'emergency': False
                }))
        else:
            log.debug('Parent received SIGUSR1 and ignoring it')
    else:
        log.debug(
            f"Shutting down child process {current_pid} - you might see exceptions from interrupted worker threads"
        )
        reason = f'Received shutdown signal {sig} from parent process'
        # Child's threads have 3s to shut down before the following thread will shut them down hard.
        kt = threading.Thread(target=delayed_exit, name='shutdown')
        kt.start()
        # Dispatch shutdown event in child process
        dispatch_event(Event(EventType.SHUTDOWN, {
            'reason': reason,
            'emergency': False
        }),
                       blocking=False)
        sys.exit(0)
Exemplo n.º 5
0
    def cmd_collect(self, items: Iterable, args: str) -> Iterable:
        '''Usage: collect

        Perform a collect run.
        '''
        dispatch_event(Event(EventType.START_COLLECT))
        return ()
Exemplo n.º 6
0
    def read(self) -> bool:
        try:
            if not self.config_file:
                raise ValueError(
                    "Attribute config_file is not set on ProtectSnowflakesConfig() instance"
                )

            with open(self.config_file) as config_file:
                config = yaml.load(config_file, Loader=yaml.FullLoader)
            if self.validate(config):
                self.update(config)
        except Exception:
            log.exception(f"Error while reading {self.config_file}")
            reason = (
                "Snowflake Protection failed to validate config "
                "- Resource Protection can't be guaranteed - configuration fix required!"
            )
            dispatch_event(
                Event(EventType.SHUTDOWN, {
                    "reason": reason,
                    "emergency": True
                }),
                blocking=True,
            )
        else:
            return True
        return False
Exemplo n.º 7
0
    def run(self) -> None:
        num_run = 0
        while self.__run:
            self.__run_event.clear()

            num_run += 1
            time_run_start = time.time()

            log.info(f"Starting processor run {num_run}")
            log_stats()
            dispatch_event(Event(EventType.PROCESS_BEGIN, self.gc.graph))
            self.collect()
            log_stats(garbage_collector_stats=True)
            if self.__run:
                self.cleanup()
            dispatch_event(Event(EventType.PROCESS_FINISH, self.gc.graph),
                           blocking=True)

            elapsed = int(time.time() - time_run_start)
            log.info(
                (f"Done run {num_run} with {len(self.gc.graph.nodes)} nodes in"
                 f" {elapsed} seconds"))
            log_stats(garbage_collector_stats=True)

            if self.__interval > elapsed:
                wait_time = self.__interval - elapsed
                log.debug(f"Waiting {wait_time} seconds before next run")
                self.__run_event.wait(wait_time)

            # If we're only supposed to run once we still wait for interval above.
            # If no wait is desired --interval 0 can be specified.
            if ArgumentParser.args.one_shot:
                reason = "One shot run specified"
                log.debug(f"Requesting shutdown: {reason}")
                dispatch_event(
                    Event(EventType.SHUTDOWN, {
                        "reason": reason,
                        "emergency": False
                    }),
                    blocking=True,
                )
        log.debug("Processor thread shut down")
Exemplo n.º 8
0
 def __init__(self, cache_graph=True) -> None:
     self._graph = None
     self._observers = []
     self.__lock = threading.Lock()
     self.graph = Graph()
     resource_attr = get_resource_attributes(self.GRAPH_ROOT)
     self.graph.add_node(self.GRAPH_ROOT, label=self.GRAPH_ROOT.id, **resource_attr)
     if cache_graph:
         self.cache = GraphCache()
         self.cache.update_cache(Event(EventType.STARTUP, self.graph))
         add_event_listener(EventType.COLLECT_FINISH, self.cache.update_cache)
         add_event_listener(EventType.CLEANUP_FINISH, self.cache.update_cache)
     else:
         self.cache = None
Exemplo n.º 9
0
def test_processor():
    arg_parser = get_arg_parser()
    Processor.add_args(arg_parser)
    GraphContainer.add_args(arg_parser)
    event_add_args(arg_parser)
    arg_parser.parse_args()

    graph_container = GraphContainer(cache_graph=False)
    plugins = [SomeTestPlugin]

    processor = Processor(graph_container, plugins)
    processor.daemon = True
    processor.start()
    time.sleep(1)
    assert len(processor.gc.graph.nodes) == num_resources + 2
    processor.shutdown(Event(EventType.SHUTDOWN))
Exemplo n.º 10
0
 def quit(reason=None):
     dispatch_event(
         Event(EventType.SHUTDOWN, {
             'reason': reason,
             'emergency': False
         }))
Exemplo n.º 11
0
 def quit(reason=None):
     dispatch_event(
         Event(EventType.SHUTDOWN, {"reason": reason, "emergency": False})
     )
Exemplo n.º 12
0
 def callback(self):
     jwt_data = get_jwt_data(cherrypy.request)
     if jwt_data.get("event") in ("COLLECT_FINISH", "PROCESS_FINISH"):
         dispatch_event(Event(EventType.START_COLLECT))
         return {"status": "ok"}
     return {"status": "unknown event"}
Exemplo n.º 13
0
 def on_post(self, req, resp) -> None:
     dispatch_event(Event(EventType.START_COLLECT))
     resp.content_type = "application/json"
     j = json.dumps({"status": "ok"})
     resp.body = f"{j}\r\n"
Exemplo n.º 14
0
def main() -> None:
    log.info("Cloudkeeper initializing")
    # Try to run in a new process group and
    # ignore if not possible for whatever reason
    try:
        os.setpgid(0, 0)
    except:
        pass

    cloudkeeper.signal.parent_pid = os.getpid()

    # Add cli args
    arg_parser = get_arg_parser()

    logging.add_args(arg_parser)
    Cli.add_args(arg_parser)
    WebServer.add_args(arg_parser)
    Scheduler.add_args(arg_parser)
    Processor.add_args(arg_parser)
    Cleaner.add_args(arg_parser)
    PluginLoader.add_args(arg_parser)
    GraphContainer.add_args(arg_parser)
    event_add_args(arg_parser)

    # Find cloudkeeper Plugins in the cloudkeeper.plugins module
    plugin_loader = PluginLoader()
    plugin_loader.add_plugin_args(arg_parser)

    # At this point the CLI, all Plugins as well as the WebServer have
    # added their args to the arg parser
    arg_parser.parse_args()

    # Handle Ctrl+c and other means of termination/shutdown
    cloudkeeper.signal.initializer()
    add_event_listener(EventType.SHUTDOWN, shutdown, blocking=False)

    # Try to increase nofile and nproc limits
    increase_limits()

    # We're using a GraphContainer() to contain the graph which gets replaced
    # at runtime. This way we're not losing the context in other places like
    # the webserver when the graph gets reassigned.
    graph_container = GraphContainer()

    # GraphCollector() is a custom Prometheus Collector that
    # takes a graph and yields its metrics
    graph_collector = GraphCollector(graph_container)
    REGISTRY.register(graph_collector)

    # Scheduler() starts an APScheduler instance
    scheduler = Scheduler(graph_container)
    scheduler.daemon = True
    scheduler.start()

    # Cli() is the CLI Thread
    cli = Cli(graph_container, scheduler)
    cli.daemon = True
    cli.start()

    # WebServer is handed the graph container context so it can e.g. produce graphml
    # from it. The webserver serves Prometheus Metrics as well as different graph
    # endpoints.
    web_server = WebServer(graph_container)
    web_server.daemon = True
    web_server.start()

    for Plugin in plugin_loader.plugins(PluginType.PERSISTENT):
        try:
            log.debug(f"Starting persistent Plugin {Plugin}")
            plugin = Plugin()
            plugin.daemon = True
            plugin.start()
        except Exception as e:
            log.exception(f"Caught unhandled persistent Plugin exception {e}")

    processor = Processor(graph_container,
                          plugin_loader.plugins(PluginType.COLLECTOR))
    processor.daemon = True
    processor.start()

    # Dispatch the STARTUP event
    dispatch_event(Event(EventType.STARTUP))

    # We wait for the shutdown Event to be set() and then end the program
    # While doing so we print the list of active threads once per 15 minutes
    while not shutdown_event.is_set():
        log_stats()
        shutdown_event.wait(900)
    time.sleep(5)
    cloudkeeper.signal.kill_children(cloudkeeper.signal.SIGTERM,
                                     ensure_death=True)
    log.info("Shutdown complete")
    quit()
Exemplo n.º 15
0
 def on_post(self, req, resp) -> None:
     dispatch_event(Event(EventType.START_COLLECT))
     resp.content_type = 'application/json'
     j = json.dumps({'status': 'ok'})
     resp.body = f'{j}\r\n'
Exemplo n.º 16
0
def main() -> None:
    # Add cli args
    arg_parser = get_arg_parser()

    Cli.add_args(arg_parser)
    WebServer.add_args(arg_parser)
    Scheduler.add_args(arg_parser)
    Processor.add_args(arg_parser)
    Cleaner.add_args(arg_parser)
    PluginLoader.add_args(arg_parser)
    GraphContainer.add_args(arg_parser)
    event_add_args(arg_parser)

    # Find cloudkeeper Plugins in the cloudkeeper.plugins module
    plugin_loader = PluginLoader()
    plugin_loader.add_plugin_args(arg_parser)

    # At this point the CLI, all Plugins as well as the WebServer have added their args to the arg parser
    arg_parser.parse_args()

    # Write log to a file in addition to stdout
    if ArgumentParser.args.logfile:
        log_formatter = logging.Formatter(log_format)
        fh = logging.FileHandler(ArgumentParser.args.logfile)
        fh.setFormatter(log_formatter)
        logging.getLogger().addHandler(fh)

    # Handle Ctrl+c and other means of termination/shutdown
    signal_on_parent_exit()
    add_event_listener(EventType.SHUTDOWN, shutdown, blocking=False)
    signal(SIGINT, signal_handler)
    signal(SIGTERM, signal_handler)
    signal(SIGUSR1, signal_handler)

    # We're using a GraphContainer() to contain the graph which gets replaced at runtime.
    # This way we're not losing the context in other places like the webserver when the
    # graph gets reassigned.
    graph_container = GraphContainer()

    # GraphCollector() is a custom Prometheus Collector that
    # takes a graph and yields its metrics
    graph_collector = GraphCollector(graph_container)
    REGISTRY.register(graph_collector)

    # Scheduler() starts an APScheduler instance
    scheduler = Scheduler(graph_container)
    scheduler.daemon = True
    scheduler.start()

    # Cli() is the CLI Thread
    cli = Cli(graph_container, scheduler)
    cli.daemon = True
    cli.start()

    # WebServer is handed the graph container context so it can e.g. produce graphml from it
    # The webserver serves Prometheus Metrics as well as different graph endpoints
    web_server = WebServer(graph_container)
    web_server.daemon = True
    web_server.start()

    for Plugin in plugin_loader.plugins(PluginType.PERSISTENT):
        try:
            log.debug(f'Starting persistent Plugin {Plugin}')
            plugin = Plugin()
            plugin.daemon = True
            plugin.start()
        except Exception as e:
            log.exception(f'Caught unhandled persistent Plugin exception {e}')

    collector = Processor(graph_container,
                          plugin_loader.plugins(PluginType.COLLECTOR))
    collector.daemon = True
    collector.start()

    # Dispatch the STARTUP event
    dispatch_event(Event(EventType.STARTUP))

    # We wait for the shutdown Event to be set() and then end the program
    # While doing so we print the list of active threads once per 15 minutes
    while not shutdown_event.is_set():
        log_stats()
        shutdown_event.wait(900)
    time.sleep(5)
    log.info('Shutdown complete')
    quit()