def send_to_resotocore(self, graph: Graph, task_id: str): if not ArgumentParser.args.resotocore_uri: return log.info("resotocore Event Handler called") base_uri = resotocore.http_uri resotocore_graph = self._config.resotoworker.graph dump_json = self._config.resotoworker.debug_dump_json tempdir = self._config.resotoworker.tempdir graph_merge_kind = self._config.resotoworker.graph_merge_kind self.create_graph(base_uri, resotocore_graph) self.update_model(graph, base_uri, dump_json=dump_json, tempdir=tempdir) graph_export_iterator = GraphExportIterator( graph, delete_tempfile=not dump_json, tempdir=tempdir, graph_merge_kind=graph_merge_kind, ) # The graph is not required any longer and can be released. del graph graph_export_iterator.export_graph() self.send_graph(graph_export_iterator, base_uri, resotocore_graph, task_id)
def do_action(self, data: Dict) -> None: log.info("Protector called") Config.plugin_protector.validate(Config.plugin_protector) self.config = deepcopy(Config.plugin_protector.config) cg = CoreGraph(tls_data=self.tls_data) resource_parts = [] for cloud_id, accounts in self.config.items(): for account_id, regions in accounts.items(): for region_id, kinds in regions.items(): for kind, resources in kinds.items(): for resource_id in resources: log.debug( f"Protecting {resource_id} of kind {kind} in" f" region {region_id} account {account_id}" f" cloud {cloud_id}") resource_parts.append( f'(/reported.id == "{resource_id}"' f' and /reported.kind == "{kind}"' f' and /ancestors.region.reported.id == "{region_id}"' f' and /ancestors.cloud.reported.id == "{cloud_id}")' ) resource_part = " or ".join(resource_parts) command = f"search {resource_part} | protect" for node_data in cg.execute(command): node = node_from_dict(node_data) log.debug(f"Protected {node.rtdname}")
def __delitem__(self, key): if self.parent_resource and isinstance(self.parent_resource, BaseResource): log.debug(f"Calling parent resource to delete tag {key} in cloud") try: if self.parent_resource.delete_tag(key): log_msg = f"Successfully deleted tag {key} in cloud" self.parent_resource._changes.add("tags") self.parent_resource.log(log_msg) log.info((f"{log_msg} for {self.parent_resource.kind}" f" {self.parent_resource.id}")) return super().__delitem__(key) else: log_msg = f"Error deleting tag {key} in cloud" self.parent_resource.log(log_msg) log.error((f"{log_msg} for {self.parent_resource.kind}" f" {self.parent_resource.id}")) except Exception as e: log_msg = f"Unhandled exception while trying to delete tag {key} in cloud:" f" {type(e)} {e}" self.parent_resource.log(log_msg, exception=e) if self.parent_resource._raise_tags_exceptions: raise else: log.exception(log_msg) else: return super().__delitem__(key)
async def clean_ws_handler(ws_id: str, websocket_handler: WSHandler) -> None: with suppress(Exception): handler = websocket_handler.get(ws_id) if handler: websocket_handler.pop(ws_id, None) future, ws = handler future.cancel() log.info(f"Cleanup ws handler: {ws_id} ({len(websocket_handler)} active)") if not ws.closed: await ws.close()
async def send(ctx: Callable[[], AsyncContextManager[Queue[T]]]) -> None: try: async with ctx() as events: while True: event = await events.get() await ws.send_str(outgoing_fn(event) + "\n") except Exception as ex: # do not allow any exception - it will destroy the async fiber and cleanup log.info(f"Send: message listener {wsid}: {ex}. Hang up.") finally: await clean_ws_handler(wsid, websocket_handler)
def pre_cleanup(self, graph=None) -> bool: if not hasattr(self, "pre_delete"): return True if graph is None: graph = self._graph if self.phantom: raise RuntimeError( f"Can't cleanup phantom resource {self.rtdname}") if self.cleaned: log.debug(f"Resource {self.rtdname} has already been cleaned up") return True account = self.account(graph) region = self.region(graph) if not isinstance(account, BaseAccount) or not isinstance( region, BaseRegion): log.error( ("Could not determine account or region for pre cleanup of" f" {self.rtdname}")) return False log_suffix = f" in account {account.dname} region {region.name}" self.log("Trying to run pre clean up") log.debug(f"Trying to run pre clean up {self.rtdname}{log_suffix}") try: if not getattr(self, "pre_delete")(graph): self.log("Failed to run pre clean up") log.error( f"Failed to run pre clean up {self.rtdname}{log_suffix}") return False self.log("Successfully ran pre clean up") log.info( f"Successfully ran pre clean up {self.rtdname}{log_suffix}") except Exception as e: self.log("An error occurred during pre clean up", exception=e) log.exception( f"An error occurred during pre clean up {self.rtdname}{log_suffix}" ) cloud = self.cloud(graph) metrics_resource_pre_cleanup_exceptions.labels( cloud=cloud.name, account=account.dname, region=region.name, kind=self.kind, ).inc() return False return True
def restart_required(new_config: Dict) -> bool: for config_id, config_data in new_config.items(): if config_id in Config.running_config.data: for field in fields(config_data): if field.metadata.get("restart_required", False): old_value = getattr( Config.running_config.data[config_id], field.name, None) new_value = getattr(config_data, field.name, None) if new_value != old_value: log.info( f"Changed config {config_id}.{field.name} requires restart" ) return True return False
def main() -> None: """ Application entrypoint - no arguments are allowed. """ try: run(sys.argv[1:]) log.info("Process finished.") except (KeyboardInterrupt, SystemExit): log.debug("Stopping resotoeventlog.") sys.exit(0) except Exception as ex: if "--debug" in sys.argv: print(traceback.format_exc()) print(f"resotoeventlog stopped. Reason: {ex}", file=sys.stderr) sys.exit(1)
def shutdown(event: Event) -> None: reason = event.data.get("reason") emergency = event.data.get("emergency") if emergency: resotolib.proc.emergency_shutdown(reason) current_pid = os.getpid() if current_pid != resotolib.proc.parent_pid: return if reason is None: reason = "unknown reason" log.info((f"Received shut down event {event.event_type}:" f" {reason} - killing all threads and child processes")) shutdown_event.set() # and then end the program
def clean(self, node: BaseResource) -> None: log_prefix = f"Resource {node.rtdname} is marked for removal" if Config.resotoworker.cleanup_dry_run: log.info( f"{log_prefix}, not calling cleanup method because of dry run flag" ) return log.info(f"{log_prefix}, calling cleanup method") try: node.cleanup(self.graph) except Exception: log.exception( f"An exception occurred when running resource cleanup on {node.rtdname}" )
async def receive() -> None: try: async for msg in ws: if isinstance(msg, WSMessage) and msg.type in ( WSMsgType.ERROR, WSMsgType.CLOSE, WSMsgType.CLOSED, ): break elif isinstance(msg, WSMessage) and msg.type == WSMsgType.TEXT and len(msg.data.strip()) > 0: log.debug(f"Incoming message: type={msg.type} data={msg.data} extra={msg.extra}") await handle_incoming(msg.data) except Exception as ex: # do not allow any exception - it will destroy the async fiber and cleanup log.info(f"Receive: message listener {wsid}: {ex}. Hang up.") finally: await clean_ws_handler(wsid, websocket_handler)
def load_config(self, reload: bool = False) -> None: if len(Config.running_config.classes) == 0: raise RuntimeError("No config added") with self._config_lock: try: config, new_config_revision = get_config(self.config_name, self.resotocore_uri, verify=self.verify) if len(config) == 0: if self._initial_load: raise ConfigNotFoundError( "Empty config returned - loading defaults") else: raise ValueError("Empty config returned") except ConfigNotFoundError: pass else: log.info( f"Loaded config {self.config_name} revision {new_config_revision}" ) new_config = {} for config_id, config_data in config.items(): if config_id in Config.running_config.classes: log.debug(f"Loading config section {config_id}") new_config[config_id] = jsons.load( config_data, Config.running_config.classes[config_id]) else: log.warning(f"Unknown config section {config_id}") if reload and self.restart_required(new_config): restart() Config.running_config.data = new_config Config.running_config.revision = new_config_revision self.init_default_config() if self._initial_load: # Try to store the generated config. Handle failure gracefully. try: self.save_config() except RuntimeError as e: log.error(f"Failed to save config: {e}") self.override_config(Config.running_config) self._initial_load = False if not self._ce.is_alive(): log.debug("Starting config event listener") self._ce.start()
def pre_clean(self, node: BaseResource) -> None: if not hasattr(node, "pre_delete"): return log_prefix = f"Resource {node.rtdname} is marked for removal" if Config.resotoworker.cleanup_dry_run: log.info( f"{log_prefix}, not calling pre cleanup method because of dry run flag" ) return log.info(f"{log_prefix}, calling pre cleanup method") try: node.pre_cleanup(self.graph) except Exception: log.exception( ("An exception occurred when running resource pre cleanup on" f" {node.rtdname}"))
def collect(self) -> None: """This method is being called by resoto whenever the collector runs It is responsible for querying the cloud APIs for remote resources and adding them to the plugin graph. The graph root (self.graph.root) must always be followed by one or more accounts. An account must always be followed by a region. A region can contain arbitrary resources. """ tokens = Config.digitalocean.api_tokens spaces_access_keys: List[str] = Config.digitalocean.spaces_access_keys spaces_keys: List[Tuple[Optional[str], Optional[str]]] = [] def spaces_keys_valid(keys: List[str]) -> bool: return all([len(key.split(":")) == 2 for key in keys]) if not spaces_keys_valid(spaces_access_keys): log.warn( "DigitalOcean Spaces access keys must be provided in pairs of access_key:secret_key" ) else: def key_to_tuple(key: str) -> Tuple[str, str]: splitted = key.split(":") return splitted[0], splitted[1] spaces_keys = [key_to_tuple(key) for key in spaces_access_keys] if len(tokens) != len(spaces_access_keys): log.warn( "The number of DigitalOcean API tokens and DigitalOcean Spaces access keys must be equal." + "Missing or extra spaces access keys will be ignored.") spaces_keys = spaces_keys[:len(tokens)] spaces_keys.extend([(None, None)] * (len(tokens) - len(spaces_keys))) log.info( f"plugin: collecting DigitalOcean resources for {len(tokens)} teams" ) for token, space_key_tuple in zip(tokens, spaces_keys): client = StreamingWrapper(token, space_key_tuple[0], space_key_tuple[1]) team_graph = self.collect_team(client) self.graph.merge(team_graph)
def cleanup(self) -> None: if not Config.resotoworker.cleanup: log.debug(("Cleanup called but resotoworker.cleanup not configured" " - ignoring call")) return log.info("Running cleanup") # create a subgraph of all the nodes that have a delete edge delete_graph = DiGraph(self.graph.edge_type_subgraph(EdgeType.delete)) # from that graph delete all the nodes not marked for cleanup for node in list(delete_graph.nodes): if not node.clean: delete_graph.remove_node(node) # add all the nodes that are supposed to be cleaned # but do not have a delete edge so weren't part of the # subgraph for node in self.graph.nodes: if node.clean and node not in delete_graph: delete_graph.add_node(node) cleanup_nodes = list(delete_graph.nodes) for node in cleanup_nodes: log.debug(f"Adding {node.rtdname} to cleanup plan") log.debug(f"Sending {len(cleanup_nodes)} nodes to pre-cleanup pool") with ThreadPoolExecutor( max_workers=Config.resotoworker.cleanup_pool_size, thread_name_prefix="pre_cleaner", ) as executor: executor.map(self.pre_clean, cleanup_nodes) log.debug(f"Running parallel cleanup on {len(cleanup_nodes)} nodes") parallel_pass_num = 1 for nodes in dependent_node_iterator(delete_graph): log.debug( f"Cleaning {len(nodes)} nodes in {ordinal(parallel_pass_num)} pass" ) with ThreadPoolExecutor( max_workers=Config.resotoworker.cleanup_pool_size, thread_name_prefix="cleaner", ) as executor: executor.map(self.clean, nodes) parallel_pass_num += 1
async def subscribe( self, subscriber_id: str, channels: Optional[List[str]] = None, queue_size: int = 0, show_last: int = 100) -> AsyncGenerator[Queue[Event], None]: queue: Queue[Event] = Queue(queue_size) # initially fill the list with the last x entries try: el = len(self.events) for element in islice(self.events, max(0, el - show_last), el): queue.put_nowait(element) except QueueFull: pass def add_listener(name: str) -> None: if name not in self.listeners: self.listeners[name] = [queue] else: self.listeners[name].append(queue) def remove_listener(name: str) -> None: self.listeners[name].remove(queue) if len(self.listeners[name]) == 0: del self.listeners[name] ch_list = channels if channels else ["*"] if len(ch_list) == 0: raise AttributeError("Need at least one channel to subscribe to!") try: self.active_listener[subscriber_id] = ch_list for channel in ch_list: add_listener(channel) log.info( f"Event listener {subscriber_id} added to following queues: {ch_list}" ) yield queue finally: log.info(f"Remove listener: {subscriber_id}") for channel in ch_list: remove_listener(channel) self.active_listener.pop(subscriber_id, None)
def alarm_cleanup(self, graph: Graph): log.info("AWS Cloudwatch Alarms cleanup called") for node in graph.nodes: if node.protected or not isinstance(node, AWSCloudwatchAlarm): continue cloud = node.cloud(graph) account = node.account(graph) region = node.region(graph) log_prefix = f"Found {node.rtdname} in cloud {cloud.name} account {account.dname} " f"region {region.name}." if len(self.config) > 0: if cloud.id not in self.config or account.id not in self.config[ cloud.id]: log.debug(( f"{log_prefix} Account not found in config - ignoring." )) continue should_clean = False i = None log_msg = log_prefix for dimension in node.dimensions: if dimension.get("Name") == "InstanceId": instance_id = dimension.get("Value") i = graph.search_first_all({ "kind": "aws_ec2_instance", "id": instance_id }) if isinstance( i, AWSEC2Instance) and i.instance_status not in ( "terminated"): should_clean = False break else: should_clean = True log_msg += f" Referenced EC2 instance {instance_id} not found." if not should_clean: continue log.debug(f"{log_msg} - cleaning alarm") node.clean = True
def cleanup(tls_data: Optional[TLSData] = None): """Run resource cleanup""" log.info("Running cleanup") cg = CoreGraph(tls_data=tls_data) search_filter = "" if Config.resotoworker.collector and len( Config.resotoworker.collector) > 0: clouds = '["' + '", "'.join(Config.resotoworker.collector) + '"]' search_filter = f"and /ancestors.cloud.reported.id in {clouds} " search = ( f"/desired.clean == true and /metadata.cleaned != true" f" and /metadata.protected!=true {search_filter}<-default,delete[0:]->" ) graph = cg.graph(search) cleaner = Cleaner(graph) cleaner.cleanup() cg.patch_nodes(graph)
def get_org_accounts(filter_current_account=False): session = aws_session() client = session.client("organizations") accounts = [] try: response = client.list_accounts() accounts = response.get("Accounts", []) while response.get("NextToken") is not None: response = client.list_accounts(NextToken=response["NextToken"]) accounts.extend(response.get("Accounts", [])) except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "AccessDeniedException": log.error("AWS error - missing permissions to list organization accounts") else: raise filter_account_id = current_account_id() if filter_current_account else -1 accounts = [aws_account["Id"] for aws_account in accounts if aws_account["Id"] != filter_account_id] for account in accounts: log.debug(f"AWS found org account {account}") log.info(f"AWS found a total of {len(accounts)} org accounts") return accounts
def update_model( self, graph: Graph, resotocore_base_uri: str, dump_json: bool = False, tempdir: Optional[str] = None, ) -> None: model_uri = f"{resotocore_base_uri}/model" log.debug(f"Updating model via {model_uri}") model_json = json.dumps(graph.export_model(), indent=4) if dump_json: ts = datetime.now().strftime("%Y-%m-%d-%H-%M") with tempfile.NamedTemporaryFile( prefix=f"resoto-model-{ts}-", suffix=".json", delete=not dump_json, dir=tempdir, ) as model_outfile: log.info(f"Writing model json to file {model_outfile.name}") model_outfile.write(model_json.encode()) headers = { "Content-Type": "application/json", } if getattr(ArgumentParser.args, "psk", None): encode_jwt_to_headers(headers, {}, ArgumentParser.args.psk) request = requests.Request(method="PATCH", url=model_uri, data=model_json, headers=headers) r = self._send_request(request) if r.status_code != 200: log.error(r.content) raise RuntimeError(f"Failed to create model: {r.content}")
def collect_plugin_graph( collector_plugin: BaseCollectorPlugin, args: Namespace = None, running_config: RunningConfig = None, ) -> Optional[Graph]: collector: BaseCollectorPlugin = collector_plugin() collector_name = f"collector_{collector.cloud}" resotolib.proc.set_thread_name(collector_name) if args is not None: ArgumentParser.args = args setup_logger("resotoworker") if running_config is not None: Config.running_config.apply(running_config) log.debug(f"Starting new collect process for {collector.cloud}") start_time = time() collector.start() collector.join(Config.resotoworker.timeout) elapsed = time() - start_time if not collector.is_alive(): # The plugin has finished its work if not collector.finished: log.error(f"Plugin {collector.cloud} did not finish collection" " - ignoring plugin results") return None if not collector.graph.is_dag_per_edge_type(): log.error(f"Graph of plugin {collector.cloud} is not acyclic" " - ignoring plugin results") return None log.info( f"Collector of plugin {collector.cloud} finished in {elapsed:.4f}s" ) return collector.graph else: log.error( f"Plugin {collector.cloud} timed out - discarding Plugin graph") return None
def core_actions_processor(plugin_loader: PluginLoader, tls_data: TLSData, collector: Collector, message: Dict) -> None: collectors: List[BaseCollectorPlugin] = plugin_loader.plugins(PluginType.COLLECTOR) if not isinstance(message, dict): log.error(f"Invalid message: {message}") return kind = message.get("kind") message_type = message.get("message_type") data = message.get("data") task_id = data.get("task") log.debug(f"Received message of kind {kind}, type {message_type}, data: {data}") if kind == "action": try: if message_type == "collect": start_time = time.time() collector.collect_and_send(collectors, task_id=task_id) run_time = int(time.time() - start_time) log.info(f"Collect ran for {run_time} seconds") elif message_type == "cleanup": if not Config.resotoworker.cleanup: log.info("Cleanup called but disabled in config" " (resotoworker.cleanup) - skipping") else: if Config.resotoworker.cleanup_dry_run: log.info("Cleanup called with dry run configured" " (resotoworker.cleanup_dry_run)") start_time = time.time() cleanup(tls_data=tls_data) run_time = int(time.time() - start_time) log.info(f"Cleanup ran for {run_time} seconds") else: raise ValueError(f"Unknown message type {message_type}") except Exception as e: log.exception(f"Failed to {message_type}: {e}") reply_kind = "action_error" else: reply_kind = "action_done" reply_message = { "kind": reply_kind, "message_type": message_type, "data": data, } return reply_message
def main() -> None: setup_logger("resotometrics") resotolib.proc.parent_pid = os.getpid() add_event_listener(EventType.SHUTDOWN, shutdown) arg_parser = ArgumentParser(description="resoto metrics exporter", env_args_prefix="RESOTOMETRICS_") add_args(arg_parser) Config.add_args(arg_parser) resotocore_add_args(arg_parser) logging_add_args(arg_parser) jwt_add_args(arg_parser) TLSData.add_args(arg_parser) arg_parser.parse_args() try: wait_for_resotocore(resotocore.http_uri) except TimeoutError as e: log.fatal(f"Failed to connect to resotocore: {e}") sys.exit(1) tls_data = None if resotocore.is_secure: tls_data = TLSData( common_name=ArgumentParser.args.subscriber_id, resotocore_uri=resotocore.http_uri, ) tls_data.start() config = Config( ArgumentParser.args.subscriber_id, resotocore_uri=resotocore.http_uri, tls_data=tls_data, ) config.add_config(ResotoMetricsConfig) config.load_config() resotolib.proc.initializer() metrics = Metrics() graph_collector = GraphCollector(metrics) REGISTRY.register(graph_collector) resotocore_graph = Config.resotometrics.graph graph_uri = f"{resotocore.http_uri}/graph/{resotocore_graph}" search_uri = f"{graph_uri}/search/aggregate?section=reported" message_processor = partial(core_actions_processor, metrics, search_uri, tls_data) core_actions = CoreActions( identifier=ArgumentParser.args.subscriber_id, resotocore_uri=resotocore.http_uri, resotocore_ws_uri=resotocore.ws_uri, actions={ "generate_metrics": { "timeout": Config.resotometrics.timeout, "wait_for_completion": True, }, }, message_processor=message_processor, tls_data=tls_data, ) web_server_args = {} if tls_data: web_server_args = { "ssl_cert": tls_data.cert_path, "ssl_key": tls_data.key_path, } web_server = WebServer( WebApp(mountpoint=Config.resotometrics.web_path), web_host=Config.resotometrics.web_host, web_port=Config.resotometrics.web_port, **web_server_args, ) web_server.daemon = True web_server.start() core_actions.start() shutdown_event.wait() web_server.shutdown() core_actions.shutdown() resotolib.proc.kill_children(resotolib.proc.SIGTERM, ensure_death=True) log.info("Shutdown complete") sys.exit(0)
def do_action(self, data: Dict) -> None: log.info("Tag Validator called") Config.plugin_tagvalidator.validate(Config.plugin_tagvalidator) self.config = deepcopy(Config.plugin_tagvalidator.config) cg = CoreGraph(tls_data=self.tls_data) query_tag = "tagvalidate" exclusion_part = "metadata.protected == false and metadata.phantom == false and metadata.cleaned == false" tags_part = "has_key(reported.tags, expiration)" kinds_part = 'reported.kind in ["' + '", "'.join( self.config["kinds"]) + '"]' account_parts = [] for cloud_id, account in self.config["accounts"].items(): for account_id in account.keys(): account_part = ( f'(metadata.ancestors.cloud.id == "{cloud_id}" and ' f'metadata.ancestors.account.id == "{account_id}")') account_parts.append(account_part) accounts_part = "(" + " or ".join(account_parts) + ")" query = f"{exclusion_part} and {kinds_part} and {tags_part} and {accounts_part} #{query_tag} <-[0:]-" graph = cg.graph(query) commands = [] for node in graph.nodes: cloud = node.cloud(graph) account = node.account(graph) region = node.region(graph) if node.protected or node._resotocore_query_tag != query_tag: continue update_node_tag = False max_expiration = self.config["accounts"].get(cloud.id, {}).get( account.id, {}).get("expiration") max_expiration_str = delta_to_str(max_expiration) node_expiration_str = node.tags.get("expiration") try: node_expiration = parse_delta(node_expiration_str) except (AssertionError, ValueError): log_msg = ( f"Invalid expiration tag value {node_expiration_str}" f" - updating tag to {max_expiration_str}") node.log(log_msg) log.error(f"{log_msg} on {node.rtdname} in {cloud.rtdname}" f" {account.rtdname} {region.rtdname}") update_node_tag = True else: if max_expiration < node_expiration: log_msg = ( f"Current expiration tag value {node_expiration_str} is larger" f" than {max_expiration_str} - updating tag") node.log(log_msg) log.error(f"{log_msg} on {node.rtdname}") update_node_tag = True if update_node_tag: commands.append( f"query id({node._resotocore_id}) | tag update --nowait expiration {max_expiration_str}" ) cg.patch_nodes(graph) for command in commands: if Config.plugin_tagvalidator.dry_run: log.debug(f"Tag validator dry run - not executing: {command}") continue for response in cg.execute(command): log.debug(f"Response: {response}")
def main() -> None: setup_logger("resotoworker") # Try to run in a new process group and # ignore if not possible for whatever reason try: os.setpgid(0, 0) except Exception: pass resotolib.proc.parent_pid = os.getpid() arg_parser = ArgumentParser( description="resoto worker", env_args_prefix="RESOTOWORKER_", ) add_args(arg_parser) jwt_add_args(arg_parser) logging_add_args(arg_parser) core_add_args(arg_parser) Config.add_args(arg_parser) TLSData.add_args(arg_parser) # Find resoto Plugins in the resoto.plugins module plugin_loader = PluginLoader() plugin_loader.add_plugin_args(arg_parser) # At this point the CLI, all Plugins as well as the WebServer have # added their args to the arg parser arg_parser.parse_args() try: wait_for_resotocore(resotocore.http_uri) except TimeoutError as e: log.fatal(f"Failed to connect to resotocore: {e}") sys.exit(1) tls_data = None if resotocore.is_secure: tls_data = TLSData( common_name=ArgumentParser.args.subscriber_id, resotocore_uri=resotocore.http_uri, ) tls_data.start() config = Config( ArgumentParser.args.subscriber_id, resotocore_uri=resotocore.http_uri, tls_data=tls_data, ) add_config(config) plugin_loader.add_plugin_config(config) config.load_config() def send_request(request: requests.Request) -> requests.Response: prepared = request.prepare() s = requests.Session() verify = None if tls_data: verify = tls_data.verify return s.send(request=prepared, verify=verify) core = Resotocore(send_request, config) collector = Collector(core.send_to_resotocore, config) # Handle Ctrl+c and other means of termination/shutdown resotolib.proc.initializer() add_event_listener(EventType.SHUTDOWN, shutdown, blocking=False) # Try to increase nofile and nproc limits increase_limits() web_server_args = {} if tls_data: web_server_args = { "ssl_cert": tls_data.cert_path, "ssl_key": tls_data.key_path, } web_server = WebServer( WebApp(mountpoint=Config.resotoworker.web_path), web_host=Config.resotoworker.web_host, web_port=Config.resotoworker.web_port, **web_server_args, ) web_server.daemon = True web_server.start() core_actions = CoreActions( identifier=f"{ArgumentParser.args.subscriber_id}-collector", resotocore_uri=resotocore.http_uri, resotocore_ws_uri=resotocore.ws_uri, actions={ "collect": { "timeout": Config.resotoworker.timeout, "wait_for_completion": True, }, "cleanup": { "timeout": Config.resotoworker.timeout, "wait_for_completion": True, }, }, message_processor=partial(core_actions_processor, plugin_loader, tls_data, collector), tls_data=tls_data, ) task_queue_filter = {} if len(Config.resotoworker.collector) > 0: task_queue_filter = {"cloud": list(Config.resotoworker.collector)} core_tasks = CoreTasks( identifier=f"{ArgumentParser.args.subscriber_id}-tagger", resotocore_ws_uri=resotocore.ws_uri, tasks=["tag"], task_queue_filter=task_queue_filter, message_processor=core_tag_tasks_processor, tls_data=tls_data, ) core_actions.start() core_tasks.start() for Plugin in plugin_loader.plugins(PluginType.ACTION): try: log.debug(f"Starting action plugin {Plugin}") plugin = Plugin(tls_data=tls_data) plugin.start() except Exception as e: log.exception(f"Caught unhandled persistent Plugin exception {e}") # We wait for the shutdown Event to be set() and then end the program # While doing so we print the list of active threads once per 15 minutes shutdown_event.wait() web_server.shutdown() time.sleep(1) # everything gets 1000ms to shutdown gracefully before we force it resotolib.proc.kill_children(resotolib.proc.SIGTERM, ensure_death=True) log.info("Shutdown complete") os._exit(0)
def vpc_cleanup(self, graph: Graph): log.info("AWS VPC cleanup called") for node in graph.nodes: if node.protected or not node.clean or not isinstance( node, AWSVPC): continue cloud = node.cloud(graph) account = node.account(graph) region = node.region(graph) log_prefix = ( f"Found AWS VPC {node.dname} in cloud {cloud.name} account {account.dname} " f"region {region.name} marked for cleanup.") if self.config and len(self.config) > 0: if cloud.id not in self.config or account.id not in self.config[ cloud.id]: log.debug(( f"{log_prefix} Account not found in config - ignoring dependent resources." )) continue vpc_instances = [ i for i in node.descendants(graph, edge_type=EdgeType.delete) if isinstance(i, AWSEC2Instance) and i.instance_status not in ( "shutting-down", "terminated") and not i.clean ] if len(vpc_instances) > 0: log_msg = "VPC contains active EC2 instances - not cleaning VPC." log.debug(f"{log_prefix} {log_msg}") node.log(log_msg) node.clean = False continue log.debug( f"{log_prefix} Marking dependent resources for cleanup as well." ) for descendant in node.descendants(graph, edge_type=EdgeType.delete): log.debug( f"Found descendant {descendant.rtdname} of VPC {node.dname}" ) if isinstance( descendant, ( AWSVPCPeeringConnection, AWSEC2NetworkAcl, AWSEC2NetworkInterface, AWSELB, AWSALB, AWSALBTargetGroup, AWSEC2Subnet, AWSEC2SecurityGroup, AWSEC2InternetGateway, AWSEC2NATGateway, AWSEC2RouteTable, AWSVPCEndpoint, AWSEC2ElasticIP, ), ): descendant.log(( f"Marking for cleanup because resource is a descendant of VPC {node.dname} " f"which is set to be cleaned")) node.log( f"Marking {descendant.rtdname} for cleanup because resource is a descendant" ) descendant.clean = True else: if descendant.clean: log.debug(( f"Descendant {descendant.rtdname} of VPC {node.dname} is not targeted but " f"already marked for cleaning")) else: log.error(( f"Descendant {descendant.rtdname} of VPC {node.dname} is not targeted and " f"not marked for cleaning - VPC cleanup will likely fail" )) node.log(( f"Descendant {descendant.rtdname} is not targeted and not marked for cleaning " f"- cleanup will likely fail"))
def shutdown(event: ResotoEvent) -> None: log.info("Shutting down") shutdown_event.set()
def loadbalancer_cleanup(self, graph: Graph): log.info("AWS Loadbalancers Cleanup called") for node in graph.nodes: if (not isinstance(node, AWSELB) and not isinstance(node, AWSALB) and not isinstance(node, AWSALBTargetGroup)): continue if node.age < self.age: continue if node.tags.get("expiration") == "never": continue cloud = node.cloud(graph) account = node.account(graph) region = node.region(graph) if (isinstance(node, AWSELB) and len([ i for i in node.predecessors(graph, edge_type=EdgeType.delete) if isinstance(i, AWSEC2Instance) and i.instance_status != "terminated" ]) == 0 and len(node.backends) == 0): log.debug(( f"Found orphaned AWS ELB {node.dname} in cloud {cloud.name} account {account.dname} " f"region {region.name} with age {node.age} and no EC2 instances attached to it." )) node.clean = True elif (isinstance(node, AWSALB) and len([ n for n in node.predecessors(graph, edge_type=EdgeType.delete) if isinstance(n, AWSALBTargetGroup) ]) == 0 and len(node.backends) == 0): log.debug(( f"Found orphaned AWS ALB {node.dname} in cloud {cloud.name} account {account.dname} " f"region {region.name} with age {node.age} and no Target Groups attached to it." )) node.clean = True elif (isinstance(node, AWSALBTargetGroup) and len( list(node.successors(graph, edge_type=EdgeType.delete))) == 0): log.debug(( f"Found orphaned AWS ALB Target Group {node.dname} in cloud {cloud.name} " f"account {account.dname} region {region.name} with age {node.age}" )) node.clean = True elif isinstance(node, AWSALB): cleanup_alb = True target_groups = [ n for n in node.predecessors(graph, edge_type=EdgeType.delete) if isinstance(n, AWSALBTargetGroup) ] if len(node.backends) > 0: cleanup_alb = False for tg in target_groups: if (tg.target_type != "instance" or tg.age < self.age or len([ i for i in tg.predecessors( graph, edge_type=EdgeType.delete) if isinstance(i, AWSEC2Instance) and i.instance_status != "terminated" ]) > 0): cleanup_alb = False if cleanup_alb: log.debug(( f"Found AWS ALB {node.dname} in cloud {cloud.name} account {account.dname} " f"region {region.name} with age {node.age} and no EC2 instances attached " f"to its {len(target_groups)} target groups.")) for tg in target_groups: tg.clean = True node.clean = True
async def on_start_stop(_: Application) -> AsyncIterator[None]: await api.start() log.info("Initialization done. Starting API.") yield log.info("Shutdown initiated. Stop all tasks.") await api.stop()