示例#1
0
    def send_to_resotocore(self, graph: Graph, task_id: str):
        if not ArgumentParser.args.resotocore_uri:
            return

        log.info("resotocore Event Handler called")

        base_uri = resotocore.http_uri
        resotocore_graph = self._config.resotoworker.graph
        dump_json = self._config.resotoworker.debug_dump_json
        tempdir = self._config.resotoworker.tempdir
        graph_merge_kind = self._config.resotoworker.graph_merge_kind

        self.create_graph(base_uri, resotocore_graph)
        self.update_model(graph,
                          base_uri,
                          dump_json=dump_json,
                          tempdir=tempdir)

        graph_export_iterator = GraphExportIterator(
            graph,
            delete_tempfile=not dump_json,
            tempdir=tempdir,
            graph_merge_kind=graph_merge_kind,
        )
        #  The graph is not required any longer and can be released.
        del graph
        graph_export_iterator.export_graph()
        self.send_graph(graph_export_iterator, base_uri, resotocore_graph,
                        task_id)
示例#2
0
    def do_action(self, data: Dict) -> None:
        log.info("Protector called")
        Config.plugin_protector.validate(Config.plugin_protector)
        self.config = deepcopy(Config.plugin_protector.config)

        cg = CoreGraph(tls_data=self.tls_data)
        resource_parts = []
        for cloud_id, accounts in self.config.items():
            for account_id, regions in accounts.items():
                for region_id, kinds in regions.items():
                    for kind, resources in kinds.items():
                        for resource_id in resources:
                            log.debug(
                                f"Protecting {resource_id} of kind {kind} in"
                                f" region {region_id} account {account_id}"
                                f" cloud {cloud_id}")
                            resource_parts.append(
                                f'(/reported.id == "{resource_id}"'
                                f' and /reported.kind == "{kind}"'
                                f' and /ancestors.region.reported.id == "{region_id}"'
                                f' and /ancestors.cloud.reported.id == "{cloud_id}")'
                            )
        resource_part = " or ".join(resource_parts)
        command = f"search {resource_part} | protect"
        for node_data in cg.execute(command):
            node = node_from_dict(node_data)
            log.debug(f"Protected {node.rtdname}")
示例#3
0
 def __delitem__(self, key):
     if self.parent_resource and isinstance(self.parent_resource,
                                            BaseResource):
         log.debug(f"Calling parent resource to delete tag {key} in cloud")
         try:
             if self.parent_resource.delete_tag(key):
                 log_msg = f"Successfully deleted tag {key} in cloud"
                 self.parent_resource._changes.add("tags")
                 self.parent_resource.log(log_msg)
                 log.info((f"{log_msg} for {self.parent_resource.kind}"
                           f" {self.parent_resource.id}"))
                 return super().__delitem__(key)
             else:
                 log_msg = f"Error deleting tag {key} in cloud"
                 self.parent_resource.log(log_msg)
                 log.error((f"{log_msg} for {self.parent_resource.kind}"
                            f" {self.parent_resource.id}"))
         except Exception as e:
             log_msg = f"Unhandled exception while trying to delete tag {key} in cloud:" f" {type(e)} {e}"
             self.parent_resource.log(log_msg, exception=e)
             if self.parent_resource._raise_tags_exceptions:
                 raise
             else:
                 log.exception(log_msg)
     else:
         return super().__delitem__(key)
示例#4
0
async def clean_ws_handler(ws_id: str, websocket_handler: WSHandler) -> None:
    with suppress(Exception):
        handler = websocket_handler.get(ws_id)
        if handler:
            websocket_handler.pop(ws_id, None)
            future, ws = handler
            future.cancel()
            log.info(f"Cleanup ws handler: {ws_id} ({len(websocket_handler)} active)")
            if not ws.closed:
                await ws.close()
示例#5
0
 async def send(ctx: Callable[[], AsyncContextManager[Queue[T]]]) -> None:
     try:
         async with ctx() as events:
             while True:
                 event = await events.get()
                 await ws.send_str(outgoing_fn(event) + "\n")
     except Exception as ex:
         # do not allow any exception - it will destroy the async fiber and cleanup
         log.info(f"Send: message listener {wsid}: {ex}. Hang up.")
     finally:
         await clean_ws_handler(wsid, websocket_handler)
示例#6
0
    def pre_cleanup(self, graph=None) -> bool:
        if not hasattr(self, "pre_delete"):
            return True

        if graph is None:
            graph = self._graph

        if self.phantom:
            raise RuntimeError(
                f"Can't cleanup phantom resource {self.rtdname}")

        if self.cleaned:
            log.debug(f"Resource {self.rtdname} has already been cleaned up")
            return True

        account = self.account(graph)
        region = self.region(graph)
        if not isinstance(account, BaseAccount) or not isinstance(
                region, BaseRegion):
            log.error(
                ("Could not determine account or region for pre cleanup of"
                 f" {self.rtdname}"))
            return False

        log_suffix = f" in account {account.dname} region {region.name}"
        self.log("Trying to run pre clean up")
        log.debug(f"Trying to run pre clean up {self.rtdname}{log_suffix}")
        try:
            if not getattr(self, "pre_delete")(graph):
                self.log("Failed to run pre clean up")
                log.error(
                    f"Failed to run pre clean up {self.rtdname}{log_suffix}")
                return False
            self.log("Successfully ran pre clean up")
            log.info(
                f"Successfully ran pre clean up {self.rtdname}{log_suffix}")
        except Exception as e:
            self.log("An error occurred during pre clean up", exception=e)
            log.exception(
                f"An error occurred during pre clean up {self.rtdname}{log_suffix}"
            )
            cloud = self.cloud(graph)
            metrics_resource_pre_cleanup_exceptions.labels(
                cloud=cloud.name,
                account=account.dname,
                region=region.name,
                kind=self.kind,
            ).inc()
            return False
        return True
示例#7
0
 def restart_required(new_config: Dict) -> bool:
     for config_id, config_data in new_config.items():
         if config_id in Config.running_config.data:
             for field in fields(config_data):
                 if field.metadata.get("restart_required", False):
                     old_value = getattr(
                         Config.running_config.data[config_id], field.name,
                         None)
                     new_value = getattr(config_data, field.name, None)
                     if new_value != old_value:
                         log.info(
                             f"Changed config {config_id}.{field.name} requires restart"
                         )
                         return True
     return False
示例#8
0
def main() -> None:
    """
    Application entrypoint - no arguments are allowed.
    """
    try:
        run(sys.argv[1:])
        log.info("Process finished.")
    except (KeyboardInterrupt, SystemExit):
        log.debug("Stopping resotoeventlog.")
        sys.exit(0)
    except Exception as ex:
        if "--debug" in sys.argv:
            print(traceback.format_exc())
        print(f"resotoeventlog stopped. Reason: {ex}", file=sys.stderr)
        sys.exit(1)
示例#9
0
def shutdown(event: Event) -> None:
    reason = event.data.get("reason")
    emergency = event.data.get("emergency")

    if emergency:
        resotolib.proc.emergency_shutdown(reason)

    current_pid = os.getpid()
    if current_pid != resotolib.proc.parent_pid:
        return

    if reason is None:
        reason = "unknown reason"
    log.info((f"Received shut down event {event.event_type}:" f" {reason} - killing all threads and child processes"))
    shutdown_event.set()  # and then end the program
示例#10
0
    def clean(self, node: BaseResource) -> None:
        log_prefix = f"Resource {node.rtdname} is marked for removal"
        if Config.resotoworker.cleanup_dry_run:
            log.info(
                f"{log_prefix}, not calling cleanup method because of dry run flag"
            )
            return

        log.info(f"{log_prefix}, calling cleanup method")
        try:
            node.cleanup(self.graph)
        except Exception:
            log.exception(
                f"An exception occurred when running resource cleanup on {node.rtdname}"
            )
示例#11
0
 async def receive() -> None:
     try:
         async for msg in ws:
             if isinstance(msg, WSMessage) and msg.type in (
                 WSMsgType.ERROR,
                 WSMsgType.CLOSE,
                 WSMsgType.CLOSED,
             ):
                 break
             elif isinstance(msg, WSMessage) and msg.type == WSMsgType.TEXT and len(msg.data.strip()) > 0:
                 log.debug(f"Incoming message: type={msg.type} data={msg.data} extra={msg.extra}")
                 await handle_incoming(msg.data)
     except Exception as ex:
         # do not allow any exception - it will destroy the async fiber and cleanup
         log.info(f"Receive: message listener {wsid}: {ex}. Hang up.")
     finally:
         await clean_ws_handler(wsid, websocket_handler)
示例#12
0
 def load_config(self, reload: bool = False) -> None:
     if len(Config.running_config.classes) == 0:
         raise RuntimeError("No config added")
     with self._config_lock:
         try:
             config, new_config_revision = get_config(self.config_name,
                                                      self.resotocore_uri,
                                                      verify=self.verify)
             if len(config) == 0:
                 if self._initial_load:
                     raise ConfigNotFoundError(
                         "Empty config returned - loading defaults")
                 else:
                     raise ValueError("Empty config returned")
         except ConfigNotFoundError:
             pass
         else:
             log.info(
                 f"Loaded config {self.config_name} revision {new_config_revision}"
             )
             new_config = {}
             for config_id, config_data in config.items():
                 if config_id in Config.running_config.classes:
                     log.debug(f"Loading config section {config_id}")
                     new_config[config_id] = jsons.load(
                         config_data,
                         Config.running_config.classes[config_id])
                 else:
                     log.warning(f"Unknown config section {config_id}")
             if reload and self.restart_required(new_config):
                 restart()
             Config.running_config.data = new_config
             Config.running_config.revision = new_config_revision
         self.init_default_config()
         if self._initial_load:
             # Try to store the generated config. Handle failure gracefully.
             try:
                 self.save_config()
             except RuntimeError as e:
                 log.error(f"Failed to save config: {e}")
         self.override_config(Config.running_config)
         self._initial_load = False
         if not self._ce.is_alive():
             log.debug("Starting config event listener")
             self._ce.start()
示例#13
0
    def pre_clean(self, node: BaseResource) -> None:
        if not hasattr(node, "pre_delete"):
            return

        log_prefix = f"Resource {node.rtdname} is marked for removal"
        if Config.resotoworker.cleanup_dry_run:
            log.info(
                f"{log_prefix}, not calling pre cleanup method because of dry run flag"
            )
            return

        log.info(f"{log_prefix}, calling pre cleanup method")
        try:
            node.pre_cleanup(self.graph)
        except Exception:
            log.exception(
                ("An exception occurred when running resource pre cleanup on"
                 f" {node.rtdname}"))
示例#14
0
    def collect(self) -> None:
        """This method is being called by resoto whenever the collector runs

        It is responsible for querying the cloud APIs for remote resources and adding
        them to the plugin graph.
        The graph root (self.graph.root) must always be followed by one or more
        accounts. An account must always be followed by a region.
        A region can contain arbitrary resources.
        """
        tokens = Config.digitalocean.api_tokens
        spaces_access_keys: List[str] = Config.digitalocean.spaces_access_keys
        spaces_keys: List[Tuple[Optional[str], Optional[str]]] = []

        def spaces_keys_valid(keys: List[str]) -> bool:
            return all([len(key.split(":")) == 2 for key in keys])

        if not spaces_keys_valid(spaces_access_keys):
            log.warn(
                "DigitalOcean Spaces access keys must be provided in pairs of access_key:secret_key"
            )
        else:

            def key_to_tuple(key: str) -> Tuple[str, str]:
                splitted = key.split(":")
                return splitted[0], splitted[1]

            spaces_keys = [key_to_tuple(key) for key in spaces_access_keys]

        if len(tokens) != len(spaces_access_keys):
            log.warn(
                "The number of DigitalOcean API tokens and DigitalOcean Spaces access keys must be equal."
                + "Missing or extra spaces access keys will be ignored.")
            spaces_keys = spaces_keys[:len(tokens)]
            spaces_keys.extend([(None, None)] *
                               (len(tokens) - len(spaces_keys)))

        log.info(
            f"plugin: collecting DigitalOcean resources for {len(tokens)} teams"
        )
        for token, space_key_tuple in zip(tokens, spaces_keys):
            client = StreamingWrapper(token, space_key_tuple[0],
                                      space_key_tuple[1])
            team_graph = self.collect_team(client)
            self.graph.merge(team_graph)
示例#15
0
    def cleanup(self) -> None:
        if not Config.resotoworker.cleanup:
            log.debug(("Cleanup called but resotoworker.cleanup not configured"
                       " - ignoring call"))
            return

        log.info("Running cleanup")
        # create a subgraph of all the nodes that have a delete edge
        delete_graph = DiGraph(self.graph.edge_type_subgraph(EdgeType.delete))
        # from that graph delete all the nodes not marked for cleanup
        for node in list(delete_graph.nodes):
            if not node.clean:
                delete_graph.remove_node(node)
        # add all the nodes that are supposed to be cleaned
        # but do not have a delete edge so weren't part of the
        # subgraph
        for node in self.graph.nodes:
            if node.clean and node not in delete_graph:
                delete_graph.add_node(node)
        cleanup_nodes = list(delete_graph.nodes)

        for node in cleanup_nodes:
            log.debug(f"Adding {node.rtdname} to cleanup plan")

        log.debug(f"Sending {len(cleanup_nodes)} nodes to pre-cleanup pool")
        with ThreadPoolExecutor(
                max_workers=Config.resotoworker.cleanup_pool_size,
                thread_name_prefix="pre_cleaner",
        ) as executor:
            executor.map(self.pre_clean, cleanup_nodes)

        log.debug(f"Running parallel cleanup on {len(cleanup_nodes)} nodes")
        parallel_pass_num = 1
        for nodes in dependent_node_iterator(delete_graph):
            log.debug(
                f"Cleaning {len(nodes)} nodes in {ordinal(parallel_pass_num)} pass"
            )
            with ThreadPoolExecutor(
                    max_workers=Config.resotoworker.cleanup_pool_size,
                    thread_name_prefix="cleaner",
            ) as executor:
                executor.map(self.clean, nodes)
            parallel_pass_num += 1
示例#16
0
    async def subscribe(
            self,
            subscriber_id: str,
            channels: Optional[List[str]] = None,
            queue_size: int = 0,
            show_last: int = 100) -> AsyncGenerator[Queue[Event], None]:
        queue: Queue[Event] = Queue(queue_size)

        # initially fill the list with the last x entries
        try:
            el = len(self.events)
            for element in islice(self.events, max(0, el - show_last), el):
                queue.put_nowait(element)
        except QueueFull:
            pass

        def add_listener(name: str) -> None:
            if name not in self.listeners:
                self.listeners[name] = [queue]
            else:
                self.listeners[name].append(queue)

        def remove_listener(name: str) -> None:
            self.listeners[name].remove(queue)
            if len(self.listeners[name]) == 0:
                del self.listeners[name]

        ch_list = channels if channels else ["*"]
        if len(ch_list) == 0:
            raise AttributeError("Need at least one channel to subscribe to!")
        try:
            self.active_listener[subscriber_id] = ch_list
            for channel in ch_list:
                add_listener(channel)
            log.info(
                f"Event listener {subscriber_id} added to following queues: {ch_list}"
            )
            yield queue
        finally:
            log.info(f"Remove listener: {subscriber_id}")
            for channel in ch_list:
                remove_listener(channel)
            self.active_listener.pop(subscriber_id, None)
示例#17
0
    def alarm_cleanup(self, graph: Graph):
        log.info("AWS Cloudwatch Alarms cleanup called")
        for node in graph.nodes:
            if node.protected or not isinstance(node, AWSCloudwatchAlarm):
                continue

            cloud = node.cloud(graph)
            account = node.account(graph)
            region = node.region(graph)
            log_prefix = f"Found {node.rtdname} in cloud {cloud.name} account {account.dname} " f"region {region.name}."

            if len(self.config) > 0:
                if cloud.id not in self.config or account.id not in self.config[
                        cloud.id]:
                    log.debug((
                        f"{log_prefix} Account not found in config - ignoring."
                    ))
                    continue

            should_clean = False
            i = None
            log_msg = log_prefix
            for dimension in node.dimensions:
                if dimension.get("Name") == "InstanceId":
                    instance_id = dimension.get("Value")
                    i = graph.search_first_all({
                        "kind": "aws_ec2_instance",
                        "id": instance_id
                    })
                    if isinstance(
                            i, AWSEC2Instance) and i.instance_status not in (
                                "terminated"):
                        should_clean = False
                        break
                    else:
                        should_clean = True
                        log_msg += f" Referenced EC2 instance {instance_id} not found."

            if not should_clean:
                continue
            log.debug(f"{log_msg} - cleaning alarm")
            node.clean = True
示例#18
0
def cleanup(tls_data: Optional[TLSData] = None):
    """Run resource cleanup"""

    log.info("Running cleanup")

    cg = CoreGraph(tls_data=tls_data)

    search_filter = ""
    if Config.resotoworker.collector and len(
            Config.resotoworker.collector) > 0:
        clouds = '["' + '", "'.join(Config.resotoworker.collector) + '"]'
        search_filter = f"and /ancestors.cloud.reported.id in {clouds} "
    search = (
        f"/desired.clean == true and /metadata.cleaned != true"
        f" and /metadata.protected!=true {search_filter}<-default,delete[0:]->"
    )

    graph = cg.graph(search)
    cleaner = Cleaner(graph)
    cleaner.cleanup()
    cg.patch_nodes(graph)
示例#19
0
def get_org_accounts(filter_current_account=False):
    session = aws_session()
    client = session.client("organizations")
    accounts = []
    try:
        response = client.list_accounts()
        accounts = response.get("Accounts", [])
        while response.get("NextToken") is not None:
            response = client.list_accounts(NextToken=response["NextToken"])
            accounts.extend(response.get("Accounts", []))
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "AccessDeniedException":
            log.error("AWS error - missing permissions to list organization accounts")
        else:
            raise
    filter_account_id = current_account_id() if filter_current_account else -1
    accounts = [aws_account["Id"] for aws_account in accounts if aws_account["Id"] != filter_account_id]
    for account in accounts:
        log.debug(f"AWS found org account {account}")
    log.info(f"AWS found a total of {len(accounts)} org accounts")
    return accounts
示例#20
0
    def update_model(
        self,
        graph: Graph,
        resotocore_base_uri: str,
        dump_json: bool = False,
        tempdir: Optional[str] = None,
    ) -> None:
        model_uri = f"{resotocore_base_uri}/model"

        log.debug(f"Updating model via {model_uri}")

        model_json = json.dumps(graph.export_model(), indent=4)

        if dump_json:
            ts = datetime.now().strftime("%Y-%m-%d-%H-%M")
            with tempfile.NamedTemporaryFile(
                    prefix=f"resoto-model-{ts}-",
                    suffix=".json",
                    delete=not dump_json,
                    dir=tempdir,
            ) as model_outfile:
                log.info(f"Writing model json to file {model_outfile.name}")
                model_outfile.write(model_json.encode())

        headers = {
            "Content-Type": "application/json",
        }
        if getattr(ArgumentParser.args, "psk", None):
            encode_jwt_to_headers(headers, {}, ArgumentParser.args.psk)

        request = requests.Request(method="PATCH",
                                   url=model_uri,
                                   data=model_json,
                                   headers=headers)
        r = self._send_request(request)
        if r.status_code != 200:
            log.error(r.content)
            raise RuntimeError(f"Failed to create model: {r.content}")
示例#21
0
def collect_plugin_graph(
    collector_plugin: BaseCollectorPlugin,
    args: Namespace = None,
    running_config: RunningConfig = None,
) -> Optional[Graph]:
    collector: BaseCollectorPlugin = collector_plugin()
    collector_name = f"collector_{collector.cloud}"
    resotolib.proc.set_thread_name(collector_name)

    if args is not None:
        ArgumentParser.args = args
        setup_logger("resotoworker")
    if running_config is not None:
        Config.running_config.apply(running_config)

    log.debug(f"Starting new collect process for {collector.cloud}")
    start_time = time()
    collector.start()
    collector.join(Config.resotoworker.timeout)
    elapsed = time() - start_time
    if not collector.is_alive():  # The plugin has finished its work
        if not collector.finished:
            log.error(f"Plugin {collector.cloud} did not finish collection"
                      " - ignoring plugin results")
            return None
        if not collector.graph.is_dag_per_edge_type():
            log.error(f"Graph of plugin {collector.cloud} is not acyclic"
                      " - ignoring plugin results")
            return None
        log.info(
            f"Collector of plugin {collector.cloud} finished in {elapsed:.4f}s"
        )
        return collector.graph
    else:
        log.error(
            f"Plugin {collector.cloud} timed out - discarding Plugin graph")
        return None
示例#22
0
def core_actions_processor(plugin_loader: PluginLoader, tls_data: TLSData, collector: Collector, message: Dict) -> None:
    collectors: List[BaseCollectorPlugin] = plugin_loader.plugins(PluginType.COLLECTOR)
    if not isinstance(message, dict):
        log.error(f"Invalid message: {message}")
        return
    kind = message.get("kind")
    message_type = message.get("message_type")
    data = message.get("data")
    task_id = data.get("task")
    log.debug(f"Received message of kind {kind}, type {message_type}, data: {data}")
    if kind == "action":
        try:
            if message_type == "collect":
                start_time = time.time()
                collector.collect_and_send(collectors, task_id=task_id)
                run_time = int(time.time() - start_time)
                log.info(f"Collect ran for {run_time} seconds")
            elif message_type == "cleanup":
                if not Config.resotoworker.cleanup:
                    log.info("Cleanup called but disabled in config" " (resotoworker.cleanup) - skipping")
                else:
                    if Config.resotoworker.cleanup_dry_run:
                        log.info("Cleanup called with dry run configured" " (resotoworker.cleanup_dry_run)")
                    start_time = time.time()
                    cleanup(tls_data=tls_data)
                    run_time = int(time.time() - start_time)
                    log.info(f"Cleanup ran for {run_time} seconds")
            else:
                raise ValueError(f"Unknown message type {message_type}")
        except Exception as e:
            log.exception(f"Failed to {message_type}: {e}")
            reply_kind = "action_error"
        else:
            reply_kind = "action_done"

        reply_message = {
            "kind": reply_kind,
            "message_type": message_type,
            "data": data,
        }
        return reply_message
示例#23
0
def main() -> None:
    setup_logger("resotometrics")
    resotolib.proc.parent_pid = os.getpid()

    add_event_listener(EventType.SHUTDOWN, shutdown)
    arg_parser = ArgumentParser(description="resoto metrics exporter",
                                env_args_prefix="RESOTOMETRICS_")
    add_args(arg_parser)
    Config.add_args(arg_parser)
    resotocore_add_args(arg_parser)
    logging_add_args(arg_parser)
    jwt_add_args(arg_parser)
    TLSData.add_args(arg_parser)
    arg_parser.parse_args()

    try:
        wait_for_resotocore(resotocore.http_uri)
    except TimeoutError as e:
        log.fatal(f"Failed to connect to resotocore: {e}")
        sys.exit(1)

    tls_data = None
    if resotocore.is_secure:
        tls_data = TLSData(
            common_name=ArgumentParser.args.subscriber_id,
            resotocore_uri=resotocore.http_uri,
        )
        tls_data.start()
    config = Config(
        ArgumentParser.args.subscriber_id,
        resotocore_uri=resotocore.http_uri,
        tls_data=tls_data,
    )
    config.add_config(ResotoMetricsConfig)
    config.load_config()

    resotolib.proc.initializer()

    metrics = Metrics()
    graph_collector = GraphCollector(metrics)
    REGISTRY.register(graph_collector)

    resotocore_graph = Config.resotometrics.graph
    graph_uri = f"{resotocore.http_uri}/graph/{resotocore_graph}"
    search_uri = f"{graph_uri}/search/aggregate?section=reported"

    message_processor = partial(core_actions_processor, metrics, search_uri,
                                tls_data)
    core_actions = CoreActions(
        identifier=ArgumentParser.args.subscriber_id,
        resotocore_uri=resotocore.http_uri,
        resotocore_ws_uri=resotocore.ws_uri,
        actions={
            "generate_metrics": {
                "timeout": Config.resotometrics.timeout,
                "wait_for_completion": True,
            },
        },
        message_processor=message_processor,
        tls_data=tls_data,
    )
    web_server_args = {}
    if tls_data:
        web_server_args = {
            "ssl_cert": tls_data.cert_path,
            "ssl_key": tls_data.key_path,
        }
    web_server = WebServer(
        WebApp(mountpoint=Config.resotometrics.web_path),
        web_host=Config.resotometrics.web_host,
        web_port=Config.resotometrics.web_port,
        **web_server_args,
    )
    web_server.daemon = True
    web_server.start()
    core_actions.start()
    shutdown_event.wait()
    web_server.shutdown()
    core_actions.shutdown()
    resotolib.proc.kill_children(resotolib.proc.SIGTERM, ensure_death=True)
    log.info("Shutdown complete")
    sys.exit(0)
示例#24
0
    def do_action(self, data: Dict) -> None:
        log.info("Tag Validator called")
        Config.plugin_tagvalidator.validate(Config.plugin_tagvalidator)
        self.config = deepcopy(Config.plugin_tagvalidator.config)

        cg = CoreGraph(tls_data=self.tls_data)

        query_tag = "tagvalidate"
        exclusion_part = "metadata.protected == false and metadata.phantom == false and metadata.cleaned == false"
        tags_part = "has_key(reported.tags, expiration)"
        kinds_part = 'reported.kind in ["' + '", "'.join(
            self.config["kinds"]) + '"]'
        account_parts = []
        for cloud_id, account in self.config["accounts"].items():
            for account_id in account.keys():
                account_part = (
                    f'(metadata.ancestors.cloud.id == "{cloud_id}" and '
                    f'metadata.ancestors.account.id == "{account_id}")')
                account_parts.append(account_part)
        accounts_part = "(" + " or ".join(account_parts) + ")"
        query = f"{exclusion_part} and {kinds_part} and {tags_part} and {accounts_part} #{query_tag} <-[0:]-"

        graph = cg.graph(query)
        commands = []
        for node in graph.nodes:
            cloud = node.cloud(graph)
            account = node.account(graph)
            region = node.region(graph)
            if node.protected or node._resotocore_query_tag != query_tag:
                continue
            update_node_tag = False
            max_expiration = self.config["accounts"].get(cloud.id, {}).get(
                account.id, {}).get("expiration")
            max_expiration_str = delta_to_str(max_expiration)
            node_expiration_str = node.tags.get("expiration")
            try:
                node_expiration = parse_delta(node_expiration_str)
            except (AssertionError, ValueError):
                log_msg = (
                    f"Invalid expiration tag value {node_expiration_str}"
                    f" - updating tag to {max_expiration_str}")
                node.log(log_msg)
                log.error(f"{log_msg} on {node.rtdname} in {cloud.rtdname}"
                          f" {account.rtdname} {region.rtdname}")
                update_node_tag = True
            else:
                if max_expiration < node_expiration:
                    log_msg = (
                        f"Current expiration tag value {node_expiration_str} is larger"
                        f" than {max_expiration_str} - updating tag")
                    node.log(log_msg)
                    log.error(f"{log_msg} on {node.rtdname}")
                    update_node_tag = True
            if update_node_tag:
                commands.append(
                    f"query id({node._resotocore_id}) | tag update --nowait expiration {max_expiration_str}"
                )
        cg.patch_nodes(graph)
        for command in commands:
            if Config.plugin_tagvalidator.dry_run:
                log.debug(f"Tag validator dry run - not executing: {command}")
                continue
            for response in cg.execute(command):
                log.debug(f"Response: {response}")
示例#25
0
def main() -> None:
    setup_logger("resotoworker")
    # Try to run in a new process group and
    # ignore if not possible for whatever reason
    try:
        os.setpgid(0, 0)
    except Exception:
        pass

    resotolib.proc.parent_pid = os.getpid()

    arg_parser = ArgumentParser(
        description="resoto worker",
        env_args_prefix="RESOTOWORKER_",
    )
    add_args(arg_parser)
    jwt_add_args(arg_parser)
    logging_add_args(arg_parser)
    core_add_args(arg_parser)
    Config.add_args(arg_parser)
    TLSData.add_args(arg_parser)

    # Find resoto Plugins in the resoto.plugins module
    plugin_loader = PluginLoader()
    plugin_loader.add_plugin_args(arg_parser)

    # At this point the CLI, all Plugins as well as the WebServer have
    # added their args to the arg parser
    arg_parser.parse_args()

    try:
        wait_for_resotocore(resotocore.http_uri)
    except TimeoutError as e:
        log.fatal(f"Failed to connect to resotocore: {e}")
        sys.exit(1)

    tls_data = None
    if resotocore.is_secure:
        tls_data = TLSData(
            common_name=ArgumentParser.args.subscriber_id,
            resotocore_uri=resotocore.http_uri,
        )
        tls_data.start()
    config = Config(
        ArgumentParser.args.subscriber_id,
        resotocore_uri=resotocore.http_uri,
        tls_data=tls_data,
    )
    add_config(config)
    plugin_loader.add_plugin_config(config)
    config.load_config()

    def send_request(request: requests.Request) -> requests.Response:
        prepared = request.prepare()
        s = requests.Session()
        verify = None
        if tls_data:
            verify = tls_data.verify
        return s.send(request=prepared, verify=verify)

    core = Resotocore(send_request, config)

    collector = Collector(core.send_to_resotocore, config)

    # Handle Ctrl+c and other means of termination/shutdown
    resotolib.proc.initializer()
    add_event_listener(EventType.SHUTDOWN, shutdown, blocking=False)

    # Try to increase nofile and nproc limits
    increase_limits()

    web_server_args = {}
    if tls_data:
        web_server_args = {
            "ssl_cert": tls_data.cert_path,
            "ssl_key": tls_data.key_path,
        }
    web_server = WebServer(
        WebApp(mountpoint=Config.resotoworker.web_path),
        web_host=Config.resotoworker.web_host,
        web_port=Config.resotoworker.web_port,
        **web_server_args,
    )
    web_server.daemon = True
    web_server.start()

    core_actions = CoreActions(
        identifier=f"{ArgumentParser.args.subscriber_id}-collector",
        resotocore_uri=resotocore.http_uri,
        resotocore_ws_uri=resotocore.ws_uri,
        actions={
            "collect": {
                "timeout": Config.resotoworker.timeout,
                "wait_for_completion": True,
            },
            "cleanup": {
                "timeout": Config.resotoworker.timeout,
                "wait_for_completion": True,
            },
        },
        message_processor=partial(core_actions_processor, plugin_loader, tls_data, collector),
        tls_data=tls_data,
    )

    task_queue_filter = {}
    if len(Config.resotoworker.collector) > 0:
        task_queue_filter = {"cloud": list(Config.resotoworker.collector)}
    core_tasks = CoreTasks(
        identifier=f"{ArgumentParser.args.subscriber_id}-tagger",
        resotocore_ws_uri=resotocore.ws_uri,
        tasks=["tag"],
        task_queue_filter=task_queue_filter,
        message_processor=core_tag_tasks_processor,
        tls_data=tls_data,
    )
    core_actions.start()
    core_tasks.start()

    for Plugin in plugin_loader.plugins(PluginType.ACTION):
        try:
            log.debug(f"Starting action plugin {Plugin}")
            plugin = Plugin(tls_data=tls_data)
            plugin.start()
        except Exception as e:
            log.exception(f"Caught unhandled persistent Plugin exception {e}")

    # We wait for the shutdown Event to be set() and then end the program
    # While doing so we print the list of active threads once per 15 minutes
    shutdown_event.wait()
    web_server.shutdown()
    time.sleep(1)  # everything gets 1000ms to shutdown gracefully before we force it
    resotolib.proc.kill_children(resotolib.proc.SIGTERM, ensure_death=True)
    log.info("Shutdown complete")
    os._exit(0)
示例#26
0
    def vpc_cleanup(self, graph: Graph):
        log.info("AWS VPC cleanup called")
        for node in graph.nodes:
            if node.protected or not node.clean or not isinstance(
                    node, AWSVPC):
                continue

            cloud = node.cloud(graph)
            account = node.account(graph)
            region = node.region(graph)
            log_prefix = (
                f"Found AWS VPC {node.dname} in cloud {cloud.name} account {account.dname} "
                f"region {region.name} marked for cleanup.")

            if self.config and len(self.config) > 0:
                if cloud.id not in self.config or account.id not in self.config[
                        cloud.id]:
                    log.debug((
                        f"{log_prefix} Account not found in config - ignoring dependent resources."
                    ))
                    continue

            vpc_instances = [
                i for i in node.descendants(graph, edge_type=EdgeType.delete)
                if isinstance(i, AWSEC2Instance) and i.instance_status not in (
                    "shutting-down", "terminated") and not i.clean
            ]
            if len(vpc_instances) > 0:
                log_msg = "VPC contains active EC2 instances - not cleaning VPC."
                log.debug(f"{log_prefix} {log_msg}")
                node.log(log_msg)
                node.clean = False
                continue

            log.debug(
                f"{log_prefix} Marking dependent resources for cleanup as well."
            )

            for descendant in node.descendants(graph,
                                               edge_type=EdgeType.delete):
                log.debug(
                    f"Found descendant {descendant.rtdname} of VPC {node.dname}"
                )
                if isinstance(
                        descendant,
                    (
                        AWSVPCPeeringConnection,
                        AWSEC2NetworkAcl,
                        AWSEC2NetworkInterface,
                        AWSELB,
                        AWSALB,
                        AWSALBTargetGroup,
                        AWSEC2Subnet,
                        AWSEC2SecurityGroup,
                        AWSEC2InternetGateway,
                        AWSEC2NATGateway,
                        AWSEC2RouteTable,
                        AWSVPCEndpoint,
                        AWSEC2ElasticIP,
                    ),
                ):
                    descendant.log((
                        f"Marking for cleanup because resource is a descendant of VPC {node.dname} "
                        f"which is set to be cleaned"))
                    node.log(
                        f"Marking {descendant.rtdname} for cleanup because resource is a descendant"
                    )
                    descendant.clean = True
                else:
                    if descendant.clean:
                        log.debug((
                            f"Descendant {descendant.rtdname} of VPC {node.dname} is not targeted but "
                            f"already marked for cleaning"))
                    else:
                        log.error((
                            f"Descendant {descendant.rtdname} of VPC {node.dname} is not targeted and "
                            f"not marked for cleaning - VPC cleanup will likely fail"
                        ))
                        node.log((
                            f"Descendant {descendant.rtdname} is not targeted and not marked for cleaning "
                            f"- cleanup will likely fail"))
示例#27
0
def shutdown(event: ResotoEvent) -> None:
    log.info("Shutting down")
    shutdown_event.set()
示例#28
0
    def loadbalancer_cleanup(self, graph: Graph):
        log.info("AWS Loadbalancers Cleanup called")
        for node in graph.nodes:
            if (not isinstance(node, AWSELB) and not isinstance(node, AWSALB)
                    and not isinstance(node, AWSALBTargetGroup)):
                continue

            if node.age < self.age:
                continue

            if node.tags.get("expiration") == "never":
                continue

            cloud = node.cloud(graph)
            account = node.account(graph)
            region = node.region(graph)

            if (isinstance(node, AWSELB) and len([
                    i for i in node.predecessors(graph,
                                                 edge_type=EdgeType.delete)
                    if isinstance(i, AWSEC2Instance)
                    and i.instance_status != "terminated"
            ]) == 0 and len(node.backends) == 0):
                log.debug((
                    f"Found orphaned AWS ELB {node.dname} in cloud {cloud.name} account {account.dname} "
                    f"region {region.name} with age {node.age} and no EC2 instances attached to it."
                ))
                node.clean = True
            elif (isinstance(node, AWSALB) and len([
                    n for n in node.predecessors(graph,
                                                 edge_type=EdgeType.delete)
                    if isinstance(n, AWSALBTargetGroup)
            ]) == 0 and len(node.backends) == 0):
                log.debug((
                    f"Found orphaned AWS ALB {node.dname} in cloud {cloud.name} account {account.dname} "
                    f"region {region.name} with age {node.age} and no Target Groups attached to it."
                ))
                node.clean = True
            elif (isinstance(node, AWSALBTargetGroup) and len(
                    list(node.successors(graph, edge_type=EdgeType.delete)))
                  == 0):
                log.debug((
                    f"Found orphaned AWS ALB Target Group {node.dname} in cloud {cloud.name} "
                    f"account {account.dname} region {region.name} with age {node.age}"
                ))
                node.clean = True
            elif isinstance(node, AWSALB):
                cleanup_alb = True
                target_groups = [
                    n for n in node.predecessors(graph,
                                                 edge_type=EdgeType.delete)
                    if isinstance(n, AWSALBTargetGroup)
                ]

                if len(node.backends) > 0:
                    cleanup_alb = False

                for tg in target_groups:
                    if (tg.target_type != "instance" or tg.age < self.age
                            or len([
                                i for i in tg.predecessors(
                                    graph, edge_type=EdgeType.delete)
                                if isinstance(i, AWSEC2Instance)
                                and i.instance_status != "terminated"
                            ]) > 0):
                        cleanup_alb = False

                if cleanup_alb:
                    log.debug((
                        f"Found AWS ALB {node.dname} in cloud {cloud.name} account {account.dname} "
                        f"region {region.name} with age {node.age} and no EC2 instances attached "
                        f"to its {len(target_groups)} target groups."))
                    for tg in target_groups:
                        tg.clean = True
                    node.clean = True
示例#29
0
 async def on_start_stop(_: Application) -> AsyncIterator[None]:
     await api.start()
     log.info("Initialization done. Starting API.")
     yield
     log.info("Shutdown initiated. Stop all tasks.")
     await api.stop()