示例#1
0
def core_actions_processor(metrics: Metrics, query_uri: str,
                           message: dict) -> None:
    if not isinstance(message, dict):
        log.error(f"Invalid message: {message}")
        return
    kind = message.get("kind")
    message_type = message.get("message_type")
    data = message.get("data")
    log.debug(
        f"Received message of kind {kind}, type {message_type}, data: {data}")
    if kind == "action":
        try:
            if message_type == "generate_metrics":
                start_time = time.time()
                update_metrics(metrics, query_uri)
                run_time = time.time() - start_time
                log.debug(f"Updated metrics for {run_time:.2f} seconds")
            else:
                raise ValueError(f"Unknown message type {message_type}")
        except Exception as e:
            log.exception(f"Failed to {message_type}: {e}")
            reply_kind = "action_error"
        else:
            reply_kind = "action_done"

        reply_message = {
            "kind": reply_kind,
            "message_type": message_type,
            "data": data,
        }
        return reply_message
示例#2
0
 def __delitem__(self, key):
     if self.parent_resource and isinstance(self.parent_resource,
                                            BaseResource):
         log.debug(f"Calling parent resource to delete tag {key} in cloud")
         try:
             if self.parent_resource.delete_tag(key):
                 log_msg = f"Successfully deleted tag {key} in cloud"
                 self.parent_resource._changes.add("tags")
                 self.parent_resource.log(log_msg)
                 log.info((f"{log_msg} for {self.parent_resource.kind}"
                           f" {self.parent_resource.id}"))
                 return super().__delitem__(key)
             else:
                 log_msg = f"Error deleting tag {key} in cloud"
                 self.parent_resource.log(log_msg)
                 log.error((f"{log_msg} for {self.parent_resource.kind}"
                            f" {self.parent_resource.id}"))
         except Exception as e:
             log_msg = (
                 f"Unhandled exception while trying to delete tag {key} in cloud:"
                 f" {type(e)} {e}")
             self.parent_resource.log(log_msg, exception=e)
             if self.parent_resource._raise_tags_exceptions:
                 raise
             else:
                 log.exception(log_msg)
     else:
         return super().__delitem__(key)
示例#3
0
    def collect_project(project_id: str,
                        args=None,
                        credentials=None) -> Optional[Dict]:
        """Collects an individual project.

        Is being called in collect() and either run within a thread or a spawned
        process. Depending on whether `--gcp-fork` was specified or not.

        Because the spawned process does not inherit any of our memory or file
        descriptors we are passing the already parsed `args` Namespace() to this
        method.
        """
        project = GCPProject(project_id, {})
        collector_name = f"gcp_{project.id}"
        resotolib.signal.set_thread_name(collector_name)

        if args is not None:
            ArgumentParser.args = args
            setup_logger("resotoworker-gcp")

        if credentials is not None:
            Credentials._credentials = credentials
            Credentials._initialized = True

        log.debug(f"Starting new collect process for project {project.dname}")

        try:
            gpc = GCPProjectCollector(project)
            gpc.collect()
        except Exception:
            log.exception(
                f"An unhandled error occurred while collecting {project.rtdname}"
            )
        else:
            return gpc.graph
示例#4
0
 def on_message(self, ws, message):
     try:
         message: Dict = json.loads(message)
     except json.JSONDecodeError:
         log.exception(f"Unable to decode received message {message}")
         return
     self.queue.put(message)
示例#5
0
def collect_account(account: AWSAccount, regions: List, args=None):
    collector_name = f"aws_{account.id}"
    resotolib.signal.set_thread_name(collector_name)

    if args is not None:
        ArgumentParser.args = args
        setup_logger("resotoworker-aws")

    log.debug(f"Starting new collect process for account {account.dname}")

    aac = AWSAccountCollector(regions, account)
    try:
        aac.collect()
    except botocore.exceptions.ClientError as e:
        log.exception(
            f"An AWS {e.response['Error']['Code']} error occurred while collecting account {account.dname}"
        )
        metrics_unhandled_account_exceptions.labels(
            account=account.dname).inc()
    except Exception:
        log.exception(
            f"An unhandled error occurred while collecting AWS account {account.dname}"
        )
        metrics_unhandled_account_exceptions.labels(
            account=account.dname).inc()

    return aac.graph
示例#6
0
def dispatch_event(event: Event, blocking: bool = False) -> None:
    """Dispatch an Event"""
    waiting_str = "" if blocking else "not "
    log.debug(
        f"Dispatching event {event.event_type.name} and {waiting_str}waiting for"
        " listeners to return")

    if event.event_type not in _events.keys():
        return

    with _events_lock.read_access:
        # Event listeners might unregister themselves during event dispatch
        # so we will work on a shallow copy while processing the current event.
        listeners = dict(_events[event.event_type])

    threads = {}
    for listener, listener_data in listeners.items():
        try:
            if listener_data["pid"] != os.getpid():
                continue

            if listener_data["one-shot"] and not listener_data["lock"].acquire(
                    blocking=False):
                log.error(f"Not calling one-shot listener {listener} of type"
                          f" {type(listener)} - can't acquire lock")
                continue

            log.debug(f"Calling listener {listener} of type {type(listener)}"
                      f" (blocking: {listener_data['blocking']})")
            thread_name = (f"{event.event_type.name.lower()}_event"
                           f"-{getattr(listener, '__name__', 'anonymous')}")
            t = Thread(target=listener, args=[event], name=thread_name)
            if blocking or listener_data["blocking"]:
                threads[t] = listener
            t.start()
        except Exception:
            log.exception("Caught unhandled event callback exception")
        finally:
            if listener_data["one-shot"]:
                log.debug(
                    f"One-shot specified for event {event.event_type.name} "
                    f"listener {listener} - removing event listener")
                remove_event_listener(event.event_type, listener)
                listener_data["lock"].release()

    start_time = time.time()
    for thread, listener in threads.items():
        timeout = start_time + listeners[listener]["timeout"] - time.time()
        if timeout < 1:
            timeout = 1
        log.debug(
            f"Waiting up to {timeout:.2f}s for event listener {thread.name} to finish"
        )
        thread.join(timeout)
        log.debug(
            f"Event listener {thread.name} finished (timeout: {thread.is_alive()})"
        )
示例#7
0
def validate_dataclass(node: BaseResource):
    for field in fields(node):
        value = getattr(node, field.name)
        try:
            check_type(str(value), value, field.type)
        except TypeError:
            log.exception(
                f"In {node.rtdname} expected {field.name}"
                f" type {field.type} ({type(field.type)})"
                f" for value {value} ({type(value)})"
            )
示例#8
0
 def worker(self) -> None:
     while not self.shutdown_event.is_set():
         message = self.queue.get()
         log.debug(f"{self.identifier} received: {message}")
         if self.message_processor is not None and callable(self.message_processor):
             try:
                 result = self.message_processor(message)
                 log.debug(f"Sending reply {result}")
                 self.ws.send(json.dumps(result))
             except Exception:
                 log.exception(f"Something went wrong while processing {message}")
         self.queue.task_done()
示例#9
0
 def bootstrap(self) -> bool:
     if ArgumentParser.args.cleanup_volumes:
         try:
             self.age = parse_delta(ArgumentParser.args.cleanup_volumes_age)
             log.debug(f"Volume Cleanup Plugin Age {self.age}")
         except ValueError:
             log.exception(
                 f"Error while parsing Volume Cleanup Age {ArgumentParser.args.volclean_age}"
             )
         else:
             return True
     return False
示例#10
0
    def pre_cleanup(self, graph=None) -> bool:
        if not hasattr(self, "pre_delete"):
            return True

        if graph is None:
            graph = self._graph

        if self.phantom:
            raise RuntimeError(
                f"Can't cleanup phantom resource {self.rtdname}")

        if self.cleaned:
            log.debug(f"Resource {self.rtdname} has already been cleaned up")
            return True

        account = self.account(graph)
        region = self.region(graph)
        if not isinstance(account, BaseAccount) or not isinstance(
                region, BaseRegion):
            log.error(
                ("Could not determine account or region for pre cleanup of"
                 f" {self.rtdname}"))
            return False

        log_suffix = f" in account {account.dname} region {region.name}"
        self.log("Trying to run pre clean up")
        log.debug(f"Trying to run pre clean up {self.rtdname}{log_suffix}")
        try:
            if not getattr(self, "pre_delete")(graph):
                self.log("Failed to run pre clean up")
                log.error(
                    f"Failed to run pre clean up {self.rtdname}{log_suffix}")
                return False
            self.log("Successfully ran pre clean up")
            log.info(
                f"Successfully ran pre clean up {self.rtdname}{log_suffix}")
        except Exception as e:
            self.log("An error occurred during pre clean up", exception=e)
            log.exception(
                f"An error occurred during pre clean up {self.rtdname}{log_suffix}"
            )
            cloud = self.cloud(graph)
            metrics_resource_pre_cleanup_exceptions.labels(
                cloud=cloud.name,
                account=account.dname,
                region=region.name,
                kind=self.kind,
            ).inc()
            return False
        return True
示例#11
0
 def on_message(self, ws, message):
     try:
         message: Dict = json.loads(message)
     except json.JSONDecodeError:
         log.exception(f"Unable to decode received message {message}")
         return
     log.debug(f"{self.identifier} received: {message}")
     if self.message_processor is not None and callable(self.message_processor):
         try:
             result = self.message_processor(message)
             log.debug(f"Sending reply {result}")
             ws.send(json.dumps(result))
         except Exception:
             log.exception(f"Something went wrong while processing {message}")
示例#12
0
 def bootstrap(self) -> bool:
     if ArgumentParser.args.cleanup_aws_loadbalancers:
         try:
             self.age = parse_delta(
                 ArgumentParser.args.cleanup_aws_loadbalancers_age)
             log.debug(f"AWS Loadbalancer Cleanup Plugin Age {self.age}")
         except ValueError:
             log.exception(
                 f"Error while parsing AWS Loadbalancer "
                 f"Cleanup Age {ArgumentParser.args.cleanup_aws_loadbalancers_age}"
             )
         else:
             return True
     return False
示例#13
0
 def catch_and_log(*args, **kwargs):
     try:
         return f(*args, **kwargs)
     except do_raise:
         raise
     except Exception:
         args_str = ", ".join([repr(arg) for arg in args])
         kwargs_str = ", ".join(
             [f"{k}={repr(v)}" for k, v in kwargs.items()])
         if len(args) > 0 and len(kwargs) > 0:
             args_str += ", "
         log.exception(
             f"Caught exception in {f.__name__}({args_str}{kwargs_str})"
         )
示例#14
0
    def clean(self, node: BaseResource) -> None:
        log_prefix = f"Resource {node.rtdname} is marked for removal"
        if ArgumentParser.args.cleanup_dry_run:
            log.debug(
                f"{log_prefix}, not calling cleanup method because of dry run flag"
            )
            return

        log.debug(f"{log_prefix}, calling cleanup method")
        try:
            node.cleanup(self.graph)
        except Exception:
            log.exception(
                f"An exception occurred when running resource cleanup on {node.rtdname}"
            )
示例#15
0
    def collect_team(self, client: StreamingWrapper) -> Optional[Dict]:
        """Collects an individual team."""
        projects = client.list_projects()
        team_id = str(projects[0]["owner_id"])
        team = DigitalOceanTeam(id=team_id, tags={}, urn=f"do:team:{team_id}")

        try:
            dopc = DigitalOceanTeamCollector(team, client)
            dopc.collect()
        except Exception:
            log.exception(
                f"An unhandled error occurred while collecting team {team_id}"
            )
        else:
            return dopc.graph
示例#16
0
    def add_edge(
        self,
        src: BaseResource,
        dst: BaseResource,
        key: EdgeKey = None,
        edge_type: EdgeType = None,
        **attr,
    ):
        if src is None or dst is None:
            log.error(f"Not creating edge from or to NoneType: {src} to {dst}")
            return

        if edge_type is None:
            edge_type = EdgeType.default
        if key is None:
            key = EdgeKey(src=src, dst=dst, edge_type=edge_type)

        if self.has_edge(src, dst, key=key):
            log.error(f"Edge from {src} to {dst} already exists in graph")
            return
        return_key = super().add_edge(src, dst, key=key, **attr)
        if (
            self._log_edge_creation
            and isinstance(src, BaseResource)
            and isinstance(dst, BaseResource)
        ):
            log.debug(
                f"Added edge from {src.rtdname} to {dst.rtdname} (type: {edge_type.value})"
            )
            try:
                src.successor_added(dst, self)
            except Exception:
                log.exception(
                    (
                        f"Unhandled exception while telling {src.rtdname}"
                        f" that {dst.rtdname} was added as a successor"
                    )
                )
            try:
                dst.predecessor_added(src, self)
            except Exception:
                log.exception(
                    (
                        f"Unhandled exception while telling {dst.rtdname}"
                        f" that {src.rtdname} was added as a predecessor"
                    )
                )
        return return_key
示例#17
0
def get_stats(graph=None) -> Dict:
    try:
        stats = {
            "active_threads": threading.active_count(),
            "thread_names": [thread.name for thread in threading.enumerate()],
            "garbage_collector": garbage_collector.get_stats(),
            "process": get_all_process_info(),
        }
        if sys.platform == "linux":
            stats.update({
                "maxrss_parent_bytes":
                resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024,
                "maxrss_children_bytes":
                resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss * 1024,
            })
        else:
            stats.update({
                "maxrss_parent_bytes": 0,
                "maxrss_children_bytes": 0
            })
        stats["maxrss_total_bytes"] = (stats["maxrss_parent_bytes"] +
                                       stats["maxrss_children_bytes"])
        num_fds_parent = (stats["process"].get("parent",
                                               {}).get("num_file_descriptors",
                                                       0))
        num_fds_children = sum([
            v["num_file_descriptors"]
            for v in stats["process"].get("children", {}).values()
        ])
        stats.update({
            "maxrss_parent_human_readable":
            iec_size_format(stats["maxrss_parent_bytes"]),
            "maxrss_children_human_readable":
            iec_size_format(stats["maxrss_children_bytes"]),
            "maxrss_total_human_readable":
            iec_size_format(stats["maxrss_total_bytes"]),
            "num_fds_parent":
            num_fds_parent,
            "num_fds_children":
            num_fds_children,
            "num_fds_total":
            num_fds_parent + num_fds_children,
        })
    except Exception:
        log.exception("Error while trying to get stats")
        return {}
    else:
        return stats
示例#18
0
    def pre_clean(self, node: BaseResource) -> None:
        if not hasattr(node, "pre_delete"):
            return

        log_prefix = f"Resource {node.rtdname} is marked for removal"
        if ArgumentParser.args.cleanup_dry_run:
            log.debug(
                f"{log_prefix}, not calling pre cleanup method because of dry run flag"
            )
            return

        log.debug(f"{log_prefix}, calling pre cleanup method")
        try:
            node.pre_cleanup(self.graph)
        except Exception:
            log.exception(
                ("An exception occurred when running resource pre cleanup on"
                 f" {node.rtdname}"))
示例#19
0
    def search_first_parent_class(self, node, cls):
        """Return the first parent node matching a certain class

        This is being used to search up the graph and e.g. find the account that the
        graph node is a member of.
        """
        ret = None
        try:
            for predecessor_node in list(self.predecessors(node)):
                if isinstance(predecessor_node, cls):
                    ret = predecessor_node
                else:
                    ret = self.search_first_parent_class(predecessor_node, cls)
                if ret:
                    break
        except RecursionError:
            log.exception(
                f"Recursive search error triggered for node {node}'s parent class {cls}"
            )
        return ret
示例#20
0
def log_stats(graph=None, garbage_collector_stats: bool = False) -> None:
    stats = get_stats(graph)
    try:
        log.debug(
            f"Stats: max rss parent: {stats['maxrss_parent_human_readable']},"
            f" children: {stats['maxrss_children_human_readable']},"
            f" fds: {stats['num_fds_total']}/"
            f"{stats['process'].get('parent', {}).get('rlimit_nofile', [0])[0]}"
            f" active threads {stats['active_threads']}:"
            f" {', '.join([thread for thread in stats['thread_names']])}")
        if graph:
            log.debug(f"Graph Stats: {stats['graph_size_human_readable']}")
        if garbage_collector_stats:
            gc_stats = " | ".join([
                (f"Gen {i}: collections {data.get('collections')}, "
                 f"collected {data.get('collected')}, "
                 f"uncollectable {data.get('uncollectable')}")
                for i, data in enumerate(stats["garbage_collector"])
            ])
            log.debug(f"Garbage Collector Stats: {gc_stats}")
    except Exception:
        log.exception("Error while trying to log stats")
示例#21
0
def core_actions_processor(
    collectors: List[BaseCollectorPlugin], message: Dict
) -> None:
    if not isinstance(message, dict):
        log.error(f"Invalid message: {message}")
        return
    kind = message.get("kind")
    message_type = message.get("message_type")
    data = message.get("data")
    log.debug(f"Received message of kind {kind}, type {message_type}, data: {data}")
    if kind == "action":
        try:
            if message_type == "collect":
                start_time = time.time()
                collect_and_send(collectors)
                run_time = int(time.time() - start_time)
                log.info(f"Collect ran for {run_time} seconds")
            elif message_type == "cleanup":
                start_time = time.time()
                cleanup()
                run_time = int(time.time() - start_time)
                log.info(f"Cleanup ran for {run_time} seconds")
            else:
                raise ValueError(f"Unknown message type {message_type}")
        except Exception as e:
            log.exception(f"Failed to {message_type}: {e}")
            reply_kind = "action_error"
        else:
            reply_kind = "action_done"

        reply_message = {
            "kind": reply_kind,
            "message_type": message_type,
            "data": data,
        }
        return reply_message
示例#22
0
def main() -> None:
    setup_logger("resotoworker")
    # Try to run in a new process group and
    # ignore if not possible for whatever reason
    try:
        os.setpgid(0, 0)
    except Exception:
        pass

    resotolib.signal.parent_pid = os.getpid()

    # Add cli args
    # The following double parsing of cli args is done so that when
    # a user specifies e.g. `--collector aws --help`  they would
    # no longer be shown cli args for other collectors like gcp.
    collector_arg_parser = ArgumentParser(
        description="resoto worker",
        env_args_prefix="RESOTOWORKER_",
        add_help=False,
        add_machine_help=False,
    )
    PluginLoader.add_args(collector_arg_parser)
    (args, _) = collector_arg_parser.parse_known_args()
    ArgumentParser.args = args

    arg_parser = ArgumentParser(
        description="resoto worker",
        env_args_prefix="RESOTOWORKER_",
    )
    jwt_add_args(arg_parser)
    logging_add_args(arg_parser)
    graph_add_args(arg_parser)
    collect_add_args(arg_parser)
    cleanup_add_args(arg_parser)
    core_add_args(arg_parser)
    resotocore_add_args(arg_parser)
    CoreActions.add_args(arg_parser)
    WebApp.add_args(arg_parser)
    PluginLoader.add_args(arg_parser)
    event_add_args(arg_parser)
    add_args(arg_parser)

    # Find resoto Plugins in the resoto.plugins module
    plugin_loader = PluginLoader()
    plugin_loader.add_plugin_args(arg_parser)

    # At this point the CLI, all Plugins as well as the WebServer have
    # added their args to the arg parser
    arg_parser.parse_args()

    # Handle Ctrl+c and other means of termination/shutdown
    resotolib.signal.initializer()
    add_event_listener(EventType.SHUTDOWN, shutdown, blocking=False)

    # Try to increase nofile and nproc limits
    increase_limits()

    web_server = WebServer(WebApp())
    web_server.daemon = True
    web_server.start()

    core_actions = CoreActions(
        identifier=f"{ArgumentParser.args.resotocore_subscriber_id}-collect_cleanup",
        resotocore_uri=ArgumentParser.args.resotocore_uri,
        resotocore_ws_uri=ArgumentParser.args.resotocore_ws_uri,
        actions={
            "collect": {
                "timeout": ArgumentParser.args.timeout,
                "wait_for_completion": True,
            },
            "cleanup": {
                "timeout": ArgumentParser.args.timeout,
                "wait_for_completion": True,
            },
        },
        message_processor=partial(
            core_actions_processor, plugin_loader.plugins(PluginType.COLLECTOR)
        ),
    )

    task_queue_filter = {}
    if ArgumentParser.args.collector and len(ArgumentParser.args.collector) > 0:
        task_queue_filter = {"cloud": list(ArgumentParser.args.collector)}
    core_tasks = CoreTasks(
        identifier="workerd-tasks",
        resotocore_ws_uri=ArgumentParser.args.resotocore_ws_uri,
        tasks=["tag"],
        task_queue_filter=task_queue_filter,
        message_processor=core_tag_tasks_processor,
    )
    core_actions.start()
    core_tasks.start()

    for Plugin in plugin_loader.plugins(PluginType.ACTION):
        try:
            log.debug(f"Starting action plugin {Plugin}")
            plugin = Plugin()
            plugin.start()
        except Exception as e:
            log.exception(f"Caught unhandled persistent Plugin exception {e}")

    # We wait for the shutdown Event to be set() and then end the program
    # While doing so we print the list of active threads once per 15 minutes
    shutdown_event.wait()
    web_server.shutdown()
    time.sleep(1)  # everything gets 1000ms to shutdown gracefully before we force it
    resotolib.signal.kill_children(resotolib.signal.SIGTERM, ensure_death=True)
    log.info("Shutdown complete")
    os._exit(0)
示例#23
0
def main() -> None:
    setup_logger("resotoshell")
    shutdown_event = Event()
    arg_parser = ArgumentParser(description="resoto shell",
                                env_args_prefix="RESOTOSHELL_")
    add_args(arg_parser)
    logging_add_args(arg_parser)
    jwt_add_args(arg_parser)
    arg_parser.parse_args()

    headers = {"Accept": "text/plain"}
    execute_endpoint = f"{ArgumentParser.args.resotocore_uri}/cli/execute"
    execute_endpoint += f"?resoto_session_id={rnd_str()}"
    if ArgumentParser.args.resotocore_graph:
        query_string = urlencode(
            {"graph": ArgumentParser.args.resotocore_graph})
        execute_endpoint += f"&{query_string}"
    if ArgumentParser.args.resotocore_section:
        query_string = urlencode(
            {"section": ArgumentParser.args.resotocore_section})
        execute_endpoint += f"&{query_string}"

    if ArgumentParser.args.stdin:
        shell = Shell(execute_endpoint, False, "monochrome")
        log.debug("Reading commands from STDIN")
        try:
            for command in sys.stdin.readlines():
                command = command.rstrip()
                shell.handle_command(command, headers)
        except KeyboardInterrupt:
            pass
        except (RuntimeError, ValueError) as e:
            log.error(e)
        except Exception:
            log.exception(
                "Caught unhandled exception while processing CLI command")
        finally:
            shutdown_event.set()
    else:
        shell = Shell(execute_endpoint, True, detect_color_system())
        completer = None
        history_file = str(pathlib.Path.home() / ".resotoshell_history")
        history = FileHistory(history_file)
        session = PromptSession(history=history)
        log.debug("Starting interactive session")

        while not shutdown_event.is_set():
            try:
                command = session.prompt("> ", completer=completer)
                if command == "":
                    continue
                if command == "quit":
                    shutdown_event.set()
                    continue

                shell.handle_command(command, headers)

            except KeyboardInterrupt:
                pass
            except EOFError:
                shutdown_event.set()
            except (RuntimeError, ValueError) as e:
                log.error(e)
            except Exception:
                log.exception(
                    "Caught unhandled exception while processing CLI command")

    sys.exit(0)