Пример #1
0
 def __init__(self, cli_args, logger=None):
     self.args = cli_args
     self.logger = logger or logging.getLogger(__name__)
     self.mesos = MesosMaster(cli_args.mesos_uri)
     self.marathon = Marathon(
         cli_args.marathon_uri,
         (cli_args.marathon_user, cli_args.marathon_pass)
         if cli_args.marathon_user and cli_args.marathon_pass else None)
     self.auto_scaler = AutoScaler(self.marathon, cli_args=cli_args)
     self.cpu_fan_out = cli_args.cpu_fan_out
     self.datadog_client = DatadogClient(cli_args)
     self.agent_port = cli_args.agent_port
Пример #2
0
 def __init__(self, cli_args, logger=None):
     self.args = cli_args
     self.logger = logger or logging.getLogger(__name__)
     self.mesos = MesosMaster(cli_args.mesos_uri)
     self.marathon = Marathon(cli_args.marathon_uri, (cli_args.marathon_user, cli_args.marathon_pass))
     self.auto_scaler = AutoScaler(self.marathon, cli_args=cli_args)
     self.cpu_fan_out = cli_args.cpu_fan_out
     self.datadog_client = DatadogClient(cli_args)
Пример #3
0
class Poller:
    """
    A polling mechanism for gathering metrics data for the Autoscaler's decision engine.
    """

    def __init__(self, cli_args, logger=None):
        self.args = cli_args
        self.logger = logger or logging.getLogger(__name__)
        self.mesos = MesosMaster(cli_args.mesos_uri)
        self.marathon = Marathon(cli_args.marathon_uri,
            (cli_args.marathon_user, cli_args.marathon_pass)
            if cli_args.marathon_user and cli_args.marathon_pass
            else None
        )
        self.auto_scaler = AutoScaler(self.marathon, cli_args=cli_args)
        self.cpu_fan_out = cli_args.cpu_fan_out
        self.datadog_client = DatadogClient(cli_args)
        self.agent_port = cli_args.agent_port

    def poll(self, mesos_master, marathon_client, poll_time_span=3, cpu_fan_out=None):
        """ The main method for forming the applications metrics data object. A call to Marathon retrieves
        all applications, a call to Mesos Master retrieves all slaves, each slave node is queried for its
        statistics (metrics) twice (to acquire a differential), differentials are calculated, additional
        metrics are added (max_cpu, max_mem). All of this data is collected in a single dictionary object
        and returned to caller.
        :param mesos_master: A Mesos Master Client
        :param marathon_client: A Marathon Client
        :param poll_time_span: Time (in seconds) between polling events
        :param cpu_fan_out: Max number of processes in MP Pools
        :return: A dictionary object containing all Application metric data for 2 consecutive polls
        """
        try:
            marathon_apps = marathon_client.get_all_apps().get("apps")
            slaves = mesos_master.get_slaves()
            agent_hosts = [slave.get("hostname") for slave in slaves.get("slaves")]
        except Exception as ex:
            self.logger.error(ex)
            self.logger.fatal("Marathon data could not be retrieved!")
            return

        executor_stats = defaultdict(list)

        for _ in range(2):
            agent_hosts_stats = {}
            if cpu_fan_out:
                fan_out_procs = cpu_fan_out
            else:
                fan_out_procs = len(agent_hosts)

            maxtasks = int(len(agent_hosts) / fan_out_procs) if int(len(agent_hosts) / fan_out_procs) else 1
            pool = Pool(processes=fan_out_procs, maxtasksperchild=maxtasks)
            for agent_host, stats in pool.imap_unordered(get_mesos_agent_statistics, agent_hosts):
                self.logger.debug((agent_host, stats))
                agent_hosts_stats[agent_host] = stats
            pool.close()

            for host in agent_hosts_stats:
                if agent_hosts_stats[host] is not None:
                    for executor in agent_hosts_stats[host]:
                        data = {"host": host, "stats": executor["statistics"]}
                        executor_stats[executor["executor_id"]].append(data)
            sleep(poll_time_span)

        self.logger.info("Stats differentials collected.")
        all_diffs = {}

        for key, stat in executor_stats.items():

            if len(stat) != 2:
                continue

            host = stat[0].get("host")
            first_stats = stat[0].get("stats")
            second_stats = stat[1].get("stats")

            if "timestamp" in second_stats and "timestamp" in first_stats:
                sys_cpu_delta = second_stats["cpus_system_time_secs"] - \
                                first_stats["cpus_system_time_secs"]

                user_cpu_delta = second_stats["cpus_user_time_secs"] - first_stats["cpus_user_time_secs"]
                timestamp_delta = second_stats["timestamp"] - first_stats["timestamp"]
                mem_total = first_stats["mem_limit_bytes"]
                mem_used = second_stats["mem_rss_bytes"]
                cpu_total_usage = ((sys_cpu_delta + user_cpu_delta) / timestamp_delta) * 100
                memory_total_usage = (float(mem_used) / mem_total) * 100

                diffs = dict(timestamp=timestamp_delta,
                             cpus_system_time_secs=sys_cpu_delta,
                             cpus_user_time_secs=user_cpu_delta,
                             cpu_total_usage=cpu_total_usage,
                             memory_total_usage=memory_total_usage,
                             host=host,
                             executor_id=key)
                all_diffs[key] = diffs
            else:
                self.logger.error("Timestamps were not found in stats from host: {0}".format(host))

        app_metric_map = defaultdict(list)

        for key in all_diffs.keys():
            app_metric_map[key.split(".")[0]].append(all_diffs[key])

        app_metric_summation = {}

        for app, metrics in app_metric_map.items():

            metric_sums = {}

            cpu_values = [metric["cpu_total_usage"] for metric in metrics]
            metric_sums["cpu_avg_usage"] = sum(cpu_values) / len(cpu_values)

            metric_sums["max_cpu"] = max([(metric["cpu_total_usage"],
                                           metric["executor_id"],
                                           metric["host"]) for metric in metrics])

            memory_values = [metric["memory_total_usage"] for metric in metrics]
            metric_sums["memory_avg_usage"] = sum(memory_values) / len(memory_values)

            metric_sums["max_memory"] = max([(metric["memory_total_usage"],
                                              metric["executor_id"],
                                              metric["host"]) for metric in metrics])

            metric_sums["executor_metrics"] = metrics
            metric_sums["application_definition"] = next((appdef for appdef in marathon_apps
                                                          if app.replace("_", "/") == appdef.get("id")), {})
            app_metric_summation[app] = metric_sums

        return app_metric_summation

    def start(self):
        """
        This is the entry method for the Poller class.
        :return: None
        """
        self.logger.info("Mesos and Marathon Connections Established.")
        while True:
            polled_stats = self.poll(self.mesos, self.marathon, cpu_fan_out=self.cpu_fan_out)

            if polled_stats is not None:
                self.datadog_client.send_datadog_metrics(polled_stats)
                self.auto_scaler.decide(polled_stats)
                self.update_autoscaler_metrics(polled_stats)
                self.logger.info("Decisions are completed.")
            else:
                self.logger.fatal("Poller unable to reach Marathon/Mesos!")

            sleep(self.args.sleep_interval)

    def update_autoscaler_metrics(self, stats):
        """
        * Number of participating applications
        * Total min instances
        * Total max instances
        * Total number of currently running instances (only participating applications)
        * Number of currently running instances per application
        * Number of scale up events
        * Number of scale down events
        * Number of flap detection events
        :param stats: dict object containing all application metric information specific to this polling event
        :return:
        """
        participating_applications = [{"app": app,
                                       "current_instances": int(items["application_definition"]["instances"]),
                                       "min_instances": int(items["application_definition"]["labels"]["min_instances"]),
                                       "max_instances": int(items["application_definition"]["labels"]["max_instances"])}
                                      for app, items in stats.items()
                                      if ApplicationDefinition(items["application_definition"]).is_app_participating]

        # total_participating_applications = len(participating_applications)
        # total_min_instances = sum([app["min_instances"] for app in participating_applications])
        # total_max_instances = sum([app["max_instances"] for app in participating_applications])
        # total_current_instances = sum([app["current_instances"] for app in participating_applications])

        [self.datadog_client.send_counter_event(app["app"],
                                                "marathon_autoscaler.counters.min_instances",
                                                points=app["min_instances"])
         for app in participating_applications]
        [self.datadog_client.send_counter_event(app["app"],
                                                "marathon_autoscaler.counters.max_instances",
                                                points=app["max_instances"])
         for app in participating_applications]
        [self.datadog_client.send_counter_event(app["app"],
                                                "marathon_autoscaler.counters.current_instances",
                                                points=app["current_instances"])
         for app in participating_applications]
Пример #4
0
class Poller:
    """
    A polling mechanism for gathering metrics data for the Autoscaler's decision engine.
    """
    def __init__(self, cli_args, logger=None):
        self.args = cli_args
        self.logger = logger or logging.getLogger(__name__)
        self.mesos = MesosMaster(cli_args.mesos_uri)
        self.marathon = Marathon(
            cli_args.marathon_uri,
            (cli_args.marathon_user, cli_args.marathon_pass)
            if cli_args.marathon_user and cli_args.marathon_pass else None)
        self.auto_scaler = AutoScaler(self.marathon, cli_args=cli_args)
        self.cpu_fan_out = cli_args.cpu_fan_out
        self.datadog_client = DatadogClient(cli_args)
        self.agent_port = cli_args.agent_port

    def poll(self,
             mesos_master,
             marathon_client,
             poll_time_span=3,
             cpu_fan_out=None):
        """ The main method for forming the applications metrics data object. A call to Marathon retrieves
        all applications, a call to Mesos Master retrieves all slaves, each slave node is queried for its
        statistics (metrics) twice (to acquire a differential), differentials are calculated, additional
        metrics are added (max_cpu, max_mem). All of this data is collected in a single dictionary object
        and returned to caller.
        :param mesos_master: A Mesos Master Client
        :param marathon_client: A Marathon Client
        :param poll_time_span: Time (in seconds) between polling events
        :param cpu_fan_out: Max number of processes in MP Pools
        :return: A dictionary object containing all Application metric data for 2 consecutive polls
        """
        try:
            marathon_apps = marathon_client.get_all_apps().get("apps")
            slaves = mesos_master.get_slaves()
            agent_hosts = [
                slave.get("hostname") for slave in slaves.get("slaves")
            ]
        except Exception as ex:
            self.logger.error(ex)
            self.logger.fatal("Marathon data could not be retrieved!")
            return

        executor_stats = defaultdict(list)

        for _ in range(2):
            agent_hosts_stats = {}
            if cpu_fan_out:
                fan_out_procs = cpu_fan_out
            else:
                fan_out_procs = len(agent_hosts)

            maxtasks = int(len(agent_hosts) / fan_out_procs) if int(
                len(agent_hosts) / fan_out_procs) else 1
            pool = Pool(processes=fan_out_procs, maxtasksperchild=maxtasks)
            for agent_host, stats in pool.imap_unordered(
                    get_mesos_agent_statistics, agent_hosts):
                self.logger.debug((agent_host, stats))
                agent_hosts_stats[agent_host] = stats
            pool.close()

            for host in agent_hosts_stats:
                if agent_hosts_stats[host] is not None:
                    for executor in agent_hosts_stats[host]:
                        data = {"host": host, "stats": executor["statistics"]}
                        executor_stats[executor["executor_id"]].append(data)
            sleep(poll_time_span)

        self.logger.info("Stats differentials collected.")
        all_diffs = {}

        for key, stat in executor_stats.items():

            if len(stat) != 2:
                continue

            host = stat[0].get("host")
            first_stats = stat[0].get("stats")
            second_stats = stat[1].get("stats")

            if "timestamp" in second_stats and "timestamp" in first_stats:
                sys_cpu_delta = second_stats["cpus_system_time_secs"] - \
                                first_stats["cpus_system_time_secs"]

                user_cpu_delta = second_stats[
                    "cpus_user_time_secs"] - first_stats["cpus_user_time_secs"]
                timestamp_delta = second_stats["timestamp"] - first_stats[
                    "timestamp"]
                mem_total = first_stats["mem_limit_bytes"]
                mem_used = second_stats["mem_rss_bytes"]
                cpu_total_usage = (
                    (sys_cpu_delta + user_cpu_delta) / timestamp_delta) * 100
                memory_total_usage = (float(mem_used) / mem_total) * 100

                diffs = dict(timestamp=timestamp_delta,
                             cpus_system_time_secs=sys_cpu_delta,
                             cpus_user_time_secs=user_cpu_delta,
                             cpu_total_usage=cpu_total_usage,
                             memory_total_usage=memory_total_usage,
                             host=host,
                             executor_id=key)
                all_diffs[key] = diffs
            else:
                self.logger.error(
                    "Timestamps were not found in stats from host: {0}".format(
                        host))

        app_metric_map = defaultdict(list)

        for key in all_diffs.keys():
            app_metric_map[key.split(".")[0]].append(all_diffs[key])

        app_metric_summation = {}

        for app, metrics in app_metric_map.items():

            metric_sums = {}

            cpu_values = [metric["cpu_total_usage"] for metric in metrics]
            metric_sums["cpu_avg_usage"] = sum(cpu_values) / len(cpu_values)

            metric_sums["max_cpu"] = max([
                (metric["cpu_total_usage"], metric["executor_id"],
                 metric["host"]) for metric in metrics
            ])

            memory_values = [
                metric["memory_total_usage"] for metric in metrics
            ]
            metric_sums["memory_avg_usage"] = sum(memory_values) / len(
                memory_values)

            metric_sums["max_memory"] = max([
                (metric["memory_total_usage"], metric["executor_id"],
                 metric["host"]) for metric in metrics
            ])

            metric_sums["executor_metrics"] = metrics
            metric_sums["application_definition"] = next(
                (appdef for appdef in marathon_apps
                 if app.replace("_", "/") == appdef.get("id")), {})
            app_metric_summation[app] = metric_sums

        return app_metric_summation

    def start(self):
        """
        This is the entry method for the Poller class.
        :return: None
        """
        self.logger.info("Mesos and Marathon Connections Established.")
        while True:
            polled_stats = self.poll(self.mesos,
                                     self.marathon,
                                     cpu_fan_out=self.cpu_fan_out)

            if polled_stats is not None:
                self.datadog_client.send_datadog_metrics(polled_stats)
                self.auto_scaler.decide(polled_stats)
                self.update_autoscaler_metrics(polled_stats)
                self.logger.info("Decisions are completed.")
            else:
                self.logger.fatal("Poller unable to reach Marathon/Mesos!")

            sleep(self.args.sleep_interval)

    def update_autoscaler_metrics(self, stats):
        """
        * Number of participating applications
        * Total min instances
        * Total max instances
        * Total number of currently running instances (only participating applications)
        * Number of currently running instances per application
        * Number of scale up events
        * Number of scale down events
        * Number of flap detection events
        :param stats: dict object containing all application metric information specific to this polling event
        :return:
        """
        participating_applications = [{
            "app":
            app,
            "current_instances":
            int(items["application_definition"]["instances"]),
            "min_instances":
            int(items["application_definition"]["labels"]["min_instances"]),
            "max_instances":
            int(items["application_definition"]["labels"]["max_instances"])
        } for app, items in stats.items() if ApplicationDefinition(
            items["application_definition"]).is_app_participating]

        # total_participating_applications = len(participating_applications)
        # total_min_instances = sum([app["min_instances"] for app in participating_applications])
        # total_max_instances = sum([app["max_instances"] for app in participating_applications])
        # total_current_instances = sum([app["current_instances"] for app in participating_applications])

        [
            self.datadog_client.send_counter_event(
                app["app"],
                "marathon_autoscaler.counters.min_instances",
                points=app["min_instances"])
            for app in participating_applications
        ]
        [
            self.datadog_client.send_counter_event(
                app["app"],
                "marathon_autoscaler.counters.max_instances",
                points=app["max_instances"])
            for app in participating_applications
        ]
        [
            self.datadog_client.send_counter_event(
                app["app"],
                "marathon_autoscaler.counters.current_instances",
                points=app["current_instances"])
            for app in participating_applications
        ]