def mesos_cpu_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs): """ Gets the average cpu utilization of a service across all of its tasks. :param marathon_service_config: the MarathonServiceConfig to get data from :param marathon_tasks: Marathon tasks to get data from :param mesos_tasks: Mesos tasks to get data from :returns: the service's average utilization, from 0 to 1 """ autoscaling_root = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) zk_last_time_path = '%s/cpu_last_time' % autoscaling_root zk_last_cpu_data = '%s/cpu_data' % autoscaling_root with ZookeeperPool() as zk: try: last_time, _ = zk.get(zk_last_time_path) last_cpu_data, _ = zk.get(zk_last_cpu_data) last_time = float(last_time) last_cpu_data = (datum for datum in last_cpu_data.split(',') if datum) except NoNodeError: last_time = 0.0 last_cpu_data = [] mesos_tasks = {task['id']: task.stats for task in mesos_tasks} current_time = int(datetime.now().strftime('%s')) time_delta = current_time - last_time mesos_cpu_data = {task_id: float(stats.get('cpus_system_time_secs', 0.0) + stats.get( 'cpus_user_time_secs', 0.0)) / (stats.get('cpus_limit', 0) - .1) for task_id, stats in mesos_tasks.items()} if not mesos_cpu_data: raise MetricsProviderNoDataError("Couldn't get any cpu data from Mesos") cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id) for task_id, cpu_seconds in mesos_cpu_data.items()) with ZookeeperPool() as zk: zk.ensure_path(zk_last_cpu_data) zk.ensure_path(zk_last_time_path) zk.set(zk_last_cpu_data, str(cpu_data_csv)) zk.set(zk_last_time_path, str(current_time)) utilization = {} for datum in last_cpu_data: last_cpu_seconds, task_id = datum.split(':') if task_id in mesos_cpu_data: utilization[task_id] = (mesos_cpu_data[task_id] - float(last_cpu_seconds)) / time_delta if not utilization: raise MetricsProviderNoDataError("""The mesos_cpu metrics provider doesn't have Zookeeper data for this service. This is expected for its first run.""") task_utilization = utilization.values() average_utilization = sum(task_utilization) / len(task_utilization) return average_utilization
def get_new_instance_count( utilization, error, autoscaling_params, current_instances, marathon_service_config, num_healthy_instances, persist_data: bool, ): autoscaling_decision_policy = get_decision_policy( autoscaling_params[DECISION_POLICY_KEY]) zookeeper_path = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) autoscaling_amount = autoscaling_decision_policy( utilization=utilization, error=error, current_instances=current_instances, zookeeper_path=zookeeper_path, num_healthy_instances=num_healthy_instances, persist_data=persist_data, **autoscaling_params, ) # Limit downscaling by 30% of current_instances until we find out what is # going on in such situations safe_downscaling_threshold = int(current_instances * 0.7) new_instance_count = max(current_instances + autoscaling_amount, safe_downscaling_threshold) new_instance_count = marathon_service_config.limit_instance_count( new_instance_count) return new_instance_count
def mesos_cpu_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs): """ Gets the mean cpu utilization of a service across all of its tasks. :param marathon_service_config: the MarathonServiceConfig to get data from :param marathon_tasks: Marathon tasks to get data from :param mesos_tasks: Mesos tasks to get data from :returns: the service's mean utilization, from 0 to 1 """ autoscaling_root = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) zk_last_time_path = '%s/cpu_last_time' % autoscaling_root zk_last_cpu_data = '%s/cpu_data' % autoscaling_root with ZookeeperPool() as zk: try: last_time, _ = zk.get(zk_last_time_path) last_cpu_data, _ = zk.get(zk_last_cpu_data) last_time = float(last_time) last_cpu_data = (datum for datum in last_cpu_data.split(',') if datum) except NoNodeError: last_time = 0.0 last_cpu_data = [] mesos_tasks = {task['id']: task.stats for task in mesos_tasks} current_time = int(datetime.now().strftime('%s')) time_delta = current_time - last_time mesos_cpu_data = {task_id: float(stats.get('cpus_system_time_secs', 0.0) + stats.get( 'cpus_user_time_secs', 0.0)) / (stats.get('cpus_limit', 0) - .1) for task_id, stats in mesos_tasks.items()} if not mesos_cpu_data: raise MetricsProviderNoDataError("Couldn't get any cpu data from Mesos") cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id) for task_id, cpu_seconds in mesos_cpu_data.items()) with ZookeeperPool() as zk: zk.ensure_path(zk_last_cpu_data) zk.ensure_path(zk_last_time_path) zk.set(zk_last_cpu_data, str(cpu_data_csv)) zk.set(zk_last_time_path, str(current_time)) utilization = {} for datum in last_cpu_data: last_cpu_seconds, task_id = datum.split(':') if task_id in mesos_cpu_data: utilization[task_id] = (mesos_cpu_data[task_id] - float(last_cpu_seconds)) / time_delta if not utilization: raise MetricsProviderNoDataError("""The mesos_cpu metrics provider doesn't have Zookeeper data for this service. This is expected for its first run.""") task_utilization = utilization.values() mean_utilization = mean(task_utilization) return mean_utilization
def threshold_decision_policy(marathon_service_config, metrics_provider_method, marathon_tasks, mesos_tasks, delay=600, setpoint=0.8, threshold=0.1, **kwargs): """ Decides to autoscale a service up or down if the service utilization exceeds the setpoint by a certain threshold. """ zk_last_time_path = '%s/threshold_last_time' % compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) with ZookeeperPool() as zk: try: last_time, _ = zk.get(zk_last_time_path) last_time = float(last_time) except NoNodeError: last_time = 0.0 current_time = int(datetime.now().strftime('%s')) if current_time - last_time < delay: return 0 error = metrics_provider_method(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs) - setpoint with ZookeeperPool() as zk: zk.ensure_path(zk_last_time_path) zk.set(zk_last_time_path, str(current_time)) if error > threshold: return 1 elif abs(error) > threshold: return -1 else: return 0
def pid_decision_policy(marathon_service_config, metrics_provider_method, marathon_tasks, mesos_tasks, delay=600, setpoint=0.8, **kwargs): """ Uses a PID to determine when to autoscale a service. See https://en.wikipedia.org/wiki/PID_controller for more information on PIDs. Kp, Ki and Kd are the canonical PID constants, where the output of the PID is: Kp * error + Ki * integral(error * dt) + Kd * (d(error) / dt) """ Kp = 0.2 Ki = 0.2 / delay Kd = 0.05 * delay autoscaling_root = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) zk_iterm_path = '%s/pid_iterm' % autoscaling_root zk_last_error_path = '%s/pid_last_error' % autoscaling_root zk_last_time_path = '%s/pid_last_time' % autoscaling_root with ZookeeperPool() as zk: try: iterm, _ = zk.get(zk_iterm_path) last_error, _ = zk.get(zk_last_error_path) last_time, _ = zk.get(zk_last_time_path) iterm = float(iterm) last_error = float(last_error) last_time = float(last_time) except NoNodeError: iterm = 0.0 last_error = 0.0 last_time = 0.0 if int(datetime.now().strftime('%s')) - last_time < delay: return 0 utilization = metrics_provider_method(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs) error = utilization - setpoint with ZookeeperPool() as zk: zk.ensure_path(zk_iterm_path) zk.ensure_path(zk_last_error_path) zk.set(zk_iterm_path, str(iterm)) zk.set(zk_last_error_path, str(error)) current_time = int(datetime.now().strftime('%s')) time_delta = current_time - last_time iterm = clamp_value(iterm + (Ki * error) * time_delta) with ZookeeperPool() as zk: zk.ensure_path(zk_iterm_path) zk.ensure_path(zk_last_error_path) zk.ensure_path(zk_last_time_path) zk.set(zk_iterm_path, str(iterm)) zk.set(zk_last_error_path, str(error)) zk.set(zk_last_time_path, str(current_time)) return int(round(clamp_value(Kp * error + iterm + Kd * (error - last_error) / time_delta)))
def autoscale_marathon_instance(marathon_service_config, marathon_tasks, mesos_tasks): current_instances = marathon_service_config.get_instances() if len(marathon_tasks) != current_instances: write_to_log( config=marathon_service_config, line= 'Delaying scaling as marathon is either waiting for resources or is delayed' ) return autoscaling_params = marathon_service_config.get_autoscaling_params() autoscaling_metrics_provider = get_service_metrics_provider( autoscaling_params.pop(SERVICE_METRICS_PROVIDER_KEY)) autoscaling_decision_policy = get_decision_policy( autoscaling_params.pop(DECISION_POLICY_KEY)) utilization = autoscaling_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, **autoscaling_params) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params.pop('setpoint'), current_instances=current_instances, ) zookeeper_path = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) autoscaling_amount = autoscaling_decision_policy( error=error, min_instances=marathon_service_config.get_min_instances(), max_instances=marathon_service_config.get_max_instances(), current_instances=current_instances, zookeeper_path=zookeeper_path, **autoscaling_params) new_instance_count = marathon_service_config.limit_instance_count( current_instances + autoscaling_amount) if new_instance_count != current_instances: write_to_log( config=marathon_service_config, line='Scaling from %d to %d instances (%s)' % (current_instances, new_instance_count, humanize_error(error)), ) set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=new_instance_count, ) else: write_to_log( config=marathon_service_config, line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)), level='debug', )
def pid_decision_policy(marathon_service_config, error, **kwargs): """ Uses a PID to determine when to autoscale a service. See https://en.wikipedia.org/wiki/PID_controller for more information on PIDs. Kp, Ki and Kd are the canonical PID constants, where the output of the PID is: Kp * error + Ki * integral(error * dt) + Kd * (d(error) / dt) """ Kp = 0.2 Ki = 0.2 / AUTOSCALING_DELAY Kd = 0.05 * AUTOSCALING_DELAY autoscaling_root = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) zk_iterm_path = '%s/pid_iterm' % autoscaling_root zk_last_error_path = '%s/pid_last_error' % autoscaling_root zk_last_time_path = '%s/pid_last_time' % autoscaling_root with ZookeeperPool() as zk: try: iterm, _ = zk.get(zk_iterm_path) last_error, _ = zk.get(zk_last_error_path) last_time, _ = zk.get(zk_last_time_path) iterm = float(iterm) last_error = float(last_error) last_time = float(last_time) except NoNodeError: iterm = 0.0 last_error = 0.0 last_time = 0.0 with ZookeeperPool() as zk: zk.ensure_path(zk_iterm_path) zk.ensure_path(zk_last_error_path) zk.set(zk_iterm_path, str(iterm)) zk.set(zk_last_error_path, str(error)) current_time = int(datetime.now().strftime('%s')) time_delta = current_time - last_time iterm = clamp_value(iterm + (Ki * error) * time_delta) with ZookeeperPool() as zk: zk.ensure_path(zk_iterm_path) zk.ensure_path(zk_last_error_path) zk.ensure_path(zk_last_time_path) zk.set(zk_iterm_path, str(iterm)) zk.set(zk_last_error_path, str(error)) zk.set(zk_last_time_path, str(current_time)) return int( round( clamp_value(Kp * error + iterm + Kd * (error - last_error) / time_delta)))
def autoscale_marathon_instance(marathon_service_config, marathon_tasks, mesos_tasks): current_instances = marathon_service_config.get_instances() if len(marathon_tasks) != current_instances: write_to_log(config=marathon_service_config, line='Delaying scaling as marathon is either waiting for resources or is delayed') return autoscaling_params = marathon_service_config.get_autoscaling_params() autoscaling_metrics_provider = get_service_metrics_provider(autoscaling_params.pop(SERVICE_METRICS_PROVIDER_KEY)) autoscaling_decision_policy = get_decision_policy(autoscaling_params.pop(DECISION_POLICY_KEY)) utilization = autoscaling_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, **autoscaling_params) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params.pop('setpoint'), current_instances=current_instances, ) zookeeper_path = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) autoscaling_amount = autoscaling_decision_policy( error=error, min_instances=marathon_service_config.get_min_instances(), max_instances=marathon_service_config.get_max_instances(), current_instances=current_instances, zookeeper_path=zookeeper_path, **autoscaling_params ) new_instance_count = marathon_service_config.limit_instance_count(current_instances + autoscaling_amount) if new_instance_count != current_instances: write_to_log( config=marathon_service_config, line='Scaling from %d to %d instances (%s)' % ( current_instances, new_instance_count, humanize_error(error)), ) set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=new_instance_count, ) else: write_to_log( config=marathon_service_config, line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)), level='debug', )
def mesos_cpu_metrics_provider( marathon_service_config, system_paasta_config, marathon_tasks, mesos_tasks, log_utilization_data={}, noop=False, **kwargs, ): """ Gets the mean cpu utilization of a service across all of its tasks. :param marathon_service_config: the MarathonServiceConfig to get data from :param marathon_tasks: Marathon tasks to get data from :param mesos_tasks: Mesos tasks to get data from :param log_utilization_data: A dict used to transfer utilization data to autoscale_marathon_instance() :returns: the service's mean utilization, from 0 to 1 """ autoscaling_root = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) zk_last_time_path = "%s/cpu_last_time" % autoscaling_root zk_last_cpu_data = "%s/cpu_data" % autoscaling_root with ZookeeperPool() as zk: try: last_time = zk.get(zk_last_time_path)[0].decode("utf8") last_cpu_data = zk.get(zk_last_cpu_data)[0].decode("utf8") log_utilization_data[last_time] = last_cpu_data last_time = float(last_time) last_cpu_data = (datum for datum in last_cpu_data.split(",") if datum) except NoNodeError: last_time = 0.0 last_cpu_data = [] futures = [asyncio.ensure_future(task.stats()) for task in mesos_tasks] if futures: a_sync.block(asyncio.wait, futures, timeout=60) def results_or_None(fut): if fut.exception(): return None else: return fut.result() mesos_tasks_stats = dict( zip( [task["id"] for task in mesos_tasks], [results_or_None(fut) for fut in futures], )) current_time = int(datetime.now().strftime("%s")) time_delta = current_time - last_time mesos_cpu_data = {} for task_id, stats in mesos_tasks_stats.items(): if stats is not None: try: utime = float(stats["cpus_user_time_secs"]) stime = float(stats["cpus_system_time_secs"]) limit = float(stats["cpus_limit"]) - 0.1 mesos_cpu_data[task_id] = (stime + utime) / limit except KeyError: pass if not mesos_cpu_data: raise MetricsProviderNoDataError( "Couldn't get any cpu data from Mesos") cpu_data_csv = ",".join(f"{cpu_seconds}:{task_id}" for task_id, cpu_seconds in mesos_cpu_data.items()) log_utilization_data[str(current_time)] = cpu_data_csv if not noop: with ZookeeperPool() as zk: zk.ensure_path(zk_last_cpu_data) zk.ensure_path(zk_last_time_path) zk.set(zk_last_cpu_data, str(cpu_data_csv).encode("utf8")) zk.set(zk_last_time_path, str(current_time).encode("utf8")) utilization = {} for datum in last_cpu_data: last_cpu_seconds, task_id = datum.split(":") if task_id in mesos_cpu_data: cputime_delta = mesos_cpu_data[task_id] - float(last_cpu_seconds) utilization[task_id] = cputime_delta / time_delta if not utilization: raise MetricsProviderNoDataError( """The mesos_cpu metrics provider doesn't have Zookeeper data for this service. This is expected for its first run.""" ) task_utilization = utilization.values() mean_utilization = mean(task_utilization) return mean_utilization
def mesos_cpu_ram_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs): """ Gets the average utilization of a service across all of its tasks, where the utilization of a task is the maximum value between its cpu and ram utilization. :param marathon_service_config: the MarathonServiceConfig to get data from :param marathon_tasks: Marathon tasks to get data from :param mesos_tasks: Mesos tasks to get data from :returns: the service's average utilization, from 0 to 1 """ autoscaling_root = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) zk_last_time_path = '%s/cpu_last_time' % autoscaling_root zk_last_cpu_data = '%s/cpu_data' % autoscaling_root with ZookeeperPool() as zk: try: last_time, _ = zk.get(zk_last_time_path) last_cpu_data, _ = zk.get(zk_last_cpu_data) last_time = float(last_time) last_cpu_data = (datum for datum in last_cpu_data.split(',') if datum) except NoNodeError: last_time = 0.0 last_cpu_data = [] mesos_tasks = {task['id']: task.stats for task in mesos_tasks} current_time = int(datetime.now().strftime('%s')) time_delta = current_time - last_time mesos_cpu_data = {task_id: float(stats.get('cpus_system_time_secs', 0.0) + stats.get( 'cpus_user_time_secs', 0.0)) / (stats.get('cpus_limit', 0) - .1) for task_id, stats in mesos_tasks.items()} utilization = {} for datum in last_cpu_data: last_cpu_seconds, task_id = datum.split(':') if task_id in mesos_cpu_data: utilization[task_id] = (mesos_cpu_data[task_id] - float(last_cpu_seconds)) / time_delta for task_id, stats in mesos_tasks.items(): if stats.get('mem_limit_bytes', 0) != 0: utilization[task_id] = max( utilization.get(task_id, 0), float(stats.get('mem_rss_bytes', 0)) / stats.get('mem_limit_bytes', 0), ) if not utilization: raise MetricsProviderNoDataError("Couldn't get any cpu or ram data from Mesos") task_utilization = utilization.values() average_utilization = sum(task_utilization) / len(task_utilization) cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id) for task_id, cpu_seconds in mesos_cpu_data.items()) with ZookeeperPool() as zk: zk.ensure_path(zk_last_cpu_data) zk.ensure_path(zk_last_time_path) zk.set(zk_last_cpu_data, str(cpu_data_csv)) zk.set(zk_last_time_path, str(current_time)) return average_utilization