def get_instances_from_zookeeper(service, instance): with ZookeeperPool() as zookeeper_client: (instances, _) = zookeeper_client.get( '%s/instances' % compose_autoscaling_zookeeper_root(service, instance)) return int(instances)
def set_boost_factor( zk_boost_path: str, region: str='', pool: str='', send_clusterman_metrics: bool=False, factor: float=DEFAULT_BOOST_FACTOR, duration_minutes: int=DEFAULT_BOOST_DURATION, override: bool=False, ) -> bool: """ Set a boost factor for a path in zk Can be used to boost either cluster or service autoscalers. If using for cluster you must specify region, pool and set send_clusterman_metrics=True so that clusterman metrics are updated otherwise just zk_boost_path is enough. """ if factor < MIN_BOOST_FACTOR: log.error(f'Cannot set a boost factor smaller than {MIN_BOOST_FACTOR}') return False if factor > MAX_BOOST_FACTOR: log.warning('Boost factor {} does not sound reasonable. Defaulting to {}'.format( factor, MAX_BOOST_FACTOR, )) factor = MAX_BOOST_FACTOR if duration_minutes > MAX_BOOST_DURATION: log.warning('Boost duration of {} minutes is too much. Falling back to {}.'.format( duration_minutes, MAX_BOOST_DURATION, )) duration_minutes = MAX_BOOST_DURATION current_time = get_time() end_time = current_time + 60 * duration_minutes if clusterman_metrics and send_clusterman_metrics: cluster = load_system_paasta_config().get_cluster() metrics_client = clusterman_metrics.ClustermanMetricsBotoClient(region_name=region, app_identifier='default') with metrics_client.get_writer(clusterman_metrics.APP_METRICS) as writer: metrics_key = clusterman_metrics.generate_key_with_dimensions( 'boost_factor', {'cluster': cluster, 'pool': pool}, ) writer.send((metrics_key, current_time, factor)) if duration_minutes > 0: writer.send((metrics_key, end_time, 1.0)) zk_end_time_path = zk_boost_path + '/end_time' zk_factor_path = zk_boost_path + '/factor' zk_expected_load_path = zk_boost_path + '/expected_load' with ZookeeperPool() as zk: if ( not override and current_time < get_boost_values(zk_boost_path, zk).end_time ): log.error('Boost already active. Not overriding.') return False try: zk.ensure_path(zk_end_time_path) zk.ensure_path(zk_factor_path) zk.ensure_path(zk_expected_load_path) zk.set(zk_end_time_path, str(end_time).encode('utf-8')) zk.set(zk_factor_path, str(factor).encode('utf-8')) zk.set(zk_expected_load_path, '0'.encode('utf-8')) except Exception: log.error('Error setting the boost in Zookeeper') raise log.info('Load boost: Set capacity boost factor {} at path {} until {}'.format( factor, zk_boost_path, datetime.fromtimestamp(end_time).strftime('%c'), )) # Let's check that this factor has been properly written to zk return get_boost_values(zk_boost_path, zk) == BoostValues( end_time=end_time, boost_factor=factor, expected_load=0, )
def mesos_cpu_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs): """ Gets the average utilization of a service across all of its tasks, where the utilization of a task is the maximum value between its cpu and ram utilization. :param marathon_service_config: the MarathonServiceConfig to get data from :param marathon_tasks: Marathon tasks to get data from :param mesos_tasks: Mesos tasks to get data from :returns: the service's average utilization, from 0 to 1 """ autoscaling_root = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) zk_last_time_path = '%s/cpu_last_time' % autoscaling_root zk_last_cpu_data = '%s/cpu_data' % autoscaling_root with ZookeeperPool() as zk: try: last_time, _ = zk.get(zk_last_time_path) last_cpu_data, _ = zk.get(zk_last_cpu_data) last_time = float(last_time) last_cpu_data = (datum for datum in last_cpu_data.split(',') if datum) except NoNodeError: last_time = 0.0 last_cpu_data = [] mesos_tasks = {task['id']: task.stats for task in mesos_tasks} current_time = int(datetime.now().strftime('%s')) time_delta = current_time - last_time mesos_cpu_data = { task_id: float( stats.get('cpus_system_time_secs', 0.0) + stats.get('cpus_user_time_secs', 0.0)) / (stats.get('cpus_limit', 0) - .1) for task_id, stats in mesos_tasks.items() } if not mesos_cpu_data: raise MetricsProviderNoDataError( "Couldn't get any cpu or ram data from Mesos") cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id) for task_id, cpu_seconds in mesos_cpu_data.items()) with ZookeeperPool() as zk: zk.ensure_path(zk_last_cpu_data) zk.ensure_path(zk_last_time_path) zk.set(zk_last_cpu_data, str(cpu_data_csv)) zk.set(zk_last_time_path, str(current_time)) utilization = {} for datum in last_cpu_data: last_cpu_seconds, task_id = datum.split(':') if task_id in mesos_cpu_data: utilization[task_id] = (mesos_cpu_data[task_id] - float(last_cpu_seconds)) / time_delta if not utilization: raise MetricsProviderNoDataError( """The mesos_cpu metrics provider doesn't have Zookeeper data for this service. This is expected for its first run.""" ) task_utilization = utilization.values() average_utilization = sum(task_utilization) / len(task_utilization) return average_utilization
def mesos_cpu_metrics_provider( marathon_service_config, system_paasta_config, marathon_tasks, mesos_tasks, log_utilization_data={}, noop=False, **kwargs, ): """ Gets the mean cpu utilization of a service across all of its tasks. :param marathon_service_config: the MarathonServiceConfig to get data from :param marathon_tasks: Marathon tasks to get data from :param mesos_tasks: Mesos tasks to get data from :param log_utilization_data: A dict used to transfer utilization data to autoscale_marathon_instance() :returns: the service's mean utilization, from 0 to 1 """ autoscaling_root = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) zk_last_time_path = "%s/cpu_last_time" % autoscaling_root zk_last_cpu_data = "%s/cpu_data" % autoscaling_root with ZookeeperPool() as zk: try: last_time = zk.get(zk_last_time_path)[0].decode("utf8") last_cpu_data = zk.get(zk_last_cpu_data)[0].decode("utf8") log_utilization_data[last_time] = last_cpu_data last_time = float(last_time) last_cpu_data = (datum for datum in last_cpu_data.split(",") if datum) except NoNodeError: last_time = 0.0 last_cpu_data = [] futures = [asyncio.ensure_future(task.stats()) for task in mesos_tasks] if futures: a_sync.block(asyncio.wait, futures, timeout=60) def results_or_None(fut): if fut.exception(): return None else: return fut.result() mesos_tasks_stats = dict( zip( [task["id"] for task in mesos_tasks], [results_or_None(fut) for fut in futures], )) current_time = int(datetime.now().strftime("%s")) time_delta = current_time - last_time mesos_cpu_data = {} for task_id, stats in mesos_tasks_stats.items(): if stats is not None: try: utime = float(stats["cpus_user_time_secs"]) stime = float(stats["cpus_system_time_secs"]) limit = float(stats["cpus_limit"]) - 0.1 mesos_cpu_data[task_id] = (stime + utime) / limit except KeyError: pass if not mesos_cpu_data: raise MetricsProviderNoDataError( "Couldn't get any cpu data from Mesos") cpu_data_csv = ",".join(f"{cpu_seconds}:{task_id}" for task_id, cpu_seconds in mesos_cpu_data.items()) log_utilization_data[str(current_time)] = cpu_data_csv if not noop: with ZookeeperPool() as zk: zk.ensure_path(zk_last_cpu_data) zk.ensure_path(zk_last_time_path) zk.set(zk_last_cpu_data, str(cpu_data_csv).encode("utf8")) zk.set(zk_last_time_path, str(current_time).encode("utf8")) utilization = {} for datum in last_cpu_data: last_cpu_seconds, task_id = datum.split(":") if task_id in mesos_cpu_data: cputime_delta = mesos_cpu_data[task_id] - float(last_cpu_seconds) utilization[task_id] = cputime_delta / time_delta if not utilization: raise MetricsProviderNoDataError( """The mesos_cpu metrics provider doesn't have Zookeeper data for this service. This is expected for its first run.""" ) task_utilization = utilization.values() mean_utilization = mean(task_utilization) return mean_utilization
def save_historical_load(historical_load, zk_path_prefix): with ZookeeperPool() as zk: historical_load_bytes = serialize_historical_load(historical_load) zk.ensure_path(zk_historical_load_path(zk_path_prefix)) zk.set(zk_historical_load_path(zk_path_prefix), historical_load_bytes)
def set_boost_factor( region: str, pool: str, factor=DEFAULT_BOOST_FACTOR, duration_minutes=DEFAULT_BOOST_DURATION, override=False, ) -> bool: if factor < MIN_BOOST_FACTOR: log.error(f'Cannot set a boost factor smaller than {MIN_BOOST_FACTOR}') return False if factor > MAX_BOOST_FACTOR: log.warning( 'Boost factor {} does not sound reasonable. Defaulting to {}'. format( factor, MAX_BOOST_FACTOR, )) factor = MAX_BOOST_FACTOR if duration_minutes > MAX_BOOST_DURATION: log.warning( 'Boost duration of {} minutes is too much. Falling back to {}.'. format( duration_minutes, MAX_BOOST_DURATION, )) duration_minutes = MAX_BOOST_DURATION zk_boost_path = get_zk_boost_path(region, pool) current_time = get_time() end_time = current_time + 60 * duration_minutes zk_end_time_path = zk_boost_path + '/end_time' zk_factor_path = zk_boost_path + '/factor' zk_expected_load_path = zk_boost_path + '/expected_load' with ZookeeperPool() as zk: if (not override and current_time < get_boost_values(region, pool, zk).end_time): log.error('Boost already active. Not overriding.') return False try: zk.ensure_path(zk_end_time_path) zk.ensure_path(zk_factor_path) zk.ensure_path(zk_expected_load_path) zk.set(zk_end_time_path, str(end_time).encode('utf-8')) zk.set(zk_factor_path, str(factor).encode('utf-8')) zk.set(zk_expected_load_path, '0'.encode('utf-8')) except Exception: log.error('Error setting the boost in Zookeeper') raise log.info( 'Cluster boost: Set capacity boost factor {} for pool {} in region {} until {}' .format( factor, pool, region, datetime.fromtimestamp(end_time).strftime('%c'), )) # Let's check that this factor has been properly written to zk return get_boost_values(region, pool, zk) == BoostValues( end_time=end_time, boost_factor=factor, expected_load=0, )