示例#1
0
def get_instances_from_zookeeper(service, instance):
    with ZookeeperPool() as zookeeper_client:
        (instances, _) = zookeeper_client.get(
            '%s/instances' %
            compose_autoscaling_zookeeper_root(service, instance))
        return int(instances)
示例#2
0
def set_boost_factor(
    zk_boost_path: str,
    region: str='',
    pool: str='',
    send_clusterman_metrics: bool=False,
    factor: float=DEFAULT_BOOST_FACTOR,
    duration_minutes: int=DEFAULT_BOOST_DURATION,
    override: bool=False,
) -> bool:
    """
    Set a boost factor for a path in zk

    Can be used to boost either cluster or service autoscalers.
    If using for cluster you must specify region, pool and set
    send_clusterman_metrics=True so that clusterman metrics are updated

    otherwise just zk_boost_path is enough.
    """
    if factor < MIN_BOOST_FACTOR:
        log.error(f'Cannot set a boost factor smaller than {MIN_BOOST_FACTOR}')
        return False

    if factor > MAX_BOOST_FACTOR:
        log.warning('Boost factor {} does not sound reasonable. Defaulting to {}'.format(
            factor,
            MAX_BOOST_FACTOR,
        ))
        factor = MAX_BOOST_FACTOR

    if duration_minutes > MAX_BOOST_DURATION:
        log.warning('Boost duration of {} minutes is too much. Falling back to {}.'.format(
            duration_minutes,
            MAX_BOOST_DURATION,
        ))
        duration_minutes = MAX_BOOST_DURATION

    current_time = get_time()
    end_time = current_time + 60 * duration_minutes

    if clusterman_metrics and send_clusterman_metrics:
        cluster = load_system_paasta_config().get_cluster()
        metrics_client = clusterman_metrics.ClustermanMetricsBotoClient(region_name=region, app_identifier='default')
        with metrics_client.get_writer(clusterman_metrics.APP_METRICS) as writer:
            metrics_key = clusterman_metrics.generate_key_with_dimensions(
                'boost_factor',
                {'cluster': cluster, 'pool': pool},
            )
            writer.send((metrics_key, current_time, factor))
            if duration_minutes > 0:
                writer.send((metrics_key, end_time, 1.0))

    zk_end_time_path = zk_boost_path + '/end_time'
    zk_factor_path = zk_boost_path + '/factor'
    zk_expected_load_path = zk_boost_path + '/expected_load'

    with ZookeeperPool() as zk:
        if (
            not override and
            current_time < get_boost_values(zk_boost_path, zk).end_time
        ):
            log.error('Boost already active. Not overriding.')
            return False

        try:
            zk.ensure_path(zk_end_time_path)
            zk.ensure_path(zk_factor_path)
            zk.ensure_path(zk_expected_load_path)
            zk.set(zk_end_time_path, str(end_time).encode('utf-8'))
            zk.set(zk_factor_path, str(factor).encode('utf-8'))
            zk.set(zk_expected_load_path, '0'.encode('utf-8'))
        except Exception:
            log.error('Error setting the boost in Zookeeper')
            raise

        log.info('Load boost: Set capacity boost factor {} at path {} until {}'.format(
            factor,
            zk_boost_path,
            datetime.fromtimestamp(end_time).strftime('%c'),
        ))

        # Let's check that this factor has been properly written to zk
        return get_boost_values(zk_boost_path, zk) == BoostValues(
            end_time=end_time,
            boost_factor=factor,
            expected_load=0,
        )
示例#3
0
def mesos_cpu_metrics_provider(marathon_service_config, marathon_tasks,
                               mesos_tasks, **kwargs):
    """
    Gets the average utilization of a service across all of its tasks, where the utilization of
    a task is the maximum value between its cpu and ram utilization.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from

    :returns: the service's average utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = '%s/cpu_last_time' % autoscaling_root
    zk_last_cpu_data = '%s/cpu_data' % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time, _ = zk.get(zk_last_time_path)
            last_cpu_data, _ = zk.get(zk_last_cpu_data)
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(',')
                             if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    mesos_tasks = {task['id']: task.stats for task in mesos_tasks}
    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    mesos_cpu_data = {
        task_id: float(
            stats.get('cpus_system_time_secs', 0.0) +
            stats.get('cpus_user_time_secs', 0.0)) /
        (stats.get('cpus_limit', 0) - .1)
        for task_id, stats in mesos_tasks.items()
    }

    if not mesos_cpu_data:
        raise MetricsProviderNoDataError(
            "Couldn't get any cpu or ram data from Mesos")

    cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id)
                            for task_id, cpu_seconds in mesos_cpu_data.items())

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_last_cpu_data)
        zk.ensure_path(zk_last_time_path)
        zk.set(zk_last_cpu_data, str(cpu_data_csv))
        zk.set(zk_last_time_path, str(current_time))

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(':')
        if task_id in mesos_cpu_data:
            utilization[task_id] = (mesos_cpu_data[task_id] -
                                    float(last_cpu_seconds)) / time_delta

    if not utilization:
        raise MetricsProviderNoDataError(
            """The mesos_cpu metrics provider doesn't have Zookeeper data for this service.
                                         This is expected for its first run."""
        )

    task_utilization = utilization.values()
    average_utilization = sum(task_utilization) / len(task_utilization)

    return average_utilization
示例#4
0
def mesos_cpu_metrics_provider(
    marathon_service_config,
    system_paasta_config,
    marathon_tasks,
    mesos_tasks,
    log_utilization_data={},
    noop=False,
    **kwargs,
):
    """
    Gets the mean cpu utilization of a service across all of its tasks.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from
    :param log_utilization_data: A dict used to transfer utilization data to autoscale_marathon_instance()

    :returns: the service's mean utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = "%s/cpu_last_time" % autoscaling_root
    zk_last_cpu_data = "%s/cpu_data" % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time = zk.get(zk_last_time_path)[0].decode("utf8")
            last_cpu_data = zk.get(zk_last_cpu_data)[0].decode("utf8")
            log_utilization_data[last_time] = last_cpu_data
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(",")
                             if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    futures = [asyncio.ensure_future(task.stats()) for task in mesos_tasks]
    if futures:
        a_sync.block(asyncio.wait, futures, timeout=60)

    def results_or_None(fut):
        if fut.exception():
            return None
        else:
            return fut.result()

    mesos_tasks_stats = dict(
        zip(
            [task["id"] for task in mesos_tasks],
            [results_or_None(fut) for fut in futures],
        ))

    current_time = int(datetime.now().strftime("%s"))
    time_delta = current_time - last_time

    mesos_cpu_data = {}
    for task_id, stats in mesos_tasks_stats.items():
        if stats is not None:
            try:
                utime = float(stats["cpus_user_time_secs"])
                stime = float(stats["cpus_system_time_secs"])
                limit = float(stats["cpus_limit"]) - 0.1
                mesos_cpu_data[task_id] = (stime + utime) / limit
            except KeyError:
                pass

    if not mesos_cpu_data:
        raise MetricsProviderNoDataError(
            "Couldn't get any cpu data from Mesos")

    cpu_data_csv = ",".join(f"{cpu_seconds}:{task_id}"
                            for task_id, cpu_seconds in mesos_cpu_data.items())
    log_utilization_data[str(current_time)] = cpu_data_csv

    if not noop:
        with ZookeeperPool() as zk:
            zk.ensure_path(zk_last_cpu_data)
            zk.ensure_path(zk_last_time_path)
            zk.set(zk_last_cpu_data, str(cpu_data_csv).encode("utf8"))
            zk.set(zk_last_time_path, str(current_time).encode("utf8"))

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(":")
        if task_id in mesos_cpu_data:
            cputime_delta = mesos_cpu_data[task_id] - float(last_cpu_seconds)
            utilization[task_id] = cputime_delta / time_delta

    if not utilization:
        raise MetricsProviderNoDataError(
            """The mesos_cpu metrics provider doesn't have Zookeeper data for this service.
                                         This is expected for its first run."""
        )

    task_utilization = utilization.values()
    mean_utilization = mean(task_utilization)
    return mean_utilization
示例#5
0
def save_historical_load(historical_load, zk_path_prefix):
    with ZookeeperPool() as zk:
        historical_load_bytes = serialize_historical_load(historical_load)
        zk.ensure_path(zk_historical_load_path(zk_path_prefix))
        zk.set(zk_historical_load_path(zk_path_prefix), historical_load_bytes)
示例#6
0
def set_boost_factor(
    region: str,
    pool: str,
    factor=DEFAULT_BOOST_FACTOR,
    duration_minutes=DEFAULT_BOOST_DURATION,
    override=False,
) -> bool:
    if factor < MIN_BOOST_FACTOR:
        log.error(f'Cannot set a boost factor smaller than {MIN_BOOST_FACTOR}')
        return False

    if factor > MAX_BOOST_FACTOR:
        log.warning(
            'Boost factor {} does not sound reasonable. Defaulting to {}'.
            format(
                factor,
                MAX_BOOST_FACTOR,
            ))
        factor = MAX_BOOST_FACTOR

    if duration_minutes > MAX_BOOST_DURATION:
        log.warning(
            'Boost duration of {} minutes is too much. Falling back to {}.'.
            format(
                duration_minutes,
                MAX_BOOST_DURATION,
            ))
        duration_minutes = MAX_BOOST_DURATION

    zk_boost_path = get_zk_boost_path(region, pool)
    current_time = get_time()
    end_time = current_time + 60 * duration_minutes

    zk_end_time_path = zk_boost_path + '/end_time'
    zk_factor_path = zk_boost_path + '/factor'
    zk_expected_load_path = zk_boost_path + '/expected_load'

    with ZookeeperPool() as zk:
        if (not override and
                current_time < get_boost_values(region, pool, zk).end_time):
            log.error('Boost already active. Not overriding.')
            return False

        try:
            zk.ensure_path(zk_end_time_path)
            zk.ensure_path(zk_factor_path)
            zk.ensure_path(zk_expected_load_path)
            zk.set(zk_end_time_path, str(end_time).encode('utf-8'))
            zk.set(zk_factor_path, str(factor).encode('utf-8'))
            zk.set(zk_expected_load_path, '0'.encode('utf-8'))
        except Exception:
            log.error('Error setting the boost in Zookeeper')
            raise

        log.info(
            'Cluster boost: Set capacity boost factor {} for pool {} in region {} until {}'
            .format(
                factor,
                pool,
                region,
                datetime.fromtimestamp(end_time).strftime('%c'),
            ))

        # Let's check that this factor has been properly written to zk
        return get_boost_values(region, pool, zk) == BoostValues(
            end_time=end_time,
            boost_factor=factor,
            expected_load=0,
        )