Exemplo n.º 1
0
def get_new_instance_count(
    utilization,
    error,
    autoscaling_params,
    current_instances,
    marathon_service_config,
    num_healthy_instances,
):
    autoscaling_decision_policy = get_decision_policy(
        autoscaling_params[DECISION_POLICY_KEY])

    zookeeper_path = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    autoscaling_amount = autoscaling_decision_policy(
        utilization=utilization,
        error=error,
        min_instances=marathon_service_config.get_min_instances(),
        max_instances=marathon_service_config.get_max_instances(),
        current_instances=current_instances,
        zookeeper_path=zookeeper_path,
        num_healthy_instances=num_healthy_instances,
        **autoscaling_params,
    )

    # Limit downscaling by 30% of current_instances until we find out what is
    # going on in such situations
    safe_downscaling_threshold = int(current_instances * 0.7)
    new_instance_count = max(current_instances + autoscaling_amount,
                             safe_downscaling_threshold)

    new_instance_count = marathon_service_config.limit_instance_count(
        new_instance_count)
    return new_instance_count
Exemplo n.º 2
0
def mesos_cpu_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, **kwargs):
    """
    Gets the mean cpu utilization of a service across all of its tasks.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from

    :returns: the service's mean utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = '%s/cpu_last_time' % autoscaling_root
    zk_last_cpu_data = '%s/cpu_data' % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time, _ = zk.get(zk_last_time_path)
            last_cpu_data, _ = zk.get(zk_last_cpu_data)
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(',') if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    mesos_tasks = {task['id']: task.stats for task in mesos_tasks}
    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    mesos_cpu_data = {task_id: float(stats.get('cpus_system_time_secs', 0.0) + stats.get(
        'cpus_user_time_secs', 0.0)) / (stats.get('cpus_limit', 0) - .1) for task_id, stats in mesos_tasks.items()}

    if not mesos_cpu_data:
        raise MetricsProviderNoDataError("Couldn't get any cpu data from Mesos")

    cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id) for task_id, cpu_seconds in mesos_cpu_data.items())

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_last_cpu_data)
        zk.ensure_path(zk_last_time_path)
        zk.set(zk_last_cpu_data, str(cpu_data_csv))
        zk.set(zk_last_time_path, str(current_time))

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(':')
        if task_id in mesos_cpu_data:
            utilization[task_id] = (mesos_cpu_data[task_id] - float(last_cpu_seconds)) / time_delta

    if not utilization:
        raise MetricsProviderNoDataError("""The mesos_cpu metrics provider doesn't have Zookeeper data for this service.
                                         This is expected for its first run.""")

    task_utilization = utilization.values()
    mean_utilization = mean(task_utilization)

    return mean_utilization
def autoscale_marathon_instance(marathon_service_config, marathon_tasks,
                                mesos_tasks):
    current_instances = marathon_service_config.get_instances()
    if len(marathon_tasks) != current_instances:
        write_to_log(
            config=marathon_service_config,
            line=
            'Delaying scaling as marathon is either waiting for resources or is delayed'
        )
        return
    autoscaling_params = marathon_service_config.get_autoscaling_params()
    autoscaling_metrics_provider = get_service_metrics_provider(
        autoscaling_params.pop(SERVICE_METRICS_PROVIDER_KEY))
    autoscaling_decision_policy = get_decision_policy(
        autoscaling_params.pop(DECISION_POLICY_KEY))

    utilization = autoscaling_metrics_provider(
        marathon_service_config=marathon_service_config,
        marathon_tasks=marathon_tasks,
        mesos_tasks=mesos_tasks,
        **autoscaling_params)
    error = get_error_from_utilization(
        utilization=utilization,
        setpoint=autoscaling_params.pop('setpoint'),
        current_instances=current_instances,
    )

    zookeeper_path = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    autoscaling_amount = autoscaling_decision_policy(
        error=error,
        min_instances=marathon_service_config.get_min_instances(),
        max_instances=marathon_service_config.get_max_instances(),
        current_instances=current_instances,
        zookeeper_path=zookeeper_path,
        **autoscaling_params)

    new_instance_count = marathon_service_config.limit_instance_count(
        current_instances + autoscaling_amount)
    if new_instance_count != current_instances:
        write_to_log(
            config=marathon_service_config,
            line='Scaling from %d to %d instances (%s)' %
            (current_instances, new_instance_count, humanize_error(error)),
        )
        set_instances_for_marathon_service(
            service=marathon_service_config.service,
            instance=marathon_service_config.instance,
            instance_count=new_instance_count,
        )
    else:
        write_to_log(
            config=marathon_service_config,
            line='Staying at %d instances (%s)' %
            (current_instances, humanize_error(error)),
            level='debug',
        )
Exemplo n.º 4
0
def autoscale_marathon_instance(marathon_service_config, marathon_tasks, mesos_tasks):
    current_instances = marathon_service_config.get_instances()
    if len(marathon_tasks) != current_instances:
        write_to_log(config=marathon_service_config,
                     line='Delaying scaling as marathon is either waiting for resources or is delayed')
        return
    autoscaling_params = marathon_service_config.get_autoscaling_params()
    autoscaling_metrics_provider = get_service_metrics_provider(autoscaling_params.pop(SERVICE_METRICS_PROVIDER_KEY))
    autoscaling_decision_policy = get_decision_policy(autoscaling_params.pop(DECISION_POLICY_KEY))

    utilization = autoscaling_metrics_provider(marathon_service_config, marathon_tasks,
                                               mesos_tasks, **autoscaling_params)
    error = get_error_from_utilization(
        utilization=utilization,
        setpoint=autoscaling_params.pop('setpoint'),
        current_instances=current_instances,
    )

    zookeeper_path = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    autoscaling_amount = autoscaling_decision_policy(
        error=error,
        min_instances=marathon_service_config.get_min_instances(),
        max_instances=marathon_service_config.get_max_instances(),
        current_instances=current_instances,
        zookeeper_path=zookeeper_path,
        **autoscaling_params
    )

    new_instance_count = marathon_service_config.limit_instance_count(current_instances + autoscaling_amount)
    if new_instance_count != current_instances:
        write_to_log(
            config=marathon_service_config,
            line='Scaling from %d to %d instances (%s)' % (
                current_instances, new_instance_count, humanize_error(error)),
        )
        set_instances_for_marathon_service(
            service=marathon_service_config.service,
            instance=marathon_service_config.instance,
            instance_count=new_instance_count,
        )
    else:
        write_to_log(
            config=marathon_service_config,
            line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)),
            level='debug',
        )
Exemplo n.º 5
0
def mesos_cpu_metrics_provider(
    marathon_service_config,
    system_paasta_config,
    marathon_tasks,
    mesos_tasks,
    log_utilization_data={},
    noop=False,
    **kwargs,
):
    """
    Gets the mean cpu utilization of a service across all of its tasks.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from
    :param log_utilization_data: A dict used to transfer utilization data to autoscale_marathon_instance()

    :returns: the service's mean utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = '%s/cpu_last_time' % autoscaling_root
    zk_last_cpu_data = '%s/cpu_data' % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time = zk.get(zk_last_time_path)[0].decode('utf8')
            last_cpu_data = zk.get(zk_last_cpu_data)[0].decode('utf8')
            log_utilization_data[last_time] = last_cpu_data
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(',')
                             if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    monkey.patch_socket()
    jobs = [gevent.spawn(task.stats_callable) for task in mesos_tasks]
    gevent.joinall(jobs, timeout=60)
    mesos_tasks = dict(
        zip([task['id'] for task in mesos_tasks], [job.value for job in jobs]))

    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    mesos_cpu_data = {}
    for task_id, stats in mesos_tasks.items():
        if stats is not None:
            try:
                utime = float(stats['cpus_user_time_secs'])
                stime = float(stats['cpus_system_time_secs'])
                limit = float(stats['cpus_limit']) - .1
                mesos_cpu_data[task_id] = (stime + utime) / limit
            except KeyError:
                pass

    if not mesos_cpu_data:
        raise MetricsProviderNoDataError(
            "Couldn't get any cpu data from Mesos")

    cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id)
                            for task_id, cpu_seconds in mesos_cpu_data.items())
    log_utilization_data[str(current_time)] = cpu_data_csv

    if not noop:
        with ZookeeperPool() as zk:
            zk.ensure_path(zk_last_cpu_data)
            zk.ensure_path(zk_last_time_path)
            zk.set(zk_last_cpu_data, str(cpu_data_csv).encode('utf8'))
            zk.set(zk_last_time_path, str(current_time).encode('utf8'))

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(':')
        if task_id in mesos_cpu_data:
            cputime_delta = mesos_cpu_data[task_id] - float(last_cpu_seconds)

            if system_paasta_config.get_filter_bogus_mesos_cputime_enabled():
                # It is unlikely that the cputime consumed by a task is greater than the CPU limits
                # that we enforce (plus 10% of margin). This is a bug in Mesos (tracked in PAASTA-13510)
                cpu_burst_allowance = (
                    1.10 * marathon_service_config.get_cpu_quota() /
                    marathon_service_config.get_cpu_period())
                if cputime_delta > time_delta * cpu_burst_allowance:
                    log.warning(
                        'Ignoring potentially bogus cputime values for task {}'
                        .format(str(task_id)))
                    log.debug(
                        'Elapsed time: {}, Enforced CPU limit: {}, CPU time consumed: {}'
                        .format(
                            time_delta,
                            cpu_burst_allowance,
                            cputime_delta,
                        ), )
                    continue

            utilization[task_id] = cputime_delta / time_delta

    if not utilization:
        raise MetricsProviderNoDataError(
            """The mesos_cpu metrics provider doesn't have Zookeeper data for this service.
                                         This is expected for its first run."""
        )

    task_utilization = utilization.values()
    mean_utilization = mean(task_utilization)
    return mean_utilization
Exemplo n.º 6
0
def mesos_cpu_metrics_provider(
    marathon_service_config,
    marathon_tasks,
    mesos_tasks,
    log_utilization_data={},
    noop=False,
    **kwargs,
):
    """
    Gets the mean cpu utilization of a service across all of its tasks.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from
    :param log_utilization_data: A dict used to transfer utilization data to autoscale_marathon_instance()

    :returns: the service's mean utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = '%s/cpu_last_time' % autoscaling_root
    zk_last_cpu_data = '%s/cpu_data' % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time = zk.get(zk_last_time_path)[0].decode('utf8')
            last_cpu_data = zk.get(zk_last_cpu_data)[0].decode('utf8')
            log_utilization_data[last_time] = last_cpu_data
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(',')
                             if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    monkey.patch_socket()
    jobs = [gevent.spawn(task.stats_callable) for task in mesos_tasks]
    gevent.joinall(jobs, timeout=60)
    mesos_tasks = dict(
        zip([task['id'] for task in mesos_tasks], [job.value for job in jobs]))

    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    mesos_cpu_data = {}
    for task_id, stats in mesos_tasks.items():
        if stats is not None:
            try:
                utime = float(stats['cpus_user_time_secs'])
                stime = float(stats['cpus_system_time_secs'])
                limit = float(stats['cpus_limit']) - .1
                mesos_cpu_data[task_id] = (stime + utime) / limit
            except KeyError:
                pass

    if not mesos_cpu_data:
        raise MetricsProviderNoDataError(
            "Couldn't get any cpu data from Mesos")

    cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id)
                            for task_id, cpu_seconds in mesos_cpu_data.items())
    log_utilization_data[str(current_time)] = cpu_data_csv

    if not noop:
        with ZookeeperPool() as zk:
            zk.ensure_path(zk_last_cpu_data)
            zk.ensure_path(zk_last_time_path)
            zk.set(zk_last_cpu_data, str(cpu_data_csv).encode('utf8'))
            zk.set(zk_last_time_path, str(current_time).encode('utf8'))

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(':')
        if task_id in mesos_cpu_data:
            utilization[task_id] = (mesos_cpu_data[task_id] -
                                    float(last_cpu_seconds)) / time_delta

    if not utilization:
        raise MetricsProviderNoDataError(
            """The mesos_cpu metrics provider doesn't have Zookeeper data for this service.
                                         This is expected for its first run."""
        )

    task_utilization = utilization.values()
    mean_utilization = mean(task_utilization)

    return mean_utilization
def mesos_cpu_metrics_provider(marathon_service_config, marathon_tasks,
                               mesos_tasks, **kwargs):
    """
    Gets the mean cpu utilization of a service across all of its tasks.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from

    :returns: the service's mean utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = '%s/cpu_last_time' % autoscaling_root
    zk_last_cpu_data = '%s/cpu_data' % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time, _ = zk.get(zk_last_time_path)
            last_cpu_data, _ = zk.get(zk_last_cpu_data)
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(',')
                             if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    mesos_tasks = {task['id']: task.stats for task in mesos_tasks}
    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    mesos_cpu_data = {
        task_id: float(
            stats.get('cpus_system_time_secs', 0.0) +
            stats.get('cpus_user_time_secs', 0.0)) /
        (stats.get('cpus_limit', 0) - .1)
        for task_id, stats in mesos_tasks.items()
    }

    if not mesos_cpu_data:
        raise MetricsProviderNoDataError(
            "Couldn't get any cpu data from Mesos")

    cpu_data_csv = ','.join('%s:%s' % (cpu_seconds, task_id)
                            for task_id, cpu_seconds in mesos_cpu_data.items())

    with ZookeeperPool() as zk:
        zk.ensure_path(zk_last_cpu_data)
        zk.ensure_path(zk_last_time_path)
        zk.set(zk_last_cpu_data, str(cpu_data_csv))
        zk.set(zk_last_time_path, str(current_time))

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(':')
        if task_id in mesos_cpu_data:
            utilization[task_id] = (mesos_cpu_data[task_id] -
                                    float(last_cpu_seconds)) / time_delta

    if not utilization:
        raise MetricsProviderNoDataError(
            """The mesos_cpu metrics provider doesn't have Zookeeper data for this service.
                                         This is expected for its first run."""
        )

    task_utilization = utilization.values()
    mean_utilization = mean(task_utilization)

    return mean_utilization
Exemplo n.º 8
0
def mesos_cpu_metrics_provider(
    marathon_service_config,
    system_paasta_config,
    marathon_tasks,
    mesos_tasks,
    log_utilization_data={},
    noop=False,
    **kwargs,
):
    """
    Gets the mean cpu utilization of a service across all of its tasks.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from
    :param log_utilization_data: A dict used to transfer utilization data to autoscale_marathon_instance()

    :returns: the service's mean utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = '%s/cpu_last_time' % autoscaling_root
    zk_last_cpu_data = '%s/cpu_data' % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time = zk.get(zk_last_time_path)[0].decode('utf8')
            last_cpu_data = zk.get(zk_last_cpu_data)[0].decode('utf8')
            log_utilization_data[last_time] = last_cpu_data
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(',')
                             if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    futures = [asyncio.ensure_future(task.stats()) for task in mesos_tasks]
    if futures:
        a_sync.block(asyncio.wait, futures, timeout=60)

    def results_or_None(fut):
        if fut.exception():
            return None
        else:
            return fut.result()

    mesos_tasks_stats = dict(
        zip([task['id'] for task in mesos_tasks],
            [results_or_None(fut) for fut in futures]))

    current_time = int(datetime.now().strftime('%s'))
    time_delta = current_time - last_time

    mesos_cpu_data = {}
    for task_id, stats in mesos_tasks_stats.items():
        if stats is not None:
            try:
                utime = float(stats['cpus_user_time_secs'])
                stime = float(stats['cpus_system_time_secs'])
                limit = float(stats['cpus_limit']) - .1
                mesos_cpu_data[task_id] = (stime + utime) / limit
            except KeyError:
                pass

    if not mesos_cpu_data:
        raise MetricsProviderNoDataError(
            "Couldn't get any cpu data from Mesos")

    cpu_data_csv = ','.join(f'{cpu_seconds}:{task_id}'
                            for task_id, cpu_seconds in mesos_cpu_data.items())
    log_utilization_data[str(current_time)] = cpu_data_csv

    if not noop:
        with ZookeeperPool() as zk:
            zk.ensure_path(zk_last_cpu_data)
            zk.ensure_path(zk_last_time_path)
            zk.set(zk_last_cpu_data, str(cpu_data_csv).encode('utf8'))
            zk.set(zk_last_time_path, str(current_time).encode('utf8'))

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(':')
        if task_id in mesos_cpu_data:
            cputime_delta = mesos_cpu_data[task_id] - float(last_cpu_seconds)

            if system_paasta_config.get_filter_bogus_mesos_cputime_enabled():
                # It is unlikely that the cputime consumed by a task is greater than the CPU limits
                # that we enforce. This is a bug in Mesos (tracked in PAASTA-13510)
                max_cpu_allowed = 1 + marathon_service_config.get_cpu_burst_add(
                ) / marathon_service_config.get_cpus()
                task_cpu_usage = cputime_delta / time_delta

                if task_cpu_usage > (max_cpu_allowed * 1.1):
                    log.warning(
                        'Ignoring potentially bogus cpu usage {} for task {}'.
                        format(
                            task_cpu_usage,
                            str(task_id),
                        ), )
                    continue

            utilization[task_id] = cputime_delta / time_delta

    if not utilization:
        raise MetricsProviderNoDataError(
            """The mesos_cpu metrics provider doesn't have Zookeeper data for this service.
                                         This is expected for its first run."""
        )

    task_utilization = utilization.values()
    mean_utilization = mean(task_utilization)
    return mean_utilization
Exemplo n.º 9
0
def autoscale_marathon_instance(marathon_service_config, marathon_tasks,
                                mesos_tasks):
    current_instances = marathon_service_config.get_instances()
    too_many_instances_running = len(marathon_tasks) > int(
        (1 + MAX_TASK_DELTA) * current_instances)
    too_few_instances_running = len(marathon_tasks) < int(
        (1 - MAX_TASK_DELTA) * current_instances)
    if too_many_instances_running or too_few_instances_running:
        if current_instances < marathon_service_config.get_min_instances():
            write_to_log(
                config=marathon_service_config,
                line=
                'Scaling from %d to %d instances because we are below min_instances'
                % (current_instances,
                   marathon_service_config.get_min_instances()))
            set_instances_for_marathon_service(
                service=marathon_service_config.service,
                instance=marathon_service_config.instance,
                instance_count=marathon_service_config.get_min_instances())

        else:
            write_to_log(
                config=marathon_service_config,
                line=
                'Delaying scaling as we found too many or too few tasks running in marathon. '
                'This can happen because tasks are delayed/waiting/unhealthy or because we are '
                'waiting for tasks to be killed.')
        return
    autoscaling_params = marathon_service_config.get_autoscaling_params()
    autoscaling_metrics_provider = get_service_metrics_provider(
        autoscaling_params.pop(SERVICE_METRICS_PROVIDER_KEY))
    autoscaling_decision_policy = get_decision_policy(
        autoscaling_params.pop(DECISION_POLICY_KEY))

    log_utilization_data = {}

    utilization = autoscaling_metrics_provider(
        marathon_service_config=marathon_service_config,
        marathon_tasks=marathon_tasks,
        mesos_tasks=mesos_tasks,
        log_utilization_data=log_utilization_data,
        **autoscaling_params)
    error = get_error_from_utilization(
        utilization=utilization,
        setpoint=autoscaling_params.pop('setpoint'),
        current_instances=current_instances,
    )

    zookeeper_path = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    autoscaling_amount = autoscaling_decision_policy(
        error=error,
        min_instances=marathon_service_config.get_min_instances(),
        max_instances=marathon_service_config.get_max_instances(),
        current_instances=current_instances,
        zookeeper_path=zookeeper_path,
        **autoscaling_params)

    # Limit downscaling by 30% of current_instances until we find out what is
    # going on in such situations
    safe_downscaling_threshold = int(current_instances * 0.7)
    new_instance_count = max(current_instances + autoscaling_amount,
                             safe_downscaling_threshold)

    new_instance_count = marathon_service_config.limit_instance_count(
        new_instance_count)
    if new_instance_count != current_instances:
        if new_instance_count == safe_downscaling_threshold:
            write_to_log(
                config=marathon_service_config,
                line='Autoscaler clamped: %s' % str(log_utilization_data),
                level='debug',
            )

        write_to_log(
            config=marathon_service_config,
            line='Scaling from %d to %d instances (%s)' %
            (current_instances, new_instance_count, humanize_error(error)),
        )
        set_instances_for_marathon_service(
            service=marathon_service_config.service,
            instance=marathon_service_config.instance,
            instance_count=new_instance_count,
        )
    else:
        write_to_log(
            config=marathon_service_config,
            line='Staying at %d instances (%s)' %
            (current_instances, humanize_error(error)),
            level='debug',
        )
Exemplo n.º 10
0
def mesos_cpu_metrics_provider(
    marathon_service_config,
    system_paasta_config,
    marathon_tasks,
    mesos_tasks,
    log_utilization_data={},
    noop=False,
    **kwargs,
):
    """
    Gets the mean cpu utilization of a service across all of its tasks.

    :param marathon_service_config: the MarathonServiceConfig to get data from
    :param marathon_tasks: Marathon tasks to get data from
    :param mesos_tasks: Mesos tasks to get data from
    :param log_utilization_data: A dict used to transfer utilization data to autoscale_marathon_instance()

    :returns: the service's mean utilization, from 0 to 1
    """

    autoscaling_root = compose_autoscaling_zookeeper_root(
        service=marathon_service_config.service,
        instance=marathon_service_config.instance,
    )
    zk_last_time_path = "%s/cpu_last_time" % autoscaling_root
    zk_last_cpu_data = "%s/cpu_data" % autoscaling_root

    with ZookeeperPool() as zk:
        try:
            last_time = zk.get(zk_last_time_path)[0].decode("utf8")
            last_cpu_data = zk.get(zk_last_cpu_data)[0].decode("utf8")
            log_utilization_data[last_time] = last_cpu_data
            last_time = float(last_time)
            last_cpu_data = (datum for datum in last_cpu_data.split(",")
                             if datum)
        except NoNodeError:
            last_time = 0.0
            last_cpu_data = []

    futures = [asyncio.ensure_future(task.stats()) for task in mesos_tasks]
    if futures:
        a_sync.block(asyncio.wait, futures, timeout=60)

    def results_or_None(fut):
        if fut.exception():
            return None
        else:
            return fut.result()

    mesos_tasks_stats = dict(
        zip(
            [task["id"] for task in mesos_tasks],
            [results_or_None(fut) for fut in futures],
        ))

    current_time = int(datetime.now().strftime("%s"))
    time_delta = current_time - last_time

    mesos_cpu_data = {}
    for task_id, stats in mesos_tasks_stats.items():
        if stats is not None:
            try:
                utime = float(stats["cpus_user_time_secs"])
                stime = float(stats["cpus_system_time_secs"])
                limit = float(stats["cpus_limit"]) - 0.1
                mesos_cpu_data[task_id] = (stime + utime) / limit
            except KeyError:
                pass

    if not mesos_cpu_data:
        raise MetricsProviderNoDataError(
            "Couldn't get any cpu data from Mesos")

    cpu_data_csv = ",".join(f"{cpu_seconds}:{task_id}"
                            for task_id, cpu_seconds in mesos_cpu_data.items())
    log_utilization_data[str(current_time)] = cpu_data_csv

    if not noop:
        with ZookeeperPool() as zk:
            zk.ensure_path(zk_last_cpu_data)
            zk.ensure_path(zk_last_time_path)
            zk.set(zk_last_cpu_data, str(cpu_data_csv).encode("utf8"))
            zk.set(zk_last_time_path, str(current_time).encode("utf8"))

    utilization = {}
    for datum in last_cpu_data:
        last_cpu_seconds, task_id = datum.split(":")
        if task_id in mesos_cpu_data:
            cputime_delta = mesos_cpu_data[task_id] - float(last_cpu_seconds)
            utilization[task_id] = cputime_delta / time_delta

    if not utilization:
        raise MetricsProviderNoDataError(
            """The mesos_cpu metrics provider doesn't have Zookeeper data for this service.
                                         This is expected for its first run."""
        )

    task_utilization = utilization.values()
    mean_utilization = mean(task_utilization)
    return mean_utilization