def _record_autoscaling_decision(
    marathon_service_config: MarathonServiceConfig,
    autoscaling_params: AutoscalingParamsDict,
    utilization: float,
    log_utilization_data: Mapping[str, str],
    error: float,
    current_instances: int,
    num_healthy_instances: int,
    new_instance_count: int,
    safe_downscaling_threshold: int,
    task_data_insufficient: bool,
) -> None:
    """
    Based on the calculations made, perform observability side effects.
    Log messages, generate time series, send any alerts, etc.
    """
    write_to_log(
        config=marathon_service_config,
        line=json.dumps(
            dict(
                timestamp=time.time(),
                paasta_cluster=marathon_service_config.get_cluster(),
                paasta_service=marathon_service_config.get_service(),
                paasta_instance=marathon_service_config.get_instance(),
                autoscaling_params=autoscaling_params,
                utilization=utilization,
                error=error,
                current_instances=current_instances,
                num_healthy_instances=num_healthy_instances,
                new_instance_count=new_instance_count,
                safe_downscaling_threshold=safe_downscaling_threshold,
                task_data_insufficient=task_data_insufficient,
            )
        ),
        level="debug",
    )
    meteorite_dims = {
        "paasta_service": marathon_service_config.service,
        "paasta_cluster": marathon_service_config.cluster,
        "paasta_instance": marathon_service_config.instance,
        "paasta_pool": marathon_service_config.get_pool(),
        "decision_policy": autoscaling_params[DECISION_POLICY_KEY],  # type: ignore
    }
    if yelp_meteorite:
        gauge = yelp_meteorite.create_gauge("paasta.service.instances", meteorite_dims)
        gauge.set(new_instance_count)
        gauge = yelp_meteorite.create_gauge(
            "paasta.service.max_instances", meteorite_dims
        )
        gauge.set(marathon_service_config.get_max_instances())
        gauge = yelp_meteorite.create_gauge(
            "paasta.service.min_instances", meteorite_dims
        )
        gauge.set(marathon_service_config.get_min_instances())
Exemplo n.º 2
0
def autoscale_marathon_instance(
    marathon_service_config: MarathonServiceConfig,
    system_paasta_config: SystemPaastaConfig,
    marathon_tasks: Sequence[MarathonTask],
    mesos_tasks: Sequence[Task],
) -> None:
    try:
        with create_autoscaling_lock(marathon_service_config.service, marathon_service_config.instance):
            current_instances = marathon_service_config.get_instances()
            task_data_insufficient = is_task_data_insufficient(
                marathon_service_config=marathon_service_config,
                marathon_tasks=marathon_tasks,
                current_instances=current_instances,
            )
            autoscaling_params = marathon_service_config.get_autoscaling_params()
            log_utilization_data: Mapping = {}
            utilization = get_utilization(
                marathon_service_config=marathon_service_config,
                system_paasta_config=system_paasta_config,
                autoscaling_params=autoscaling_params,
                log_utilization_data=log_utilization_data,
                marathon_tasks=marathon_tasks,
                mesos_tasks=mesos_tasks,
            )
            error = get_error_from_utilization(
                utilization=utilization,
                setpoint=autoscaling_params['setpoint'],
                current_instances=current_instances,
            )
            new_instance_count = get_new_instance_count(
                utilization=utilization,
                error=error,
                autoscaling_params=autoscaling_params,
                current_instances=current_instances,
                marathon_service_config=marathon_service_config,
                num_healthy_instances=len(marathon_tasks),
            )

            safe_downscaling_threshold = int(current_instances * 0.7)
            if new_instance_count != current_instances:
                if new_instance_count < current_instances and task_data_insufficient:
                    write_to_log(
                        config=marathon_service_config,
                        line='Delaying scaling *down* as we found too few healthy tasks running in marathon. '
                             'This can happen because tasks are delayed/waiting/unhealthy or because we are '
                             'waiting for tasks to be killed. Will wait for sufficient healthy tasks before '
                             'we make a decision to scale down.',
                    )
                    return
                if new_instance_count == safe_downscaling_threshold:
                    write_to_log(
                        config=marathon_service_config,
                        line='Autoscaler clamped: %s' % str(log_utilization_data),
                        level='debug',
                    )

                write_to_log(
                    config=marathon_service_config,
                    line='Scaling from %d to %d instances (%s)' % (
                        current_instances, new_instance_count, humanize_error(error),
                    ),
                )
                set_instances_for_marathon_service(
                    service=marathon_service_config.service,
                    instance=marathon_service_config.instance,
                    instance_count=new_instance_count,
                )
            else:
                write_to_log(
                    config=marathon_service_config,
                    line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)),
                    level='debug',
                )
            meteorite_dims = {
                'service_name': marathon_service_config.service,
                'decision_policy': autoscaling_params[DECISION_POLICY_KEY],  # type: ignore
                'paasta_cluster': marathon_service_config.cluster,
                'instance_name': marathon_service_config.instance,
            }
            if yelp_meteorite:
                gauge = yelp_meteorite.create_gauge('paasta.service.instances', meteorite_dims)
                gauge.set(new_instance_count)
                gauge = yelp_meteorite.create_gauge('paasta.service.max_instances', meteorite_dims)
                gauge.set(marathon_service_config.get_max_instances())
                gauge = yelp_meteorite.create_gauge('paasta.service.min_instances', meteorite_dims)
                gauge.set(marathon_service_config.get_min_instances())
    except LockHeldException:
        log.warning("Skipping autoscaling run for {service}.{instance} because the lock is held".format(
            service=marathon_service_config.service,
            instance=marathon_service_config.instance,
        ))