Exemplo n.º 1
0
def autoscale_marathon_instance(
    marathon_service_config: MarathonServiceConfig,
    system_paasta_config: SystemPaastaConfig,
    marathon_tasks: Sequence[MarathonTask],
    mesos_tasks: Sequence[Task],
) -> None:
    try:
        with create_autoscaling_lock(marathon_service_config.service,
                                     marathon_service_config.instance):
            current_instances = marathon_service_config.get_instances()
            task_data_insufficient = is_task_data_insufficient(
                marathon_service_config=marathon_service_config,
                marathon_tasks=marathon_tasks,
                current_instances=current_instances,
            )
            autoscaling_params = marathon_service_config.get_autoscaling_params(
            )
            log_utilization_data: Mapping = {}
            utilization = get_utilization(
                marathon_service_config=marathon_service_config,
                system_paasta_config=system_paasta_config,
                autoscaling_params=autoscaling_params,
                log_utilization_data=log_utilization_data,
                marathon_tasks=marathon_tasks,
                mesos_tasks=mesos_tasks,
            )
            error = get_error_from_utilization(
                utilization=utilization,
                setpoint=autoscaling_params["setpoint"],
                current_instances=current_instances,
            )
            num_healthy_instances = len(marathon_tasks)
            new_instance_count = get_new_instance_count(
                utilization=utilization,
                error=error,
                autoscaling_params=autoscaling_params,
                current_instances=current_instances,
                marathon_service_config=marathon_service_config,
                num_healthy_instances=num_healthy_instances,
                persist_data=(not task_data_insufficient),
            )
            safe_downscaling_threshold = int(current_instances * 0.7)
            _record_autoscaling_decision(
                marathon_service_config=marathon_service_config,
                autoscaling_params=autoscaling_params,
                utilization=utilization,
                log_utilization_data=log_utilization_data,
                error=error,
                current_instances=current_instances,
                num_healthy_instances=num_healthy_instances,
                new_instance_count=new_instance_count,
                safe_downscaling_threshold=safe_downscaling_threshold,
                task_data_insufficient=task_data_insufficient,
            )
            if new_instance_count != current_instances:
                if new_instance_count < current_instances and task_data_insufficient:
                    write_to_log(
                        config=marathon_service_config,
                        line=
                        "Delaying scaling *down* as we found too few healthy tasks running in marathon. "
                        "This can happen because tasks are delayed/waiting/unhealthy or because we are "
                        "waiting for tasks to be killed. Will wait for sufficient healthy tasks before "
                        "we make a decision to scale down.",
                        level="debug",
                    )
                    return
                else:
                    set_instances_for_marathon_service(
                        service=marathon_service_config.service,
                        instance=marathon_service_config.instance,
                        instance_count=new_instance_count,
                    )
                    write_to_log(
                        config=marathon_service_config,
                        line="Scaling from %d to %d instances (%s)" % (
                            current_instances,
                            new_instance_count,
                            humanize_error(error),
                        ),
                        level="event",
                    )
            else:
                write_to_log(
                    config=marathon_service_config,
                    line="Staying at %d instances (%s)" %
                    (current_instances, humanize_error(error)),
                    level="debug",
                )
    except LockHeldException:
        log.warning(
            "Skipping autoscaling run for {service}.{instance} because the lock is held"
            .format(
                service=marathon_service_config.service,
                instance=marathon_service_config.instance,
            ))
Exemplo n.º 2
0
def autoscale_marathon_instance(
    marathon_service_config: MarathonServiceConfig,
    system_paasta_config: SystemPaastaConfig,
    marathon_tasks: Sequence[MarathonTask],
    mesos_tasks: Sequence[Task],
) -> None:
    try:
        with create_autoscaling_lock(marathon_service_config.service, marathon_service_config.instance):
            current_instances = marathon_service_config.get_instances()
            task_data_insufficient = is_task_data_insufficient(
                marathon_service_config=marathon_service_config,
                marathon_tasks=marathon_tasks,
                current_instances=current_instances,
            )
            autoscaling_params = marathon_service_config.get_autoscaling_params()
            log_utilization_data: Mapping = {}
            utilization = get_utilization(
                marathon_service_config=marathon_service_config,
                system_paasta_config=system_paasta_config,
                autoscaling_params=autoscaling_params,
                log_utilization_data=log_utilization_data,
                marathon_tasks=marathon_tasks,
                mesos_tasks=mesos_tasks,
            )
            error = get_error_from_utilization(
                utilization=utilization,
                setpoint=autoscaling_params['setpoint'],
                current_instances=current_instances,
            )
            new_instance_count = get_new_instance_count(
                utilization=utilization,
                error=error,
                autoscaling_params=autoscaling_params,
                current_instances=current_instances,
                marathon_service_config=marathon_service_config,
                num_healthy_instances=len(marathon_tasks),
            )

            safe_downscaling_threshold = int(current_instances * 0.7)
            if new_instance_count != current_instances:
                if new_instance_count < current_instances and task_data_insufficient:
                    write_to_log(
                        config=marathon_service_config,
                        line='Delaying scaling *down* as we found too few healthy tasks running in marathon. '
                             'This can happen because tasks are delayed/waiting/unhealthy or because we are '
                             'waiting for tasks to be killed. Will wait for sufficient healthy tasks before '
                             'we make a decision to scale down.',
                    )
                    return
                if new_instance_count == safe_downscaling_threshold:
                    write_to_log(
                        config=marathon_service_config,
                        line='Autoscaler clamped: %s' % str(log_utilization_data),
                        level='debug',
                    )

                write_to_log(
                    config=marathon_service_config,
                    line='Scaling from %d to %d instances (%s)' % (
                        current_instances, new_instance_count, humanize_error(error),
                    ),
                )
                set_instances_for_marathon_service(
                    service=marathon_service_config.service,
                    instance=marathon_service_config.instance,
                    instance_count=new_instance_count,
                )
            else:
                write_to_log(
                    config=marathon_service_config,
                    line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)),
                    level='debug',
                )
            meteorite_dims = {
                'service_name': marathon_service_config.service,
                'decision_policy': autoscaling_params[DECISION_POLICY_KEY],  # type: ignore
                'paasta_cluster': marathon_service_config.cluster,
                'instance_name': marathon_service_config.instance,
            }
            if yelp_meteorite:
                gauge = yelp_meteorite.create_gauge('paasta.service.instances', meteorite_dims)
                gauge.set(new_instance_count)
                gauge = yelp_meteorite.create_gauge('paasta.service.max_instances', meteorite_dims)
                gauge.set(marathon_service_config.get_max_instances())
                gauge = yelp_meteorite.create_gauge('paasta.service.min_instances', meteorite_dims)
                gauge.set(marathon_service_config.get_min_instances())
    except LockHeldException:
        log.warning("Skipping autoscaling run for {service}.{instance} because the lock is held".format(
            service=marathon_service_config.service,
            instance=marathon_service_config.instance,
        ))