def _record_autoscaling_decision( marathon_service_config: MarathonServiceConfig, autoscaling_params: AutoscalingParamsDict, utilization: float, log_utilization_data: Mapping[str, str], error: float, current_instances: int, num_healthy_instances: int, new_instance_count: int, safe_downscaling_threshold: int, task_data_insufficient: bool, ) -> None: """ Based on the calculations made, perform observability side effects. Log messages, generate time series, send any alerts, etc. """ write_to_log( config=marathon_service_config, line=json.dumps( dict( timestamp=time.time(), paasta_cluster=marathon_service_config.get_cluster(), paasta_service=marathon_service_config.get_service(), paasta_instance=marathon_service_config.get_instance(), autoscaling_params=autoscaling_params, utilization=utilization, error=error, current_instances=current_instances, num_healthy_instances=num_healthy_instances, new_instance_count=new_instance_count, safe_downscaling_threshold=safe_downscaling_threshold, task_data_insufficient=task_data_insufficient, ) ), level="debug", ) meteorite_dims = { "paasta_service": marathon_service_config.service, "paasta_cluster": marathon_service_config.cluster, "paasta_instance": marathon_service_config.instance, "paasta_pool": marathon_service_config.get_pool(), "decision_policy": autoscaling_params[DECISION_POLICY_KEY], # type: ignore } if yelp_meteorite: gauge = yelp_meteorite.create_gauge("paasta.service.instances", meteorite_dims) gauge.set(new_instance_count) gauge = yelp_meteorite.create_gauge( "paasta.service.max_instances", meteorite_dims ) gauge.set(marathon_service_config.get_max_instances()) gauge = yelp_meteorite.create_gauge( "paasta.service.min_instances", meteorite_dims ) gauge.set(marathon_service_config.get_min_instances())
def autoscale_marathon_instance( marathon_service_config: MarathonServiceConfig, system_paasta_config: SystemPaastaConfig, marathon_tasks: Sequence[MarathonTask], mesos_tasks: Sequence[Task], ) -> None: try: with create_autoscaling_lock(marathon_service_config.service, marathon_service_config.instance): current_instances = marathon_service_config.get_instances() task_data_insufficient = is_task_data_insufficient( marathon_service_config=marathon_service_config, marathon_tasks=marathon_tasks, current_instances=current_instances, ) autoscaling_params = marathon_service_config.get_autoscaling_params() log_utilization_data: Mapping = {} utilization = get_utilization( marathon_service_config=marathon_service_config, system_paasta_config=system_paasta_config, autoscaling_params=autoscaling_params, log_utilization_data=log_utilization_data, marathon_tasks=marathon_tasks, mesos_tasks=mesos_tasks, ) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params['setpoint'], current_instances=current_instances, ) new_instance_count = get_new_instance_count( utilization=utilization, error=error, autoscaling_params=autoscaling_params, current_instances=current_instances, marathon_service_config=marathon_service_config, num_healthy_instances=len(marathon_tasks), ) safe_downscaling_threshold = int(current_instances * 0.7) if new_instance_count != current_instances: if new_instance_count < current_instances and task_data_insufficient: write_to_log( config=marathon_service_config, line='Delaying scaling *down* as we found too few healthy tasks running in marathon. ' 'This can happen because tasks are delayed/waiting/unhealthy or because we are ' 'waiting for tasks to be killed. Will wait for sufficient healthy tasks before ' 'we make a decision to scale down.', ) return if new_instance_count == safe_downscaling_threshold: write_to_log( config=marathon_service_config, line='Autoscaler clamped: %s' % str(log_utilization_data), level='debug', ) write_to_log( config=marathon_service_config, line='Scaling from %d to %d instances (%s)' % ( current_instances, new_instance_count, humanize_error(error), ), ) set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=new_instance_count, ) else: write_to_log( config=marathon_service_config, line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)), level='debug', ) meteorite_dims = { 'service_name': marathon_service_config.service, 'decision_policy': autoscaling_params[DECISION_POLICY_KEY], # type: ignore 'paasta_cluster': marathon_service_config.cluster, 'instance_name': marathon_service_config.instance, } if yelp_meteorite: gauge = yelp_meteorite.create_gauge('paasta.service.instances', meteorite_dims) gauge.set(new_instance_count) gauge = yelp_meteorite.create_gauge('paasta.service.max_instances', meteorite_dims) gauge.set(marathon_service_config.get_max_instances()) gauge = yelp_meteorite.create_gauge('paasta.service.min_instances', meteorite_dims) gauge.set(marathon_service_config.get_min_instances()) except LockHeldException: log.warning("Skipping autoscaling run for {service}.{instance} because the lock is held".format( service=marathon_service_config.service, instance=marathon_service_config.instance, ))