def autoscale_marathon_instance( marathon_service_config: MarathonServiceConfig, system_paasta_config: SystemPaastaConfig, marathon_tasks: Sequence[MarathonTask], mesos_tasks: Sequence[Task], ) -> None: try: with create_autoscaling_lock(marathon_service_config.service, marathon_service_config.instance): current_instances = marathon_service_config.get_instances() task_data_insufficient = is_task_data_insufficient( marathon_service_config=marathon_service_config, marathon_tasks=marathon_tasks, current_instances=current_instances, ) autoscaling_params = marathon_service_config.get_autoscaling_params( ) log_utilization_data: Mapping = {} utilization = get_utilization( marathon_service_config=marathon_service_config, system_paasta_config=system_paasta_config, autoscaling_params=autoscaling_params, log_utilization_data=log_utilization_data, marathon_tasks=marathon_tasks, mesos_tasks=mesos_tasks, ) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params["setpoint"], current_instances=current_instances, ) num_healthy_instances = len(marathon_tasks) new_instance_count = get_new_instance_count( utilization=utilization, error=error, autoscaling_params=autoscaling_params, current_instances=current_instances, marathon_service_config=marathon_service_config, num_healthy_instances=num_healthy_instances, persist_data=(not task_data_insufficient), ) safe_downscaling_threshold = int(current_instances * 0.7) _record_autoscaling_decision( marathon_service_config=marathon_service_config, autoscaling_params=autoscaling_params, utilization=utilization, log_utilization_data=log_utilization_data, error=error, current_instances=current_instances, num_healthy_instances=num_healthy_instances, new_instance_count=new_instance_count, safe_downscaling_threshold=safe_downscaling_threshold, task_data_insufficient=task_data_insufficient, ) if new_instance_count != current_instances: if new_instance_count < current_instances and task_data_insufficient: write_to_log( config=marathon_service_config, line= "Delaying scaling *down* as we found too few healthy tasks running in marathon. " "This can happen because tasks are delayed/waiting/unhealthy or because we are " "waiting for tasks to be killed. Will wait for sufficient healthy tasks before " "we make a decision to scale down.", level="debug", ) return else: set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=new_instance_count, ) write_to_log( config=marathon_service_config, line="Scaling from %d to %d instances (%s)" % ( current_instances, new_instance_count, humanize_error(error), ), level="event", ) else: write_to_log( config=marathon_service_config, line="Staying at %d instances (%s)" % (current_instances, humanize_error(error)), level="debug", ) except LockHeldException: log.warning( "Skipping autoscaling run for {service}.{instance} because the lock is held" .format( service=marathon_service_config.service, instance=marathon_service_config.instance, ))
def autoscale_marathon_instance( marathon_service_config: MarathonServiceConfig, system_paasta_config: SystemPaastaConfig, marathon_tasks: Sequence[MarathonTask], mesos_tasks: Sequence[Task], ) -> None: try: with create_autoscaling_lock(marathon_service_config.service, marathon_service_config.instance): current_instances = marathon_service_config.get_instances() task_data_insufficient = is_task_data_insufficient( marathon_service_config=marathon_service_config, marathon_tasks=marathon_tasks, current_instances=current_instances, ) autoscaling_params = marathon_service_config.get_autoscaling_params() log_utilization_data: Mapping = {} utilization = get_utilization( marathon_service_config=marathon_service_config, system_paasta_config=system_paasta_config, autoscaling_params=autoscaling_params, log_utilization_data=log_utilization_data, marathon_tasks=marathon_tasks, mesos_tasks=mesos_tasks, ) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params['setpoint'], current_instances=current_instances, ) new_instance_count = get_new_instance_count( utilization=utilization, error=error, autoscaling_params=autoscaling_params, current_instances=current_instances, marathon_service_config=marathon_service_config, num_healthy_instances=len(marathon_tasks), ) safe_downscaling_threshold = int(current_instances * 0.7) if new_instance_count != current_instances: if new_instance_count < current_instances and task_data_insufficient: write_to_log( config=marathon_service_config, line='Delaying scaling *down* as we found too few healthy tasks running in marathon. ' 'This can happen because tasks are delayed/waiting/unhealthy or because we are ' 'waiting for tasks to be killed. Will wait for sufficient healthy tasks before ' 'we make a decision to scale down.', ) return if new_instance_count == safe_downscaling_threshold: write_to_log( config=marathon_service_config, line='Autoscaler clamped: %s' % str(log_utilization_data), level='debug', ) write_to_log( config=marathon_service_config, line='Scaling from %d to %d instances (%s)' % ( current_instances, new_instance_count, humanize_error(error), ), ) set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=new_instance_count, ) else: write_to_log( config=marathon_service_config, line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)), level='debug', ) meteorite_dims = { 'service_name': marathon_service_config.service, 'decision_policy': autoscaling_params[DECISION_POLICY_KEY], # type: ignore 'paasta_cluster': marathon_service_config.cluster, 'instance_name': marathon_service_config.instance, } if yelp_meteorite: gauge = yelp_meteorite.create_gauge('paasta.service.instances', meteorite_dims) gauge.set(new_instance_count) gauge = yelp_meteorite.create_gauge('paasta.service.max_instances', meteorite_dims) gauge.set(marathon_service_config.get_max_instances()) gauge = yelp_meteorite.create_gauge('paasta.service.min_instances', meteorite_dims) gauge.set(marathon_service_config.get_min_instances()) except LockHeldException: log.warning("Skipping autoscaling run for {service}.{instance} because the lock is held".format( service=marathon_service_config.service, instance=marathon_service_config.instance, ))