def autoscale_marathon_instance(marathon_service_config, marathon_tasks, mesos_tasks): current_instances = marathon_service_config.get_instances() if len(marathon_tasks) != current_instances: write_to_log( config=marathon_service_config, line= 'Delaying scaling as marathon is either waiting for resources or is delayed' ) return autoscaling_params = marathon_service_config.get_autoscaling_params() autoscaling_metrics_provider = get_service_metrics_provider( autoscaling_params.pop(SERVICE_METRICS_PROVIDER_KEY)) autoscaling_decision_policy = get_decision_policy( autoscaling_params.pop(DECISION_POLICY_KEY)) utilization = autoscaling_metrics_provider( marathon_service_config=marathon_service_config, marathon_tasks=marathon_tasks, mesos_tasks=mesos_tasks, **autoscaling_params) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params.pop('setpoint'), current_instances=current_instances, ) zookeeper_path = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) autoscaling_amount = autoscaling_decision_policy( error=error, min_instances=marathon_service_config.get_min_instances(), max_instances=marathon_service_config.get_max_instances(), current_instances=current_instances, zookeeper_path=zookeeper_path, **autoscaling_params) new_instance_count = marathon_service_config.limit_instance_count( current_instances + autoscaling_amount) if new_instance_count != current_instances: write_to_log( config=marathon_service_config, line='Scaling from %d to %d instances (%s)' % (current_instances, new_instance_count, humanize_error(error)), ) set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=new_instance_count, ) else: write_to_log( config=marathon_service_config, line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)), level='debug', )
def autoscale_marathon_instance(marathon_service_config, marathon_tasks, mesos_tasks): current_instances = marathon_service_config.get_instances() if len(marathon_tasks) != current_instances: write_to_log(config=marathon_service_config, line='Delaying scaling as marathon is either waiting for resources or is delayed') return autoscaling_params = marathon_service_config.get_autoscaling_params() autoscaling_metrics_provider = get_service_metrics_provider(autoscaling_params.pop(SERVICE_METRICS_PROVIDER_KEY)) autoscaling_decision_policy = get_decision_policy(autoscaling_params.pop(DECISION_POLICY_KEY)) utilization = autoscaling_metrics_provider(marathon_service_config, marathon_tasks, mesos_tasks, **autoscaling_params) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params.pop('setpoint'), current_instances=current_instances, ) zookeeper_path = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) autoscaling_amount = autoscaling_decision_policy( error=error, min_instances=marathon_service_config.get_min_instances(), max_instances=marathon_service_config.get_max_instances(), current_instances=current_instances, zookeeper_path=zookeeper_path, **autoscaling_params ) new_instance_count = marathon_service_config.limit_instance_count(current_instances + autoscaling_amount) if new_instance_count != current_instances: write_to_log( config=marathon_service_config, line='Scaling from %d to %d instances (%s)' % ( current_instances, new_instance_count, humanize_error(error)), ) set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=new_instance_count, ) else: write_to_log( config=marathon_service_config, line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)), level='debug', )
def update_autoscaler_count(request): service = request.swagger_data.get("service") instance = request.swagger_data.get("instance") desired_instances = request.swagger_data.get( "json_body")["desired_instances"] if not isinstance(desired_instances, int): error_message = 'The provided body does not have an integer value for "desired_instances": {}'.format( request.swagger_data.get("json_body")) raise ApiFailure(error_message, 500) try: service_config = load_marathon_service_config( service=service, instance=instance, cluster=settings.cluster, soa_dir=settings.soa_dir, load_deployments=False, ) except Exception: error_message = f"Unable to load service config for {service}.{instance}" raise ApiFailure(error_message, 404) max_instances = service_config.get_max_instances() if max_instances is None: error_message = f"Autoscaling is not enabled for {service}.{instance}" raise ApiFailure(error_message, 404) min_instances = service_config.get_min_instances() # Dump whatever number from the client to zk. get_instances() will limit # readings from zk to [min_instances, max_instances]. set_instances_for_marathon_service(service=service, instance=instance, instance_count=desired_instances) status = "SUCCESS" if desired_instances > max_instances: desired_instances = max_instances status = ( "WARNING desired_instances is greater than max_instances %d" % max_instances) elif desired_instances < min_instances: desired_instances = min_instances status = ("WARNING desired_instances is less than min_instances %d" % min_instances) response_body = {"desired_instances": desired_instances, "status": status} return Response(json_body=response_body, status_code=202)
def update_autoscaler_count(request): service = request.swagger_data.get('service') instance = request.swagger_data.get('instance') desired_instances = request.swagger_data.get( 'json_body')['desired_instances'] try: service_config = load_marathon_service_config( service=service, instance=instance, cluster=settings.cluster, soa_dir=settings.soa_dir, load_deployments=False, ) except Exception: error_message = 'Unable to load service config for %s.%s' % (service, instance) raise ApiFailure(error_message, 404) max_instances = service_config.get_max_instances() if max_instances is None: error_message = 'Autoscaling is not enabled for %s.%s' % (service, instance) raise ApiFailure(error_message, 404) min_instances = service_config.get_min_instances() # Dump whatever number from the client to zk. get_instances() will limit # readings from zk to [min_instances, max_instances]. set_instances_for_marathon_service(service=service, instance=instance, instance_count=desired_instances) status = 'SUCCESS' if desired_instances > max_instances: desired_instances = max_instances status = 'WARNING desired_instances is greater than max_instances %d' % max_instances elif desired_instances < min_instances: desired_instances = min_instances status = 'WARNING desired_instances is less than min_instances %d' % min_instances response_body = {'desired_instances': desired_instances, 'status': status} return Response(json_body=response_body, status_code=202)
def update_autoscaler_count(request): service = request.swagger_data.get('service') instance = request.swagger_data.get('instance') desired_instances = request.swagger_data.get('json_body')['desired_instances'] try: service_config = load_marathon_service_config( service=service, instance=instance, cluster=settings.cluster, soa_dir=settings.soa_dir, load_deployments=False, ) except Exception: error_message = 'Unable to load service config for %s.%s' % (service, instance) raise ApiFailure(error_message, 404) max_instances = service_config.get_max_instances() if max_instances is None: error_message = 'Autoscaling is not enabled for %s.%s' % (service, instance) raise ApiFailure(error_message, 404) min_instances = service_config.get_min_instances() # Dump whatever number from the client to zk. get_instances() will limit # readings from zk to [min_instances, max_instances]. set_instances_for_marathon_service(service=service, instance=instance, instance_count=desired_instances) status = 'SUCCESS' if desired_instances > max_instances: desired_instances = max_instances status = 'WARNING desired_instances is greater than max_instances %d' % max_instances elif desired_instances < min_instances: desired_instances = min_instances status = 'WARNING desired_instances is less than min_instances %d' % min_instances response_body = {'desired_instances': desired_instances, 'status': status} return Response(json_body=response_body, status_code=202)
def autoscale_marathon_instance(marathon_service_config, system_paasta_config, marathon_tasks, mesos_tasks): current_instances = marathon_service_config.get_instances() task_data_insufficient = is_task_data_insufficient(marathon_service_config, marathon_tasks, current_instances) autoscaling_params = marathon_service_config.get_autoscaling_params() log_utilization_data = {} utilization = get_utilization( marathon_service_config=marathon_service_config, system_paasta_config=system_paasta_config, autoscaling_params=autoscaling_params, log_utilization_data=log_utilization_data, marathon_tasks=marathon_tasks, mesos_tasks=mesos_tasks, ) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params['setpoint'], current_instances=current_instances, ) new_instance_count = get_new_instance_count( utilization=utilization, error=error, autoscaling_params=autoscaling_params, current_instances=current_instances, marathon_service_config=marathon_service_config, num_healthy_instances=len(marathon_tasks), ) safe_downscaling_threshold = int(current_instances * 0.7) if new_instance_count != current_instances: if new_instance_count < current_instances and task_data_insufficient: write_to_log( config=marathon_service_config, line= 'Delaying scaling *down* as we found too few healthy tasks running in marathon. ' 'This can happen because tasks are delayed/waiting/unhealthy or because we are ' 'waiting for tasks to be killed. Will wait for sufficient healthy tasks before ' 'we make a decision to scale down.', ) return if new_instance_count == safe_downscaling_threshold: write_to_log( config=marathon_service_config, line='Autoscaler clamped: %s' % str(log_utilization_data), level='debug', ) write_to_log( config=marathon_service_config, line='Scaling from %d to %d instances (%s)' % ( current_instances, new_instance_count, humanize_error(error), ), ) set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=new_instance_count, ) else: write_to_log( config=marathon_service_config, line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)), level='debug', ) meteorite_dims = { 'service_name': marathon_service_config.service, 'decision_policy': autoscaling_params[DECISION_POLICY_KEY], 'paasta_cluster': marathon_service_config.cluster, 'instance_name': marathon_service_config.instance, } if yelp_meteorite: gauge = yelp_meteorite.create_gauge('paasta.service.instances', meteorite_dims) gauge.set(new_instance_count) gauge = yelp_meteorite.create_gauge('paasta.service.max_instances', meteorite_dims) gauge.set(marathon_service_config.get_max_instances()) gauge = yelp_meteorite.create_gauge('paasta.service.min_instances', meteorite_dims) gauge.set(marathon_service_config.get_min_instances())
def autoscale_marathon_instance(marathon_service_config, marathon_tasks, mesos_tasks): current_instances = marathon_service_config.get_instances() too_many_instances_running = len(marathon_tasks) > int( (1 + MAX_TASK_DELTA) * current_instances) too_few_instances_running = len(marathon_tasks) < int( (1 - MAX_TASK_DELTA) * current_instances) if too_many_instances_running or too_few_instances_running: if current_instances < marathon_service_config.get_min_instances(): write_to_log( config=marathon_service_config, line= 'Scaling from %d to %d instances because we are below min_instances' % (current_instances, marathon_service_config.get_min_instances())) set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=marathon_service_config.get_min_instances()) else: write_to_log( config=marathon_service_config, line= 'Delaying scaling as we found too many or too few tasks running in marathon. ' 'This can happen because tasks are delayed/waiting/unhealthy or because we are ' 'waiting for tasks to be killed.') return autoscaling_params = marathon_service_config.get_autoscaling_params() autoscaling_metrics_provider = get_service_metrics_provider( autoscaling_params.pop(SERVICE_METRICS_PROVIDER_KEY)) autoscaling_decision_policy = get_decision_policy( autoscaling_params.pop(DECISION_POLICY_KEY)) log_utilization_data = {} utilization = autoscaling_metrics_provider( marathon_service_config=marathon_service_config, marathon_tasks=marathon_tasks, mesos_tasks=mesos_tasks, log_utilization_data=log_utilization_data, **autoscaling_params) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params.pop('setpoint'), current_instances=current_instances, ) zookeeper_path = compose_autoscaling_zookeeper_root( service=marathon_service_config.service, instance=marathon_service_config.instance, ) autoscaling_amount = autoscaling_decision_policy( error=error, min_instances=marathon_service_config.get_min_instances(), max_instances=marathon_service_config.get_max_instances(), current_instances=current_instances, zookeeper_path=zookeeper_path, **autoscaling_params) # Limit downscaling by 30% of current_instances until we find out what is # going on in such situations safe_downscaling_threshold = int(current_instances * 0.7) new_instance_count = max(current_instances + autoscaling_amount, safe_downscaling_threshold) new_instance_count = marathon_service_config.limit_instance_count( new_instance_count) if new_instance_count != current_instances: if new_instance_count == safe_downscaling_threshold: write_to_log( config=marathon_service_config, line='Autoscaler clamped: %s' % str(log_utilization_data), level='debug', ) write_to_log( config=marathon_service_config, line='Scaling from %d to %d instances (%s)' % (current_instances, new_instance_count, humanize_error(error)), ) set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=new_instance_count, ) else: write_to_log( config=marathon_service_config, line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)), level='debug', )
def autoscale_marathon_instance(marathon_service_config, marathon_tasks, mesos_tasks): current_instances = marathon_service_config.get_instances() task_data_insufficient = is_task_data_insufficient(marathon_service_config, marathon_tasks, current_instances) autoscaling_params = marathon_service_config.get_autoscaling_params() log_utilization_data = {} utilization = get_utilization( marathon_service_config=marathon_service_config, autoscaling_params=autoscaling_params, log_utilization_data=log_utilization_data, marathon_tasks=marathon_tasks, mesos_tasks=mesos_tasks) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params['setpoint'], current_instances=current_instances, ) new_instance_count = get_new_instance_count( utilization=utilization, error=error, autoscaling_params=autoscaling_params, current_instances=current_instances, marathon_service_config=marathon_service_config, num_healthy_instances=len(marathon_tasks), ) safe_downscaling_threshold = int(current_instances * 0.7) if new_instance_count != current_instances: if new_instance_count < current_instances and task_data_insufficient: write_to_log( config=marathon_service_config, line= 'Delaying scaling *down* as we found too few healthy tasks running in marathon. ' 'This can happen because tasks are delayed/waiting/unhealthy or because we are ' 'waiting for tasks to be killed. Will wait for sufficient healthy tasks before ' 'we make a decision to scale down.') return if new_instance_count == safe_downscaling_threshold: write_to_log( config=marathon_service_config, line='Autoscaler clamped: %s' % str(log_utilization_data), level='debug', ) write_to_log( config=marathon_service_config, line='Scaling from %d to %d instances (%s)' % (current_instances, new_instance_count, humanize_error(error)), ) set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=new_instance_count, ) else: write_to_log( config=marathon_service_config, line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)), level='debug', )
def autoscale_marathon_instance( marathon_service_config: MarathonServiceConfig, system_paasta_config: SystemPaastaConfig, marathon_tasks: Sequence[MarathonTask], mesos_tasks: Sequence[Task], ) -> None: try: with create_autoscaling_lock(marathon_service_config.service, marathon_service_config.instance): current_instances = marathon_service_config.get_instances() task_data_insufficient = is_task_data_insufficient( marathon_service_config=marathon_service_config, marathon_tasks=marathon_tasks, current_instances=current_instances, ) autoscaling_params = marathon_service_config.get_autoscaling_params( ) log_utilization_data: Mapping = {} utilization = get_utilization( marathon_service_config=marathon_service_config, system_paasta_config=system_paasta_config, autoscaling_params=autoscaling_params, log_utilization_data=log_utilization_data, marathon_tasks=marathon_tasks, mesos_tasks=mesos_tasks, ) error = get_error_from_utilization( utilization=utilization, setpoint=autoscaling_params["setpoint"], current_instances=current_instances, ) num_healthy_instances = len(marathon_tasks) new_instance_count = get_new_instance_count( utilization=utilization, error=error, autoscaling_params=autoscaling_params, current_instances=current_instances, marathon_service_config=marathon_service_config, num_healthy_instances=num_healthy_instances, persist_data=(not task_data_insufficient), ) safe_downscaling_threshold = int(current_instances * 0.7) _record_autoscaling_decision( marathon_service_config=marathon_service_config, autoscaling_params=autoscaling_params, utilization=utilization, log_utilization_data=log_utilization_data, error=error, current_instances=current_instances, num_healthy_instances=num_healthy_instances, new_instance_count=new_instance_count, safe_downscaling_threshold=safe_downscaling_threshold, task_data_insufficient=task_data_insufficient, ) if new_instance_count != current_instances: if new_instance_count < current_instances and task_data_insufficient: write_to_log( config=marathon_service_config, line= "Delaying scaling *down* as we found too few healthy tasks running in marathon. " "This can happen because tasks are delayed/waiting/unhealthy or because we are " "waiting for tasks to be killed. Will wait for sufficient healthy tasks before " "we make a decision to scale down.", level="debug", ) return else: set_instances_for_marathon_service( service=marathon_service_config.service, instance=marathon_service_config.instance, instance_count=new_instance_count, ) write_to_log( config=marathon_service_config, line="Scaling from %d to %d instances (%s)" % ( current_instances, new_instance_count, humanize_error(error), ), level="event", ) except LockHeldException: log.warning( "Skipping autoscaling run for {service}.{instance} because the lock is held" .format( service=marathon_service_config.service, instance=marathon_service_config.instance, ))