def test_get_resource_utilization_by_grouping( mock_get_all_tasks_from_state, mock_calculate_resource_utilization_for_slaves, mock_group_slaves_by_key_func, ): mock_group_slaves_by_key_func.return_value = { 'somenametest-habitat': [{ 'id': 'abcd', 'hostname': 'test.somewhere.www' }], 'somenametest-habitat-2': [{ 'id': 'abcd', 'hostname': 'test2.somewhere.www' }] } mock_calculate_resource_utilization_for_slaves.return_value = { 'free': paasta_metastatus.ResourceInfo(cpus=10, mem=10, disk=10), 'total': paasta_metastatus.ResourceInfo(cpus=20, mem=20, disk=20) } mock_get_all_tasks_from_state([Mock(), Mock()]) state = {'frameworks': Mock(), 'slaves': [{}]} actual = paasta_metastatus.get_resource_utilization_by_grouping( grouping_func=lambda slave: slave['attributes']['habitat'], mesos_state=state, ) assert sorted(actual.keys()) == sorted( ['somenametest-habitat', 'somenametest-habitat-2']) for k, v in actual.items(): print v assert v['total'] == paasta_metastatus.ResourceInfo(cpus=20, disk=20, mem=20) assert v['free'] == paasta_metastatus.ResourceInfo(cpus=10, disk=10, mem=10)
def spotfleet_metrics_provider(spotfleet_request_id, resource, pool_settings): mesos_state = get_mesos_master().state sfr = get_sfr(spotfleet_request_id, region=resource['region']) if not sfr or not sfr['SpotFleetRequestState'] == 'active': log.error( "Ignoring SFR {0} that does not exist or is not active.".format( spotfleet_request_id)) return 0, 0 sfr['ActiveInstances'] = get_spot_fleet_instances( spotfleet_request_id, region=resource['region']) resource['sfr'] = sfr desired_instances = len(sfr['ActiveInstances']) instance_ips = get_sfr_instance_ips(sfr, region=resource['region']) slaves = { slave['id']: slave for slave in mesos_state.get('slaves', []) if slave_pid_to_ip(slave['pid']) in instance_ips and slave['attributes'].get('pool', 'default') == resource['pool'] } current_instances = len(slaves) log.info("Found %.2f%% slaves registered in mesos for this SFR (%d/%d)" % (float(float(current_instances) / float(desired_instances)) * 100, current_instances, desired_instances)) if float(current_instances) / desired_instances < ( 1.00 - MISSING_SLAVE_PANIC_THRESHOLD): error_message = ( "We currently have %d instances active in mesos out of a desired %d.\n" "Refusing to scale because we either need to wait for the requests to be " "filled, or the new instances are not healthy for some reason.\n" "(cowardly refusing to go past %.2f%% missing instances)") % ( current_instances, desired_instances, MISSING_SLAVE_PANIC_THRESHOLD) raise ClusterAutoscalingError(error_message) pool_utilization_dict = get_resource_utilization_by_grouping( lambda slave: slave['attributes']['pool'], mesos_state)[resource['pool']] log.debug(pool_utilization_dict) free_pool_resources = pool_utilization_dict['free'] total_pool_resources = pool_utilization_dict['total'] utilization = 1.0 - min([ float(float(pair[0]) / float(pair[1])) for pair in zip(free_pool_resources, total_pool_resources) ]) target_utilization = pool_settings.get('target_utilization', DEFAULT_TARGET_UTILIZATION) error = utilization - target_utilization current, target = get_spot_fleet_delta(resource, error) return current, target
def spotfleet_metrics_provider(spotfleet_request_id, mesos_state, pool): def slave_pid_to_ip(slave_pid): regex = re.compile(r'.+?@([\d\.]+):\d+') return regex.match(slave_pid).group(1) ec2_client = boto3.client('ec2') spot_fleet_instances = ec2_client.describe_spot_fleet_instances( SpotFleetRequestId=spotfleet_request_id)['ActiveInstances'] desired_instances = len(spot_fleet_instances) instance_ips = { instance['PrivateIpAddress'] for reservation in ec2_client.describe_instances(InstanceIds=[ instance['InstanceId'] for instance in spot_fleet_instances ])['Reservations'] for instance in reservation['Instances'] } slaves = { slave['id']: slave for slave in mesos_state.get('slaves', []) if slave_pid_to_ip(slave['pid']) in instance_ips and slave['attributes'].get('pool', 'default') == pool } current_instances = len(slaves) log.info("Found %.2%% slaves registered in mesos for this SFR (%d/%d)" % ((current_instances / desired_instances) * 100, current_instances, desired_instances)) if float(current_instances) / desired_instances < ( 1.00 - MISSING_SLAVE_PANIC_THRESHOLD): error_message = ( "We currently have %d instances active in mesos out of a desired %d.\n" "Refusing to scale because we either need to wait for the requests to be " "filled, or the new instances are not healthy for some reason.\n" "(cowardly refusing to go past %.2f%% missing instances)") % ( current_instances, desired_instances, MISSING_SLAVE_PANIC_THRESHOLD) raise ClusterAutoscalingError(error_message) pool_utilization_dict = get_resource_utilization_by_grouping( lambda slave: slave['attributes']['pool'], mesos_state)[pool] free_pool_resources = pool_utilization_dict['free'] total_pool_resources = pool_utilization_dict['total'] utilization = 1.0 - min([ float(free_pool_resources[resource]) / total_pool_resources[resource] for resource in free_pool_resources ]) return utilization
def spotfleet_metrics_provider(spotfleet_request_id, mesos_state, pool): def slave_pid_to_ip(slave_pid): regex = re.compile(r'.+?@([\d\.]+):\d+') return regex.match(slave_pid).group(1) ec2_client = boto3.client('ec2') spot_fleet_instances = ec2_client.describe_spot_fleet_instances( SpotFleetRequestId=spotfleet_request_id)['ActiveInstances'] desired_instances = len(spot_fleet_instances) instance_ips = { instance['PrivateIpAddress'] for reservation in ec2_client.describe_instances( InstanceIds=[instance['InstanceId'] for instance in spot_fleet_instances])['Reservations'] for instance in reservation['Instances'] } slaves = { slave['id']: slave for slave in mesos_state.get('slaves', []) if slave_pid_to_ip(slave['pid']) in instance_ips and slave['attributes'].get('pool', 'default') == pool } current_instances = len(slaves) log.info("Found %.2%% slaves registered in mesos for this SFR (%d/%d)" % ( (current_instances / desired_instances) * 100, current_instances, desired_instances)) if float(current_instances) / desired_instances < (1.00 - MISSING_SLAVE_PANIC_THRESHOLD): error_message = ("We currently have %d instances active in mesos out of a desired %d.\n" "Refusing to scale because we either need to wait for the requests to be " "filled, or the new instances are not healthy for some reason.\n" "(cowardly refusing to go past %.2f%% missing instances)") % ( current_instances, desired_instances, MISSING_SLAVE_PANIC_THRESHOLD) raise ClusterAutoscalingError(error_message) pool_utilization_dict = get_resource_utilization_by_grouping( lambda slave: slave['attributes']['pool'], mesos_state )[pool] log.debug(pool_utilization_dict) free_pool_resources = pool_utilization_dict['free'] total_pool_resources = pool_utilization_dict['total'] utilization = 1.0 - min([ float(float(pair[0]) / float(pair[1])) for pair in zip(free_pool_resources, total_pool_resources) ]) return utilization
def get_mesos_utilization_error(spotfleet_request_id, resource, pool_settings, slaves, mesos_state, desired_instances=None): current_instances = len(slaves) if desired_instances == 0: error_message = ( "No instances are active, not scaling until the instances are launched" ) raise ClusterAutoscalingError(error_message) if desired_instances: log.info( "Found %.2f%% slaves registered in mesos for this resource (%d/%d)" % (float(float(current_instances) / float(desired_instances)) * 100, current_instances, desired_instances)) if float(current_instances) / desired_instances < ( 1.00 - MISSING_SLAVE_PANIC_THRESHOLD): error_message = ( "We currently have %d instances active in mesos out of a desired %d.\n" "Refusing to scale because we either need to wait for the requests to be " "filled, or the new instances are not healthy for some reason.\n" "(cowardly refusing to go past %.2f%% missing instances)") % ( current_instances, desired_instances, MISSING_SLAVE_PANIC_THRESHOLD) raise ClusterAutoscalingError(error_message) pool_utilization_dict = get_resource_utilization_by_grouping( lambda slave: slave['attributes']['pool'], mesos_state)[resource['pool']] log.debug(pool_utilization_dict) free_pool_resources = pool_utilization_dict['free'] total_pool_resources = pool_utilization_dict['total'] utilization = 1.0 - min([ float(float(pair[0]) / float(pair[1])) for pair in zip(free_pool_resources, total_pool_resources) ]) target_utilization = pool_settings.get('target_utilization', DEFAULT_TARGET_UTILIZATION) return utilization - target_utilization
def test_get_resource_utilization_by_grouping( mock_get_all_tasks_from_state, mock_calculate_resource_utilization_for_slaves, mock_group_slaves_by_key_func, ): mock_group_slaves_by_key_func.return_value = { 'somenametest-habitat': [{ 'id': 'abcd', 'hostname': 'test.somewhere.www' }], 'somenametest-habitat-2': [{ 'id': 'abcd', 'hostname': 'test2.somewhere.www' }] } mock_calculate_resource_utilization_for_slaves.return_value = { 'free': paasta_metastatus.ResourceInfo(cpus=10, mem=10, disk=10), 'total': paasta_metastatus.ResourceInfo(cpus=20, mem=20, disk=20) } mock_get_all_tasks_from_state([Mock(), Mock()]) state = { 'frameworks': Mock(), 'slaves': [{}] } actual = paasta_metastatus.get_resource_utilization_by_grouping( grouping_func=lambda slave: slave['attributes']['habitat'], mesos_state=state, ) assert sorted(actual.keys()) == sorted(['somenametest-habitat', 'somenametest-habitat-2']) for k, v in actual.items(): print v assert v['total'] == paasta_metastatus.ResourceInfo( cpus=20, disk=20, mem=20 ) assert v['free'] == paasta_metastatus.ResourceInfo( cpus=10, disk=10, mem=10 )
def spotfleet_metrics_provider(spotfleet_request_id, mesos_state, resource): sfr = get_sfr(spotfleet_request_id) sfr['ActiveInstances'] = get_spot_fleet_instances(spotfleet_request_id) resource['sfr'] = sfr if not sfr['SpotFleetRequestState'] == 'active': log.error("Ignoring SFR {0} that is not yet active or is cancelled etc.".format(spotfleet_request_id)) return 0, 0 desired_instances = len(sfr['ActiveInstances']) instance_ips = get_sfr_instance_ips(sfr) slaves = { slave['id']: slave for slave in mesos_state.get('slaves', []) if slave_pid_to_ip(slave['pid']) in instance_ips and slave['attributes'].get('pool', 'default') == resource['pool'] } current_instances = len(slaves) log.info("Found %.2f%% slaves registered in mesos for this SFR (%d/%d)" % ( float(float(current_instances) / float(desired_instances)) * 100, current_instances, desired_instances)) if float(current_instances) / desired_instances < (1.00 - MISSING_SLAVE_PANIC_THRESHOLD): error_message = ("We currently have %d instances active in mesos out of a desired %d.\n" "Refusing to scale because we either need to wait for the requests to be " "filled, or the new instances are not healthy for some reason.\n" "(cowardly refusing to go past %.2f%% missing instances)") % ( current_instances, desired_instances, MISSING_SLAVE_PANIC_THRESHOLD) raise ClusterAutoscalingError(error_message) pool_utilization_dict = get_resource_utilization_by_grouping( lambda slave: slave['attributes']['pool'], mesos_state )[resource['pool']] log.debug(pool_utilization_dict) free_pool_resources = pool_utilization_dict['free'] total_pool_resources = pool_utilization_dict['total'] utilization = 1.0 - min([ float(float(pair[0]) / float(pair[1])) for pair in zip(free_pool_resources, total_pool_resources) ]) error = utilization - CLUSTER_TARGET_UTILIZATION current, target = get_spot_fleet_delta(resource, error) return current, target