示例#1
0
def test_get_resource_utilization_by_grouping(
    mock_get_all_tasks_from_state,
    mock_calculate_resource_utilization_for_slaves,
    mock_group_slaves_by_key_func,
):
    mock_group_slaves_by_key_func.return_value = {
        'somenametest-habitat': [{
            'id': 'abcd',
            'hostname': 'test.somewhere.www'
        }],
        'somenametest-habitat-2': [{
            'id': 'abcd',
            'hostname': 'test2.somewhere.www'
        }]
    }
    mock_calculate_resource_utilization_for_slaves.return_value = {
        'free': paasta_metastatus.ResourceInfo(cpus=10, mem=10, disk=10),
        'total': paasta_metastatus.ResourceInfo(cpus=20, mem=20, disk=20)
    }
    mock_get_all_tasks_from_state([Mock(), Mock()])
    state = {'frameworks': Mock(), 'slaves': [{}]}
    actual = paasta_metastatus.get_resource_utilization_by_grouping(
        grouping_func=lambda slave: slave['attributes']['habitat'],
        mesos_state=state,
    )
    assert sorted(actual.keys()) == sorted(
        ['somenametest-habitat', 'somenametest-habitat-2'])
    for k, v in actual.items():
        print v
        assert v['total'] == paasta_metastatus.ResourceInfo(cpus=20,
                                                            disk=20,
                                                            mem=20)
        assert v['free'] == paasta_metastatus.ResourceInfo(cpus=10,
                                                           disk=10,
                                                           mem=10)
示例#2
0
def spotfleet_metrics_provider(spotfleet_request_id, resource, pool_settings):
    mesos_state = get_mesos_master().state
    sfr = get_sfr(spotfleet_request_id, region=resource['region'])
    if not sfr or not sfr['SpotFleetRequestState'] == 'active':
        log.error(
            "Ignoring SFR {0} that does not exist or is not active.".format(
                spotfleet_request_id))
        return 0, 0
    sfr['ActiveInstances'] = get_spot_fleet_instances(
        spotfleet_request_id, region=resource['region'])
    resource['sfr'] = sfr
    desired_instances = len(sfr['ActiveInstances'])
    instance_ips = get_sfr_instance_ips(sfr, region=resource['region'])
    slaves = {
        slave['id']: slave
        for slave in mesos_state.get('slaves', [])
        if slave_pid_to_ip(slave['pid']) in instance_ips
        and slave['attributes'].get('pool', 'default') == resource['pool']
    }
    current_instances = len(slaves)
    log.info("Found %.2f%% slaves registered in mesos for this SFR (%d/%d)" %
             (float(float(current_instances) / float(desired_instances)) * 100,
              current_instances, desired_instances))
    if float(current_instances) / desired_instances < (
            1.00 - MISSING_SLAVE_PANIC_THRESHOLD):
        error_message = (
            "We currently have %d instances active in mesos out of a desired %d.\n"
            "Refusing to scale because we either need to wait for the requests to be "
            "filled, or the new instances are not healthy for some reason.\n"
            "(cowardly refusing to go past %.2f%% missing instances)") % (
                current_instances, desired_instances,
                MISSING_SLAVE_PANIC_THRESHOLD)
        raise ClusterAutoscalingError(error_message)

    pool_utilization_dict = get_resource_utilization_by_grouping(
        lambda slave: slave['attributes']['pool'],
        mesos_state)[resource['pool']]

    log.debug(pool_utilization_dict)
    free_pool_resources = pool_utilization_dict['free']
    total_pool_resources = pool_utilization_dict['total']
    utilization = 1.0 - min([
        float(float(pair[0]) / float(pair[1]))
        for pair in zip(free_pool_resources, total_pool_resources)
    ])
    target_utilization = pool_settings.get('target_utilization',
                                           DEFAULT_TARGET_UTILIZATION)
    error = utilization - target_utilization
    current, target = get_spot_fleet_delta(resource, error)
    return current, target
示例#3
0
def spotfleet_metrics_provider(spotfleet_request_id, mesos_state, pool):
    def slave_pid_to_ip(slave_pid):
        regex = re.compile(r'.+?@([\d\.]+):\d+')
        return regex.match(slave_pid).group(1)

    ec2_client = boto3.client('ec2')
    spot_fleet_instances = ec2_client.describe_spot_fleet_instances(
        SpotFleetRequestId=spotfleet_request_id)['ActiveInstances']
    desired_instances = len(spot_fleet_instances)
    instance_ips = {
        instance['PrivateIpAddress']
        for reservation in ec2_client.describe_instances(InstanceIds=[
            instance['InstanceId'] for instance in spot_fleet_instances
        ])['Reservations'] for instance in reservation['Instances']
    }
    slaves = {
        slave['id']: slave
        for slave in mesos_state.get('slaves', [])
        if slave_pid_to_ip(slave['pid']) in instance_ips
        and slave['attributes'].get('pool', 'default') == pool
    }
    current_instances = len(slaves)
    log.info("Found %.2%% slaves registered in mesos for this SFR (%d/%d)" %
             ((current_instances / desired_instances) * 100, current_instances,
              desired_instances))
    if float(current_instances) / desired_instances < (
            1.00 - MISSING_SLAVE_PANIC_THRESHOLD):
        error_message = (
            "We currently have %d instances active in mesos out of a desired %d.\n"
            "Refusing to scale because we either need to wait for the requests to be "
            "filled, or the new instances are not healthy for some reason.\n"
            "(cowardly refusing to go past %.2f%% missing instances)") % (
                current_instances, desired_instances,
                MISSING_SLAVE_PANIC_THRESHOLD)
        raise ClusterAutoscalingError(error_message)

    pool_utilization_dict = get_resource_utilization_by_grouping(
        lambda slave: slave['attributes']['pool'], mesos_state)[pool]

    free_pool_resources = pool_utilization_dict['free']
    total_pool_resources = pool_utilization_dict['total']
    utilization = 1.0 - min([
        float(free_pool_resources[resource]) / total_pool_resources[resource]
        for resource in free_pool_resources
    ])
    return utilization
示例#4
0
def spotfleet_metrics_provider(spotfleet_request_id, mesos_state, pool):
    def slave_pid_to_ip(slave_pid):
        regex = re.compile(r'.+?@([\d\.]+):\d+')
        return regex.match(slave_pid).group(1)

    ec2_client = boto3.client('ec2')
    spot_fleet_instances = ec2_client.describe_spot_fleet_instances(
        SpotFleetRequestId=spotfleet_request_id)['ActiveInstances']
    desired_instances = len(spot_fleet_instances)
    instance_ips = {
        instance['PrivateIpAddress']
        for reservation in ec2_client.describe_instances(
            InstanceIds=[instance['InstanceId'] for instance in spot_fleet_instances])['Reservations']
        for instance in reservation['Instances']
    }
    slaves = {
        slave['id']: slave for slave in mesos_state.get('slaves', [])
        if slave_pid_to_ip(slave['pid']) in instance_ips and
        slave['attributes'].get('pool', 'default') == pool
    }
    current_instances = len(slaves)
    log.info("Found %.2%% slaves registered in mesos for this SFR (%d/%d)" % (
             (current_instances / desired_instances) * 100, current_instances, desired_instances))
    if float(current_instances) / desired_instances < (1.00 - MISSING_SLAVE_PANIC_THRESHOLD):
        error_message = ("We currently have %d instances active in mesos out of a desired %d.\n"
                         "Refusing to scale because we either need to wait for the requests to be "
                         "filled, or the new instances are not healthy for some reason.\n"
                         "(cowardly refusing to go past %.2f%% missing instances)") % (
            current_instances, desired_instances, MISSING_SLAVE_PANIC_THRESHOLD)
        raise ClusterAutoscalingError(error_message)

    pool_utilization_dict = get_resource_utilization_by_grouping(
        lambda slave: slave['attributes']['pool'],
        mesos_state
    )[pool]

    log.debug(pool_utilization_dict)
    free_pool_resources = pool_utilization_dict['free']
    total_pool_resources = pool_utilization_dict['total']
    utilization = 1.0 - min([
        float(float(pair[0]) / float(pair[1]))
        for pair in zip(free_pool_resources, total_pool_resources)
    ])
    return utilization
示例#5
0
def get_mesos_utilization_error(spotfleet_request_id,
                                resource,
                                pool_settings,
                                slaves,
                                mesos_state,
                                desired_instances=None):
    current_instances = len(slaves)
    if desired_instances == 0:
        error_message = (
            "No instances are active, not scaling until the instances are launched"
        )
        raise ClusterAutoscalingError(error_message)
    if desired_instances:
        log.info(
            "Found %.2f%% slaves registered in mesos for this resource (%d/%d)"
            % (float(float(current_instances) / float(desired_instances)) *
               100, current_instances, desired_instances))
        if float(current_instances) / desired_instances < (
                1.00 - MISSING_SLAVE_PANIC_THRESHOLD):
            error_message = (
                "We currently have %d instances active in mesos out of a desired %d.\n"
                "Refusing to scale because we either need to wait for the requests to be "
                "filled, or the new instances are not healthy for some reason.\n"
                "(cowardly refusing to go past %.2f%% missing instances)") % (
                    current_instances, desired_instances,
                    MISSING_SLAVE_PANIC_THRESHOLD)
            raise ClusterAutoscalingError(error_message)

    pool_utilization_dict = get_resource_utilization_by_grouping(
        lambda slave: slave['attributes']['pool'],
        mesos_state)[resource['pool']]

    log.debug(pool_utilization_dict)
    free_pool_resources = pool_utilization_dict['free']
    total_pool_resources = pool_utilization_dict['total']
    utilization = 1.0 - min([
        float(float(pair[0]) / float(pair[1]))
        for pair in zip(free_pool_resources, total_pool_resources)
    ])
    target_utilization = pool_settings.get('target_utilization',
                                           DEFAULT_TARGET_UTILIZATION)
    return utilization - target_utilization
示例#6
0
def test_get_resource_utilization_by_grouping(
        mock_get_all_tasks_from_state,
        mock_calculate_resource_utilization_for_slaves,
        mock_group_slaves_by_key_func,
):
    mock_group_slaves_by_key_func.return_value = {
        'somenametest-habitat': [{
            'id': 'abcd',
            'hostname': 'test.somewhere.www'
        }],
        'somenametest-habitat-2': [{
            'id': 'abcd',
            'hostname': 'test2.somewhere.www'
        }]
    }
    mock_calculate_resource_utilization_for_slaves.return_value = {
        'free': paasta_metastatus.ResourceInfo(cpus=10, mem=10, disk=10),
        'total': paasta_metastatus.ResourceInfo(cpus=20, mem=20, disk=20)
    }
    mock_get_all_tasks_from_state([Mock(), Mock()])
    state = {
        'frameworks': Mock(),
        'slaves': [{}]
    }
    actual = paasta_metastatus.get_resource_utilization_by_grouping(
        grouping_func=lambda slave: slave['attributes']['habitat'],
        mesos_state=state,
    )
    assert sorted(actual.keys()) == sorted(['somenametest-habitat', 'somenametest-habitat-2'])
    for k, v in actual.items():
        print v
        assert v['total'] == paasta_metastatus.ResourceInfo(
            cpus=20,
            disk=20,
            mem=20
        )
        assert v['free'] == paasta_metastatus.ResourceInfo(
            cpus=10,
            disk=10,
            mem=10
        )
示例#7
0
def spotfleet_metrics_provider(spotfleet_request_id, mesos_state, resource):
    sfr = get_sfr(spotfleet_request_id)
    sfr['ActiveInstances'] = get_spot_fleet_instances(spotfleet_request_id)
    resource['sfr'] = sfr
    if not sfr['SpotFleetRequestState'] == 'active':
        log.error("Ignoring SFR {0} that is not yet active or is cancelled etc.".format(spotfleet_request_id))
        return 0, 0
    desired_instances = len(sfr['ActiveInstances'])
    instance_ips = get_sfr_instance_ips(sfr)
    slaves = {
        slave['id']: slave for slave in mesos_state.get('slaves', [])
        if slave_pid_to_ip(slave['pid']) in instance_ips and
        slave['attributes'].get('pool', 'default') == resource['pool']
    }
    current_instances = len(slaves)
    log.info("Found %.2f%% slaves registered in mesos for this SFR (%d/%d)" % (
             float(float(current_instances) / float(desired_instances)) * 100, current_instances, desired_instances))
    if float(current_instances) / desired_instances < (1.00 - MISSING_SLAVE_PANIC_THRESHOLD):
        error_message = ("We currently have %d instances active in mesos out of a desired %d.\n"
                         "Refusing to scale because we either need to wait for the requests to be "
                         "filled, or the new instances are not healthy for some reason.\n"
                         "(cowardly refusing to go past %.2f%% missing instances)") % (
            current_instances, desired_instances, MISSING_SLAVE_PANIC_THRESHOLD)
        raise ClusterAutoscalingError(error_message)

    pool_utilization_dict = get_resource_utilization_by_grouping(
        lambda slave: slave['attributes']['pool'],
        mesos_state
    )[resource['pool']]

    log.debug(pool_utilization_dict)
    free_pool_resources = pool_utilization_dict['free']
    total_pool_resources = pool_utilization_dict['total']
    utilization = 1.0 - min([
        float(float(pair[0]) / float(pair[1]))
        for pair in zip(free_pool_resources, total_pool_resources)
    ])
    error = utilization - CLUSTER_TARGET_UTILIZATION
    current, target = get_spot_fleet_delta(resource, error)
    return current, target