def test_get_resource_utilization_by_grouping(
    mock_get_all_tasks_from_state,
    mock_calculate_resource_utilization_for_slaves,
    mock_group_slaves_by_key_func,
):
    mock_group_slaves_by_key_func.return_value = {
        "somenametest-habitat": [{
            "id": "abcd",
            "hostname": "test.somewhere.www"
        }],
        "somenametest-habitat-2": [{
            "id": "abcd",
            "hostname": "test2.somewhere.www"
        }],
    }
    mock_calculate_resource_utilization_for_slaves.return_value = {
        "free": metastatus_lib.ResourceInfo(cpus=10, mem=10, disk=10),
        "total": metastatus_lib.ResourceInfo(cpus=20, mem=20, disk=20),
    }
    state = {"frameworks": Mock(), "slaves": [{"id": "abcd"}]}
    actual = metastatus_lib.get_resource_utilization_by_grouping(
        grouping_func=mock.sentinel.grouping_func, mesos_state=state)
    mock_get_all_tasks_from_state.assert_called_with(state,
                                                     include_orphans=True)
    assert sorted(actual.keys()) == sorted(
        ["somenametest-habitat", "somenametest-habitat-2"])
    for k, v in actual.items():
        assert v["total"] == metastatus_lib.ResourceInfo(cpus=20,
                                                         disk=20,
                                                         mem=20)
        assert v["free"] == metastatus_lib.ResourceInfo(cpus=10,
                                                        disk=10,
                                                        mem=10)
Exemplo n.º 2
0
    def get_mesos_utilization_error(
        self,
        slaves,
        mesos_state,
        expected_instances=None,
    ):
        current_instances = len(slaves)
        if current_instances == 0:
            error_message = (
                "No instances are active, not scaling until the instances are attached to mesos"
            )
            raise ClusterAutoscalingError(error_message)
        if expected_instances:
            self.log.info(
                "Found %.2f%% slaves registered in mesos for this resource (%d/%d)"
                % (
                    float(
                        float(current_instances) / float(expected_instances)) *
                    100,
                    current_instances,
                    expected_instances,
                ))
            if float(current_instances) / expected_instances < (
                    1.00 - MISSING_SLAVE_PANIC_THRESHOLD):
                error_message = (
                    "We currently have %d instances active in mesos out of a desired %d.\n"
                    "Refusing to scale because we either need to wait for the requests to be "
                    "filled, or the new instances are not healthy for some reason.\n"
                    "(cowardly refusing to go past %.2f%% missing instances)"
                ) % (
                    current_instances,
                    expected_instances,
                    MISSING_SLAVE_PANIC_THRESHOLD,
                )
                raise ClusterAutoscalingError(error_message)

        region_pool_utilization_dict = get_resource_utilization_by_grouping(
            lambda slave: (
                slave['attributes']['pool'],
                slave['attributes']['datacenter'],
            ),
            mesos_state,
        )[(
            self.resource['pool'],
            self.resource['region'],
        )]

        self.log.debug(region_pool_utilization_dict)
        free_pool_resources = region_pool_utilization_dict['free']
        total_pool_resources = region_pool_utilization_dict['total']
        free_percs = []
        for pair in zip(free_pool_resources, total_pool_resources):
            free, total = pair[0], pair[1]
            if math.isclose(total, 0):
                continue
            free_percs.append(float(free) / float(total))
        utilization = 1.0 - min(free_percs)
        target_utilization = self.pool_settings.get(
            'target_utilization', DEFAULT_TARGET_UTILIZATION)
        return utilization - target_utilization
Exemplo n.º 3
0
def test_get_resource_utilization_by_grouping(
    mock_get_all_tasks_from_state,
    mock_calculate_resource_utilization_for_slaves,
    mock_group_slaves_by_key_func,
):
    mock_group_slaves_by_key_func.return_value = {
        'somenametest-habitat': [{
            'id': 'abcd',
            'hostname': 'test.somewhere.www'
        }],
        'somenametest-habitat-2': [{
            'id': 'abcd',
            'hostname': 'test2.somewhere.www'
        }]
    }
    mock_calculate_resource_utilization_for_slaves.return_value = {
        'free': metastatus_lib.ResourceInfo(cpus=10, mem=10, disk=10),
        'total': metastatus_lib.ResourceInfo(cpus=20, mem=20, disk=20)
    }
    state = {'frameworks': Mock(), 'slaves': [{'id': 'abcd'}]}
    actual = metastatus_lib.get_resource_utilization_by_grouping(
        grouping_func=mock.sentinel.grouping_func,
        mesos_state=state,
    )
    mock_get_all_tasks_from_state.assert_called_with(state,
                                                     include_orphans=True)
    assert sorted(actual.keys()) == sorted(
        ['somenametest-habitat', 'somenametest-habitat-2'])
    for k, v in actual.items():
        assert v['total'] == metastatus_lib.ResourceInfo(cpus=20,
                                                         disk=20,
                                                         mem=20)
        assert v['free'] == metastatus_lib.ResourceInfo(cpus=10,
                                                        disk=10,
                                                        mem=10)
Exemplo n.º 4
0
def test_get_resource_utilization_by_grouping(
    mock_get_all_tasks_from_state,
    mock_calculate_resource_utilization_for_slaves,
    mock_group_slaves_by_key_func,
):
    mock_group_slaves_by_key_func.return_value = {
        'somenametest-habitat': [{
            'id': 'abcd',
            'hostname': 'test.somewhere.www'
        }],
        'somenametest-habitat-2': [{
            'id': 'abcd',
            'hostname': 'test2.somewhere.www'
        }]
    }
    mock_calculate_resource_utilization_for_slaves.return_value = {
        'free': metastatus_lib.ResourceInfo(cpus=10, mem=10, disk=10),
        'total': metastatus_lib.ResourceInfo(cpus=20, mem=20, disk=20)
    }
    mock_get_all_tasks_from_state([Mock(), Mock()])
    state = {'frameworks': Mock(), 'slaves': [{}]}
    actual = metastatus_lib.get_resource_utilization_by_grouping(
        grouping_func=lambda slave: slave['attributes']['habitat'],
        mesos_state=state,
    )
    assert sorted(actual.keys()) == sorted(
        ['somenametest-habitat', 'somenametest-habitat-2'])
    for k, v in actual.items():
        paasta_print(v)
        assert v['total'] == metastatus_lib.ResourceInfo(cpus=20,
                                                         disk=20,
                                                         mem=20)
        assert v['free'] == metastatus_lib.ResourceInfo(cpus=10,
                                                        disk=10,
                                                        mem=10)
Exemplo n.º 5
0
def test_get_resource_utilization_by_grouping_correctly_multi_groups():
    fake_state = {
        "slaves": [
            {
                "id": "foo1",
                "resources": {"disk": 100, "cpus": 10, "mem": 50},
                "attributes": {"one": "yes", "two": "yes"},
                "reserved_resources": {},
            },
            {
                "id": "bar1",
                "resources": {"disk": 100, "cpus": 10, "mem": 50},
                "attributes": {"one": "yes", "two": "no"},
                "reserved_resources": {},
            },
            {
                "id": "foo2",
                "resources": {"disk": 100, "cpus": 10, "mem": 50},
                "attributes": {"one": "no", "two": "yes"},
                "reserved_resources": {},
            },
            {
                "id": "bar2",
                "resources": {"disk": 100, "cpus": 10, "mem": 50},
                "attributes": {"one": "no", "two": "no"},
                "reserved_resources": {},
            },
        ],
        "frameworks": [
            {
                "tasks": [
                    {
                        "state": "TASK_RUNNING",
                        "resources": {"cpus": 1, "mem": 10, "disk": 10},
                        "slave_id": "foo1",
                    },
                    {
                        "state": "TASK_RUNNING",
                        "resources": {"cpus": 1, "mem": 10, "disk": 10},
                        "slave_id": "bar1",
                    },
                ]
            }
        ],
    }

    grouping_func = metastatus_lib.key_func_for_attribute_multi(["one", "two"])
    resp = metastatus_lib.get_resource_utilization_by_grouping(
        mesos_state=fake_state, grouping_func=grouping_func
    )
    # resp should have 4 keys...
    assert len(resp.keys()) == 4
    # Each key should be a set with 2 items...
    assert len(list(resp.keys())[0]) == 2
    # Each item in the set should have 2 values (original key, value)
    assert len(list(list(resp.keys())[0])[0]) == 2
Exemplo n.º 6
0
def test_get_resource_utilization_by_grouping_correctly_groups():
    fake_state = {
        'slaves': [
            {
                'id': 'foo',
                'resources': {
                    'disk': 100,
                    'cpus': 10,
                    'mem': 50,
                },
                'reserved_resources': {},
            },
            {
                'id': 'bar',
                'resources': {
                    'disk': 100,
                    'cpus': 10,
                    'mem': 50,
                },
                'reserved_resources': {},
            },
        ],
        'frameworks': [
            {
                'tasks': [
                    {
                        'state': 'TASK_RUNNING',
                        'resources': {
                            'cpus': 1,
                            'mem': 10,
                            'disk': 10
                        },
                        'slave_id': 'foo',
                    },
                    {
                        'state': 'TASK_RUNNING',
                        'resources': {
                            'cpus': 1,
                            'mem': 10,
                            'disk': 10
                        },
                        'slave_id': 'bar',
                    },
                ]
            },
        ],
    }

    def grouping_func(x):
        return x['id']

    free_cpus = metastatus_lib.get_resource_utilization_by_grouping(
        mesos_state=fake_state,
        grouping_func=grouping_func,
    )['foo']['free'].cpus
    assert free_cpus == 9
def test_get_resource_utilization_by_grouping_correctly_groups():
    fake_state = {
        "slaves": [
            {
                "id": "foo",
                "resources": {
                    "disk": 100,
                    "cpus": 10,
                    "mem": 50
                },
                "reserved_resources": {},
            },
            {
                "id": "bar",
                "resources": {
                    "disk": 100,
                    "cpus": 10,
                    "mem": 50
                },
                "reserved_resources": {},
            },
        ],
        "frameworks": [{
            "tasks": [
                {
                    "state": "TASK_RUNNING",
                    "resources": {
                        "cpus": 1,
                        "mem": 10,
                        "disk": 10
                    },
                    "slave_id": "foo",
                },
                {
                    "state": "TASK_RUNNING",
                    "resources": {
                        "cpus": 1,
                        "mem": 10,
                        "disk": 10
                    },
                    "slave_id": "bar",
                },
            ]
        }],
    }

    def grouping_func(x):
        return x["id"]

    free_cpus = metastatus_lib.get_resource_utilization_by_grouping(
        mesos_state=fake_state,
        grouping_func=grouping_func)["foo"]["free"].cpus
    assert free_cpus == 9
Exemplo n.º 8
0
def utilization_table_by_grouping_from_mesos_state(
    groupings: Sequence[str],
    threshold: float,
    humanize: bool,
    mesos_state: Dict,
) -> Tuple[
    List[List[str]],
    bool,
]:
    grouping_function = metastatus_lib.key_func_for_attribute_multi(groupings)
    resource_info_dict_grouped = metastatus_lib.get_resource_utilization_by_grouping(
        grouping_function,
        mesos_state,
    )

    static_headers = [
        'CPU (used/total)',
        'RAM (used/total)',
        'Disk (used/total)',
        'GPU (used/total)',
        'Agent count',
    ]

    all_rows = [
        [grouping.capitalize() for grouping in groupings] + static_headers,
    ]
    table_rows = []

    for grouping_values, resource_info_dict in resource_info_dict_grouped.items():
        resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
            total=resource_info_dict['total'],
            free=resource_info_dict['free'],
        )
        healthcheck_utilization_pairs = [
            metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(
                utilization,
                threshold,
            )
            for utilization in resource_utilizations
        ]
        healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
        table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
            [v for g, v in grouping_values],
            healthcheck_utilization_pairs,
            humanize,
        ) + [str(resource_info_dict['slave_count'])])
    table_rows = sorted(table_rows, key=lambda x: x[0:len(groupings)])
    all_rows.extend(table_rows)

    return all_rows, healthy_exit
Exemplo n.º 9
0
def utilization_table_by_grouping_from_mesos_state(
    groupings: Sequence[str],
    threshold: float,
    mesos_state: MesosState,
    service_instance_stats: Optional[ServiceInstanceStats] = None,
) -> Tuple[Sequence[MutableSequence[str]], bool]:
    grouping_function = metastatus_lib.key_func_for_attribute_multi(groupings)
    resource_info_dict_grouped = metastatus_lib.get_resource_utilization_by_grouping(
        grouping_function, mesos_state)

    return utilization_table_by_grouping(
        groupings,
        grouping_function,
        resource_info_dict_grouped,
        threshold,
        service_instance_stats,
    )
Exemplo n.º 10
0
def test_get_resource_utilization_by_grouping(
        mock_get_all_tasks_from_state,
        mock_calculate_resource_utilization_for_slaves,
        mock_group_slaves_by_key_func,
):
    mock_group_slaves_by_key_func.return_value = {
        'somenametest-habitat': [{
            'id': 'abcd',
            'hostname': 'test.somewhere.www'
        }],
        'somenametest-habitat-2': [{
            'id': 'abcd',
            'hostname': 'test2.somewhere.www'
        }]
    }
    mock_calculate_resource_utilization_for_slaves.return_value = {
        'free': metastatus_lib.ResourceInfo(cpus=10, mem=10, disk=10),
        'total': metastatus_lib.ResourceInfo(cpus=20, mem=20, disk=20)
    }
    mock_get_all_tasks_from_state([Mock(), Mock()])
    state = {
        'frameworks': Mock(),
        'slaves': [{}]
    }
    actual = metastatus_lib.get_resource_utilization_by_grouping(
        grouping_func=lambda slave: slave['attributes']['habitat'],
        mesos_state=state,
    )
    assert sorted(actual.keys()) == sorted(['somenametest-habitat', 'somenametest-habitat-2'])
    for k, v in actual.items():
        print v
        assert v['total'] == metastatus_lib.ResourceInfo(
            cpus=20,
            disk=20,
            mem=20
        )
        assert v['free'] == metastatus_lib.ResourceInfo(
            cpus=10,
            disk=10,
            mem=10
        )
Exemplo n.º 11
0
def resources_utilization(request):
    master = get_mesos_master()
    mesos_state = block(master.state)

    groupings = request.swagger_data.get('groupings', ['superregion'])
    # swagger actually makes the key None if it's not set
    if groupings is None:
        groupings = ['superregion']
    grouping_function = metastatus_lib.key_func_for_attribute_multi(groupings)
    sorting_function = metastatus_lib.sort_func_for_attributes(groupings)

    filters = request.swagger_data.get('filter', [])
    filters = parse_filters(filters)
    filter_funcs = [
        metastatus_lib.make_filter_slave_func(attr, vals)
        for attr, vals in filters.items()
    ]

    resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(
        grouping_func=grouping_function,
        mesos_state=mesos_state,
        filters=filter_funcs,
        sort_func=sorting_function,
    )

    response_body = []
    for k, v in resource_info_dict.items():
        group = {'groupings': {}}
        for grouping, value in k:
            group['groupings'][grouping] = value
        for resource, value in v['total']._asdict().items():
            group[resource] = {'total': value}
        for resource, value in v['free']._asdict().items():
            group[resource]['free'] = value
        for resource in v['free']._fields:
            group[resource][
                'used'] = group[resource]['total'] - group[resource]['free']

        response_body.append(group)

    return Response(json_body=response_body, status_code=200)
Exemplo n.º 12
0
def test_get_resource_utilization_by_grouping(
    mock_get_all_tasks_from_state, mock_calculate_resource_utilization_for_slaves, mock_group_slaves_by_key_func
):
    mock_group_slaves_by_key_func.return_value = {
        "somenametest-habitat": [{"id": "abcd", "hostname": "test.somewhere.www"}],
        "somenametest-habitat-2": [{"id": "abcd", "hostname": "test2.somewhere.www"}],
    }
    mock_calculate_resource_utilization_for_slaves.return_value = {
        "free": metastatus_lib.ResourceInfo(cpus=10, mem=10, disk=10),
        "total": metastatus_lib.ResourceInfo(cpus=20, mem=20, disk=20),
    }
    mock_get_all_tasks_from_state([Mock(), Mock()])
    state = {"frameworks": Mock(), "slaves": [{}]}
    actual = metastatus_lib.get_resource_utilization_by_grouping(
        grouping_func=lambda slave: slave["attributes"]["habitat"], mesos_state=state
    )
    assert sorted(actual.keys()) == sorted(["somenametest-habitat", "somenametest-habitat-2"])
    for k, v in actual.items():
        paasta_print(v)
        assert v["total"] == metastatus_lib.ResourceInfo(cpus=20, disk=20, mem=20)
        assert v["free"] == metastatus_lib.ResourceInfo(cpus=10, disk=10, mem=10)
Exemplo n.º 13
0
def get_mesos_utilization_error(
    mesos_state,
    region,
    pool,
    target_utilization,
):
    try:
        region_pool_utilization_dict = get_resource_utilization_by_grouping(
            lambda slave: (
                slave['attributes']['pool'],
                slave['attributes']['datacenter'],
            ),
            mesos_state,
        )[(
            pool,
            region,
        )]
    except KeyError:
        log.info(
            "Failed to find utilization for region %s, pool %s, returning 0 error"
        )
        return 0

    log.debug(region_pool_utilization_dict)
    free_pool_resources = region_pool_utilization_dict['free']
    total_pool_resources = region_pool_utilization_dict['total']
    free_percs = []
    for free, total in zip(free_pool_resources, total_pool_resources):
        if math.isclose(total, 0):
            continue
        free_percs.append(float(free) / float(total))

    if len(free_percs
           ) == 0:  # If all resource totals are close to 0 for some reason
        return 0

    utilization = 1.0 - min(free_percs)
    return utilization - target_utilization
Exemplo n.º 14
0
def get_mesos_utilization_error(spotfleet_request_id,
                                resource,
                                pool_settings,
                                slaves,
                                mesos_state,
                                desired_instances=None):
    current_instances = len(slaves)
    if desired_instances == 0:
        error_message = ("No instances are active, not scaling until the instances are launched")
        raise ClusterAutoscalingError(error_message)
    if desired_instances:
        log.info("Found %.2f%% slaves registered in mesos for this resource (%d/%d)" % (
                 float(float(current_instances) / float(desired_instances)) * 100,
                 current_instances,
                 desired_instances))
        if float(current_instances) / desired_instances < (1.00 - MISSING_SLAVE_PANIC_THRESHOLD):
            error_message = ("We currently have %d instances active in mesos out of a desired %d.\n"
                             "Refusing to scale because we either need to wait for the requests to be "
                             "filled, or the new instances are not healthy for some reason.\n"
                             "(cowardly refusing to go past %.2f%% missing instances)") % (
                current_instances, desired_instances, MISSING_SLAVE_PANIC_THRESHOLD)
            raise ClusterAutoscalingError(error_message)

    pool_utilization_dict = get_resource_utilization_by_grouping(
        lambda slave: slave['attributes']['pool'],
        mesos_state
    )[resource['pool']]

    log.debug(pool_utilization_dict)
    free_pool_resources = pool_utilization_dict['free']
    total_pool_resources = pool_utilization_dict['total']
    utilization = 1.0 - min([
        float(float(pair[0]) / float(pair[1]))
        for pair in zip(free_pool_resources, total_pool_resources)
    ])
    target_utilization = pool_settings.get('target_utilization', DEFAULT_TARGET_UTILIZATION)
    return utilization - target_utilization
Exemplo n.º 15
0
def main(argv=None):
    chronos_config = None
    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()

    master_kwargs = {}
    # we don't want to be passing False to not override a possible True
    # value from system config
    if args.use_mesos_cache:
        master_kwargs['use_mesos_cache'] = True
    master = get_mesos_master(**master_kwargs)

    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers))

    try:
        mesos_state = master.state
        all_mesos_results = _run_mesos_checks(
            mesos_master=master,
            mesos_state=mesos_state,
            marathon_clients=marathon_clients,
        )
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        paasta_print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config, cached=True)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            paasta_print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [metastatus_lib.HealthCheckResult(
            message='Chronos is not configured to run here',
            healthy=True,
        )]

    marathon_results = _run_marathon_checks(marathon_clients)

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    paasta_print("Master paasta_tools version: {}".format(__version__))
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose)
    if args.verbose > 1:
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            grouping_function = metastatus_lib.key_func_for_attribute(grouping)
            resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(
                grouping_function,
                mesos_state,
            )
            all_rows = [[
                grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)',
                'GPU (used/total)', 'Agent count',
            ]]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization,
                        args.threshold,
                    )
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize,
                ) + [str(resource_info_dict['slave_count'])])
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields]
            table = functools.reduce(
                lambda x, y: x + [(y)],
                get_autoscaling_info_for_all_resources(mesos_state),
                [headers],
            )

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(
                lambda slave: slave['hostname'],
                mesos_state,
            )
            all_rows = [['Hostname', 'CPU (used/total)', 'RAM (used//total)', 'Disk (used//total)', 'GPU (used/total)']]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items():
                table_rows = []
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization,
                        args.threshold,
                    )
                    for utilization in resource_utilizations
                ]
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize,
                ))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Exemplo n.º 16
0
def main(argv=None):
    marathon_config = None
    chronos_config = None
    args = parse_args(argv)

    master = get_mesos_master()
    try:
        mesos_state = master.state
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        paasta_print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = metastatus_lib.get_mesos_state_status(
        mesos_state=mesos_state, )

    metrics = master.metrics_snapshot()
    mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(
        mesos_metrics=metrics, mesos_state=mesos_state)
    framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status(
        metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks

    # Check to see if Marathon should be running here by checking for config
    marathon_config = marathon_tools.load_marathon_config()

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if marathon_config:
        marathon_client = metastatus_lib.get_marathon_client(marathon_config)
        try:
            marathon_results = metastatus_lib.get_marathon_status(
                marathon_client)
        except (MarathonError, InternalServerError, ValueError) as e:
            # catch ValueError until marathon-python/pull/167 is merged and this is handled upstream
            paasta_print(
                PaastaColors.red(
                    ("CRITICAL: Unable to contact Marathon cluster at {}!"
                     "Is the cluster healthy?".format(
                         marathon_config["url"]))))
            sys.exit(2)
    else:
        marathon_results = [
            metastatus_lib.HealthCheckResult(
                message='Marathon is not configured to run here', healthy=True)
        ]

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            paasta_print(
                PaastaColors.red(
                    "CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [
            metastatus_lib.HealthCheckResult(
                message='Chronos is not configured to run here', healthy=True)
        ]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check(
        "Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check(
        "Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check(
        "Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    paasta_print("Master paasta_tools version: {}".format(__version__))
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok,
                                                  all_mesos_results,
                                                  args.verbose)
    if args.verbose > 1:
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            grouping_function = metastatus_lib.key_func_for_attribute(grouping)
            resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(
                grouping_function, mesos_state)
            all_rows = [[
                grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)',
                'Disk (used/total)'
            ]]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items(
            ):
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy
                                   for pair in healthcheck_utilization_pairs)
                table_rows.append(
                    metastatus_lib.get_table_rows_for_resource_info_dict(
                        attribute_value, healthcheck_utilization_pairs,
                        args.humanize))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [
                field.replace("_", " ").capitalize()
                for field in AutoscalingInfo._fields
            ]
            table = reduce(lambda x, y: x + [(y)],
                           get_autoscaling_info_for_all_resources(), [headers])

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose == 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(
                lambda slave: slave['hostname'], mesos_state)
            all_rows = [[
                'Hostname', 'CPU (used/total)', 'RAM (used//total)',
                'Disk (used//total)'
            ]]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items(
            ):
                table_rows = []
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                table_rows.append(
                    metastatus_lib.get_table_rows_for_resource_info_dict(
                        attribute_value, healthcheck_utilization_pairs,
                        args.humanize))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary,
                                                  marathon_ok,
                                                  marathon_results,
                                                  args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok,
                                                  chronos_results,
                                                  args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Exemplo n.º 17
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    master = get_mesos_master()
    try:
        mesos_state = master.state
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = metastatus_lib.get_mesos_state_status(
        mesos_state=mesos_state,
    )

    metrics = master.metrics_snapshot()
    mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(mesos_metrics=metrics,
                                                                                mesos_state=mesos_state)
    framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status(metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks

    # Check to see if Marathon should be running here by checking for config
    marathon_config = marathon_tools.load_marathon_config()

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if marathon_config:
        marathon_client = metastatus_lib.get_marathon_client(marathon_config)
        try:
            marathon_results = metastatus_lib.get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)
    else:
        marathon_results = [metastatus_lib.HealthCheckResult(message='Marathon is not configured to run here',
                                                             healthy=True)]

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [metastatus_lib.HealthCheckResult(message='Chronos is not configured to run here',
                                                            healthy=True)]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    print "Master paasta_tools version: {0}".format(__version__)
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose)
    if args.verbose > 1:
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            grouping_function = metastatus_lib.key_func_for_attribute(grouping)
            resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(grouping_function,
                                                                                     mesos_state)
            all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization,
                                                                                                         args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.verbose == 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(lambda slave: slave['hostname'],
                                                                                      mesos_state)
            all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items():
                table_rows = []
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization,
                                                                                                         args.threshold)
                    for utilization in resource_utilizations
                ]
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
Exemplo n.º 18
0
def test_get_resource_utilization_by_grouping_correctly_multi_groups():
    fake_state = {
        'slaves': [
            {
                'id': 'foo1',
                'resources': {
                    'disk': 100,
                    'cpus': 10,
                    'mem': 50,
                },
                'attributes': {'one': 'yes', 'two': 'yes'},
                'reserved_resources': {},
            },
            {
                'id': 'bar1',
                'resources': {
                    'disk': 100,
                    'cpus': 10,
                    'mem': 50,
                },
                'attributes': {'one': 'yes', 'two': 'no'},
                'reserved_resources': {},
            },
            {
                'id': 'foo2',
                'resources': {
                    'disk': 100,
                    'cpus': 10,
                    'mem': 50,
                },
                'attributes': {'one': 'no', 'two': 'yes'},
                'reserved_resources': {},
            },
            {
                'id': 'bar2',
                'resources': {
                    'disk': 100,
                    'cpus': 10,
                    'mem': 50,
                },
                'attributes': {'one': 'no', 'two': 'no'},
                'reserved_resources': {},
            },
        ],
        'frameworks': [
            {'tasks': [
                {
                    'state': 'TASK_RUNNING',
                    'resources': {'cpus': 1, 'mem': 10, 'disk': 10},
                    'slave_id': 'foo1',
                },
                {
                    'state': 'TASK_RUNNING',
                    'resources': {'cpus': 1, 'mem': 10, 'disk': 10},
                    'slave_id': 'bar1',
                },
            ]},
        ],
    }

    grouping_func = metastatus_lib.key_func_for_attribute_multi(['one', 'two'])
    resp = metastatus_lib.get_resource_utilization_by_grouping(
        mesos_state=fake_state,
        grouping_func=grouping_func,
    )
    # resp should have 4 keys...
    assert(len(resp.keys()) == 4)
    # Each key should be a set with 2 items...
    assert(len(list(resp.keys())[0]) == 2)
    # Each item in the set should have 2 values (original key, value)
    assert(len(list(list(resp.keys())[0])[0]) == 2)