def test_get_resource_utilization_by_grouping( mock_get_all_tasks_from_state, mock_calculate_resource_utilization_for_slaves, mock_group_slaves_by_key_func, ): mock_group_slaves_by_key_func.return_value = { "somenametest-habitat": [{ "id": "abcd", "hostname": "test.somewhere.www" }], "somenametest-habitat-2": [{ "id": "abcd", "hostname": "test2.somewhere.www" }], } mock_calculate_resource_utilization_for_slaves.return_value = { "free": metastatus_lib.ResourceInfo(cpus=10, mem=10, disk=10), "total": metastatus_lib.ResourceInfo(cpus=20, mem=20, disk=20), } state = {"frameworks": Mock(), "slaves": [{"id": "abcd"}]} actual = metastatus_lib.get_resource_utilization_by_grouping( grouping_func=mock.sentinel.grouping_func, mesos_state=state) mock_get_all_tasks_from_state.assert_called_with(state, include_orphans=True) assert sorted(actual.keys()) == sorted( ["somenametest-habitat", "somenametest-habitat-2"]) for k, v in actual.items(): assert v["total"] == metastatus_lib.ResourceInfo(cpus=20, disk=20, mem=20) assert v["free"] == metastatus_lib.ResourceInfo(cpus=10, disk=10, mem=10)
def get_mesos_utilization_error( self, slaves, mesos_state, expected_instances=None, ): current_instances = len(slaves) if current_instances == 0: error_message = ( "No instances are active, not scaling until the instances are attached to mesos" ) raise ClusterAutoscalingError(error_message) if expected_instances: self.log.info( "Found %.2f%% slaves registered in mesos for this resource (%d/%d)" % ( float( float(current_instances) / float(expected_instances)) * 100, current_instances, expected_instances, )) if float(current_instances) / expected_instances < ( 1.00 - MISSING_SLAVE_PANIC_THRESHOLD): error_message = ( "We currently have %d instances active in mesos out of a desired %d.\n" "Refusing to scale because we either need to wait for the requests to be " "filled, or the new instances are not healthy for some reason.\n" "(cowardly refusing to go past %.2f%% missing instances)" ) % ( current_instances, expected_instances, MISSING_SLAVE_PANIC_THRESHOLD, ) raise ClusterAutoscalingError(error_message) region_pool_utilization_dict = get_resource_utilization_by_grouping( lambda slave: ( slave['attributes']['pool'], slave['attributes']['datacenter'], ), mesos_state, )[( self.resource['pool'], self.resource['region'], )] self.log.debug(region_pool_utilization_dict) free_pool_resources = region_pool_utilization_dict['free'] total_pool_resources = region_pool_utilization_dict['total'] free_percs = [] for pair in zip(free_pool_resources, total_pool_resources): free, total = pair[0], pair[1] if math.isclose(total, 0): continue free_percs.append(float(free) / float(total)) utilization = 1.0 - min(free_percs) target_utilization = self.pool_settings.get( 'target_utilization', DEFAULT_TARGET_UTILIZATION) return utilization - target_utilization
def test_get_resource_utilization_by_grouping( mock_get_all_tasks_from_state, mock_calculate_resource_utilization_for_slaves, mock_group_slaves_by_key_func, ): mock_group_slaves_by_key_func.return_value = { 'somenametest-habitat': [{ 'id': 'abcd', 'hostname': 'test.somewhere.www' }], 'somenametest-habitat-2': [{ 'id': 'abcd', 'hostname': 'test2.somewhere.www' }] } mock_calculate_resource_utilization_for_slaves.return_value = { 'free': metastatus_lib.ResourceInfo(cpus=10, mem=10, disk=10), 'total': metastatus_lib.ResourceInfo(cpus=20, mem=20, disk=20) } state = {'frameworks': Mock(), 'slaves': [{'id': 'abcd'}]} actual = metastatus_lib.get_resource_utilization_by_grouping( grouping_func=mock.sentinel.grouping_func, mesos_state=state, ) mock_get_all_tasks_from_state.assert_called_with(state, include_orphans=True) assert sorted(actual.keys()) == sorted( ['somenametest-habitat', 'somenametest-habitat-2']) for k, v in actual.items(): assert v['total'] == metastatus_lib.ResourceInfo(cpus=20, disk=20, mem=20) assert v['free'] == metastatus_lib.ResourceInfo(cpus=10, disk=10, mem=10)
def test_get_resource_utilization_by_grouping( mock_get_all_tasks_from_state, mock_calculate_resource_utilization_for_slaves, mock_group_slaves_by_key_func, ): mock_group_slaves_by_key_func.return_value = { 'somenametest-habitat': [{ 'id': 'abcd', 'hostname': 'test.somewhere.www' }], 'somenametest-habitat-2': [{ 'id': 'abcd', 'hostname': 'test2.somewhere.www' }] } mock_calculate_resource_utilization_for_slaves.return_value = { 'free': metastatus_lib.ResourceInfo(cpus=10, mem=10, disk=10), 'total': metastatus_lib.ResourceInfo(cpus=20, mem=20, disk=20) } mock_get_all_tasks_from_state([Mock(), Mock()]) state = {'frameworks': Mock(), 'slaves': [{}]} actual = metastatus_lib.get_resource_utilization_by_grouping( grouping_func=lambda slave: slave['attributes']['habitat'], mesos_state=state, ) assert sorted(actual.keys()) == sorted( ['somenametest-habitat', 'somenametest-habitat-2']) for k, v in actual.items(): paasta_print(v) assert v['total'] == metastatus_lib.ResourceInfo(cpus=20, disk=20, mem=20) assert v['free'] == metastatus_lib.ResourceInfo(cpus=10, disk=10, mem=10)
def test_get_resource_utilization_by_grouping_correctly_multi_groups(): fake_state = { "slaves": [ { "id": "foo1", "resources": {"disk": 100, "cpus": 10, "mem": 50}, "attributes": {"one": "yes", "two": "yes"}, "reserved_resources": {}, }, { "id": "bar1", "resources": {"disk": 100, "cpus": 10, "mem": 50}, "attributes": {"one": "yes", "two": "no"}, "reserved_resources": {}, }, { "id": "foo2", "resources": {"disk": 100, "cpus": 10, "mem": 50}, "attributes": {"one": "no", "two": "yes"}, "reserved_resources": {}, }, { "id": "bar2", "resources": {"disk": 100, "cpus": 10, "mem": 50}, "attributes": {"one": "no", "two": "no"}, "reserved_resources": {}, }, ], "frameworks": [ { "tasks": [ { "state": "TASK_RUNNING", "resources": {"cpus": 1, "mem": 10, "disk": 10}, "slave_id": "foo1", }, { "state": "TASK_RUNNING", "resources": {"cpus": 1, "mem": 10, "disk": 10}, "slave_id": "bar1", }, ] } ], } grouping_func = metastatus_lib.key_func_for_attribute_multi(["one", "two"]) resp = metastatus_lib.get_resource_utilization_by_grouping( mesos_state=fake_state, grouping_func=grouping_func ) # resp should have 4 keys... assert len(resp.keys()) == 4 # Each key should be a set with 2 items... assert len(list(resp.keys())[0]) == 2 # Each item in the set should have 2 values (original key, value) assert len(list(list(resp.keys())[0])[0]) == 2
def test_get_resource_utilization_by_grouping_correctly_groups(): fake_state = { 'slaves': [ { 'id': 'foo', 'resources': { 'disk': 100, 'cpus': 10, 'mem': 50, }, 'reserved_resources': {}, }, { 'id': 'bar', 'resources': { 'disk': 100, 'cpus': 10, 'mem': 50, }, 'reserved_resources': {}, }, ], 'frameworks': [ { 'tasks': [ { 'state': 'TASK_RUNNING', 'resources': { 'cpus': 1, 'mem': 10, 'disk': 10 }, 'slave_id': 'foo', }, { 'state': 'TASK_RUNNING', 'resources': { 'cpus': 1, 'mem': 10, 'disk': 10 }, 'slave_id': 'bar', }, ] }, ], } def grouping_func(x): return x['id'] free_cpus = metastatus_lib.get_resource_utilization_by_grouping( mesos_state=fake_state, grouping_func=grouping_func, )['foo']['free'].cpus assert free_cpus == 9
def test_get_resource_utilization_by_grouping_correctly_groups(): fake_state = { "slaves": [ { "id": "foo", "resources": { "disk": 100, "cpus": 10, "mem": 50 }, "reserved_resources": {}, }, { "id": "bar", "resources": { "disk": 100, "cpus": 10, "mem": 50 }, "reserved_resources": {}, }, ], "frameworks": [{ "tasks": [ { "state": "TASK_RUNNING", "resources": { "cpus": 1, "mem": 10, "disk": 10 }, "slave_id": "foo", }, { "state": "TASK_RUNNING", "resources": { "cpus": 1, "mem": 10, "disk": 10 }, "slave_id": "bar", }, ] }], } def grouping_func(x): return x["id"] free_cpus = metastatus_lib.get_resource_utilization_by_grouping( mesos_state=fake_state, grouping_func=grouping_func)["foo"]["free"].cpus assert free_cpus == 9
def utilization_table_by_grouping_from_mesos_state( groupings: Sequence[str], threshold: float, humanize: bool, mesos_state: Dict, ) -> Tuple[ List[List[str]], bool, ]: grouping_function = metastatus_lib.key_func_for_attribute_multi(groupings) resource_info_dict_grouped = metastatus_lib.get_resource_utilization_by_grouping( grouping_function, mesos_state, ) static_headers = [ 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)', 'GPU (used/total)', 'Agent count', ] all_rows = [ [grouping.capitalize() for grouping in groupings] + static_headers, ] table_rows = [] for grouping_values, resource_info_dict in resource_info_dict_grouped.items(): resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, threshold, ) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( [v for g, v in grouping_values], healthcheck_utilization_pairs, humanize, ) + [str(resource_info_dict['slave_count'])]) table_rows = sorted(table_rows, key=lambda x: x[0:len(groupings)]) all_rows.extend(table_rows) return all_rows, healthy_exit
def utilization_table_by_grouping_from_mesos_state( groupings: Sequence[str], threshold: float, mesos_state: MesosState, service_instance_stats: Optional[ServiceInstanceStats] = None, ) -> Tuple[Sequence[MutableSequence[str]], bool]: grouping_function = metastatus_lib.key_func_for_attribute_multi(groupings) resource_info_dict_grouped = metastatus_lib.get_resource_utilization_by_grouping( grouping_function, mesos_state) return utilization_table_by_grouping( groupings, grouping_function, resource_info_dict_grouped, threshold, service_instance_stats, )
def test_get_resource_utilization_by_grouping( mock_get_all_tasks_from_state, mock_calculate_resource_utilization_for_slaves, mock_group_slaves_by_key_func, ): mock_group_slaves_by_key_func.return_value = { 'somenametest-habitat': [{ 'id': 'abcd', 'hostname': 'test.somewhere.www' }], 'somenametest-habitat-2': [{ 'id': 'abcd', 'hostname': 'test2.somewhere.www' }] } mock_calculate_resource_utilization_for_slaves.return_value = { 'free': metastatus_lib.ResourceInfo(cpus=10, mem=10, disk=10), 'total': metastatus_lib.ResourceInfo(cpus=20, mem=20, disk=20) } mock_get_all_tasks_from_state([Mock(), Mock()]) state = { 'frameworks': Mock(), 'slaves': [{}] } actual = metastatus_lib.get_resource_utilization_by_grouping( grouping_func=lambda slave: slave['attributes']['habitat'], mesos_state=state, ) assert sorted(actual.keys()) == sorted(['somenametest-habitat', 'somenametest-habitat-2']) for k, v in actual.items(): print v assert v['total'] == metastatus_lib.ResourceInfo( cpus=20, disk=20, mem=20 ) assert v['free'] == metastatus_lib.ResourceInfo( cpus=10, disk=10, mem=10 )
def resources_utilization(request): master = get_mesos_master() mesos_state = block(master.state) groupings = request.swagger_data.get('groupings', ['superregion']) # swagger actually makes the key None if it's not set if groupings is None: groupings = ['superregion'] grouping_function = metastatus_lib.key_func_for_attribute_multi(groupings) sorting_function = metastatus_lib.sort_func_for_attributes(groupings) filters = request.swagger_data.get('filter', []) filters = parse_filters(filters) filter_funcs = [ metastatus_lib.make_filter_slave_func(attr, vals) for attr, vals in filters.items() ] resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping( grouping_func=grouping_function, mesos_state=mesos_state, filters=filter_funcs, sort_func=sorting_function, ) response_body = [] for k, v in resource_info_dict.items(): group = {'groupings': {}} for grouping, value in k: group['groupings'][grouping] = value for resource, value in v['total']._asdict().items(): group[resource] = {'total': value} for resource, value in v['free']._asdict().items(): group[resource]['free'] = value for resource in v['free']._fields: group[resource][ 'used'] = group[resource]['total'] - group[resource]['free'] response_body.append(group) return Response(json_body=response_body, status_code=200)
def test_get_resource_utilization_by_grouping( mock_get_all_tasks_from_state, mock_calculate_resource_utilization_for_slaves, mock_group_slaves_by_key_func ): mock_group_slaves_by_key_func.return_value = { "somenametest-habitat": [{"id": "abcd", "hostname": "test.somewhere.www"}], "somenametest-habitat-2": [{"id": "abcd", "hostname": "test2.somewhere.www"}], } mock_calculate_resource_utilization_for_slaves.return_value = { "free": metastatus_lib.ResourceInfo(cpus=10, mem=10, disk=10), "total": metastatus_lib.ResourceInfo(cpus=20, mem=20, disk=20), } mock_get_all_tasks_from_state([Mock(), Mock()]) state = {"frameworks": Mock(), "slaves": [{}]} actual = metastatus_lib.get_resource_utilization_by_grouping( grouping_func=lambda slave: slave["attributes"]["habitat"], mesos_state=state ) assert sorted(actual.keys()) == sorted(["somenametest-habitat", "somenametest-habitat-2"]) for k, v in actual.items(): paasta_print(v) assert v["total"] == metastatus_lib.ResourceInfo(cpus=20, disk=20, mem=20) assert v["free"] == metastatus_lib.ResourceInfo(cpus=10, disk=10, mem=10)
def get_mesos_utilization_error( mesos_state, region, pool, target_utilization, ): try: region_pool_utilization_dict = get_resource_utilization_by_grouping( lambda slave: ( slave['attributes']['pool'], slave['attributes']['datacenter'], ), mesos_state, )[( pool, region, )] except KeyError: log.info( "Failed to find utilization for region %s, pool %s, returning 0 error" ) return 0 log.debug(region_pool_utilization_dict) free_pool_resources = region_pool_utilization_dict['free'] total_pool_resources = region_pool_utilization_dict['total'] free_percs = [] for free, total in zip(free_pool_resources, total_pool_resources): if math.isclose(total, 0): continue free_percs.append(float(free) / float(total)) if len(free_percs ) == 0: # If all resource totals are close to 0 for some reason return 0 utilization = 1.0 - min(free_percs) return utilization - target_utilization
def get_mesos_utilization_error(spotfleet_request_id, resource, pool_settings, slaves, mesos_state, desired_instances=None): current_instances = len(slaves) if desired_instances == 0: error_message = ("No instances are active, not scaling until the instances are launched") raise ClusterAutoscalingError(error_message) if desired_instances: log.info("Found %.2f%% slaves registered in mesos for this resource (%d/%d)" % ( float(float(current_instances) / float(desired_instances)) * 100, current_instances, desired_instances)) if float(current_instances) / desired_instances < (1.00 - MISSING_SLAVE_PANIC_THRESHOLD): error_message = ("We currently have %d instances active in mesos out of a desired %d.\n" "Refusing to scale because we either need to wait for the requests to be " "filled, or the new instances are not healthy for some reason.\n" "(cowardly refusing to go past %.2f%% missing instances)") % ( current_instances, desired_instances, MISSING_SLAVE_PANIC_THRESHOLD) raise ClusterAutoscalingError(error_message) pool_utilization_dict = get_resource_utilization_by_grouping( lambda slave: slave['attributes']['pool'], mesos_state )[resource['pool']] log.debug(pool_utilization_dict) free_pool_resources = pool_utilization_dict['free'] total_pool_resources = pool_utilization_dict['total'] utilization = 1.0 - min([ float(float(pair[0]) / float(pair[1])) for pair in zip(free_pool_resources, total_pool_resources) ]) target_utilization = pool_settings.get('target_utilization', DEFAULT_TARGET_UTILIZATION) return utilization - target_utilization
def main(argv=None): chronos_config = None args = parse_args(argv) system_paasta_config = load_system_paasta_config() master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs['use_mesos_cache'] = True master = get_mesos_master(**master_kwargs) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers)) try: mesos_state = master.state all_mesos_results = _run_mesos_checks( mesos_master=master, mesos_state=mesos_state, marathon_clients=marathon_clients, ) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if chronos_config: chronos_client = get_chronos_client(chronos_config, cached=True) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: paasta_print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [metastatus_lib.HealthCheckResult( message='Chronos is not configured to run here', healthy=True, )] marathon_results = _run_marathon_checks(marathon_clients) mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False paasta_print("Master paasta_tools version: {}".format(__version__)) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) grouping_function = metastatus_lib.key_func_for_attribute(grouping) resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping( grouping_function, mesos_state, ) all_rows = [[ grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)', 'GPU (used/total)', 'Agent count', ]] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold, ) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize, ) + [str(resource_info_dict['slave_count'])]) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields] table = functools.reduce( lambda x, y: x + [(y)], get_autoscaling_info_for_all_resources(mesos_state), [headers], ) for line in format_table(table): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping( lambda slave: slave['hostname'], mesos_state, ) all_rows = [['Hostname', 'CPU (used/total)', 'RAM (used//total)', 'Disk (used//total)', 'GPU (used/total)']] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold, ) for utilization in resource_utilizations ] table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize, )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def main(argv=None): marathon_config = None chronos_config = None args = parse_args(argv) master = get_mesos_master() try: mesos_state = master.state except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_state_status = metastatus_lib.get_mesos_state_status( mesos_state=mesos_state, ) metrics = master.metrics_snapshot() mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health( mesos_metrics=metrics, mesos_state=mesos_state) framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status( metrics=metrics) all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks # Check to see if Marathon should be running here by checking for config marathon_config = marathon_tools.load_marathon_config() # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if marathon_config: marathon_client = metastatus_lib.get_marathon_client(marathon_config) try: marathon_results = metastatus_lib.get_marathon_status( marathon_client) except (MarathonError, InternalServerError, ValueError) as e: # catch ValueError until marathon-python/pull/167 is merged and this is handled upstream paasta_print( PaastaColors.red( ("CRITICAL: Unable to contact Marathon cluster at {}!" "Is the cluster healthy?".format( marathon_config["url"])))) sys.exit(2) else: marathon_results = [ metastatus_lib.HealthCheckResult( message='Marathon is not configured to run here', healthy=True) ] if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: paasta_print( PaastaColors.red( "CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [ metastatus_lib.HealthCheckResult( message='Chronos is not configured to run here', healthy=True) ] mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check( "Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check( "Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check( "Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False paasta_print("Master paasta_tools version: {}".format(__version__)) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) grouping_function = metastatus_lib.key_func_for_attribute(grouping) resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping( grouping_function, mesos_state) all_rows = [[ grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)' ]] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items( ): resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib. healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append( metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize)) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [ field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields ] table = reduce(lambda x, y: x + [(y)], get_autoscaling_info_for_all_resources(), [headers]) for line in format_table(table): print_with_indent(line, 4) if args.verbose == 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping( lambda slave: slave['hostname'], mesos_state) all_rows = [[ 'Hostname', 'CPU (used/total)', 'RAM (used//total)', 'Disk (used//total)' ]] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items( ): table_rows = [] resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib. healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold) for utilization in resource_utilizations ] table_rows.append( metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize)) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def main(): marathon_config = None chronos_config = None args = parse_args() master = get_mesos_master() try: mesos_state = master.state except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_state_status = metastatus_lib.get_mesos_state_status( mesos_state=mesos_state, ) metrics = master.metrics_snapshot() mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(mesos_metrics=metrics, mesos_state=mesos_state) framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status(metrics=metrics) all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks # Check to see if Marathon should be running here by checking for config marathon_config = marathon_tools.load_marathon_config() # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if marathon_config: marathon_client = metastatus_lib.get_marathon_client(marathon_config) try: marathon_results = metastatus_lib.get_marathon_status(marathon_client) except MarathonError as e: print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e)) sys.exit(2) else: marathon_results = [metastatus_lib.HealthCheckResult(message='Marathon is not configured to run here', healthy=True)] if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [metastatus_lib.HealthCheckResult(message='Chronos is not configured to run here', healthy=True)] mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False print "Master paasta_tools version: {0}".format(__version__) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) grouping_function = metastatus_lib.key_func_for_attribute(grouping) resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(grouping_function, mesos_state) all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.verbose == 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(lambda slave: slave['hostname'], mesos_state) all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization, args.threshold) for utilization in resource_utilizations ] table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def test_get_resource_utilization_by_grouping_correctly_multi_groups(): fake_state = { 'slaves': [ { 'id': 'foo1', 'resources': { 'disk': 100, 'cpus': 10, 'mem': 50, }, 'attributes': {'one': 'yes', 'two': 'yes'}, 'reserved_resources': {}, }, { 'id': 'bar1', 'resources': { 'disk': 100, 'cpus': 10, 'mem': 50, }, 'attributes': {'one': 'yes', 'two': 'no'}, 'reserved_resources': {}, }, { 'id': 'foo2', 'resources': { 'disk': 100, 'cpus': 10, 'mem': 50, }, 'attributes': {'one': 'no', 'two': 'yes'}, 'reserved_resources': {}, }, { 'id': 'bar2', 'resources': { 'disk': 100, 'cpus': 10, 'mem': 50, }, 'attributes': {'one': 'no', 'two': 'no'}, 'reserved_resources': {}, }, ], 'frameworks': [ {'tasks': [ { 'state': 'TASK_RUNNING', 'resources': {'cpus': 1, 'mem': 10, 'disk': 10}, 'slave_id': 'foo1', }, { 'state': 'TASK_RUNNING', 'resources': {'cpus': 1, 'mem': 10, 'disk': 10}, 'slave_id': 'bar1', }, ]}, ], } grouping_func = metastatus_lib.key_func_for_attribute_multi(['one', 'two']) resp = metastatus_lib.get_resource_utilization_by_grouping( mesos_state=fake_state, grouping_func=grouping_func, ) # resp should have 4 keys... assert(len(resp.keys()) == 4) # Each key should be a set with 2 items... assert(len(list(resp.keys())[0]) == 2) # Each item in the set should have 2 values (original key, value) assert(len(list(list(resp.keys())[0])[0]) == 2)