def test_status_for_results(): assert metastatus_lib.status_for_results( [ metastatus_lib.HealthCheckResult(message="message", healthy=True), metastatus_lib.HealthCheckResult(message="message", healthy=False), ] ) == [True, False]
def test_critical_events_in_outputs(): assert metastatus_lib.critical_events_in_outputs( [ metastatus_lib.HealthCheckResult("myservice", True), metastatus_lib.HealthCheckResult("myservice_false", False), ] ) == [("myservice_false", False)]
def test_get_chronos_status(mock_queued_jobs, mock_scheduled_jobs): mock_scheduled_jobs_result = metastatus_lib.HealthCheckResult( message='Enabled chronos jobs: 1', healthy=True) mock_queued_jobs_result = metastatus_lib.HealthCheckResult( message="Jobs Queued: 0 (0%)", healthy=True) mock_queued_jobs.return_value = mock_queued_jobs_result mock_scheduled_jobs.return_value = mock_scheduled_jobs_result expected_results = [mock_queued_jobs_result, mock_scheduled_jobs_result] assert metastatus_lib.get_chronos_status(Mock()) == expected_results
def test_assert_chronos_queued_jobs_queued(): mock_client = Mock() mock_client.metrics.return_value = { 'gauges': { metastatus_lib.HIGH_QUEUE_GAUGE: { 'value': 1 }, metastatus_lib.QUEUE_GAUGE: { 'value': 0 } } } mock_client.list.return_value = [ { 'name': 'myjob', 'disabled': False }, { 'name': 'myjob', 'disabled': False }, ] assert metastatus_lib.assert_chronos_queued_jobs( mock_client) == metastatus_lib.HealthCheckResult( message="Jobs Queued: 1 (50.0%)", healthy=True)
def test_healthcheck_result_for_resource_utilization_unhealthy(): expected_message = 'cpus: 5.00/10.00(50.00%) used. Threshold (10.00%)' expected = metastatus_lib.HealthCheckResult(message=expected_message, healthy=False) resource_utilization = metastatus_lib.ResourceUtilization(metric='cpus', total=10, free=5) assert metastatus_lib.healthcheck_result_for_resource_utilization( resource_utilization=resource_utilization, threshold=10) == expected
def test_healthcheck_result_for_resource_utilization_zero(): expected_message = "cpus: 0.00/0.00(0.00%) used. Threshold (10.00%)" expected = metastatus_lib.HealthCheckResult(message=expected_message, healthy=True) resource_utilization = metastatus_lib.ResourceUtilization(metric="cpus", total=0, free=0) assert (metastatus_lib.healthcheck_result_for_resource_utilization( resource_utilization=resource_utilization, threshold=10) == expected)
def test_assert_chronos_queued_jobs_no_queued(): mock_client = Mock() mock_client.metrics.return_value = { "gauges": { metastatus_lib.HIGH_QUEUE_GAUGE: {"value": 0}, metastatus_lib.QUEUE_GAUGE: {"value": 0}, } } mock_client.list.return_value = [ {"name": "myjob", "disabled": False}, {"name": "myjob", "disabled": True}, ] assert metastatus_lib.assert_chronos_queued_jobs( mock_client ) == metastatus_lib.HealthCheckResult(message="Jobs Queued: 0 (0.0%)", healthy=True)
def print_output(argv: Optional[Sequence[str]] = None) -> None: mesos_available = is_mesos_available() kube_available = is_kubernetes_available() args = parse_args(argv) system_paasta_config = load_system_paasta_config() if mesos_available: master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs["use_mesos_cache"] = True master = get_mesos_master(**master_kwargs) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients( get_marathon_clients(marathon_servers)) try: mesos_state = a_sync.block(master.state) all_mesos_results = _run_mesos_checks(mesos_master=master, mesos_state=mesos_state) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % "\n".join(e.args))) raise FatalError(2) marathon_results = _run_marathon_checks(marathon_clients) else: marathon_results = [ metastatus_lib.HealthCheckResult( message="Marathon is not configured to run here", healthy=True) ] all_mesos_results = [ metastatus_lib.HealthCheckResult( message="Mesos is not configured to run here", healthy=True) ] if kube_available: kube_client = KubeClient() kube_results = _run_kube_checks(kube_client) else: kube_results = [ metastatus_lib.HealthCheckResult( message="Kubernetes is not configured to run here", healthy=True) ] mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) kube_ok = all(metastatus_lib.status_for_results(kube_results)) mesos_summary = metastatus_lib.generate_summary_for_check( "Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check( "Marathon", marathon_ok) kube_summary = metastatus_lib.generate_summary_for_check( "Kubernetes", kube_ok) healthy_exit = True if all([mesos_ok, marathon_ok]) else False paasta_print(f"Master paasta_tools version: {__version__}") paasta_print("Mesos leader: %s" % get_mesos_leader()) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1 and mesos_available: print_with_indent( "Resources Grouped by %s" % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings, threshold=args.threshold, mesos_state=mesos_state) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [ field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields ] table = [headers] + [[ str(x) for x in asi ] for asi in get_autoscaling_info_for_all_resources(mesos_state)] for line in format_table(table): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent("Per Slave Utilization", 2) cluster = system_paasta_config.get_cluster() service_instance_stats = get_service_instance_stats( args.service, args.instance, cluster) if service_instance_stats: print_with_indent( "Service-Instance stats:" + str(service_instance_stats), 2) # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. all_rows, _ = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings + ["hostname"], threshold=args.threshold, mesos_state=mesos_state, service_instance_stats=service_instance_stats, ) # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be # 1 for per-slave resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(kube_summary, kube_ok, kube_results, args.verbose) if args.verbose > 1 and kube_available: print_with_indent( "Resources Grouped by %s" % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_kube( groupings=args.groupings, threshold=args.threshold, kube_client=kube_client) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("No autoscaling resources for Kubernetes", 2) if args.verbose >= 3: print_with_indent("Per Node Utilization", 2) cluster = system_paasta_config.get_cluster() service_instance_stats = get_service_instance_stats( args.service, args.instance, cluster) if service_instance_stats: print_with_indent( "Service-Instance stats:" + str(service_instance_stats), 2) # print info about nodes here. Note that we don't make # modifications to the healthy_exit variable here, because we don't # care about a single node having high usage. all_rows, _ = utilization_table_by_grouping_from_kube( groupings=args.groupings + ["hostname"], threshold=args.threshold, kube_client=kube_client, service_instance_stats=service_instance_stats, ) # The last column from utilization_table_by_grouping_from_kube is "Agent count", which will always be # 1 for per-node resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) if not healthy_exit: raise FatalError(2)
def main(argv=None): chronos_config = None args = parse_args(argv) system_paasta_config = load_system_paasta_config() master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs['use_mesos_cache'] = True master = get_mesos_master(**master_kwargs) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers)) try: mesos_state = master.state all_mesos_results = _run_mesos_checks( mesos_master=master, mesos_state=mesos_state, marathon_clients=marathon_clients, ) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if chronos_config: chronos_client = get_chronos_client(chronos_config, cached=True) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: paasta_print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [metastatus_lib.HealthCheckResult( message='Chronos is not configured to run here', healthy=True, )] marathon_results = _run_marathon_checks(marathon_clients) mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False paasta_print("Master paasta_tools version: {}".format(__version__)) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) grouping_function = metastatus_lib.key_func_for_attribute(grouping) resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping( grouping_function, mesos_state, ) all_rows = [[ grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)', 'GPU (used/total)', 'Agent count', ]] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items(): resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold, ) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize, ) + [str(resource_info_dict['slave_count'])]) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields] table = functools.reduce( lambda x, y: x + [(y)], get_autoscaling_info_for_all_resources(mesos_state), [headers], ) for line in format_table(table): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping( lambda slave: slave['hostname'], mesos_state, ) all_rows = [['Hostname', 'CPU (used/total)', 'RAM (used//total)', 'Disk (used//total)', 'GPU (used/total)']] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items(): table_rows = [] resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold, ) for utilization in resource_utilizations ] table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize, )) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def main(argv=None): marathon_config = None chronos_config = None args = parse_args(argv) master = get_mesos_master() try: mesos_state = master.state except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % e.message)) sys.exit(2) mesos_state_status = metastatus_lib.get_mesos_state_status( mesos_state=mesos_state, ) metrics = master.metrics_snapshot() mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health( mesos_metrics=metrics, mesos_state=mesos_state) framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status( metrics=metrics) all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks # Check to see if Marathon should be running here by checking for config marathon_config = marathon_tools.load_marathon_config() # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if marathon_config: marathon_client = metastatus_lib.get_marathon_client(marathon_config) try: marathon_results = metastatus_lib.get_marathon_status( marathon_client) except (MarathonError, InternalServerError, ValueError) as e: # catch ValueError until marathon-python/pull/167 is merged and this is handled upstream paasta_print( PaastaColors.red( ("CRITICAL: Unable to contact Marathon cluster at {}!" "Is the cluster healthy?".format( marathon_config["url"])))) sys.exit(2) else: marathon_results = [ metastatus_lib.HealthCheckResult( message='Marathon is not configured to run here', healthy=True) ] if chronos_config: chronos_client = get_chronos_client(chronos_config) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: paasta_print( PaastaColors.red( "CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [ metastatus_lib.HealthCheckResult( message='Chronos is not configured to run here', healthy=True) ] mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check( "Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check( "Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check( "Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False paasta_print("Master paasta_tools version: {}".format(__version__)) metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: for grouping in args.groupings: print_with_indent('Resources Grouped by %s' % grouping, 2) grouping_function = metastatus_lib.key_func_for_attribute(grouping) resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping( grouping_function, mesos_state) all_rows = [[ grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)' ]] table_rows = [] for attribute_value, resource_info_dict in resource_info_dict.items( ): resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib. healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold) for utilization in resource_utilizations ] healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs) table_rows.append( metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize)) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [ field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields ] table = reduce(lambda x, y: x + [(y)], get_autoscaling_info_for_all_resources(), [headers]) for line in format_table(table): print_with_indent(line, 4) if args.verbose == 3: print_with_indent('Per Slave Utilization', 2) slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping( lambda slave: slave['hostname'], mesos_state) all_rows = [[ 'Hostname', 'CPU (used/total)', 'RAM (used//total)', 'Disk (used//total)' ]] # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. for attribute_value, resource_info_dict in slave_resource_dict.items( ): table_rows = [] resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info( total=resource_info_dict['total'], free=resource_info_dict['free'], ) healthcheck_utilization_pairs = [ metastatus_lib. healthcheck_result_resource_utilization_pair_for_resource_utilization( utilization, args.threshold) for utilization in resource_utilizations ] table_rows.append( metastatus_lib.get_table_rows_for_resource_info_dict( attribute_value, healthcheck_utilization_pairs, args.humanize)) table_rows = sorted(table_rows, key=lambda x: x[0]) all_rows.extend(table_rows) for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)
def main(argv: Optional[List[str]] = None) -> None: chronos_config = None args = parse_args(argv) system_paasta_config = load_system_paasta_config() master_kwargs = {} # we don't want to be passing False to not override a possible True # value from system config if args.use_mesos_cache: master_kwargs['use_mesos_cache'] = True master = get_mesos_master(**master_kwargs) marathon_servers = get_marathon_servers(system_paasta_config) marathon_clients = all_marathon_clients( get_marathon_clients(marathon_servers)) try: mesos_state = a_sync.block(master.state) all_mesos_results = _run_mesos_checks( mesos_master=master, mesos_state=mesos_state, ) except MasterNotAvailableException as e: # if we can't connect to master at all, # then bomb out early paasta_print(PaastaColors.red("CRITICAL: %s" % '\n'.join(e.args))) sys.exit(2) # Check to see if Chronos should be running here by checking for config chronos_config = load_chronos_config() if chronos_config: chronos_client = get_chronos_client(chronos_config, cached=True) try: chronos_results = metastatus_lib.get_chronos_status(chronos_client) except (chronos.ChronosAPIError) as e: paasta_print( PaastaColors.red( "CRITICAL: Unable to contact Chronos! Error: %s" % e)) sys.exit(2) else: chronos_results = [ metastatus_lib.HealthCheckResult( message='Chronos is not configured to run here', healthy=True, ) ] marathon_results = _run_marathon_checks(marathon_clients) mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results)) marathon_ok = all(metastatus_lib.status_for_results(marathon_results)) chronos_ok = all(metastatus_lib.status_for_results(chronos_results)) mesos_summary = metastatus_lib.generate_summary_for_check( "Mesos", mesos_ok) marathon_summary = metastatus_lib.generate_summary_for_check( "Marathon", marathon_ok) chronos_summary = metastatus_lib.generate_summary_for_check( "Chronos", chronos_ok) healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False paasta_print(f"Master paasta_tools version: {__version__}") metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose) if args.verbose > 1: print_with_indent( 'Resources Grouped by %s' % ", ".join(args.groupings), 2) all_rows, healthy_exit = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings, threshold=args.threshold, mesos_state=mesos_state, ) for line in format_table(all_rows): print_with_indent(line, 4) if args.autoscaling_info: print_with_indent("Autoscaling resources:", 2) headers = [ field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields ] table = [headers] + [[ str(x) for x in asi ] for asi in get_autoscaling_info_for_all_resources(mesos_state)] for line in format_table(table): print_with_indent(line, 4) if args.verbose >= 3: print_with_indent('Per Slave Utilization', 2) # print info about slaves here. Note that we don't make modifications to # the healthy_exit variable here, because we don't care about a single slave # having high usage. all_rows, _ = utilization_table_by_grouping_from_mesos_state( groupings=args.groupings + ["hostname"], threshold=args.threshold, mesos_state=mesos_state, ) # The last column from utilization_table_by_grouping_from_mesos_state is "Agent count", which will always be # 1 for per-slave resources, so delete it. for row in all_rows: row.pop() for line in format_table(all_rows): print_with_indent(line, 4) metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose) metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose) if not healthy_exit: sys.exit(2) else: sys.exit(0)