예제 #1
0
def main():
    marathon_config = None
    chronos_config = None
    args = parse_args()

    master = get_mesos_master()
    try:
        mesos_state = master.state
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = metastatus_lib.get_mesos_state_status(
        mesos_state=mesos_state,
    )

    metrics = master.metrics_snapshot()
    mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(mesos_metrics=metrics,
                                                                                mesos_state=mesos_state)
    framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status(metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks

    # Check to see if Marathon should be running here by checking for config
    marathon_config = marathon_tools.load_marathon_config()

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if marathon_config:
        marathon_client = metastatus_lib.get_marathon_client(marathon_config)
        try:
            marathon_results = metastatus_lib.get_marathon_status(marathon_client)
        except MarathonError as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Marathon! Error: %s" % e))
            sys.exit(2)
    else:
        marathon_results = [metastatus_lib.HealthCheckResult(message='Marathon is not configured to run here',
                                                             healthy=True)]

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [metastatus_lib.HealthCheckResult(message='Chronos is not configured to run here',
                                                            healthy=True)]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    print "Master paasta_tools version: {0}".format(__version__)
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose)
    if args.verbose > 1:
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            grouping_function = metastatus_lib.key_func_for_attribute(grouping)
            resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(grouping_function,
                                                                                     mesos_state)
            all_rows = [[grouping.capitalize(), 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization,
                                                                                                         args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.verbose == 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(lambda slave: slave['hostname'],
                                                                                      mesos_state)
            all_rows = [['Hostname', 'CPU (free/total)', 'RAM (free/total)', 'Disk (free/total)']]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items():
                table_rows = []
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(utilization,
                                                                                                         args.threshold)
                    for utilization in resource_utilizations
                ]
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize
                ))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
예제 #2
0
def test_key_func_for_attribute():
    assert inspect.isfunction(metastatus_lib.key_func_for_attribute('habitat'))
예제 #3
0
def main(argv=None):
    chronos_config = None
    args = parse_args(argv)

    system_paasta_config = load_system_paasta_config()

    master_kwargs = {}
    # we don't want to be passing False to not override a possible True
    # value from system config
    if args.use_mesos_cache:
        master_kwargs['use_mesos_cache'] = True
    master = get_mesos_master(**master_kwargs)

    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = all_marathon_clients(get_marathon_clients(marathon_servers))

    try:
        mesos_state = master.state
        all_mesos_results = _run_mesos_checks(
            mesos_master=master,
            mesos_state=mesos_state,
            marathon_clients=marathon_clients,
        )
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        paasta_print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config, cached=True)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            paasta_print(PaastaColors.red("CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [metastatus_lib.HealthCheckResult(
            message='Chronos is not configured to run here',
            healthy=True,
        )]

    marathon_results = _run_marathon_checks(marathon_clients)

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check("Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check("Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check("Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    paasta_print("Master paasta_tools version: {}".format(__version__))
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok, all_mesos_results, args.verbose)
    if args.verbose > 1:
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            grouping_function = metastatus_lib.key_func_for_attribute(grouping)
            resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(
                grouping_function,
                mesos_state,
            )
            all_rows = [[
                grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)', 'Disk (used/total)',
                'GPU (used/total)', 'Agent count',
            ]]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items():
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization,
                        args.threshold,
                    )
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy for pair in healthcheck_utilization_pairs)
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize,
                ) + [str(resource_info_dict['slave_count'])])
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [field.replace("_", " ").capitalize() for field in AutoscalingInfo._fields]
            table = functools.reduce(
                lambda x, y: x + [(y)],
                get_autoscaling_info_for_all_resources(mesos_state),
                [headers],
            )

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose >= 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(
                lambda slave: slave['hostname'],
                mesos_state,
            )
            all_rows = [['Hostname', 'CPU (used/total)', 'RAM (used//total)', 'Disk (used//total)', 'GPU (used/total)']]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items():
                table_rows = []
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization,
                        args.threshold,
                    )
                    for utilization in resource_utilizations
                ]
                table_rows.append(metastatus_lib.get_table_rows_for_resource_info_dict(
                    attribute_value,
                    healthcheck_utilization_pairs,
                    args.humanize,
                ))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary, marathon_ok, marathon_results, args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok, chronos_results, args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)
예제 #4
0
def test_key_func_for_attribute():
    assert inspect.isfunction(metastatus_lib.key_func_for_attribute("habitat"))
예제 #5
0
def main(argv=None):
    marathon_config = None
    chronos_config = None
    args = parse_args(argv)

    master = get_mesos_master()
    try:
        mesos_state = master.state
    except MasterNotAvailableException as e:
        # if we can't connect to master at all,
        # then bomb out early
        paasta_print(PaastaColors.red("CRITICAL:  %s" % e.message))
        sys.exit(2)

    mesos_state_status = metastatus_lib.get_mesos_state_status(
        mesos_state=mesos_state, )

    metrics = master.metrics_snapshot()
    mesos_metrics_status = metastatus_lib.get_mesos_resource_utilization_health(
        mesos_metrics=metrics, mesos_state=mesos_state)
    framework_metrics_healthchecks = metastatus_lib.get_framework_metrics_status(
        metrics=metrics)

    all_mesos_results = mesos_state_status + mesos_metrics_status + framework_metrics_healthchecks

    # Check to see if Marathon should be running here by checking for config
    marathon_config = marathon_tools.load_marathon_config()

    # Check to see if Chronos should be running here by checking for config
    chronos_config = load_chronos_config()

    if marathon_config:
        marathon_client = metastatus_lib.get_marathon_client(marathon_config)
        try:
            marathon_results = metastatus_lib.get_marathon_status(
                marathon_client)
        except (MarathonError, InternalServerError, ValueError) as e:
            # catch ValueError until marathon-python/pull/167 is merged and this is handled upstream
            paasta_print(
                PaastaColors.red(
                    ("CRITICAL: Unable to contact Marathon cluster at {}!"
                     "Is the cluster healthy?".format(
                         marathon_config["url"]))))
            sys.exit(2)
    else:
        marathon_results = [
            metastatus_lib.HealthCheckResult(
                message='Marathon is not configured to run here', healthy=True)
        ]

    if chronos_config:
        chronos_client = get_chronos_client(chronos_config)
        try:
            chronos_results = metastatus_lib.get_chronos_status(chronos_client)
        except (chronos.ChronosAPIError) as e:
            paasta_print(
                PaastaColors.red(
                    "CRITICAL: Unable to contact Chronos! Error: %s" % e))
            sys.exit(2)
    else:
        chronos_results = [
            metastatus_lib.HealthCheckResult(
                message='Chronos is not configured to run here', healthy=True)
        ]

    mesos_ok = all(metastatus_lib.status_for_results(all_mesos_results))
    marathon_ok = all(metastatus_lib.status_for_results(marathon_results))
    chronos_ok = all(metastatus_lib.status_for_results(chronos_results))

    mesos_summary = metastatus_lib.generate_summary_for_check(
        "Mesos", mesos_ok)
    marathon_summary = metastatus_lib.generate_summary_for_check(
        "Marathon", marathon_ok)
    chronos_summary = metastatus_lib.generate_summary_for_check(
        "Chronos", chronos_ok)

    healthy_exit = True if all([mesos_ok, marathon_ok, chronos_ok]) else False

    paasta_print("Master paasta_tools version: {}".format(__version__))
    metastatus_lib.print_results_for_healthchecks(mesos_summary, mesos_ok,
                                                  all_mesos_results,
                                                  args.verbose)
    if args.verbose > 1:
        for grouping in args.groupings:
            print_with_indent('Resources Grouped by %s' % grouping, 2)
            grouping_function = metastatus_lib.key_func_for_attribute(grouping)
            resource_info_dict = metastatus_lib.get_resource_utilization_by_grouping(
                grouping_function, mesos_state)
            all_rows = [[
                grouping.capitalize(), 'CPU (used/total)', 'RAM (used/total)',
                'Disk (used/total)'
            ]]
            table_rows = []
            for attribute_value, resource_info_dict in resource_info_dict.items(
            ):
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                healthy_exit = all(pair[0].healthy
                                   for pair in healthcheck_utilization_pairs)
                table_rows.append(
                    metastatus_lib.get_table_rows_for_resource_info_dict(
                        attribute_value, healthcheck_utilization_pairs,
                        args.humanize))
            table_rows = sorted(table_rows, key=lambda x: x[0])
            all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)

        if args.autoscaling_info:
            print_with_indent("Autoscaling resources:", 2)
            headers = [
                field.replace("_", " ").capitalize()
                for field in AutoscalingInfo._fields
            ]
            table = reduce(lambda x, y: x + [(y)],
                           get_autoscaling_info_for_all_resources(), [headers])

            for line in format_table(table):
                print_with_indent(line, 4)

        if args.verbose == 3:
            print_with_indent('Per Slave Utilization', 2)
            slave_resource_dict = metastatus_lib.get_resource_utilization_by_grouping(
                lambda slave: slave['hostname'], mesos_state)
            all_rows = [[
                'Hostname', 'CPU (used/total)', 'RAM (used//total)',
                'Disk (used//total)'
            ]]

            # print info about slaves here. Note that we don't make modifications to
            # the healthy_exit variable here, because we don't care about a single slave
            # having high usage.
            for attribute_value, resource_info_dict in slave_resource_dict.items(
            ):
                table_rows = []
                resource_utilizations = metastatus_lib.resource_utillizations_from_resource_info(
                    total=resource_info_dict['total'],
                    free=resource_info_dict['free'],
                )
                healthcheck_utilization_pairs = [
                    metastatus_lib.
                    healthcheck_result_resource_utilization_pair_for_resource_utilization(
                        utilization, args.threshold)
                    for utilization in resource_utilizations
                ]
                table_rows.append(
                    metastatus_lib.get_table_rows_for_resource_info_dict(
                        attribute_value, healthcheck_utilization_pairs,
                        args.humanize))
                table_rows = sorted(table_rows, key=lambda x: x[0])
                all_rows.extend(table_rows)
            for line in format_table(all_rows):
                print_with_indent(line, 4)
    metastatus_lib.print_results_for_healthchecks(marathon_summary,
                                                  marathon_ok,
                                                  marathon_results,
                                                  args.verbose)
    metastatus_lib.print_results_for_healthchecks(chronos_summary, chronos_ok,
                                                  chronos_results,
                                                  args.verbose)

    if not healthy_exit:
        sys.exit(2)
    else:
        sys.exit(0)