Пример #1
0
def test_monitoring_shows_osd_down(measure_stop_ceph_osd):
    """
    Make sure simple problems with OSD daemons are reported via OCP Prometheus.
    """
    prometheus = PrometheusAPI()
    # time (in seconds) for monitoring to notice the change
    expected_delay = 60

    affected_osd = measure_stop_ceph_osd['result']
    # translate this into ceph daemon name
    ceph_daemon = "osd.{}".format(int(affected_osd[len('rook-ceph-osd-'):]))
    logger.info(
        f"affected osd was {affected_osd}, aka {ceph_daemon} ceph daemon")

    logger.info("let's check that ceph health was affected")
    health_result = prometheus.query_range(
        query='ceph_health_status',
        start=measure_stop_ceph_osd['start'],
        end=measure_stop_ceph_osd['stop'],
        step=15)
    health_validation = check_query_range_result(result=health_result,
                                                 good_values=[1],
                                                 bad_values=[0],
                                                 exp_metric_num=1,
                                                 exp_delay=expected_delay)
    health_msg = "health status should be affected by missing osd"

    logger.info("let's check that osd up value was affected")
    osd_up_result = prometheus.query_range(
        query='ceph_osd_up{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_osd['start'],
        end=measure_stop_ceph_osd['stop'],
        step=15)
    osd_up_validation = check_query_range_result(result=osd_up_result,
                                                 good_values=[0],
                                                 bad_values=[1],
                                                 exp_metric_num=1,
                                                 exp_delay=expected_delay)
    osd_up_msg = "ceph_osd_up value should be affected by missing osd"

    logger.info("let's check that osd in value was not affected")
    # osd in value is not affected because we just stopped the osd, we
    # haven't removed it from the luster
    osd_in_result = prometheus.query_range(
        query='ceph_osd_in{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_osd['start'],
        end=measure_stop_ceph_osd['stop'],
        step=15)
    osd_in_validation = check_query_range_result(result=osd_in_result,
                                                 good_values=[1],
                                                 bad_values=[0],
                                                 exp_metric_num=1)
    osd_in_msg = "ceph_osd_in value should not be affected by missing osd"

    # checking validation results when all queries are performed makes sure
    # that there is evidence for all queries in the test case logs in case of
    # an assert failure
    assert health_validation, health_msg
    assert osd_up_validation, osd_up_msg
    assert osd_in_validation, osd_in_msg
Пример #2
0
def test_monitoring_shows_mon_down(measure_stop_ceph_mon):
    """
    Make sure simple problems with MON daemons are reported via OCP Prometheus.
    """
    prometheus = PrometheusAPI()
    # time (in seconds) for monitoring to notice the change
    expected_delay = 60

    affected_mons = measure_stop_ceph_mon['result']
    # we asked to stop just a single mon ... make this assumption explicit
    assert len(affected_mons) == 1
    affected_mon = affected_mons[0]
    # translate this into ceph daemon name
    ceph_daemon = "mon.{}".format(affected_mon[len('rook-ceph-mon-'):])
    logger.info(
        f"affected mon was {affected_mon}, aka {ceph_daemon} ceph daemon")

    logger.info("let's check that ceph health was affected")
    health_result = prometheus.query_range(
        query='ceph_health_status',
        start=measure_stop_ceph_mon['start'],
        end=measure_stop_ceph_mon['stop'],
        step=15)
    health_validation = prometheus.check_query_range_result(
        result=health_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=1,
        exp_delay=expected_delay)
    health_msg = "health status should be affected by missing mon"
    assert health_validation, health_msg

    logger.info("let's check that mon quorum status value was affected")
    mon_result = prometheus.query_range(
        query='ceph_mon_quorum_status{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_mon['start'],
        end=measure_stop_ceph_mon['stop'],
        step=15)
    mon_validation = prometheus.check_query_range_result(
        result=mon_result,
        good_values=[0],
        bad_values=[1],
        exp_metric_num=1,
        exp_delay=expected_delay)
    mon_msg = "ceph_osd_up value should be affected by missing osd"
    assert mon_validation, mon_msg
Пример #3
0
def test_mcg_cpu_usage(workload_idle):
    """
    Without any IO  workload, cpu utilization of MCG pods should be minimal.
    No pod should utilize more than 0.1 cpu units.
    """
    prometheus = PrometheusAPI()
    cpu_result = prometheus.query_range(
        query=CPU_USAGE_POD + '{namespace="openshift-storage",pod=~"^noobaa.*"}',
        start=workload_idle["start"],
        end=workload_idle["stop"],
        step=15,
    )
    validation = check_query_range_result_limits(
        result=cpu_result,
        good_min=0.0,
        good_max=0.25,
    )
    msg = "No NooBaa pod should utilize over 0.1 cpu units while idle."
    assert validation, msg
Пример #4
0
def test_workload_rbd(workload_storageutilization_50p_rbd):
    """
    Purpose of this test is to make the workload fixture executed, and
    show how to query prometheus.

    Note that this test is valid only on 3 osd cluster with all pools using
    3 way replication.
    """
    prometheus = PrometheusAPI()
    # Asking for values of `ceph_osd_stat_bytes_used` for every 15s in
    # when the workload fixture was utilizing 50% of the OCS storage.
    result_used = prometheus.query_range(
        query="ceph_osd_stat_bytes_used",
        start=workload_storageutilization_50p_rbd["start"],
        end=workload_storageutilization_50p_rbd["stop"],
        step=15,
    )
    # This time, we are asking for total OCS capacity, in the same format
    # as in previous case (for each OSD).
    result_total = prometheus.query_range(
        query="ceph_osd_stat_bytes",
        start=workload_storageutilization_50p_rbd["start"],
        end=workload_storageutilization_50p_rbd["stop"],
        step=15,
    )
    # Check test assumption that ceph_osd_stat_bytes hasn't changed for each
    # OSD, and that each OSD has the same size.
    osd_stat_bytes = []
    for metric in result_total:
        values = []
        for ts, value in metric["values"]:
            values.append(value)
        assert all(value == values[0] for value in values)
        osd_stat_bytes.append(values[0])
    assert all(value == osd_stat_bytes[0] for value in osd_stat_bytes)
    # Compute expected value of'ceph_osd_stat_bytes_used, based on percentage
    # utilized by the fixture.
    percentage = workload_storageutilization_50p_rbd["result"]["target_p"]
    expected_value = int(osd_stat_bytes[0]) * percentage
    # Now we can check the actual usage values from Prometheus.
    at_least_one_value_out_of_range = False
    for metric in result_used:
        name = metric["metric"]["__name__"]
        daemon = metric["metric"]["ceph_daemon"]
        logger.info(f"metric {name} from {daemon}")
        # We are skipping the 1st 10% of the values, as it could take some
        # additional time for all the data to be written everywhere, and
        # during this time utilization value still grows.
        start_index = int(len(metric["values"]) * 0.1)
        logger.info(f"ignoring first {start_index} values")
        for ts, value in metric["values"][:start_index]:
            value = int(value)
            dt = datetime.utcfromtimestamp(ts)
            logger.info(f"ignoring value {value} B at {dt}")
        for ts, value in metric["values"][start_index:]:
            value = int(value)
            dt = datetime.utcfromtimestamp(ts)
            # checking the value, with 10% error margin in each direction
            if expected_value * 0.90 <= value <= expected_value * 1.10:
                logger.info(
                    f"value {value} B at {dt} is withing expected range")
            else:
                logger.error(
                    (f"value {value} B at {dt} is outside of expected range"
                     f" {expected_value} B +- 10%"))
                at_least_one_value_out_of_range = True
    assert not at_least_one_value_out_of_range
Пример #5
0
def test_monitoring_reporting_ok_when_idle(workload_idle):
    """
    When nothing is happening, OCP Prometheus reports OCS status as OK.

    If this test case fails, the status is either reported wrong or the
    cluster is in a broken state. Either way, a failure here is not good.
    """
    prometheus = PrometheusAPI()

    health_result = prometheus.query_range(
        query='ceph_health_status',
        start=workload_idle['start'],
        end=workload_idle['stop'],
        step=15)
    health_validation = check_query_range_result(
        result=health_result,
        good_values=[0],
        bad_values=[1],
        exp_metric_num=1)
    health_msg = "ceph_health_status {} report 0 (health ok) as expected"
    if health_validation:
        health_msg = health_msg.format('does')
        logger.info(health_msg)
    else:
        health_msg = health_msg.format('should')
        logger.error(health_msg)

    mon_result = prometheus.query_range(
        query='ceph_mon_quorum_status',
        start=workload_idle['start'],
        end=workload_idle['stop'],
        step=15)
    mon_validation = check_query_range_result(
        result=mon_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=workload_idle['result']['mon_num'])
    mon_msg = "ceph_mon_quorum_status {} indicate no problems with quorum"
    if mon_validation:
        mon_msg = mon_msg.format('does')
        logger.info(mon_msg)
    else:
        mon_msg = mon_msg.format('should')
        logger.error(mon_msg)

    osd_validations = []
    for metric in ("ceph_osd_up", "ceph_osd_in"):
        osd_result = prometheus.query_range(
            query=metric,
            start=workload_idle['start'],
            end=workload_idle['stop'],
            step=15)
        osd_validation = check_query_range_result(
            result=osd_result,
            good_values=[1],
            bad_values=[0],
            exp_metric_num=workload_idle['result']['osd_num'])
        osd_validations.append(osd_validation)
        osd_msg = "{} metric {} indicate no problems with OSDs"
        if osd_validation:
            osd_msg = osd_msg.format(metric, 'does')
            logger.info(osd_msg)
        else:
            osd_msg = osd_msg.format(metric, 'should')
            logger.error(osd_msg)

    # after logging everything properly, make the test fail if necessary
    # see ERRORs reported in the test log for details
    assert health_validation, health_msg
    assert mon_validation, mon_msg
    osds_msg = "ceph_osd_{up,in} metrics should indicate no OSD issues"
    assert all(osd_validations), osds_msg
def test_monitoring_shows_mon_down(measure_stop_ceph_mon):
    """
    Make sure simple problems with MON daemons are reported via OCP Prometheus.
    """
    prometheus = PrometheusAPI()
    # time (in seconds) for monitoring to notice the change
    expected_delay = 60
    # query resolution step used in this test case (number of seconds)
    query_step = 15

    affected_mons = measure_stop_ceph_mon["result"]
    # we asked to stop just a single mon ... make this assumption explicit
    assert len(affected_mons) == 1
    affected_mon = affected_mons[0]
    # translate this into ceph daemon name
    ceph_daemon = "mon.{}".format(affected_mon[len("rook-ceph-mon-"):])
    logger.info(
        f"affected mon was {affected_mon}, aka {ceph_daemon} ceph daemon")

    logger.info("let's check that ceph health was affected")
    health_result = prometheus.query_range(
        query="ceph_health_status",
        start=measure_stop_ceph_mon["start"],
        end=measure_stop_ceph_mon["stop"],
        step=query_step,
    )
    health_validation = check_query_range_result_enum(
        result=health_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=1,
        exp_good_time=measure_stop_ceph_mon["min_downtime"],
        exp_delay=expected_delay,
    )
    health_msg = "health status should be affected by missing mon"

    logger.info("let's check that mon quorum status value was affected")
    mon_result = prometheus.query_range(
        query='ceph_mon_quorum_status{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_mon["start"],
        end=measure_stop_ceph_mon["stop"],
        step=query_step,
        validate=False,
    )
    mon_validation = check_query_range_result_enum(
        result=mon_result,
        good_values=[0],
        bad_values=[1],
        exp_metric_num=1,
        exp_good_time=measure_stop_ceph_mon["min_downtime"],
        exp_delay=expected_delay,
    )
    mon_msg = "ceph_mon_quorum_status value should be affected by missing mon"

    # checking validation results when both queries are performed makes sure
    # that there is evidence for both mon and health queries in the test case
    # logs in case of an assert failure
    assert health_validation, health_msg
    assert mon_validation, mon_msg

    # since we don't do strict result validation in the previous query, we
    # are going to check the min. expected size of the reply explicitly, taking
    # into account the min. expected downtime of the affected ceph mon
    assert len(mon_result) == 1, "there should be one metric for one mon"
    min_mon_samples = measure_stop_ceph_mon["min_downtime"] / query_step
    mon_sample_size = len(mon_result[0]["values"])
    assert mon_sample_size >= min_mon_samples