Пример #1
0
def test_ceph_rgw_metrics_after_metrics_exporter_respin(rgw_deployments):
    """
    RGW metrics should be provided via OCP Prometheus even after
    ocs-metrics-exporter pod is respinned.

    """
    logger.info("Respin ocs-metrics-exporter pod")
    pod_obj = ocp.OCP(kind=constants.POD,
                      namespace=defaults.ROOK_CLUSTER_NAMESPACE)
    metrics_pods = pod_obj.get(
        selector="app.kubernetes.io/name=ocs-metrics-exporter")["items"]
    assert len(metrics_pods) == 1
    metrics_pod_data = metrics_pods[0]
    metrics_pod = OCS(**metrics_pod_data)
    metrics_pod.delete(force=True)

    logger.info("Wait for ocs-metrics-exporter pod to come up")
    assert pod_obj.wait_for_resource(
        condition="Running",
        selector="app.kubernetes.io/name=ocs-metrics-exporter",
        resource_count=1,
        timeout=600,
    )

    logger.info("Collect RGW metrics")
    prometheus = PrometheusAPI()
    list_of_metrics_without_results = metrics.get_missing_metrics(
        prometheus, metrics.ceph_rgw_metrics)
    msg = (
        "OCS Monitoring should provide some value(s) for tested rgw metrics, "
        "so that the list of metrics without results is empty.")
    assert list_of_metrics_without_results == [], msg
def test_ceph_metrics_presence_when_osd_down(measure_stop_ceph_osd):
    """
    Since ODF 4.9 ceph metrics covering disruptions will be available only
    when there are some disruptions to report, as noted in BZ 2028649.

    This test case covers this behaviour for one stopped/disabled OSD.
    """
    prometheus = PrometheusAPI()
    metrics_expected = list(metrics.ceph_metrics_healthy)
    # metrics which should be present with one OSD down
    for mtr in ("ceph_pg_degraded", "ceph_pg_undersized"):
        assert mtr in metrics.ceph_metrics, "test code needs to be updated"
        # make sure the test code is consistent with metrics module
        metrics_expected.append(mtr)
    # metrics which should not be present with one OSD down
    for mtr in ["ceph_pg_clean"]:
        assert mtr in metrics.ceph_metrics, "test code needs to be updated"
        metrics_expected.remove(mtr)
    metrics_without_results = metrics.get_missing_metrics(
        prometheus,
        metrics_expected,
        current_platform=config.ENV_DATA["platform"].lower(),
        start=measure_stop_ceph_osd["start"],
        stop=measure_stop_ceph_osd["stop"],
    )
    msg = ("Prometheus should provide some value(s) for all tested metrics, "
           "so that the list of metrics without results is empty.")
    assert metrics_without_results == [], msg
Пример #3
0
def collect_prometheus_metrics(
    metrics,
    dir_name,
    start,
    stop,
    step=1.0,
):
    """
    Collects metrics from Prometheus and saves them in file in json format.
    Metrics can be found in OCP Console in Monitoring -> Metrics.

    Args:
        metrics (list): list of metrics to get from Prometheus
            (E.g. ceph_cluster_total_used_bytes, cluster:cpu_usage_cores:sum,
            cluster:memory_usage_bytes:sum)
        dir_name (str): directory name to store metrics. Metrics will be stored
            in dir_name suffix with _ocs_metrics.
        start (str): start timestamp of required datapoints
        stop (str): stop timestamp of required datapoints
        step (float): step of required datapoints
    """
    api = PrometheusAPI()
    log_dir_path = os.path.join(
        os.path.expanduser(ocsci_config.RUN['log_dir']),
        f"failed_testcase_ocs_logs_{ocsci_config.RUN['run_id']}",
        f"{dir_name}_ocs_metrics"
    )
    if not os.path.exists(log_dir_path):
        log.info(f'Creating directory {log_dir_path}')
        os.makedirs(log_dir_path)

    for metric in metrics:
        datapoints = api.get(
            'query_range',
            {
                'query': metric,
                'start': start,
                'end': stop,
                'step': step
            }
        )
        file_name = os.path.join(log_dir_path, f'{metric}.json')
        log.info(f'Saving {metric} data into {file_name}')
        with open(file_name, 'w') as outfile:
            json.dump(datapoints.json(), outfile)
Пример #4
0
def test_monitoring_enabled():
    """
    OCS Monitoring is enabled after OCS installation (which is why this test
    has a post deployment marker) by asking for values of one ceph and one
    noobaa related metrics.
    """
    prometheus = PrometheusAPI()

    if (
        storagecluster_independent_check()
        and float(config.ENV_DATA["ocs_version"]) < 4.6
    ):
        logger.info(
            f"Skipping ceph metrics because it is not enabled for external "
            f"mode for OCS {float(config.ENV_DATA['ocs_version'])}"
        )

    else:
        # ask for values of ceph_pool_stored metric
        logger.info("Checking that ceph data are provided in OCS monitoring")
        result = prometheus.query("ceph_pool_stored")
        msg = "check that we actually received some values for a ceph query"
        assert len(result) > 0, msg
        for metric in result:
            _, value = metric["value"]
            assert_msg = "number of bytes in a pool isn't a positive integer or zero"
            assert int(value) >= 0, assert_msg
        # additional check that values makes at least some sense
        logger.info(
            "Checking that size of ceph_pool_stored result matches number of pools"
        )
        ct_pod = pod.get_ceph_tools_pod()
        ceph_pools = ct_pod.exec_ceph_cmd("ceph osd pool ls")
        assert len(result) == len(ceph_pools)

    # again for a noobaa metric
    logger.info("Checking that MCG/NooBaa data are provided in OCS monitoring")
    result = prometheus.query("NooBaa_bucket_status")
    msg = "check that we actually received some values for a MCG/NooBaa query"
    assert len(result) > 0, msg
    for metric in result:
        _, value = metric["value"]
        assert int(value) >= 0, "bucket status isn't a positive integer or zero"
Пример #5
0
def test_mcg_cpu_usage(workload_idle):
    """
    Without any IO  workload, cpu utilization of MCG pods should be minimal.
    No pod should utilize more than 0.1 cpu units.
    """
    prometheus = PrometheusAPI()
    cpu_result = prometheus.query_range(
        query=CPU_USAGE_POD + '{namespace="openshift-storage",pod=~"^noobaa.*"}',
        start=workload_idle["start"],
        end=workload_idle["stop"],
        step=15,
    )
    validation = check_query_range_result_limits(
        result=cpu_result,
        good_min=0.0,
        good_max=0.25,
    )
    msg = "No NooBaa pod should utilize over 0.1 cpu units while idle."
    assert validation, msg
Пример #6
0
    def __init__(
        self,
        project_factory=None,
        pvc_factory=None,
        sa_factory=None,
        pod_factory=None,
        target_percentage=None,
    ):
        """
        Initializer for ClusterLoad

        Args:
            project_factory (function): A call to project_factory function
            pvc_factory (function): A call to pvc_factory function
            sa_factory (function): A call to service_account_factory function
            pod_factory (function): A call to pod_factory function
            target_percentage (float): The percentage of cluster load that is
                required. The value should be greater than 0.1 and smaller than 0.95

        """
        self.prometheus_api = PrometheusAPI()
        self.pvc_factory = pvc_factory
        self.sa_factory = sa_factory
        self.pod_factory = pod_factory
        self.target_percentage = target_percentage
        self.cluster_limit = None
        self.dc_objs = list()
        self.pvc_objs = list()
        self.previous_iops = None
        self.current_iops = None
        self.rate = None
        self.pvc_size = None
        if not config.DEPLOYMENT["external_mode"]:
            self.pvc_size = int(get_osd_pods_memory_sum() * 0.5)
        else:
            self.pvc_size = 10
        self.sleep_time = 45
        self.target_pods_number = None
        if project_factory:
            project_name = f"{defaults.BG_LOAD_NAMESPACE}-{uuid4().hex[:5]}"
            self.project = project_factory(project_name=project_name)
def test_monitoring_enabled():
    """
    OCS Monitoring is enabled after OCS installation (which is why this test
    has a post deployment marker) by asking for values of one ceph and one
    noobaa related metrics.
    """
    prometheus = PrometheusAPI()

    # ask for values of ceph_pool_stored metric
    logger.info("Checking that ceph data are provided in OCS monitoring")
    result = prometheus.query('ceph_pool_stored')
    msg = "check that we actually received some values for a ceph query"
    assert len(result) > 0, msg
    for metric in result:
        _, value = metric['value']
        assert_msg = "number of bytes in a pool isn't a positive integer or zero"
        assert int(value) >= 0, assert_msg
    # additional check that values makes at least some sense
    logger.info(
        "Checking that size of ceph_pool_stored result matches number of pools"
    )
    ct_pod = pod.get_ceph_tools_pod()
    ceph_pools = ct_pod.exec_ceph_cmd("ceph osd pool ls")
    assert len(result) == len(ceph_pools)

    # TODO: remove BZ 1790558 workaround (noobaa is not immediatelly ready just
    # after installation)
    hack_sleep = 600  # in seconds
    logger.info(
        f"BZ 1790558 workaround: going to sleep for {hack_sleep} seconds")
    time.sleep(hack_sleep)

    # again for a noobaa metric
    logger.info("Checking that MCG/NooBaa data are provided in OCS monitoring")
    result = prometheus.query('NooBaa_bucket_status')
    msg = "check that we actually received some values for a MCG/NooBaa query"
    assert len(result) > 0, msg
    for metric in result:
        _, value = metric['value']
        assert int(
            value) >= 0, "bucket status isn't a positive integer or zero"
def test_ceph_rbd_metrics_available():
    """
    Ceph RBD metrics should be provided via OCP Prometheus as well.
    See also: https://ceph.com/rbd/new-in-nautilus-rbd-performance-monitoring/
    """
    prometheus = PrometheusAPI()
    list_of_metrics_without_results = metrics.get_missing_metrics(
        prometheus, metrics.ceph_rbd_metrics)
    msg = (
        "OCS Monitoring should provide some value(s) for tested rbd metrics, "
        "so that the list of metrics without results is empty.")
    assert list_of_metrics_without_results == [], msg
Пример #9
0
    def prometheus_log(info, prometheus_alert_list):
        """
        Log all alerts from Prometheus API every 3 seconds.

        Args:
            info (dict): Contains run key attribute that controls thread.
                If `info['run'] == False` then thread will stop
            prometheus_alert_list (list): List to be populated with alerts

        """
        prometheus = PrometheusAPI()
        logger.info("Logging of all prometheus alerts started")
        while info.get("run"):
            alerts_response = prometheus.get(
                "alerts", payload={"silenced": False, "inhibited": False}
            )
            msg = f"Request {alerts_response.request.url} failed"
            assert alerts_response.ok, msg
            for alert in alerts_response.json().get("data").get("alerts"):
                if alert not in prometheus_alert_list:
                    logger.info(f"Adding {alert} to alert list")
                    prometheus_alert_list.append(alert)
            time.sleep(3)
        logger.info("Logging of all prometheus alerts stopped")
Пример #10
0
def test_ceph_metrics_available():
    """
    Ceph metrics as listed in KNIP-634 should be provided via OCP Prometheus.

    Ceph Object Gateway https://docs.ceph.com/docs/master/radosgw/ is
    deployed on on-prem platforms only (such as VMWare - see BZ 1763150),
    so this test case ignores failures for ceph_rgw_* and ceph_objecter_*
    metrics when running on cloud platforms (such as AWS).
    """
    prometheus = PrometheusAPI()
    list_of_metrics_without_results = metrics.get_missing_metrics(
        prometheus,
        metrics.ceph_metrics,
        current_platform=config.ENV_DATA['platform'].lower())
    msg = (
        "OCS Monitoring should provide some value(s) for all tested metrics, "
        "so that the list of metrics without results is empty.")
    assert list_of_metrics_without_results == [], msg
Пример #11
0
def test_monitoring_shows_mon_down(measure_stop_ceph_mon):
    """
    Make sure simple problems with MON daemons are reported via OCP Prometheus.
    """
    prometheus = PrometheusAPI()
    # time (in seconds) for monitoring to notice the change
    expected_delay = 60

    affected_mons = measure_stop_ceph_mon['result']
    # we asked to stop just a single mon ... make this assumption explicit
    assert len(affected_mons) == 1
    affected_mon = affected_mons[0]
    # translate this into ceph daemon name
    ceph_daemon = "mon.{}".format(affected_mon[len('rook-ceph-mon-'):])
    logger.info(
        f"affected mon was {affected_mon}, aka {ceph_daemon} ceph daemon")

    logger.info("let's check that ceph health was affected")
    health_result = prometheus.query_range(
        query='ceph_health_status',
        start=measure_stop_ceph_mon['start'],
        end=measure_stop_ceph_mon['stop'],
        step=15)
    health_validation = prometheus.check_query_range_result(
        result=health_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=1,
        exp_delay=expected_delay)
    health_msg = "health status should be affected by missing mon"
    assert health_validation, health_msg

    logger.info("let's check that mon quorum status value was affected")
    mon_result = prometheus.query_range(
        query='ceph_mon_quorum_status{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_mon['start'],
        end=measure_stop_ceph_mon['stop'],
        step=15)
    mon_validation = prometheus.check_query_range_result(
        result=mon_result,
        good_values=[0],
        bad_values=[1],
        exp_metric_num=1,
        exp_delay=expected_delay)
    mon_msg = "ceph_osd_up value should be affected by missing osd"
    assert mon_validation, mon_msg
Пример #12
0
def test_workload_rbd(workload_storageutilization_50p_rbd):
    """
    Purpose of this test is to make the workload fixture executed, and
    show how to query prometheus.

    Note that this test is valid only on 3 osd cluster with all pools using
    3 way replication.
    """
    prometheus = PrometheusAPI()
    # Asking for values of `ceph_osd_stat_bytes_used` for every 15s in
    # when the workload fixture was utilizing 50% of the OCS storage.
    result_used = prometheus.query_range(
        query="ceph_osd_stat_bytes_used",
        start=workload_storageutilization_50p_rbd["start"],
        end=workload_storageutilization_50p_rbd["stop"],
        step=15,
    )
    # This time, we are asking for total OCS capacity, in the same format
    # as in previous case (for each OSD).
    result_total = prometheus.query_range(
        query="ceph_osd_stat_bytes",
        start=workload_storageutilization_50p_rbd["start"],
        end=workload_storageutilization_50p_rbd["stop"],
        step=15,
    )
    # Check test assumption that ceph_osd_stat_bytes hasn't changed for each
    # OSD, and that each OSD has the same size.
    osd_stat_bytes = []
    for metric in result_total:
        values = []
        for ts, value in metric["values"]:
            values.append(value)
        assert all(value == values[0] for value in values)
        osd_stat_bytes.append(values[0])
    assert all(value == osd_stat_bytes[0] for value in osd_stat_bytes)
    # Compute expected value of'ceph_osd_stat_bytes_used, based on percentage
    # utilized by the fixture.
    percentage = workload_storageutilization_50p_rbd["result"]["target_p"]
    expected_value = int(osd_stat_bytes[0]) * percentage
    # Now we can check the actual usage values from Prometheus.
    at_least_one_value_out_of_range = False
    for metric in result_used:
        name = metric["metric"]["__name__"]
        daemon = metric["metric"]["ceph_daemon"]
        logger.info(f"metric {name} from {daemon}")
        # We are skipping the 1st 10% of the values, as it could take some
        # additional time for all the data to be written everywhere, and
        # during this time utilization value still grows.
        start_index = int(len(metric["values"]) * 0.1)
        logger.info(f"ignoring first {start_index} values")
        for ts, value in metric["values"][:start_index]:
            value = int(value)
            dt = datetime.utcfromtimestamp(ts)
            logger.info(f"ignoring value {value} B at {dt}")
        for ts, value in metric["values"][start_index:]:
            value = int(value)
            dt = datetime.utcfromtimestamp(ts)
            # checking the value, with 10% error margin in each direction
            if expected_value * 0.90 <= value <= expected_value * 1.10:
                logger.info(
                    f"value {value} B at {dt} is withing expected range")
            else:
                logger.error(
                    (f"value {value} B at {dt} is outside of expected range"
                     f" {expected_value} B +- 10%"))
                at_least_one_value_out_of_range = True
    assert not at_least_one_value_out_of_range
Пример #13
0
def test_monitoring_reporting_ok_when_idle(workload_idle):
    """
    When nothing is happening, OCP Prometheus reports OCS status as OK.

    If this test case fails, the status is either reported wrong or the
    cluster is in a broken state. Either way, a failure here is not good.
    """
    prometheus = PrometheusAPI()

    health_result = prometheus.query_range(
        query='ceph_health_status',
        start=workload_idle['start'],
        end=workload_idle['stop'],
        step=15)
    health_validation = check_query_range_result(
        result=health_result,
        good_values=[0],
        bad_values=[1],
        exp_metric_num=1)
    health_msg = "ceph_health_status {} report 0 (health ok) as expected"
    if health_validation:
        health_msg = health_msg.format('does')
        logger.info(health_msg)
    else:
        health_msg = health_msg.format('should')
        logger.error(health_msg)

    mon_result = prometheus.query_range(
        query='ceph_mon_quorum_status',
        start=workload_idle['start'],
        end=workload_idle['stop'],
        step=15)
    mon_validation = check_query_range_result(
        result=mon_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=workload_idle['result']['mon_num'])
    mon_msg = "ceph_mon_quorum_status {} indicate no problems with quorum"
    if mon_validation:
        mon_msg = mon_msg.format('does')
        logger.info(mon_msg)
    else:
        mon_msg = mon_msg.format('should')
        logger.error(mon_msg)

    osd_validations = []
    for metric in ("ceph_osd_up", "ceph_osd_in"):
        osd_result = prometheus.query_range(
            query=metric,
            start=workload_idle['start'],
            end=workload_idle['stop'],
            step=15)
        osd_validation = check_query_range_result(
            result=osd_result,
            good_values=[1],
            bad_values=[0],
            exp_metric_num=workload_idle['result']['osd_num'])
        osd_validations.append(osd_validation)
        osd_msg = "{} metric {} indicate no problems with OSDs"
        if osd_validation:
            osd_msg = osd_msg.format(metric, 'does')
            logger.info(osd_msg)
        else:
            osd_msg = osd_msg.format(metric, 'should')
            logger.error(osd_msg)

    # after logging everything properly, make the test fail if necessary
    # see ERRORs reported in the test log for details
    assert health_validation, health_msg
    assert mon_validation, mon_msg
    osds_msg = "ceph_osd_{up,in} metrics should indicate no OSD issues"
    assert all(osd_validations), osds_msg
Пример #14
0
def test_ceph_metrics_available():
    """
    Ceph metrics as listed in KNIP-634 should be provided via OCP Prometheus.

    Ceph Object Gateway https://docs.ceph.com/docs/master/radosgw/ is
    deployed on on-prem platforms only (such as VMWare - see BZ 1763150),
    so this test case ignores failures for ceph_rgw_* and ceph_objecter_*
    metrics when running on cloud platforms (such as AWS).
    """
    # this list is taken from spreadsheet attached to KNIP-634
    list_of_metrics = [
        "ceph_bluestore_state_aio_wait_lat_sum",
        "ceph_paxos_store_state_latency_sum",
        "ceph_osd_op_out_bytes",
        "ceph_pg_incomplete",
        "ceph_bluestore_submit_lat_sum",
        "ceph_paxos_commit",
        "ceph_paxos_new_pn_latency_count",
        "ceph_osd_op_r_process_latency_count",
        "ceph_osd_flag_norebalance",
        "ceph_bluestore_submit_lat_count",
        "ceph_osd_in",
        "ceph_bluestore_kv_final_lat_sum",
        "ceph_paxos_collect_keys_sum",
        "ceph_paxos_accept_timeout",
        "ceph_paxos_begin_latency_count",
        "ceph_bluefs_wal_total_bytes",
        "ceph_osd_flag_nobackfill",
        "ceph_paxos_refresh",
        "ceph_bluestore_read_lat_count",
        "ceph_pg_degraded",
        "ceph_mon_num_sessions",
        "ceph_objecter_op_rmw",
        "ceph_bluefs_bytes_written_wal",
        "ceph_mon_num_elections",
        "ceph_rocksdb_compact",
        "ceph_bluestore_kv_sync_lat_sum",
        "ceph_osd_op_process_latency_count",
        "ceph_osd_op_w_prepare_latency_count",
        "ceph_pool_stored",
        "ceph_objecter_op_active",
        "ceph_pg_backfill_unfound",
        "ceph_num_objects_degraded",
        "ceph_osd_flag_nodeep_scrub",
        "ceph_osd_apply_latency_ms",
        "ceph_paxos_begin_latency_sum",
        "ceph_osd_flag_noin",
        "ceph_osd_op_r",
        "ceph_osd_op_rw_prepare_latency_sum",
        "ceph_paxos_new_pn",
        "ceph_rgw_qlen",
        "ceph_rgw_req",
        "ceph_rocksdb_get_latency_count",
        "ceph_pool_max_avail",
        "ceph_pool_rd",
        "ceph_rgw_cache_miss",
        "ceph_paxos_commit_latency_count",
        "ceph_bluestore_throttle_lat_count",
        "ceph_paxos_lease_ack_timeout",
        "ceph_bluestore_commit_lat_sum",
        "ceph_paxos_collect_bytes_sum",
        "ceph_cluster_total_used_raw_bytes",
        "ceph_pg_stale",
        "ceph_health_status",
        "ceph_pool_wr_bytes",
        "ceph_osd_op_rw_latency_count",
        "ceph_paxos_collect_uncommitted",
        "ceph_osd_op_rw_latency_sum",
        "ceph_paxos_share_state",
        "ceph_pool_stored_raw",
        "ceph_osd_op_r_prepare_latency_sum",
        "ceph_bluestore_kv_flush_lat_sum",
        "ceph_osd_op_rw_process_latency_sum",
        "ceph_osd_metadata",
        "ceph_rocksdb_rocksdb_write_memtable_time_count",
        "ceph_paxos_collect_latency_count",
        "ceph_pg_undersized",
        "ceph_osd_op_rw_prepare_latency_count",
        "ceph_paxos_collect_latency_sum",
        "ceph_rocksdb_rocksdb_write_delay_time_count",
        "ceph_objecter_op_rmw",
        "ceph_paxos_begin_bytes_sum",
        "ceph_pg_recovering",
        "ceph_pg_peering",
        "ceph_osd_numpg",
        "ceph_osd_flag_noout",
        "ceph_pg_inconsistent",
        "ceph_osd_stat_bytes",
        "ceph_rocksdb_submit_sync_latency_sum",
        "ceph_rocksdb_compact_queue_merge",
        "ceph_paxos_collect_bytes_count",
        "ceph_osd_op",
        "ceph_paxos_commit_keys_sum",
        "ceph_osd_op_rw_in_bytes",
        "ceph_osd_op_rw_out_bytes",
        "ceph_bluefs_bytes_written_sst",
        "ceph_rgw_put",
        "ceph_osd_op_rw_process_latency_count",
        "ceph_rocksdb_compact_queue_len",
        "ceph_pool_wr",
        "ceph_bluestore_throttle_lat_sum",
        "ceph_bluefs_slow_used_bytes",
        "ceph_osd_op_r_latency_sum",
        "ceph_bluestore_kv_flush_lat_count",
        "ceph_rocksdb_compact_range",
        "ceph_osd_op_latency_sum",
        "ceph_mon_session_add",
        "ceph_paxos_share_state_keys_count",
        "ceph_num_objects_misplaced",
        "ceph_paxos_collect",
        "ceph_osd_op_w_in_bytes",
        "ceph_osd_op_r_process_latency_sum",
        "ceph_paxos_start_peon",
        "ceph_cluster_total_bytes",
        "ceph_mon_session_trim",
        "ceph_pg_recovery_wait",
        "ceph_rocksdb_get_latency_sum",
        "ceph_rocksdb_submit_transaction_sync",
        "ceph_osd_op_rw",
        "ceph_paxos_store_state_keys_count",
        "ceph_rocksdb_rocksdb_write_delay_time_sum",
        "ceph_pool_objects",
        "ceph_pg_backfill_wait",
        "ceph_objecter_op_r",
        "ceph_objecter_op_active",
        "ceph_objecter_op_w",
        "ceph_osd_recovery_ops",
        "ceph_bluefs_logged_bytes",
        "ceph_rocksdb_get",
        "ceph_pool_metadata",
        "ceph_bluefs_db_total_bytes",
        "ceph_rgw_put_initial_lat_sum",
        "ceph_pg_recovery_toofull",
        "ceph_osd_op_w_latency_count",
        "ceph_rgw_put_initial_lat_count",
        "ceph_mon_metadata",
        "ceph_bluestore_commit_lat_count",
        "ceph_bluestore_state_aio_wait_lat_count",
        "ceph_pg_unknown",
        "ceph_paxos_begin_bytes_count",
        "ceph_pg_recovery_unfound",
        "ceph_pool_quota_bytes",
        "ceph_pg_snaptrim_wait",
        "ceph_paxos_start_leader",
        "ceph_pg_creating",
        "ceph_mon_election_call",
        "ceph_rocksdb_rocksdb_write_pre_and_post_time_count",
        "ceph_mon_session_rm",
        "ceph_cluster_total_used_bytes",
        "ceph_pg_active",
        "ceph_paxos_store_state",
        "ceph_pg_activating",
        "ceph_paxos_store_state_bytes_count",
        "ceph_osd_op_w_latency_sum",
        "ceph_rgw_keystone_token_cache_hit",
        "ceph_rocksdb_submit_latency_count",
        "ceph_pool_dirty",
        "ceph_paxos_commit_latency_sum",
        "ceph_rocksdb_rocksdb_write_memtable_time_sum",
        "ceph_rgw_metadata",
        "ceph_paxos_share_state_bytes_sum",
        "ceph_osd_op_process_latency_sum",
        "ceph_paxos_begin_keys_sum",
        "ceph_pg_snaptrim_error",
        "ceph_rgw_qactive",
        "ceph_pg_backfilling",
        "ceph_rocksdb_rocksdb_write_pre_and_post_time_sum",
        "ceph_bluefs_wal_used_bytes",
        "ceph_pool_rd_bytes",
        "ceph_pg_deep",
        "ceph_rocksdb_rocksdb_write_wal_time_sum",
        "ceph_osd_op_wip",
        "ceph_pg_backfill_toofull",
        "ceph_osd_flag_noup",
        "ceph_rgw_get_initial_lat_sum",
        "ceph_pg_scrubbing",
        "ceph_num_objects_unfound",
        "ceph_mon_quorum_status",
        "ceph_paxos_lease_timeout",
        "ceph_osd_op_r_out_bytes",
        "ceph_paxos_begin_keys_count",
        "ceph_bluestore_kv_sync_lat_count",
        "ceph_osd_op_prepare_latency_count",
        "ceph_bluefs_bytes_written_slow",
        "ceph_rocksdb_submit_latency_sum",
        "ceph_pg_repair",
        "ceph_osd_op_r_latency_count",
        "ceph_paxos_share_state_keys_sum",
        "ceph_paxos_store_state_bytes_sum",
        "ceph_osd_op_latency_count",
        "ceph_paxos_commit_bytes_count",
        "ceph_paxos_restart",
        "ceph_rgw_get_initial_lat_count",
        "ceph_pg_down",
        "ceph_bluefs_slow_total_bytes",
        "ceph_paxos_collect_timeout",
        "ceph_pg_peered",
        "ceph_osd_commit_latency_ms",
        "ceph_osd_op_w_process_latency_sum",
        "ceph_osd_weight",
        "ceph_paxos_collect_keys_count",
        "ceph_paxos_share_state_bytes_count",
        "ceph_osd_op_w_prepare_latency_sum",
        "ceph_bluestore_read_lat_sum",
        "ceph_osd_flag_noscrub",
        "ceph_osd_stat_bytes_used",
        "ceph_osd_flag_norecover",
        "ceph_pg_clean",
        "ceph_paxos_begin",
        "ceph_mon_election_win",
        "ceph_osd_op_w_process_latency_count",
        "ceph_rgw_get_b",
        "ceph_rgw_failed_req",
        "ceph_rocksdb_rocksdb_write_wal_time_count",
        "ceph_rgw_keystone_token_cache_miss",
        "ceph_disk_occupation",
        "ceph_pg_snaptrim",
        "ceph_paxos_store_state_keys_sum",
        "ceph_osd_numpg_removing",
        "ceph_pg_remapped",
        "ceph_paxos_commit_keys_count",
        "ceph_pg_forced_backfill",
        "ceph_paxos_new_pn_latency_sum",
        "ceph_osd_op_in_bytes",
        "ceph_paxos_store_state_latency_count",
        "ceph_paxos_refresh_latency_count",
        "ceph_rgw_get",
        "ceph_pg_total",
        "ceph_osd_op_r_prepare_latency_count",
        "ceph_rgw_cache_hit",
        "ceph_objecter_op_w",
        "ceph_rocksdb_submit_transaction",
        "ceph_objecter_op_r",
        "ceph_bluefs_num_files",
        "ceph_osd_up",
        "ceph_rgw_put_b",
        "ceph_mon_election_lose",
        "ceph_osd_op_prepare_latency_sum",
        "ceph_bluefs_db_used_bytes",
        "ceph_bluestore_kv_final_lat_count",
        "ceph_pool_quota_objects",
        "ceph_osd_flag_nodown",
        "ceph_pg_forced_recovery",
        "ceph_paxos_refresh_latency_sum",
        "ceph_osd_recovery_bytes",
        "ceph_osd_op_w",
        "ceph_paxos_commit_bytes_sum",
        "ceph_bluefs_log_bytes",
        "ceph_rocksdb_submit_sync_latency_count",
        "ceph_pool_num_bytes_recovered",
        "ceph_pool_num_objects_recovered",
        "ceph_pool_recovering_bytes_per_sec",
        "ceph_pool_recovering_keys_per_sec",
        "ceph_pool_recovering_objects_per_sec"]

    current_platform = config.ENV_DATA['platform'].lower()

    prometheus = PrometheusAPI()

    list_of_metrics_without_results = []
    for metric in list_of_metrics:
        result = prometheus.query(metric)
        # check that we actually received some values
        if len(result) == 0:
            # Ceph Object Gateway https://docs.ceph.com/docs/master/radosgw/ is
            # deployed on on-prem platforms only, so we are going to ignore
            # missing metrics from these components on such platforms.
            is_rgw_metric = (
                metric.startswith("ceph_rgw")
                or metric.startswith("ceph_objecter"))
            if current_platform in constants.CLOUD_PLATFORMS and is_rgw_metric:
                msg = (
                    f"failed to get results for {metric}, "
                    f"but it is expected on {current_platform}")
                logger.info(msg)
            else:
                logger.error(f"failed to get results for {metric}")
                list_of_metrics_without_results.append(metric)
    msg = (
        "OCS Monitoring should provide some value(s) for all tested metrics, "
        "so that the list of metrics without results is empty.")
    assert list_of_metrics_without_results == [], msg
def test_monitoring_shows_osd_down(measure_stop_ceph_osd):
    """
    Make sure simple problems with OSD daemons are reported via OCP Prometheus.
    """
    prometheus = PrometheusAPI()
    # time (in seconds) for monitoring to notice the change
    expected_delay = 60

    affected_osd = measure_stop_ceph_osd["result"]
    # translate this into ceph daemon name
    ceph_daemon = "osd.{}".format(int(affected_osd[len("rook-ceph-osd-"):]))
    logger.info(
        f"affected osd was {affected_osd}, aka {ceph_daemon} ceph daemon")

    logger.info("let's check that ceph health was affected")
    health_result = prometheus.query_range(
        query="ceph_health_status",
        start=measure_stop_ceph_osd["start"],
        end=measure_stop_ceph_osd["stop"],
        step=15,
    )
    health_validation = check_query_range_result_enum(
        result=health_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=1,
        exp_delay=expected_delay,
    )
    health_msg = "health status should be affected by missing osd"

    logger.info("let's check that osd up value was affected")
    osd_up_result = prometheus.query_range(
        query='ceph_osd_up{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_osd["start"],
        end=measure_stop_ceph_osd["stop"],
        step=15,
    )
    osd_up_validation = check_query_range_result_enum(
        result=osd_up_result,
        good_values=[0],
        bad_values=[1],
        exp_metric_num=1,
        exp_delay=expected_delay,
    )
    osd_up_msg = "ceph_osd_up value should be affected by missing osd"

    logger.info("let's check that osd in value was not affected")
    # osd in value is not affected because we just stopped the osd, we
    # haven't removed it from the luster
    osd_in_result = prometheus.query_range(
        query='ceph_osd_in{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_osd["start"],
        end=measure_stop_ceph_osd["stop"],
        step=15,
    )
    osd_in_validation = check_query_range_result_enum(result=osd_in_result,
                                                      good_values=[1],
                                                      bad_values=[0],
                                                      exp_metric_num=1)
    osd_in_msg = "ceph_osd_in value should not be affected by missing osd"

    # checking validation results when all queries are performed makes sure
    # that there is evidence for all queries in the test case logs in case of
    # an assert failure
    assert health_validation, health_msg
    assert osd_up_validation, osd_up_msg
    assert osd_in_validation, osd_in_msg
def test_monitoring_shows_mon_down(measure_stop_ceph_mon):
    """
    Make sure simple problems with MON daemons are reported via OCP Prometheus.
    """
    prometheus = PrometheusAPI()
    # time (in seconds) for monitoring to notice the change
    expected_delay = 60
    # query resolution step used in this test case (number of seconds)
    query_step = 15

    affected_mons = measure_stop_ceph_mon["result"]
    # we asked to stop just a single mon ... make this assumption explicit
    assert len(affected_mons) == 1
    affected_mon = affected_mons[0]
    # translate this into ceph daemon name
    ceph_daemon = "mon.{}".format(affected_mon[len("rook-ceph-mon-"):])
    logger.info(
        f"affected mon was {affected_mon}, aka {ceph_daemon} ceph daemon")

    logger.info("let's check that ceph health was affected")
    health_result = prometheus.query_range(
        query="ceph_health_status",
        start=measure_stop_ceph_mon["start"],
        end=measure_stop_ceph_mon["stop"],
        step=query_step,
    )
    health_validation = check_query_range_result_enum(
        result=health_result,
        good_values=[1],
        bad_values=[0],
        exp_metric_num=1,
        exp_good_time=measure_stop_ceph_mon["min_downtime"],
        exp_delay=expected_delay,
    )
    health_msg = "health status should be affected by missing mon"

    logger.info("let's check that mon quorum status value was affected")
    mon_result = prometheus.query_range(
        query='ceph_mon_quorum_status{ceph_daemon="%s"}' % ceph_daemon,
        start=measure_stop_ceph_mon["start"],
        end=measure_stop_ceph_mon["stop"],
        step=query_step,
        validate=False,
    )
    mon_validation = check_query_range_result_enum(
        result=mon_result,
        good_values=[0],
        bad_values=[1],
        exp_metric_num=1,
        exp_good_time=measure_stop_ceph_mon["min_downtime"],
        exp_delay=expected_delay,
    )
    mon_msg = "ceph_mon_quorum_status value should be affected by missing mon"

    # checking validation results when both queries are performed makes sure
    # that there is evidence for both mon and health queries in the test case
    # logs in case of an assert failure
    assert health_validation, health_msg
    assert mon_validation, mon_msg

    # since we don't do strict result validation in the previous query, we
    # are going to check the min. expected size of the reply explicitly, taking
    # into account the min. expected downtime of the affected ceph mon
    assert len(mon_result) == 1, "there should be one metric for one mon"
    min_mon_samples = measure_stop_ceph_mon["min_downtime"] / query_step
    mon_sample_size = len(mon_result[0]["values"])
    assert mon_sample_size >= min_mon_samples
    def test_pvc_expansion_when_full(self):
        """
        Verify PVC expansion when the PVC is 100% utilized.
        Verify utilization alert will stop firing after volume expansion.

        """
        pvc_size_expanded = 10

        # Run IO to utilise 100% of volume
        log.info("Run IO on all to utilise 100% of PVCs")
        for pod_obj in self.pods:
            pod_obj.run_io(
                "fs",
                size=f"{self.pvc_size}G",
                io_direction="write",
                runtime=30,
                rate="100M",
                fio_filename=f"{pod_obj.name}_f1",
            )
        log.info("Started IO on all to utilise 100% of PVCs")
        # Wait for IO to finish
        log.info("Wait for IO to finish on pods")
        for pod_obj in self.pods:
            try:
                pod_obj.get_fio_results()
            except CommandFailed as cfe:
                if "No space left on device" not in str(cfe):
                    raise
            log.info(f"IO finished on pod {pod_obj.name}")
            # Verify used space on pod is 100%
            used_space = get_used_space_on_mount_point(pod_obj)
            assert used_space == "100%", (
                f"The used space on pod {pod_obj.name} is not 100% "
                f"but {used_space}")
            log.info(f"Verified: Used space on pod {pod_obj.name} is 100%")

        prometheus_api = PrometheusAPI()

        # Wait till utilization alerts starts
        for response in TimeoutSampler(140, 5, prometheus_api.get, "alerts"):
            alerts = response.json()["data"]["alerts"]
            for pvc_obj in self.pvcs:
                alerts_pvc = [
                    alert for alert in alerts if alert.get("labels", {}).get(
                        "persistentvolumeclaim") == pvc_obj.name
                ]
                # At least 2 alerts should be present
                if len(alerts_pvc) < 2:
                    break

                # Verify 'PersistentVolumeUsageNearFull' alert is firing
                if not getattr(pvc_obj, "near_full_alert", False):
                    try:
                        log.info(
                            f"Checking 'PersistentVolumeUsageNearFull' alert "
                            f"for PVC {pvc_obj.name}")
                        near_full_msg = (
                            f"PVC {pvc_obj.name} is nearing full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageNearFull",
                            msg=near_full_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="warning",
                        )
                        pvc_obj.near_full_alert = True
                    except AssertionError:
                        log.info(f"'PersistentVolumeUsageNearFull' alert not "
                                 f"started firing for PVC {pvc_obj.name}")

                # Verify 'PersistentVolumeUsageCritical' alert is firing
                if not getattr(pvc_obj, "critical_alert", False):
                    try:
                        log.info(
                            f"Checking 'PersistentVolumeUsageCritical' alert "
                            f"for PVC {pvc_obj.name}")
                        critical_msg = (
                            f"PVC {pvc_obj.name} is critically full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageCritical",
                            msg=critical_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="error",
                        )
                        pvc_obj.critical_alert = True
                    except AssertionError:
                        log.info(f"'PersistentVolumeUsageCritical' alert not "
                                 f"started firing for PVC {pvc_obj.name}")

            # Collect list of PVCs for which alerts are not firing
            not_near_full_pvc = [
                pvc_ob.name for pvc_ob in self.pvcs
                if not getattr(pvc_ob, "near_full_alert", False)
            ]
            not_critical_pvc = [
                pvc_ob.name for pvc_ob in self.pvcs
                if not getattr(pvc_ob, "critical_alert", False)
            ]

            if (not not_near_full_pvc) and (not not_critical_pvc):
                log.info("'PersistentVolumeUsageNearFull' and "
                         "'PersistentVolumeUsageCritical' alerts are firing "
                         "for all PVCs.")
                break

        log.info("Expanding PVCs.")
        for pvc_obj in self.pvcs:
            log.info(f"Expanding size of PVC {pvc_obj.name} to "
                     f"{pvc_size_expanded}Gi")
            pvc_obj.resize_pvc(pvc_size_expanded, True)
        log.info(f"All PVCs are expanded to {pvc_size_expanded}Gi")

        # Verify utilization alerts are stopped
        for response in TimeoutSampler(140, 5, prometheus_api.get, "alerts"):
            alerts = response.json()["data"]["alerts"]
            for pvc_obj in self.pvcs:
                alerts_pvc = [
                    alert for alert in alerts if alert.get("labels", {}).get(
                        "persistentvolumeclaim") == pvc_obj.name
                ]
                if not alerts_pvc:
                    pvc_obj.near_full_alert = False
                    pvc_obj.critical_alert = False
                    continue

                # Verify 'PersistentVolumeUsageNearFull' alert stopped firing
                if getattr(pvc_obj, "near_full_alert"):
                    try:
                        log.info(
                            f"Checking 'PrsistentVolumeUsageNearFull' alert "
                            f"is cleared for PVC {pvc_obj.name}")
                        near_full_msg = (
                            f"PVC {pvc_obj.name} is nearing full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageNearFull",
                            msg=near_full_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="warning",
                        )
                        log.info(
                            f"'PersistentVolumeUsageNearFull' alert is not "
                            f"stopped for PVC {pvc_obj.name}")
                    except AssertionError:
                        pvc_obj.near_full_alert = False
                        log.info(
                            f"'PersistentVolumeUsageNearFull' alert stopped "
                            f"firing for PVC {pvc_obj.name}")

                # Verify 'PersistentVolumeUsageCritical' alert stopped firing
                if getattr(pvc_obj, "critical_alert"):
                    try:
                        log.info(
                            f"Checking 'PersistentVolumeUsageCritical' alert "
                            f"is cleared for PVC {pvc_obj.name}")
                        critical_msg = (
                            f"PVC {pvc_obj.name} is critically full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageCritical",
                            msg=critical_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="error",
                        )
                        log.info(
                            f"'PersistentVolumeUsageCritical' alert is not "
                            f"stopped for PVC {pvc_obj.name}")
                    except AssertionError:
                        pvc_obj.critical_alert = False
                        log.info(
                            f"'PersistentVolumeUsageCritical' alert stopped "
                            f"firing for PVC {pvc_obj.name}")

            # Collect list of PVCs for which alerts are still firing
            near_full_pvcs = [
                pvc_ob.name for pvc_ob in self.pvcs
                if getattr(pvc_ob, "near_full_alert")
            ]
            critical_pvcs = [
                pvc_ob.name for pvc_ob in self.pvcs
                if getattr(pvc_ob, "critical_alert")
            ]

            if (not near_full_pvcs) and (not critical_pvcs):
                log.info(
                    "'PersistentVolumeUsageNearFull' and "
                    "'PersistentVolumeUsageCritical' alerts are cleared for "
                    "all PVCs.")
                break

        # Run IO to verify the expanded capacity can be utilized
        log.info("Run IO after PVC expansion.")
        for pod_obj in self.pods:
            pod_obj.run_io(
                "fs",
                size="3G",
                io_direction="write",
                runtime=60,
                fio_filename=f"{pod_obj.name}_f2",
            )

        # Wait for IO to complete
        log.info("Waiting for IO to complete on pods.")
        for pod_obj in self.pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"IO error on pod {pod_obj.name}. FIO result: {fio_result}"
            log.info(f"Verified IO on pod {pod_obj.name} after expanding PVC.")
Пример #18
0
    def test_clone_when_full(self, pvc_clone_factory, pod_factory):
        """
        Create a clone from an existing PVC when the PVC is 100% utilized.
        Verify data integrity.
        Verify utilization alert in cloned PVC.
        Expand cloned PVC and ensure utilization alerts are stopped.

        """
        pvc_size_expanded = 6
        file_name = "fio_full"
        prometheus_api = PrometheusAPI()

        # Run IO to utilize 100% of volume
        log.info("Run IO on all pods to utilise 100% of PVCs")
        for pod_obj in self.pods:
            # Get available free space in M
            df_avail_size = pod_obj.exec_cmd_on_pod(
                command=f"df {pod_obj.get_storage_path()} -B M --output=avail")
            # Get the numeral value of available space. eg: 3070 from '3070M'
            available_size = int(df_avail_size.strip().split()[1][0:-1])
            pod_obj.run_io(
                "fs",
                size=f"{available_size-2}M",
                runtime=20,
                rate="100M",
                fio_filename=file_name,
                end_fsync=1,
            )
        log.info("Started IO on all pods to utilise 100% of PVCs")

        # Wait for IO to finish
        log.info("Wait for IO to finish on pods")
        for pod_obj in self.pods:
            pod_obj.get_fio_results()
            log.info(f"IO finished on pod {pod_obj.name}")

            # Verify used space on pod is 100%
            used_space = pod.get_used_space_on_mount_point(pod_obj)
            assert used_space == "100%", (
                f"The used space on pod {pod_obj.name} is not 100% "
                f"but {used_space}")
            log.info(f"Verified: Used space on pod {pod_obj.name} is 100%")
            # Calculate md5sum of the file
            pod_obj.pvc.md5sum = pod.cal_md5sum(pod_obj, file_name)

        log.info("Creating clone of the PVCs")
        cloned_pvcs = [pvc_clone_factory(pvc_obj) for pvc_obj in self.pvcs]
        log.info("Created clone of the PVCs. Cloned PVCs are Bound")

        # Attach the cloned PVCs to pods
        log.info("Attach the cloned PVCs to pods")
        clone_pod_objs = []
        for clone_pvc_obj in cloned_pvcs:
            interface = (constants.CEPHFILESYSTEM if
                         (constants.CEPHFS_INTERFACE
                          in clone_pvc_obj.backed_sc) else
                         constants.CEPHBLOCKPOOL)
            clone_pod_obj = pod_factory(interface=interface,
                                        pvc=clone_pvc_obj,
                                        status="")
            log.info(f"Attached the PVC {clone_pvc_obj.name} to pod "
                     f"{clone_pod_obj.name}")
            clone_pod_objs.append(clone_pod_obj)

        # Verify the new pods are running
        log.info("Verify the new pods are running")
        for pod_obj in clone_pod_objs:
            wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
        log.info("Verified: New pods are running")

        # Verify that the md5sum matches
        for pod_obj in clone_pod_objs:
            log.info(f"Verifying md5sum of {file_name} "
                     f"on pod {pod_obj.name}")
            pod.verify_data_integrity(pod_obj, file_name,
                                      pod_obj.pvc.parent.md5sum)
            log.info(f"Verified: md5sum of {file_name} on pod {pod_obj.name} "
                     f"matches with the original md5sum")

        # Wait till utilization alerts starts
        for response in TimeoutSampler(180, 5, prometheus_api.get, "alerts"):
            alerts = response.json()["data"]["alerts"]
            for pvc_obj in cloned_pvcs:
                alerts_pvc = [
                    alert for alert in alerts if alert.get("labels", {}).get(
                        "persistentvolumeclaim") == pvc_obj.name
                ]
                # At least 2 alerts should be present
                if len(alerts_pvc) < 2:
                    break

                # Verify 'PersistentVolumeUsageNearFull' alert is firing
                if not getattr(pvc_obj, "near_full_alert", False):
                    try:
                        log.info(
                            f"Checking 'PersistentVolumeUsageNearFull' alert "
                            f"for PVC {pvc_obj.name}")
                        near_full_msg = (
                            f"PVC {pvc_obj.name} is nearing full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageNearFull",
                            msg=near_full_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="warning",
                        )
                        pvc_obj.near_full_alert = True
                    except AssertionError:
                        log.info(f"'PersistentVolumeUsageNearFull' alert not "
                                 f"started firing for PVC {pvc_obj.name}")

                # Verify 'PersistentVolumeUsageCritical' alert is firing
                if not getattr(pvc_obj, "critical_alert", False):
                    try:
                        log.info(
                            f"Checking 'PersistentVolumeUsageCritical' alert "
                            f"for PVC {pvc_obj.name}")
                        critical_msg = (
                            f"PVC {pvc_obj.name} is critically full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageCritical",
                            msg=critical_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="error",
                        )
                        pvc_obj.critical_alert = True
                    except AssertionError:
                        log.info(f"'PersistentVolumeUsageCritical' alert not "
                                 f"started firing for PVC {pvc_obj.name}")

            # Collect list of PVCs for which alerts are not firing
            not_near_full_pvc = [
                pvc_ob.name for pvc_ob in cloned_pvcs
                if not getattr(pvc_ob, "near_full_alert", False)
            ]
            not_critical_pvc = [
                pvc_ob.name for pvc_ob in cloned_pvcs
                if not getattr(pvc_ob, "critical_alert", False)
            ]

            if (not not_near_full_pvc) and (not not_critical_pvc):
                log.info("'PersistentVolumeUsageNearFull' and "
                         "'PersistentVolumeUsageCritical' alerts are firing "
                         "for all cloned PVCs.")
                break
        log.info("Verified: Utilization alerts are firing")

        log.info("Expanding cloned PVCs.")
        for pvc_obj in cloned_pvcs:
            log.info(f"Expanding size of PVC {pvc_obj.name} to "
                     f"{pvc_size_expanded}Gi")
            # Expand PVC
            pvc_obj.resize_pvc(pvc_size_expanded, True)

        # Verify utilization alerts are stopped
        for response in TimeoutSampler(180, 5, prometheus_api.get, "alerts"):
            alerts = response.json()["data"]["alerts"]
            for pvc_obj in cloned_pvcs:
                alerts_pvc = [
                    alert for alert in alerts if alert.get("labels", {}).get(
                        "persistentvolumeclaim") == pvc_obj.name
                ]
                if not alerts_pvc:
                    pvc_obj.near_full_alert = False
                    pvc_obj.critical_alert = False
                    continue

                # Verify 'PersistentVolumeUsageNearFull' alert stopped firing
                if getattr(pvc_obj, "near_full_alert"):
                    try:
                        log.info(
                            f"Checking 'PrsistentVolumeUsageNearFull' alert "
                            f"is cleared for PVC {pvc_obj.name}")
                        near_full_msg = (
                            f"PVC {pvc_obj.name} is nearing full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageNearFull",
                            msg=near_full_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="warning",
                        )
                        log.info(
                            f"'PersistentVolumeUsageNearFull' alert is not "
                            f"stopped for PVC {pvc_obj.name}")
                    except AssertionError:
                        pvc_obj.near_full_alert = False
                        log.info(
                            f"'PersistentVolumeUsageNearFull' alert stopped "
                            f"firing for PVC {pvc_obj.name}")

                # Verify 'PersistentVolumeUsageCritical' alert stopped firing
                if getattr(pvc_obj, "critical_alert"):
                    try:
                        log.info(
                            f"Checking 'PersistentVolumeUsageCritical' alert "
                            f"is cleared for PVC {pvc_obj.name}")
                        critical_msg = (
                            f"PVC {pvc_obj.name} is critically full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageCritical",
                            msg=critical_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="error",
                        )
                        log.info(
                            f"'PersistentVolumeUsageCritical' alert is not "
                            f"stopped for PVC {pvc_obj.name}")
                    except AssertionError:
                        pvc_obj.critical_alert = False
                        log.info(
                            f"'PersistentVolumeUsageCritical' alert stopped "
                            f"firing for PVC {pvc_obj.name}")

            # Collect list of PVCs for which alerts are still firing
            near_full_pvcs = [
                pvc_ob.name for pvc_ob in cloned_pvcs
                if getattr(pvc_ob, "near_full_alert")
            ]
            critical_pvcs = [
                pvc_ob.name for pvc_ob in cloned_pvcs
                if getattr(pvc_ob, "critical_alert")
            ]

            if (not near_full_pvcs) and (not critical_pvcs):
                log.info(
                    "'PersistentVolumeUsageNearFull' and "
                    "'PersistentVolumeUsageCritical' alerts are cleared for "
                    "all cloned PVCs.")
                break

        log.info("Verified: Utilization alerts stopped firing")
Пример #19
0
def test_ceph_manager_stopped(workload_stop_ceph_mgr):
    """
    Test that there is appropriate alert when ceph manager
    is unavailable and that this alert is cleared when the manager
    is back online.
    """
    prometheus = PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = workload_stop_ceph_mgr.get('prometheus_alerts')
    target_label = 'CephMgrIsAbsent'
    target_msg = 'Storage metrics collector service not available anymore.'
    target_alerts = [
        alert
        for alert
        in alerts
        if alert.get('labels').get('alertname') == target_label
    ]
    log.info(f"Checking properties of found {target_label} alerts")
    msg = f"Incorrect number of {target_label} alerts"
    assert len(target_alerts) == 2, msg

    msg = 'Alert message is not correct'
    assert target_alerts[0]['annotations']['message'] == target_msg, msg

    msg = 'First alert doesn\'t have warning severity'
    assert target_alerts[0]['annotations']['severity_level'] == 'warning', msg

    msg = 'First alert is not in pending state'
    assert target_alerts[0]['state'] == 'pending', msg

    msg = 'Alert message is not correct'
    assert target_alerts[1]['annotations']['message'] == target_msg, msg

    msg = 'Second alert doesn\'t have warning severity'
    assert target_alerts[1]['annotations']['severity_level'] == 'warning', msg

    msg = 'First alert is not in firing state'
    assert target_alerts[1]['state'] == 'firing', msg

    log.info(f"Alerts were triggered correctly during utilization")

    # seconds to wait before alert is cleared after measurement is finished
    time_min = 30

    time_actual = time.time()
    time_wait = int(
        (workload_stop_ceph_mgr.get('stop') + time_min) - time_actual
    )
    if time_wait > 0:
        log.info(f"Waiting for approximately {time_wait} seconds for alerts "
                 f"to be cleared ({time_min} seconds since measurement end)")
    else:
        time_wait = 1
    cleared_alerts = prometheus.wait_for_alert(
        name=target_label,
        state=None,
        timeout=time_wait
    )
    log.info(f"Cleared alerts: {cleared_alerts}")
    assert len(cleared_alerts) == 0, f"{target_label} alerts were not cleared"
    log.info(f"{target_label} alerts were cleared")
Пример #20
0
class ClusterLoad:
    """
    A class for cluster load functionalities

    """

    def __init__(
        self, project_factory=None, pvc_factory=None, sa_factory=None,
        pod_factory=None, target_percentage=None
    ):
        """
        Initializer for ClusterLoad

        Args:
            project_factory (function): A call to project_factory function
            pvc_factory (function): A call to pvc_factory function
            sa_factory (function): A call to service_account_factory function
            pod_factory (function): A call to pod_factory function
            target_percentage (float): The percentage of cluster load that is
                required. The value should be greater than 0.1 and smaller than 0.95

        """
        self.prometheus_api = PrometheusAPI()
        self.pvc_factory = pvc_factory
        self.sa_factory = sa_factory
        self.pod_factory = pod_factory
        self.target_percentage = target_percentage
        self.cluster_limit = None
        self.dc_objs = list()
        self.pvc_objs = list()
        self.previous_iops = None
        self.current_iops = None
        self.rate = None
        self.pvc_size = int(get_osd_pods_memory_sum() * 0.5)
        self.sleep_time = 45
        self.target_pods_number = None
        if project_factory:
            project_name = f"{defaults.BG_LOAD_NAMESPACE}-{uuid4().hex[:5]}"
            self.project = project_factory(project_name=project_name)

    def increase_load(self, rate, wait=True):
        """
        Create a PVC, a service account and a DeploymentConfig of FIO pod

        Args:
            rate (str): FIO 'rate' value (e.g. '20M')
            wait (bool): True for waiting for IO to kick in on the
                newly created pod, False otherwise

        """
        pvc_obj = self.pvc_factory(
            interface=constants.CEPHBLOCKPOOL, project=self.project,
            size=self.pvc_size, volume_mode=constants.VOLUME_MODE_BLOCK,
        )
        self.pvc_objs.append(pvc_obj)
        service_account = self.sa_factory(pvc_obj.project)

        # Set new arguments with the updated file size to be used for
        # DeploymentConfig of FIO pod creation
        fio_dc_data = templating.load_yaml(constants.FIO_DC_YAML)
        args = fio_dc_data.get('spec').get('template').get(
            'spec'
        ).get('containers')[0].get('args')
        new_args = [
            x for x in args if not x.startswith('--filesize=') and not x.startswith('--rate=')
        ]
        io_file_size = f"{self.pvc_size * 1000 - 200}M"
        new_args.append(f"--filesize={io_file_size}")
        new_args.append(f"--rate={rate}")
        dc_obj = self.pod_factory(
            pvc=pvc_obj, pod_dict_path=constants.FIO_DC_YAML,
            raw_block_pv=True, deployment_config=True,
            service_account=service_account, command_args=new_args
        )
        self.dc_objs.append(dc_obj)
        if wait:
            logger.info(
                f"Waiting {self.sleep_time} seconds for IO to kick-in on the newly "
                f"created FIO pod {dc_obj.name}"
            )
            time.sleep(self.sleep_time)

    def decrease_load(self, wait=True):
        """
        Delete DeploymentConfig with its pods and the PVC. Then, wait for the
        IO to be stopped

        Args:
            wait (bool): True for waiting for IO to drop after the deletion
                of the FIO pod, False otherwise

        """
        dc_name = self.dc_objs[-1].name
        self.dc_objs[-1].delete()
        self.dc_objs[-1].ocp.wait_for_delete(dc_name)
        self.dc_objs.remove(self.dc_objs[-1])
        self.pvc_objs[-1].delete()
        self.pvc_objs[-1].ocp.wait_for_delete(self.pvc_objs[-1].name)
        self.pvc_objs.remove(self.pvc_objs[-1])
        if wait:
            logger.info(
                f"Waiting {self.sleep_time} seconds for IO to drop after "
                f"the deletion of {dc_name}"
            )
            time.sleep(self.sleep_time)

    def increase_load_and_print_data(self, rate, wait=True):
        """
        Increase load and print data

        Args:
            rate (str): FIO 'rate' value (e.g. '20M')
            wait (bool): True for waiting for IO to kick in on the
                newly created pod, False otherwise

        """
        self.increase_load(rate=rate, wait=wait)
        self.previous_iops = self.current_iops
        self.current_iops = self.calc_trim_metric_mean(metric=constants.IOPS_QUERY)
        msg = f"Current: {self.current_iops:.2f} || Previous: {self.previous_iops:.2f}"
        logger.info(f"IOPS:{wrap_msg(msg)}")
        self.print_metrics()

    def reach_cluster_load_percentage(self):
        """
        Reach the cluster limit and then drop to the given target percentage.
        The number of pods needed for the desired target percentage is determined by
        creating pods one by one, while examining the cluster latency. Once the latency
        is greater than 250 ms and it is growing exponentially, it means that
        the cluster limit has been reached.
        Then, dropping to the target percentage by deleting all pods and re-creating
        ones with smaller value of FIO 'rate' param.
        This leaves the number of pods needed running IO for cluster load to
        be around the desired percentage.

        """
        if not self.target_percentage:
            logger.warning("The target percentage was not provided. Breaking")
            return
        if not 0.1 < self.target_percentage < 0.95:
            logger.warning(
                f"The target percentage is {self.target_percentage * 100}% which is "
                "not within the accepted range. Therefore, IO will not be started"
            )
            return
        low_diff_counter = 0
        cluster_limit = None
        latency_vals = list()
        time_to_wait = 60 * 30
        time_before = time.time()

        self.current_iops = self.get_query(query=constants.IOPS_QUERY)

        # Creating FIO DeploymentConfig pods one by one, with a large value of FIO
        # 'rate' arg. This in order to determine the cluster limit faster.
        # Once determined, these pods will be deleted. Then, new FIO DC pods will be
        # created, with a smaller value of 'rate' param. This in order to be more
        # accurate with reaching the target percentage
        while True:
            wait = False if len(self.dc_objs) <= 1 else True
            self.increase_load_and_print_data(rate='250M', wait=wait)
            if self.current_iops > self.previous_iops:
                cluster_limit = self.current_iops

            latency = self.calc_trim_metric_mean(metric=constants.LATENCY_QUERY) * 1000
            latency_vals.append(latency)
            logger.info(f"Latency values: {latency_vals}")

            iops_diff = (self.current_iops / self.previous_iops * 100) - 100
            low_diff_counter += 1 if -15 < iops_diff < 10 else 0

            cluster_used_space = get_percent_used_capacity()

            if len(latency_vals) > 1 and latency > 250:
                # Checking for an exponential growth. In case the latest latency sample
                # value is more than 128 times the first latency value sample, we can conclude
                # that the cluster limit in terms of IOPS, has been reached.
                # See https://blog.docbert.org/vdbench-curve/ for more details.
                # In other cases, when the first latency sample value is greater than 3 ms,
                # the multiplication factor we check according to, is lower, in order to
                # determine the cluster load faster.
                if latency > latency_vals[0] * 2 ** 7 or (
                    3 < latency_vals[0] < 50 and len(latency_vals) > 5
                ):
                    logger.info(
                        wrap_msg("The cluster limit was determined by latency growth")
                    )
                    break

            # In case the latency is greater than 2 seconds,
            # most chances the limit has been reached
            elif latency > 2000:
                logger.info(
                    wrap_msg(f"The limit was determined by the high latency - {latency} ms")
                )
                break

            # For clusters that their nodes do not meet the minimum
            # resource requirements, the cluster limit is being reached
            # while the latency remains low. For that, the cluster limit
            # needs to be determined by the following condition of IOPS
            # diff between FIO pod creation iterations
            elif low_diff_counter > 3:
                logger.warning(
                    wrap_msg(
                        "Limit was determined by low IOPS diff between "
                        f"iterations - {iops_diff:.2f}%"
                    )
                )
                break

            elif time.time() > time_before + time_to_wait:
                logger.warning(
                    wrap_msg(
                        "Could not determine the cluster IOPS limit within"
                        f"the given {time_to_wait} seconds timeout. Breaking"
                    )
                )
                break

            elif cluster_used_space > 60:
                logger.warning(
                    wrap_msg(
                        f"Cluster used space is {cluster_used_space}%. Could "
                        "not reach the cluster IOPS limit before the "
                        "used spaced reached 60%. Breaking"
                    )
                )
                break

        self.cluster_limit = cluster_limit
        logger.info(wrap_msg(f"The cluster IOPS limit is {self.cluster_limit:.2f}"))
        logger.info("Deleting all DC FIO pods that have large FIO rate")
        while self.dc_objs:
            self.decrease_load(wait=False)

        target_iops = self.cluster_limit * self.target_percentage

        range_map = RangeKeyDict(
            {
                (0, 500): (6, 0.82, 0.4),
                (500, 1000): (8, 0.84, 0.45),
                (1000, 1500): (10, 0.86, 0.5),
                (1500, 2000): (12, 0.88, 0.55),
                (2000, 2500): (14, 0.90, 0.6),
                (2500, 3000): (16, 0.92, 0.65),
                (3000, 3500): (18, 0.94, 0.7),
                (3500, math.inf): (20, 0.96, 0.75),
            }
        )
        self.rate = f'{range_map[target_iops][0]}M'
        # Creating the first pod of small FIO 'rate' param, to speed up the process.
        # In the meantime, the load will drop, following the deletion of the
        # FIO pods with large FIO 'rate' param
        logger.info("Creating FIO pods, one by one, until the target percentage is reached")
        self.increase_load_and_print_data(rate=self.rate)
        msg = (
            f"The target load, in IOPS, is: {target_iops}, which is "
            f"{self.target_percentage*100}% of the {self.cluster_limit} cluster limit"
        )
        logger.info(wrap_msg(msg))

        while self.current_iops < target_iops * range_map[target_iops][1]:
            wait = False if self.current_iops < target_iops * range_map[target_iops][2] else True
            self.increase_load_and_print_data(rate=self.rate, wait=wait)

        msg = f"The target load, of {self.target_percentage * 100}%, has been reached"
        logger.info(wrap_msg(msg))
        self.target_pods_number = len(self.dc_objs)

    def get_query(self, query, mute_logs=False):
        """
        Get query from Prometheus and parse it

        Args:
            query (str): Query to be done
            mute_logs (bool): True for muting the logs, False otherwise

        Returns:
            float: the query result

        """
        now = datetime.now
        timestamp = datetime.timestamp
        return float(
            self.prometheus_api.query(
                query, str(timestamp(now())), mute_logs=mute_logs
            )[0]['value'][1]
        )

    def calc_trim_metric_mean(self, metric, samples=5, mute_logs=False):
        """
        Get the trimmed mean of a given metric

        Args:
            metric (str): The metric to calculate the average result for
            samples (int): The number of samples to take
            mute_logs (bool): True for muting the logs, False otherwise

        Returns:
            float: The average result for the metric

        """
        vals = list()
        for i in range(samples):
            vals.append(round(self.get_query(metric, mute_logs), 5))
            if i == samples - 1:
                break
            time.sleep(5)
        return round(get_trim_mean(vals), 5)

    def print_metrics(self, mute_logs=False):
        """
        Print metrics

        Args:
            mute_logs (bool): True for muting the Prometheus logs, False otherwise

        """
        high_latency = 200
        metrics = {
            "throughput": self.get_query(constants.THROUGHPUT_QUERY, mute_logs=mute_logs) * (
                constants.TP_CONVERSION.get(' B/s')
            ),
            "latency": self.get_query(constants.LATENCY_QUERY, mute_logs=mute_logs) * 1000,
            "iops": self.get_query(constants.IOPS_QUERY, mute_logs=mute_logs),
            "used_space": self.get_query(constants.USED_SPACE_QUERY, mute_logs=mute_logs) / 1e+9
        }
        limit_msg = (
            f" ({metrics.get('iops') / self.cluster_limit * 100:.2f}% of the "
            f"{self.cluster_limit:.2f} limit)"
        ) if self.cluster_limit else ""
        pods_msg = f" || Number of FIO pods: {len(self.dc_objs)}" if self.dc_objs else ""
        msg = (
            f"Throughput: {metrics.get('throughput'):.2f} MB/s || "
            f"Latency: {metrics.get('latency'):.2f} ms || "
            f"IOPS: {metrics.get('iops'):.2f}{limit_msg} || "
            f"Used Space: {metrics.get('used_space'):.2f} GB{pods_msg}"
        )
        logger.info(f"Cluster utilization:{wrap_msg(msg)}")
        if metrics.get('latency') > high_latency:
            logger.warning(f"Cluster latency is higher than {high_latency} ms!")

    def adjust_load_if_needed(self):
        """
        Dynamically adjust the IO load based on the cluster latency.
        In case the latency goes beyond 250 ms, start deleting FIO pods.
        Once latency drops back below 100 ms, re-create the FIO pods
        to make sure that cluster load is around the target percentage

        """
        latency = self.calc_trim_metric_mean(
            constants.LATENCY_QUERY, mute_logs=True
        )
        if latency > 0.25 and len(self.dc_objs) > 0:
            msg = (
                f"Latency is too high - {latency * 1000:.2f} ms."
                " Dropping the background load. Once the latency drops back to "
                "normal, the background load will be increased back"
            )
            logger.warning(wrap_msg(msg))
            self.decrease_load(wait=False)
        if latency < 0.1 and self.target_pods_number > len(self.dc_objs):
            msg = (
                f"Latency is back to normal - {latency * 1000:.2f} ms. "
                f"Increasing back the load"
            )
            logger.info(wrap_msg(msg))
            self.increase_load(rate=self.rate, wait=False)

    def pause_load(self):
        """
        Pause the cluster load

        """
        logger.info(wrap_msg("Pausing the cluster load"))
        while self.dc_objs:
            self.decrease_load(wait=False)

    def resume_load(self):
        """
        Resume the cluster load

        """
        logger.info(wrap_msg("Resuming the cluster load"))
        while len(self.dc_objs) < self.target_pods_number:
            self.increase_load(rate=self.rate, wait=False)
Пример #21
0
class ClusterLoad:
    """
    A class for cluster load functionalities

    """
    def __init__(self,
                 project_factory=None,
                 pvc_factory=None,
                 sa_factory=None,
                 pod_factory=None,
                 target_percentage=None):
        """
        Initializer for ClusterLoad

        Args:
            pvc_factory (function): A call to pvc_factory function
            sa_factory (function): A call to service_account_factory function
            pod_factory (function): A call to pod_factory function
            target_percentage (float): The percentage of cluster load that is
                required. The value should be greater than 0 and smaller than 1

        """
        self.prometheus_api = PrometheusAPI()
        self.pvc_factory = pvc_factory
        self.sa_factory = sa_factory
        self.pod_factory = pod_factory
        self.target_percentage = target_percentage
        self.cluster_limit = None
        self.dc_objs = list()
        self.pvc_objs = list()
        self.name_suffix = 1
        self.pvc_size = int(get_osd_pods_memory_sum() * 0.5)
        self.io_file_size = f"{self.pvc_size * 1000 - 200}M"
        self.sleep_time = 35
        if project_factory:
            project_name = f"{defaults.BG_LOAD_NAMESPACE}-{uuid4().hex[:5]}"
            self.project = project_factory(project_name=project_name)

    def increase_load(self, rate=None, wait=True):
        """
        Create a PVC, a service account and a DeploymentConfig of FIO pod

        Args:
            rate (str): FIO 'rate' value (e.g. '20M')
            wait (bool): True for waiting for IO to kick in on the
                newly created pod, False otherwise

        """
        pvc_obj = self.pvc_factory(
            interface=constants.CEPHBLOCKPOOL,
            project=self.project,
            size=self.pvc_size,
            volume_mode=constants.VOLUME_MODE_BLOCK,
        )
        self.pvc_objs.append(pvc_obj)
        service_account = self.sa_factory(pvc_obj.project)

        # Set new arguments with the updated file size to be used for
        # DeploymentConfig of FIO pod creation
        fio_dc_data = templating.load_yaml(constants.FIO_DC_YAML)
        args = fio_dc_data.get('spec').get('template').get('spec').get(
            'containers')[0].get('args')
        new_args = [
            x for x in args
            if not x.startswith('--filesize=') and not x.startswith('--rate=')
        ]
        new_args.append(f"--filesize={self.io_file_size}")
        new_args.append(f"--rate={rate}")
        self.name_suffix += 1
        dc_obj = self.pod_factory(pvc=pvc_obj,
                                  pod_dict_path=constants.FIO_DC_YAML,
                                  raw_block_pv=True,
                                  deployment_config=True,
                                  service_account=service_account,
                                  command_args=new_args)
        self.dc_objs.append(dc_obj)
        if wait:
            logger.info(
                f"Waiting {self.sleep_time} seconds for IO to kick-in on the newly "
                f"created FIO pod {dc_obj.name}")
            time.sleep(self.sleep_time)

    def decrease_load(self, wait=True):
        """
        Delete DeploymentConfig with its pods and the PVC. Then, wait for the
        IO to be stopped

        Args:
            wait (bool): True for waiting for IO to drop after the deletion
                of the FIO pod, False otherwise

        """
        dc_name = self.dc_objs[-1].name
        self.dc_objs[-1].delete()
        self.dc_objs[-1].ocp.wait_for_delete(dc_name)
        self.dc_objs.remove(self.dc_objs[-1])
        self.pvc_objs[-1].delete()
        self.pvc_objs[-1].ocp.wait_for_delete(self.pvc_objs[-1].name)
        self.pvc_objs.remove(self.pvc_objs[-1])
        if wait:
            logger.info(
                f"Waiting {self.sleep_time} seconds for IO to drop after the deletion of {dc_name}"
            )
            time.sleep(self.sleep_time)

    def reach_cluster_load_percentage(self):
        """
        Reach the cluster limit and then drop to the given target percentage.
        The number of pods needed for the desired target percentage is determined by
        creating pods one by one, while examining the cluster latency. Once the latency
        is greater than 200 ms and it is growing exponentially, it means that
        the cluster limit has been reached.
        Then, dropping to the target percentage by deleting all pods and re-creating
        ones with smaller value of FIO 'rate' param.
        This leaves the number of pods needed running IO for cluster load to
        be around the desired percentage.

        """
        if not 0.1 < self.target_percentage < 0.95:
            logger.warning(
                f"The target percentage is {self.target_percentage * 100}% which is "
                f"not within the accepted range. Therefore, IO will not be started"
            )
            return
        low_diff_counter = 0
        limit_reached = False
        cluster_limit = None
        latency_vals = list()
        time_to_wait = 60 * 30
        time_before = time.time()

        current_iops = self.get_query(query=constants.IOPS_QUERY)

        msg = ("\n======================\nCurrent IOPS: {:.2f}"
               "\nPrevious IOPS: {:.2f}\n======================")

        # Creating FIO DeploymentConfig pods one by one, with a large value of FIO
        # 'rate' arg. This in order to determine the cluster limit faster.
        # Once determined, these pods will be deleted. Then, new FIO DC pods will be
        # created, with a smaller value of 'rate' param. This in order to be more
        # accurate with reaching the target percentage
        rate = '250M'
        while not limit_reached:
            self.increase_load(rate=rate)
            previous_iops = current_iops
            current_iops = self.get_query(query=constants.IOPS_QUERY)
            if current_iops > previous_iops:
                cluster_limit = current_iops

            logger.info(
                msg.format(current_iops, previous_iops, len(self.dc_objs)))
            self.print_metrics()

            latency = self.calc_trim_metric_mean(
                metric=constants.LATENCY_QUERY) * 1000
            latency_vals.append(latency)
            logger.info(f"Latency values: {latency_vals}")

            if len(latency_vals) > 1 and latency > 250:
                # Checking for an exponential growth
                if latency > latency_vals[0] * 2**7:
                    logger.info("Latency exponential growth was detected")
                    limit_reached = True

            # In case the latency is greater than 3 seconds,
            # most chances the limit has been reached
            if latency > 3000:
                logger.info(f"Limit was determined by latency, which is "
                            f"higher than 3 seconds - {latency} ms")
                limit_reached = True

            # For clusters that their nodes do not meet the minimum
            # resource requirements, the cluster limit is being reached
            # while the latency remains low. For that, the cluster limit
            # needs to be determined by the following condition of IOPS
            # diff between FIO pod creation iterations
            iops_diff = (current_iops / previous_iops * 100) - 100
            low_diff_counter += 1 if -15 < iops_diff < 10 else 0
            if low_diff_counter > 3:
                logger.warning(
                    f"Limit was determined by low IOPS diff between "
                    f"iterations - {iops_diff:.2f}%")
                limit_reached = True

            if time.time() > time_before + time_to_wait:
                logger.warning(
                    f"Could not determine the cluster IOPS limit within"
                    f"\nthe given {time_to_wait} seconds timeout. Breaking")
                limit_reached = True

            cluster_used_space = get_percent_used_capacity()
            if cluster_used_space > 60:
                logger.warning(
                    f"Cluster used space is {cluster_used_space}%. Could "
                    f"not reach the cluster IOPS limit before the "
                    f"used spaced reached 60%. Breaking")
                limit_reached = True

        self.cluster_limit = cluster_limit
        logger.info(
            f"\n===================================\nThe cluster IOPS limit "
            f"is {self.cluster_limit:.2f}\n==================================="
        )
        logger.info(
            f"Deleting all DC FIO pods that have FIO rate parameter of {rate}")
        while self.dc_objs:
            self.decrease_load(wait=False)

        # Creating the first pod of small FIO 'rate' param, to speed up the process.
        # In the meantime, the load will drop, following the deletion of the
        # FIO pods with large FIO 'rate' param
        rate = '15M'
        logger.info(
            f"Creating FIO pods with a rate parameter of {rate}, one by "
            f"one, until the target percentage is reached")
        self.increase_load(rate=rate)
        target_iops = self.cluster_limit * self.target_percentage
        current_iops = self.get_query(query=constants.IOPS_QUERY)
        logger.info(f"Target IOPS: {target_iops}")
        logger.info(f"Current IOPS: {current_iops}")

        while current_iops < target_iops * 0.95:
            wait = False if current_iops < target_iops / 2 else True
            self.increase_load(rate=rate, wait=wait)
            previous_iops = current_iops
            current_iops = self.get_query(query=constants.IOPS_QUERY)
            logger.info(
                msg.format(current_iops, previous_iops, len(self.dc_objs)))
            self.print_metrics()

        logger.info(
            f"\n========================================\n"
            f"The target load, of {self.target_percentage * 100}%, has been reached"
            f"\n==========================================")

    def get_query(self, query):
        """
        Get query from Prometheus and parse it

        Args:
            query (str): Query to be done

        Returns:
            float: the query result

        """
        now = datetime.now
        timestamp = datetime.timestamp
        return float(
            self.prometheus_api.query(query,
                                      str(timestamp(now())))[0]['value'][1])

    def calc_trim_metric_mean(self, metric=constants.LATENCY_QUERY, samples=5):
        """
        Get the trimmed mean of a given metric

        Args:
            metric (str): The metric to calculate the average result for
            samples (int): The number of samples to take

        Returns:
            float: The average result for the metric

        """
        vals = list()
        for i in range(samples):
            vals.append(round(self.get_query(metric), 5))
            if i == samples - 1:
                break
            time.sleep(5)
        return round(get_trim_mean(vals), 5)

    def get_metrics(self):
        """
        Get different cluster load and utilization metrics
        """
        return {
            "throughput":
            self.get_query(constants.THROUGHPUT_QUERY) *
            (constants.TP_CONVERSION.get(' B/s')),
            "latency":
            self.get_query(constants.LATENCY_QUERY) * 1000,
            "iops":
            self.get_query(constants.IOPS_QUERY),
            "used_space":
            self.get_query(constants.USED_SPACE_QUERY) / 1e+9
        }

    def print_metrics(self):
        """
        Print metrics

        """
        high_latency = 500
        metrics = self.get_metrics()
        limit_msg = ""
        pods_msg = ""
        if self.cluster_limit:
            limit_msg = (
                f"({metrics.get('iops') / self.cluster_limit * 100:.2f}% of the "
                f"{self.cluster_limit:.1f} limit)\n")
        if self.dc_objs:
            pods_msg = (f"\nNumber of pods running FIO: {len(self.dc_objs)}")
        logger.info(
            f"\n===============================\n"
            f"Cluster throughput: {metrics.get('throughput'):.2f} MB/s\n"
            f"Cluster latency: {metrics.get('latency'):.2f} ms\n"
            f"Cluster IOPS: {metrics.get('iops'):.2f}\n{limit_msg}"
            f"Cluster used space: {metrics.get('used_space'):.2f} GB{pods_msg}"
            f"\n===============================")
        if metrics.get('latency') > high_latency:
            logger.warning(
                f"Cluster latency is higher than {high_latency} ms!")