Пример #1
0
def test_rgw_unavailable(measure_stop_rgw):
    """
    Test that there is appropriate alert when RGW is unavailable and that
    this alert is cleared when the RGW interface is back online.

    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = measure_stop_rgw.get("prometheus_alerts")
    target_label = constants.ALERT_CLUSTEROBJECTSTORESTATE
    target_msg = (
        "Cluster Object Store is in unhealthy state for more than 15s. "
        "Please check Ceph cluster health or RGW connection."
    )
    states = ["pending", "firing"]

    prometheus.check_alert_list(
        label=target_label,
        msg=target_msg,
        alerts=alerts,
        states=states,
        severity="error",
    )
    api.check_alert_cleared(
        label=target_label, measure_end_time=measure_stop_rgw.get("stop")
    )
Пример #2
0
def test_ceph_monitor_stopped(measure_stop_ceph_mon):
    """
    Test that there is appropriate alert related to ceph monitor quorum
    when there is even number of ceph monitors and that this alert
    is cleared when monitors are back online.
    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = measure_stop_ceph_mon.get("prometheus_alerts")
    for target_label, target_msg, target_states, target_severity in [
        (
            constants.ALERT_MONQUORUMATRISK,
            "Storage quorum at risk",
            ["pending"],
            "error",
        ),
        (
            constants.ALERT_CLUSTERWARNINGSTATE,
            "Storage cluster is in degraded state",
            ["pending"],
            "warning",
        ),
    ]:
        prometheus.check_alert_list(
            label=target_label,
            msg=target_msg,
            alerts=alerts,
            states=target_states,
            severity=target_severity,
        )
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=measure_stop_ceph_mon.get("stop"))
Пример #3
0
def test_noobaa_bucket_quota(measure_noobaa_exceed_bucket_quota):
    """
    Test that there are appropriate alerts when NooBaa Bucket Quota is reached.
    """
    api = prometheus.PrometheusAPI()

    alerts = measure_noobaa_exceed_bucket_quota.get('prometheus_alerts')
    for target_label, target_msg, target_states, target_severity in [
        (constants.ALERT_BUCKETREACHINGQUOTASTATE,
         'A NooBaa Bucket Is In Reaching Quota State', ['firing'], 'warning'),
        (constants.ALERT_BUCKETERRORSTATE, 'A NooBaa Bucket Is In Error State',
         ['pending', 'firing'], 'warning'),
        (constants.ALERT_BUCKETEXCEEDINGQUOTASTATE,
         'A NooBaa Bucket Is In Exceeding Quota State', ['firing'], 'warning')
    ]:
        prometheus.check_alert_list(label=target_label,
                                    msg=target_msg,
                                    alerts=alerts,
                                    states=target_states,
                                    severity=target_severity)
        # the time to wait is increased because it takes more time for OCS
        # cluster to resolve its issues
        pg_wait = 480
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=measure_noobaa_exceed_bucket_quota.get('stop'),
            time_min=pg_wait)
def test_ceph_monitor_stopped(workload_stop_ceph_mon):
    """
    Test that there is appropriate alert related to ceph monitor quorum
    when there is even number of ceph monitors and that this alert
    is cleared when monitors are back online.
    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = workload_stop_ceph_mon.get('prometheus_alerts')
    for target_label, target_msg, target_states, target_severity in [
        (
            constants.ALERT_MONQUORUMATRISK,
            'Storage quorum at risk',
            ['pending'],
            'error'
        ),
        (
            constants.ALERT_CLUSTERWARNINGSTATE,
            'Storage cluster is in degraded state',
            ['pending', 'firing'],
            'warning'
        )
    ]:
        prometheus.check_alert_list(
            label=target_label,
            msg=target_msg,
            alerts=alerts,
            states=target_states,
            severity=target_severity
        )
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=workload_stop_ceph_mon.get('stop')
        )
Пример #5
0
def test_corrupt_pg_alerts(measure_corrupt_pg):
    """
    Test that there are appropriate alerts when Placement group
    on one OSD is corrupted.ceph manager
    is unavailable and that this alert is cleared when the manager
    is back online.
    """
    api = prometheus.PrometheusAPI()

    alerts = measure_corrupt_pg.get('prometheus_alerts')
    for target_label, target_msg, target_states, target_severity in [
        (constants.ALERT_PGREPAIRTAKINGTOOLONG, 'Self heal problems detected',
         ['pending'], 'warning'),
        (constants.ALERT_CLUSTERERRORSTATE,
         'Storage cluster is in error state', ['pending', 'firing'], 'error')
    ]:
        prometheus.check_alert_list(label=target_label,
                                    msg=target_msg,
                                    alerts=alerts,
                                    states=target_states,
                                    severity=target_severity)
        # the time to wait is increased because it takes more time for Ceph
        # cluster to resolve its issues
        pg_wait = 360
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=measure_corrupt_pg.get('stop'),
            time_min=pg_wait)
Пример #6
0
def test_ceph_osd_stopped(measure_stop_ceph_osd):
    """
    Test that there is appropriate alert related to situation when ceph osd
    is down. Alert is cleared when osd disk is back online.
    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = measure_stop_ceph_osd.get('prometheus_alerts')
    for target_label, target_msg, target_states, target_severity, ignore in [
        (constants.ALERT_OSDDISKNOTRESPONDING, 'Disk not responding',
         ['pending', 'firing'], 'error', False),
        (constants.ALERT_DATARECOVERYTAKINGTOOLONG, 'Data recovery is slow',
         ['pending'], 'warning', True),
        (constants.ALERT_CLUSTERWARNINGSTATE,
         'Storage cluster is in degraded state', ['pending',
                                                  'firing'], 'warning', False)
    ]:
        prometheus.check_alert_list(label=target_label,
                                    msg=target_msg,
                                    alerts=alerts,
                                    states=target_states,
                                    severity=target_severity,
                                    ignore_more_occurences=ignore)
        # the time to wait is increased because it takes more time for osd pod
        # to be ready than for other pods
        osd_up_wait = 360
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=measure_stop_ceph_osd.get('stop'),
            time_min=osd_up_wait)
Пример #7
0
def test_rgw_unavailable(measure_stop_rgw):
    """
    Test that there is appropriate alert when RGW is unavailable and that
    this alert is cleared when the RGW interface is back online.

    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = measure_stop_rgw.get("prometheus_alerts")
    target_label = constants.ALERT_CLUSTEROBJECTSTORESTATE
    # The alert message is changed since OCS 4.7
    ocs_version = config.ENV_DATA["ocs_version"]
    if Version.coerce(ocs_version) < Version.coerce("4.7"):
        target_msg = (
            "Cluster Object Store is in unhealthy state for more than 15s. "
            "Please check Ceph cluster health or RGW connection.")
    else:
        target_msg = "Cluster Object Store is in unhealthy state. Please check Ceph cluster health."
    states = ["pending", "firing"]

    prometheus.check_alert_list(
        label=target_label,
        msg=target_msg,
        alerts=alerts,
        states=states,
        severity="error",
    )
    api.check_alert_cleared(label=target_label,
                            measure_end_time=measure_stop_rgw.get("stop"),
                            time_min=300)
Пример #8
0
def test_rbd_capacity_workload_alerts(workload_storageutilization_95p_rbd):
    """
    Test that there are appropriate alerts when ceph cluster is utilized
    via RBD interface.
    """
    api = prometheus.PrometheusAPI()
    measure_end_time = workload_storageutilization_95p_rbd.get("stop")

    # Check utilization on 95%
    alerts = workload_storageutilization_95p_rbd.get("prometheus_alerts")

    if config.ENV_DATA.get("ocs_version") == "4.2":
        nearfull_message = "Storage cluster is nearing full. Expansion is required."
        criticallfull_mesage = (
            "Storage cluster is critically full and needs immediate expansion"
        )
    else:
        # since OCS 4.3
        nearfull_message = (
            "Storage cluster is nearing full. Data deletion or cluster "
            "expansion is required."
        )
        criticallfull_mesage = (
            "Storage cluster is critically full and needs immediate data "
            "deletion or cluster expansion."
        )

    for target_label, target_msg, target_states, target_severity in [
        (
            constants.ALERT_CLUSTERNEARFULL,
            nearfull_message,
            ["pending", "firing"],
            "warning",
        ),
        (
            constants.ALERT_CLUSTERCRITICALLYFULL,
            criticallfull_mesage,
            ["pending", "firing"],
            "error",
        ),
    ]:
        prometheus.check_alert_list(
            label=target_label,
            msg=target_msg,
            alerts=alerts,
            states=target_states,
            severity=target_severity,
            ignore_more_occurences=True,
        )
        # the time to wait is increased because it takes more time for Ceph
        # cluster to delete all data
        pg_wait = 300
        api.check_alert_cleared(
            label=target_label, measure_end_time=measure_end_time, time_min=pg_wait
        )
Пример #9
0
def test_capacity_workload_alerts(workload_storageutilization_95p_rbd,
                                  workload_storageutilization_95p_cephfs,
                                  interface):
    """
    Test that there are appropriate alerts when ceph cluster is utilized.
    """
    api = prometheus.PrometheusAPI()
    measure_end_time = max([
        workload_storageutilization_95p_rbd.get('stop'),
        workload_storageutilization_95p_cephfs.get('stop'),
    ])
    if interface == 'rbd':
        workload_storageutilization_95p = workload_storageutilization_95p_rbd
    elif interface == 'cephfs':
        workload_storageutilization_95p = workload_storageutilization_95p_cephfs

    # Check utilization on 95%
    alerts = workload_storageutilization_95p.get('prometheus_alerts')
    # TODO(fbalak): it seems that CephFS utilization triggers only firing
    # alerts. This needs to be more investigated.

    if config.ENV_DATA.get('ocs_version') == '4.2':
        nearfull_message = (
            'Storage cluster is nearing full. Expansion is required.')
        criticallfull_mesage = (
            'Storage cluster is critically full and needs immediate expansion')
    else:
        # since OCS 4.3
        nearfull_message = (
            'Storage cluster is nearing full. Data deletion or cluster '
            'expansion is required.')
        criticallfull_mesage = (
            'Storage cluster is critically full and needs immediate data '
            'deletion or cluster expansion.')

    for target_label, target_msg, target_states, target_severity in [
        (constants.ALERT_CLUSTERNEARFULL, nearfull_message,
         ['pending', 'firing'], 'warning'),
        (constants.ALERT_CLUSTERCRITICALLYFULL, criticallfull_mesage,
         ['pending', 'firing'], 'error'),
    ]:
        prometheus.check_alert_list(label=target_label,
                                    msg=target_msg,
                                    alerts=alerts,
                                    states=target_states,
                                    severity=target_severity,
                                    ignore_more_occurences=True)
        # the time to wait is increased because it takes more time for Ceph
        # cluster to delete all data
        pg_wait = 300
        api.check_alert_cleared(label=target_label,
                                measure_end_time=measure_end_time,
                                time_min=pg_wait)
Пример #10
0
def test_ceph_health(measure_stop_ceph_mon, measure_corrupt_pg):
    """
    Test that there are appropriate alerts for Ceph health triggered.
    For this check of Ceph Warning state is used measure_stop_ceph_mon
    utilization monitor and for Ceph Error state is used measure_corrupt_pg
    utilization.
    """
    api = prometheus.PrometheusAPI()

    alerts = measure_stop_ceph_mon.get("prometheus_alerts")
    target_label = constants.ALERT_CLUSTERWARNINGSTATE
    target_msg = "Storage cluster is in degraded state"
    target_states = ["pending", "firing"]
    target_severity = "warning"
    prometheus.check_alert_list(
        label=target_label,
        msg=target_msg,
        alerts=alerts,
        states=target_states,
        severity=target_severity,
    )
    api.check_alert_cleared(
        label=target_label,
        measure_end_time=measure_stop_ceph_mon.get("stop"),
    )

    alerts = measure_corrupt_pg.get("prometheus_alerts")
    target_label = constants.ALERT_CLUSTERERRORSTATE
    target_msg = "Storage cluster is in error state"
    target_states = ["pending", "firing"]
    target_severity = "error"
    prometheus.check_alert_list(
        label=target_label,
        msg=target_msg,
        alerts=alerts,
        states=target_states,
        severity=target_severity,
    )
    # the time to wait is increased because it takes more time for Ceph
    # cluster to resolve its issues
    pg_wait = 360
    api.check_alert_cleared(
        label=target_label,
        measure_end_time=measure_corrupt_pg.get("stop"),
        time_min=pg_wait,
    )
Пример #11
0
def test_noobaa_ns_bucket(measure_noobaa_ns_target_bucket_deleted):
    """
    Test that there are appropriate alerts when target bucket used of
    namespace store used in namespace bucket is deleted.
    """
    api = prometheus.PrometheusAPI()

    alerts = measure_noobaa_ns_target_bucket_deleted.get("prometheus_alerts")

    expected_alerts = [
        (
            constants.ALERT_NAMESPACEBUCKETERRORSTATE,
            "A NooBaa Namespace Bucket Is In Error State",
            ["pending", "firing"],
            "warning",
        ),
        (
            constants.ALERT_NAMESPACERESOURCEERRORSTATE,
            "A NooBaa Namespace Resource Is In Error State",
            ["pending", "firing"],
            "warning",
        ),
    ]

    for target_label, target_msg, target_states, target_severity in expected_alerts:
        prometheus.check_alert_list(
            label=target_label,
            msg=target_msg,
            alerts=alerts,
            states=target_states,
            severity=target_severity,
        )
        # the time to wait is increased because it takes more time for NooBaa
        # to clear the alert
        pg_wait = 600
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=measure_noobaa_ns_target_bucket_deleted.get(
                "stop"),
            time_min=pg_wait,
        )
Пример #12
0
def test_ceph_manager_stopped(measure_stop_ceph_mgr):
    """
    Test that there is appropriate alert when ceph manager
    is unavailable and that this alert is cleared when the manager
    is back online.
    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = measure_stop_ceph_mgr.get('prometheus_alerts')
    target_label = constants.ALERT_MGRISABSENT
    target_msg = 'Storage metrics collector service not available anymore.'
    states = ['pending', 'firing']

    prometheus.check_alert_list(label=target_label,
                                msg=target_msg,
                                alerts=alerts,
                                states=states,
                                severity='critical')
    api.check_alert_cleared(label=target_label,
                            measure_end_time=measure_stop_ceph_mgr.get('stop'))
Пример #13
0
def test_ceph_mons_quorum_lost(measure_stop_ceph_mon):
    """
    Test to verify that CephMonQuorumLost alert is seen and
    that this alert is cleared when monitors are back online.
    """
    api = prometheus.PrometheusAPI()

    # get alerts from time when manager deployment was scaled down
    alerts = measure_stop_ceph_mon.get("prometheus_alerts")
    target_label = constants.ALERT_MONQUORUMLOST
    target_msg = "Storage quorum is lost"
    target_states = ["pending", "firing"]

    prometheus.check_alert_list(
        label=target_label,
        msg=target_msg,
        alerts=alerts,
        states=target_states,
        severity="critical",
    )
    api.check_alert_cleared(label=target_label,
                            measure_end_time=measure_stop_ceph_mon.get("stop"))
    def test_pvc_expansion_when_full(self):
        """
        Verify PVC expansion when the PVC is 100% utilized.
        Verify utilization alert will stop firing after volume expansion.

        """
        pvc_size_expanded = 10

        # Run IO to utilise 100% of volume
        log.info("Run IO on all to utilise 100% of PVCs")
        for pod_obj in self.pods:
            pod_obj.run_io(
                "fs",
                size=f"{self.pvc_size}G",
                io_direction="write",
                runtime=30,
                rate="100M",
                fio_filename=f"{pod_obj.name}_f1",
            )
        log.info("Started IO on all to utilise 100% of PVCs")
        # Wait for IO to finish
        log.info("Wait for IO to finish on pods")
        for pod_obj in self.pods:
            try:
                pod_obj.get_fio_results()
            except CommandFailed as cfe:
                if "No space left on device" not in str(cfe):
                    raise
            log.info(f"IO finished on pod {pod_obj.name}")
            # Verify used space on pod is 100%
            used_space = get_used_space_on_mount_point(pod_obj)
            assert used_space == "100%", (
                f"The used space on pod {pod_obj.name} is not 100% "
                f"but {used_space}")
            log.info(f"Verified: Used space on pod {pod_obj.name} is 100%")

        prometheus_api = PrometheusAPI()

        # Wait till utilization alerts starts
        for response in TimeoutSampler(140, 5, prometheus_api.get, "alerts"):
            alerts = response.json()["data"]["alerts"]
            for pvc_obj in self.pvcs:
                alerts_pvc = [
                    alert for alert in alerts if alert.get("labels", {}).get(
                        "persistentvolumeclaim") == pvc_obj.name
                ]
                # At least 2 alerts should be present
                if len(alerts_pvc) < 2:
                    break

                # Verify 'PersistentVolumeUsageNearFull' alert is firing
                if not getattr(pvc_obj, "near_full_alert", False):
                    try:
                        log.info(
                            f"Checking 'PersistentVolumeUsageNearFull' alert "
                            f"for PVC {pvc_obj.name}")
                        near_full_msg = (
                            f"PVC {pvc_obj.name} is nearing full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageNearFull",
                            msg=near_full_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="warning",
                        )
                        pvc_obj.near_full_alert = True
                    except AssertionError:
                        log.info(f"'PersistentVolumeUsageNearFull' alert not "
                                 f"started firing for PVC {pvc_obj.name}")

                # Verify 'PersistentVolumeUsageCritical' alert is firing
                if not getattr(pvc_obj, "critical_alert", False):
                    try:
                        log.info(
                            f"Checking 'PersistentVolumeUsageCritical' alert "
                            f"for PVC {pvc_obj.name}")
                        critical_msg = (
                            f"PVC {pvc_obj.name} is critically full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageCritical",
                            msg=critical_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="error",
                        )
                        pvc_obj.critical_alert = True
                    except AssertionError:
                        log.info(f"'PersistentVolumeUsageCritical' alert not "
                                 f"started firing for PVC {pvc_obj.name}")

            # Collect list of PVCs for which alerts are not firing
            not_near_full_pvc = [
                pvc_ob.name for pvc_ob in self.pvcs
                if not getattr(pvc_ob, "near_full_alert", False)
            ]
            not_critical_pvc = [
                pvc_ob.name for pvc_ob in self.pvcs
                if not getattr(pvc_ob, "critical_alert", False)
            ]

            if (not not_near_full_pvc) and (not not_critical_pvc):
                log.info("'PersistentVolumeUsageNearFull' and "
                         "'PersistentVolumeUsageCritical' alerts are firing "
                         "for all PVCs.")
                break

        log.info("Expanding PVCs.")
        for pvc_obj in self.pvcs:
            log.info(f"Expanding size of PVC {pvc_obj.name} to "
                     f"{pvc_size_expanded}Gi")
            pvc_obj.resize_pvc(pvc_size_expanded, True)
        log.info(f"All PVCs are expanded to {pvc_size_expanded}Gi")

        # Verify utilization alerts are stopped
        for response in TimeoutSampler(140, 5, prometheus_api.get, "alerts"):
            alerts = response.json()["data"]["alerts"]
            for pvc_obj in self.pvcs:
                alerts_pvc = [
                    alert for alert in alerts if alert.get("labels", {}).get(
                        "persistentvolumeclaim") == pvc_obj.name
                ]
                if not alerts_pvc:
                    pvc_obj.near_full_alert = False
                    pvc_obj.critical_alert = False
                    continue

                # Verify 'PersistentVolumeUsageNearFull' alert stopped firing
                if getattr(pvc_obj, "near_full_alert"):
                    try:
                        log.info(
                            f"Checking 'PrsistentVolumeUsageNearFull' alert "
                            f"is cleared for PVC {pvc_obj.name}")
                        near_full_msg = (
                            f"PVC {pvc_obj.name} is nearing full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageNearFull",
                            msg=near_full_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="warning",
                        )
                        log.info(
                            f"'PersistentVolumeUsageNearFull' alert is not "
                            f"stopped for PVC {pvc_obj.name}")
                    except AssertionError:
                        pvc_obj.near_full_alert = False
                        log.info(
                            f"'PersistentVolumeUsageNearFull' alert stopped "
                            f"firing for PVC {pvc_obj.name}")

                # Verify 'PersistentVolumeUsageCritical' alert stopped firing
                if getattr(pvc_obj, "critical_alert"):
                    try:
                        log.info(
                            f"Checking 'PersistentVolumeUsageCritical' alert "
                            f"is cleared for PVC {pvc_obj.name}")
                        critical_msg = (
                            f"PVC {pvc_obj.name} is critically full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageCritical",
                            msg=critical_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="error",
                        )
                        log.info(
                            f"'PersistentVolumeUsageCritical' alert is not "
                            f"stopped for PVC {pvc_obj.name}")
                    except AssertionError:
                        pvc_obj.critical_alert = False
                        log.info(
                            f"'PersistentVolumeUsageCritical' alert stopped "
                            f"firing for PVC {pvc_obj.name}")

            # Collect list of PVCs for which alerts are still firing
            near_full_pvcs = [
                pvc_ob.name for pvc_ob in self.pvcs
                if getattr(pvc_ob, "near_full_alert")
            ]
            critical_pvcs = [
                pvc_ob.name for pvc_ob in self.pvcs
                if getattr(pvc_ob, "critical_alert")
            ]

            if (not near_full_pvcs) and (not critical_pvcs):
                log.info(
                    "'PersistentVolumeUsageNearFull' and "
                    "'PersistentVolumeUsageCritical' alerts are cleared for "
                    "all PVCs.")
                break

        # Run IO to verify the expanded capacity can be utilized
        log.info("Run IO after PVC expansion.")
        for pod_obj in self.pods:
            pod_obj.run_io(
                "fs",
                size="3G",
                io_direction="write",
                runtime=60,
                fio_filename=f"{pod_obj.name}_f2",
            )

        # Wait for IO to complete
        log.info("Waiting for IO to complete on pods.")
        for pod_obj in self.pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"IO error on pod {pod_obj.name}. FIO result: {fio_result}"
            log.info(f"Verified IO on pod {pod_obj.name} after expanding PVC.")
Пример #15
0
def test_noobaa_bucket_quota(measure_noobaa_exceed_bucket_quota):
    """
    Test that there are appropriate alerts when NooBaa Bucket Quota is reached.
    """
    api = prometheus.PrometheusAPI()

    alerts = measure_noobaa_exceed_bucket_quota.get("prometheus_alerts")

    # since version 4.5 all NooBaa alerts have defined Pending state
    if version.get_semantic_ocs_version_from_config() < version.VERSION_4_5:
        expected_alerts = [
            (
                constants.ALERT_BUCKETREACHINGQUOTASTATE,
                "A NooBaa Bucket Is In Reaching Quota State",
                ["firing"],
                "warning",
            ),
            (
                constants.ALERT_BUCKETERRORSTATE,
                "A NooBaa Bucket Is In Error State",
                ["pending", "firing"],
                "warning",
            ),
            (
                constants.ALERT_BUCKETEXCEEDINGQUOTASTATE,
                "A NooBaa Bucket Is In Exceeding Quota State",
                ["firing"],
                "warning",
            ),
        ]
    else:
        expected_alerts = [
            (
                constants.ALERT_BUCKETREACHINGQUOTASTATE,
                "A NooBaa Bucket Is In Reaching Quota State",
                ["pending", "firing"],
                "warning",
            ),
            (
                constants.ALERT_BUCKETERRORSTATE,
                "A NooBaa Bucket Is In Error State",
                ["pending", "firing"],
                "warning",
            ),
            (
                constants.ALERT_BUCKETEXCEEDINGQUOTASTATE,
                "A NooBaa Bucket Is In Exceeding Quota State",
                ["pending", "firing"],
                "warning",
            ),
        ]

    for target_label, target_msg, target_states, target_severity in expected_alerts:
        prometheus.check_alert_list(
            label=target_label,
            msg=target_msg,
            alerts=alerts,
            states=target_states,
            severity=target_severity,
        )
        # the time to wait is increased because it takes more time for OCS
        # cluster to resolve its issues
        pg_wait = 480
        api.check_alert_cleared(
            label=target_label,
            measure_end_time=measure_noobaa_exceed_bucket_quota.get("stop"),
            time_min=pg_wait,
        )
Пример #16
0
    def test_clone_when_full(self, pvc_clone_factory, pod_factory):
        """
        Create a clone from an existing PVC when the PVC is 100% utilized.
        Verify data integrity.
        Verify utilization alert in cloned PVC.
        Expand cloned PVC and ensure utilization alerts are stopped.

        """
        pvc_size_expanded = 6
        file_name = "fio_full"
        prometheus_api = PrometheusAPI()

        # Run IO to utilize 100% of volume
        log.info("Run IO on all pods to utilise 100% of PVCs")
        for pod_obj in self.pods:
            # Get available free space in M
            df_avail_size = pod_obj.exec_cmd_on_pod(
                command=f"df {pod_obj.get_storage_path()} -B M --output=avail")
            # Get the numeral value of available space. eg: 3070 from '3070M'
            available_size = int(df_avail_size.strip().split()[1][0:-1])
            pod_obj.run_io(
                "fs",
                size=f"{available_size-2}M",
                runtime=20,
                rate="100M",
                fio_filename=file_name,
                end_fsync=1,
            )
        log.info("Started IO on all pods to utilise 100% of PVCs")

        # Wait for IO to finish
        log.info("Wait for IO to finish on pods")
        for pod_obj in self.pods:
            pod_obj.get_fio_results()
            log.info(f"IO finished on pod {pod_obj.name}")

            # Verify used space on pod is 100%
            used_space = pod.get_used_space_on_mount_point(pod_obj)
            assert used_space == "100%", (
                f"The used space on pod {pod_obj.name} is not 100% "
                f"but {used_space}")
            log.info(f"Verified: Used space on pod {pod_obj.name} is 100%")
            # Calculate md5sum of the file
            pod_obj.pvc.md5sum = pod.cal_md5sum(pod_obj, file_name)

        log.info("Creating clone of the PVCs")
        cloned_pvcs = [pvc_clone_factory(pvc_obj) for pvc_obj in self.pvcs]
        log.info("Created clone of the PVCs. Cloned PVCs are Bound")

        # Attach the cloned PVCs to pods
        log.info("Attach the cloned PVCs to pods")
        clone_pod_objs = []
        for clone_pvc_obj in cloned_pvcs:
            interface = (constants.CEPHFILESYSTEM if
                         (constants.CEPHFS_INTERFACE
                          in clone_pvc_obj.backed_sc) else
                         constants.CEPHBLOCKPOOL)
            clone_pod_obj = pod_factory(interface=interface,
                                        pvc=clone_pvc_obj,
                                        status="")
            log.info(f"Attached the PVC {clone_pvc_obj.name} to pod "
                     f"{clone_pod_obj.name}")
            clone_pod_objs.append(clone_pod_obj)

        # Verify the new pods are running
        log.info("Verify the new pods are running")
        for pod_obj in clone_pod_objs:
            wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
        log.info("Verified: New pods are running")

        # Verify that the md5sum matches
        for pod_obj in clone_pod_objs:
            log.info(f"Verifying md5sum of {file_name} "
                     f"on pod {pod_obj.name}")
            pod.verify_data_integrity(pod_obj, file_name,
                                      pod_obj.pvc.parent.md5sum)
            log.info(f"Verified: md5sum of {file_name} on pod {pod_obj.name} "
                     f"matches with the original md5sum")

        # Wait till utilization alerts starts
        for response in TimeoutSampler(180, 5, prometheus_api.get, "alerts"):
            alerts = response.json()["data"]["alerts"]
            for pvc_obj in cloned_pvcs:
                alerts_pvc = [
                    alert for alert in alerts if alert.get("labels", {}).get(
                        "persistentvolumeclaim") == pvc_obj.name
                ]
                # At least 2 alerts should be present
                if len(alerts_pvc) < 2:
                    break

                # Verify 'PersistentVolumeUsageNearFull' alert is firing
                if not getattr(pvc_obj, "near_full_alert", False):
                    try:
                        log.info(
                            f"Checking 'PersistentVolumeUsageNearFull' alert "
                            f"for PVC {pvc_obj.name}")
                        near_full_msg = (
                            f"PVC {pvc_obj.name} is nearing full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageNearFull",
                            msg=near_full_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="warning",
                        )
                        pvc_obj.near_full_alert = True
                    except AssertionError:
                        log.info(f"'PersistentVolumeUsageNearFull' alert not "
                                 f"started firing for PVC {pvc_obj.name}")

                # Verify 'PersistentVolumeUsageCritical' alert is firing
                if not getattr(pvc_obj, "critical_alert", False):
                    try:
                        log.info(
                            f"Checking 'PersistentVolumeUsageCritical' alert "
                            f"for PVC {pvc_obj.name}")
                        critical_msg = (
                            f"PVC {pvc_obj.name} is critically full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageCritical",
                            msg=critical_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="error",
                        )
                        pvc_obj.critical_alert = True
                    except AssertionError:
                        log.info(f"'PersistentVolumeUsageCritical' alert not "
                                 f"started firing for PVC {pvc_obj.name}")

            # Collect list of PVCs for which alerts are not firing
            not_near_full_pvc = [
                pvc_ob.name for pvc_ob in cloned_pvcs
                if not getattr(pvc_ob, "near_full_alert", False)
            ]
            not_critical_pvc = [
                pvc_ob.name for pvc_ob in cloned_pvcs
                if not getattr(pvc_ob, "critical_alert", False)
            ]

            if (not not_near_full_pvc) and (not not_critical_pvc):
                log.info("'PersistentVolumeUsageNearFull' and "
                         "'PersistentVolumeUsageCritical' alerts are firing "
                         "for all cloned PVCs.")
                break
        log.info("Verified: Utilization alerts are firing")

        log.info("Expanding cloned PVCs.")
        for pvc_obj in cloned_pvcs:
            log.info(f"Expanding size of PVC {pvc_obj.name} to "
                     f"{pvc_size_expanded}Gi")
            # Expand PVC
            pvc_obj.resize_pvc(pvc_size_expanded, True)

        # Verify utilization alerts are stopped
        for response in TimeoutSampler(180, 5, prometheus_api.get, "alerts"):
            alerts = response.json()["data"]["alerts"]
            for pvc_obj in cloned_pvcs:
                alerts_pvc = [
                    alert for alert in alerts if alert.get("labels", {}).get(
                        "persistentvolumeclaim") == pvc_obj.name
                ]
                if not alerts_pvc:
                    pvc_obj.near_full_alert = False
                    pvc_obj.critical_alert = False
                    continue

                # Verify 'PersistentVolumeUsageNearFull' alert stopped firing
                if getattr(pvc_obj, "near_full_alert"):
                    try:
                        log.info(
                            f"Checking 'PrsistentVolumeUsageNearFull' alert "
                            f"is cleared for PVC {pvc_obj.name}")
                        near_full_msg = (
                            f"PVC {pvc_obj.name} is nearing full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageNearFull",
                            msg=near_full_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="warning",
                        )
                        log.info(
                            f"'PersistentVolumeUsageNearFull' alert is not "
                            f"stopped for PVC {pvc_obj.name}")
                    except AssertionError:
                        pvc_obj.near_full_alert = False
                        log.info(
                            f"'PersistentVolumeUsageNearFull' alert stopped "
                            f"firing for PVC {pvc_obj.name}")

                # Verify 'PersistentVolumeUsageCritical' alert stopped firing
                if getattr(pvc_obj, "critical_alert"):
                    try:
                        log.info(
                            f"Checking 'PersistentVolumeUsageCritical' alert "
                            f"is cleared for PVC {pvc_obj.name}")
                        critical_msg = (
                            f"PVC {pvc_obj.name} is critically full. Data "
                            f"deletion or PVC expansion is required.")
                        check_alert_list(
                            label="PersistentVolumeUsageCritical",
                            msg=critical_msg,
                            alerts=alerts_pvc,
                            states=["firing"],
                            severity="error",
                        )
                        log.info(
                            f"'PersistentVolumeUsageCritical' alert is not "
                            f"stopped for PVC {pvc_obj.name}")
                    except AssertionError:
                        pvc_obj.critical_alert = False
                        log.info(
                            f"'PersistentVolumeUsageCritical' alert stopped "
                            f"firing for PVC {pvc_obj.name}")

            # Collect list of PVCs for which alerts are still firing
            near_full_pvcs = [
                pvc_ob.name for pvc_ob in cloned_pvcs
                if getattr(pvc_ob, "near_full_alert")
            ]
            critical_pvcs = [
                pvc_ob.name for pvc_ob in cloned_pvcs
                if getattr(pvc_ob, "critical_alert")
            ]

            if (not near_full_pvcs) and (not critical_pvcs):
                log.info(
                    "'PersistentVolumeUsageNearFull' and "
                    "'PersistentVolumeUsageCritical' alerts are cleared for "
                    "all cloned PVCs.")
                break

        log.info("Verified: Utilization alerts stopped firing")