def test_rgw_unavailable(measure_stop_rgw): """ Test that there is appropriate alert when RGW is unavailable and that this alert is cleared when the RGW interface is back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = measure_stop_rgw.get("prometheus_alerts") target_label = constants.ALERT_CLUSTEROBJECTSTORESTATE target_msg = ( "Cluster Object Store is in unhealthy state for more than 15s. " "Please check Ceph cluster health or RGW connection." ) states = ["pending", "firing"] prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=states, severity="error", ) api.check_alert_cleared( label=target_label, measure_end_time=measure_stop_rgw.get("stop") )
def test_ceph_monitor_stopped(measure_stop_ceph_mon): """ Test that there is appropriate alert related to ceph monitor quorum when there is even number of ceph monitors and that this alert is cleared when monitors are back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = measure_stop_ceph_mon.get("prometheus_alerts") for target_label, target_msg, target_states, target_severity in [ ( constants.ALERT_MONQUORUMATRISK, "Storage quorum at risk", ["pending"], "error", ), ( constants.ALERT_CLUSTERWARNINGSTATE, "Storage cluster is in degraded state", ["pending"], "warning", ), ]: prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ) api.check_alert_cleared( label=target_label, measure_end_time=measure_stop_ceph_mon.get("stop"))
def test_noobaa_bucket_quota(measure_noobaa_exceed_bucket_quota): """ Test that there are appropriate alerts when NooBaa Bucket Quota is reached. """ api = prometheus.PrometheusAPI() alerts = measure_noobaa_exceed_bucket_quota.get('prometheus_alerts') for target_label, target_msg, target_states, target_severity in [ (constants.ALERT_BUCKETREACHINGQUOTASTATE, 'A NooBaa Bucket Is In Reaching Quota State', ['firing'], 'warning'), (constants.ALERT_BUCKETERRORSTATE, 'A NooBaa Bucket Is In Error State', ['pending', 'firing'], 'warning'), (constants.ALERT_BUCKETEXCEEDINGQUOTASTATE, 'A NooBaa Bucket Is In Exceeding Quota State', ['firing'], 'warning') ]: prometheus.check_alert_list(label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity) # the time to wait is increased because it takes more time for OCS # cluster to resolve its issues pg_wait = 480 api.check_alert_cleared( label=target_label, measure_end_time=measure_noobaa_exceed_bucket_quota.get('stop'), time_min=pg_wait)
def test_ceph_monitor_stopped(workload_stop_ceph_mon): """ Test that there is appropriate alert related to ceph monitor quorum when there is even number of ceph monitors and that this alert is cleared when monitors are back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = workload_stop_ceph_mon.get('prometheus_alerts') for target_label, target_msg, target_states, target_severity in [ ( constants.ALERT_MONQUORUMATRISK, 'Storage quorum at risk', ['pending'], 'error' ), ( constants.ALERT_CLUSTERWARNINGSTATE, 'Storage cluster is in degraded state', ['pending', 'firing'], 'warning' ) ]: prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity ) api.check_alert_cleared( label=target_label, measure_end_time=workload_stop_ceph_mon.get('stop') )
def test_corrupt_pg_alerts(measure_corrupt_pg): """ Test that there are appropriate alerts when Placement group on one OSD is corrupted.ceph manager is unavailable and that this alert is cleared when the manager is back online. """ api = prometheus.PrometheusAPI() alerts = measure_corrupt_pg.get('prometheus_alerts') for target_label, target_msg, target_states, target_severity in [ (constants.ALERT_PGREPAIRTAKINGTOOLONG, 'Self heal problems detected', ['pending'], 'warning'), (constants.ALERT_CLUSTERERRORSTATE, 'Storage cluster is in error state', ['pending', 'firing'], 'error') ]: prometheus.check_alert_list(label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity) # the time to wait is increased because it takes more time for Ceph # cluster to resolve its issues pg_wait = 360 api.check_alert_cleared( label=target_label, measure_end_time=measure_corrupt_pg.get('stop'), time_min=pg_wait)
def test_ceph_osd_stopped(measure_stop_ceph_osd): """ Test that there is appropriate alert related to situation when ceph osd is down. Alert is cleared when osd disk is back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = measure_stop_ceph_osd.get('prometheus_alerts') for target_label, target_msg, target_states, target_severity, ignore in [ (constants.ALERT_OSDDISKNOTRESPONDING, 'Disk not responding', ['pending', 'firing'], 'error', False), (constants.ALERT_DATARECOVERYTAKINGTOOLONG, 'Data recovery is slow', ['pending'], 'warning', True), (constants.ALERT_CLUSTERWARNINGSTATE, 'Storage cluster is in degraded state', ['pending', 'firing'], 'warning', False) ]: prometheus.check_alert_list(label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ignore_more_occurences=ignore) # the time to wait is increased because it takes more time for osd pod # to be ready than for other pods osd_up_wait = 360 api.check_alert_cleared( label=target_label, measure_end_time=measure_stop_ceph_osd.get('stop'), time_min=osd_up_wait)
def test_rgw_unavailable(measure_stop_rgw): """ Test that there is appropriate alert when RGW is unavailable and that this alert is cleared when the RGW interface is back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = measure_stop_rgw.get("prometheus_alerts") target_label = constants.ALERT_CLUSTEROBJECTSTORESTATE # The alert message is changed since OCS 4.7 ocs_version = config.ENV_DATA["ocs_version"] if Version.coerce(ocs_version) < Version.coerce("4.7"): target_msg = ( "Cluster Object Store is in unhealthy state for more than 15s. " "Please check Ceph cluster health or RGW connection.") else: target_msg = "Cluster Object Store is in unhealthy state. Please check Ceph cluster health." states = ["pending", "firing"] prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=states, severity="error", ) api.check_alert_cleared(label=target_label, measure_end_time=measure_stop_rgw.get("stop"), time_min=300)
def test_rbd_capacity_workload_alerts(workload_storageutilization_95p_rbd): """ Test that there are appropriate alerts when ceph cluster is utilized via RBD interface. """ api = prometheus.PrometheusAPI() measure_end_time = workload_storageutilization_95p_rbd.get("stop") # Check utilization on 95% alerts = workload_storageutilization_95p_rbd.get("prometheus_alerts") if config.ENV_DATA.get("ocs_version") == "4.2": nearfull_message = "Storage cluster is nearing full. Expansion is required." criticallfull_mesage = ( "Storage cluster is critically full and needs immediate expansion" ) else: # since OCS 4.3 nearfull_message = ( "Storage cluster is nearing full. Data deletion or cluster " "expansion is required." ) criticallfull_mesage = ( "Storage cluster is critically full and needs immediate data " "deletion or cluster expansion." ) for target_label, target_msg, target_states, target_severity in [ ( constants.ALERT_CLUSTERNEARFULL, nearfull_message, ["pending", "firing"], "warning", ), ( constants.ALERT_CLUSTERCRITICALLYFULL, criticallfull_mesage, ["pending", "firing"], "error", ), ]: prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ignore_more_occurences=True, ) # the time to wait is increased because it takes more time for Ceph # cluster to delete all data pg_wait = 300 api.check_alert_cleared( label=target_label, measure_end_time=measure_end_time, time_min=pg_wait )
def test_capacity_workload_alerts(workload_storageutilization_95p_rbd, workload_storageutilization_95p_cephfs, interface): """ Test that there are appropriate alerts when ceph cluster is utilized. """ api = prometheus.PrometheusAPI() measure_end_time = max([ workload_storageutilization_95p_rbd.get('stop'), workload_storageutilization_95p_cephfs.get('stop'), ]) if interface == 'rbd': workload_storageutilization_95p = workload_storageutilization_95p_rbd elif interface == 'cephfs': workload_storageutilization_95p = workload_storageutilization_95p_cephfs # Check utilization on 95% alerts = workload_storageutilization_95p.get('prometheus_alerts') # TODO(fbalak): it seems that CephFS utilization triggers only firing # alerts. This needs to be more investigated. if config.ENV_DATA.get('ocs_version') == '4.2': nearfull_message = ( 'Storage cluster is nearing full. Expansion is required.') criticallfull_mesage = ( 'Storage cluster is critically full and needs immediate expansion') else: # since OCS 4.3 nearfull_message = ( 'Storage cluster is nearing full. Data deletion or cluster ' 'expansion is required.') criticallfull_mesage = ( 'Storage cluster is critically full and needs immediate data ' 'deletion or cluster expansion.') for target_label, target_msg, target_states, target_severity in [ (constants.ALERT_CLUSTERNEARFULL, nearfull_message, ['pending', 'firing'], 'warning'), (constants.ALERT_CLUSTERCRITICALLYFULL, criticallfull_mesage, ['pending', 'firing'], 'error'), ]: prometheus.check_alert_list(label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ignore_more_occurences=True) # the time to wait is increased because it takes more time for Ceph # cluster to delete all data pg_wait = 300 api.check_alert_cleared(label=target_label, measure_end_time=measure_end_time, time_min=pg_wait)
def test_ceph_health(measure_stop_ceph_mon, measure_corrupt_pg): """ Test that there are appropriate alerts for Ceph health triggered. For this check of Ceph Warning state is used measure_stop_ceph_mon utilization monitor and for Ceph Error state is used measure_corrupt_pg utilization. """ api = prometheus.PrometheusAPI() alerts = measure_stop_ceph_mon.get("prometheus_alerts") target_label = constants.ALERT_CLUSTERWARNINGSTATE target_msg = "Storage cluster is in degraded state" target_states = ["pending", "firing"] target_severity = "warning" prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ) api.check_alert_cleared( label=target_label, measure_end_time=measure_stop_ceph_mon.get("stop"), ) alerts = measure_corrupt_pg.get("prometheus_alerts") target_label = constants.ALERT_CLUSTERERRORSTATE target_msg = "Storage cluster is in error state" target_states = ["pending", "firing"] target_severity = "error" prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ) # the time to wait is increased because it takes more time for Ceph # cluster to resolve its issues pg_wait = 360 api.check_alert_cleared( label=target_label, measure_end_time=measure_corrupt_pg.get("stop"), time_min=pg_wait, )
def test_noobaa_ns_bucket(measure_noobaa_ns_target_bucket_deleted): """ Test that there are appropriate alerts when target bucket used of namespace store used in namespace bucket is deleted. """ api = prometheus.PrometheusAPI() alerts = measure_noobaa_ns_target_bucket_deleted.get("prometheus_alerts") expected_alerts = [ ( constants.ALERT_NAMESPACEBUCKETERRORSTATE, "A NooBaa Namespace Bucket Is In Error State", ["pending", "firing"], "warning", ), ( constants.ALERT_NAMESPACERESOURCEERRORSTATE, "A NooBaa Namespace Resource Is In Error State", ["pending", "firing"], "warning", ), ] for target_label, target_msg, target_states, target_severity in expected_alerts: prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ) # the time to wait is increased because it takes more time for NooBaa # to clear the alert pg_wait = 600 api.check_alert_cleared( label=target_label, measure_end_time=measure_noobaa_ns_target_bucket_deleted.get( "stop"), time_min=pg_wait, )
def test_ceph_manager_stopped(measure_stop_ceph_mgr): """ Test that there is appropriate alert when ceph manager is unavailable and that this alert is cleared when the manager is back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = measure_stop_ceph_mgr.get('prometheus_alerts') target_label = constants.ALERT_MGRISABSENT target_msg = 'Storage metrics collector service not available anymore.' states = ['pending', 'firing'] prometheus.check_alert_list(label=target_label, msg=target_msg, alerts=alerts, states=states, severity='critical') api.check_alert_cleared(label=target_label, measure_end_time=measure_stop_ceph_mgr.get('stop'))
def test_ceph_mons_quorum_lost(measure_stop_ceph_mon): """ Test to verify that CephMonQuorumLost alert is seen and that this alert is cleared when monitors are back online. """ api = prometheus.PrometheusAPI() # get alerts from time when manager deployment was scaled down alerts = measure_stop_ceph_mon.get("prometheus_alerts") target_label = constants.ALERT_MONQUORUMLOST target_msg = "Storage quorum is lost" target_states = ["pending", "firing"] prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity="critical", ) api.check_alert_cleared(label=target_label, measure_end_time=measure_stop_ceph_mon.get("stop"))
def test_pvc_expansion_when_full(self): """ Verify PVC expansion when the PVC is 100% utilized. Verify utilization alert will stop firing after volume expansion. """ pvc_size_expanded = 10 # Run IO to utilise 100% of volume log.info("Run IO on all to utilise 100% of PVCs") for pod_obj in self.pods: pod_obj.run_io( "fs", size=f"{self.pvc_size}G", io_direction="write", runtime=30, rate="100M", fio_filename=f"{pod_obj.name}_f1", ) log.info("Started IO on all to utilise 100% of PVCs") # Wait for IO to finish log.info("Wait for IO to finish on pods") for pod_obj in self.pods: try: pod_obj.get_fio_results() except CommandFailed as cfe: if "No space left on device" not in str(cfe): raise log.info(f"IO finished on pod {pod_obj.name}") # Verify used space on pod is 100% used_space = get_used_space_on_mount_point(pod_obj) assert used_space == "100%", ( f"The used space on pod {pod_obj.name} is not 100% " f"but {used_space}") log.info(f"Verified: Used space on pod {pod_obj.name} is 100%") prometheus_api = PrometheusAPI() # Wait till utilization alerts starts for response in TimeoutSampler(140, 5, prometheus_api.get, "alerts"): alerts = response.json()["data"]["alerts"] for pvc_obj in self.pvcs: alerts_pvc = [ alert for alert in alerts if alert.get("labels", {}).get( "persistentvolumeclaim") == pvc_obj.name ] # At least 2 alerts should be present if len(alerts_pvc) < 2: break # Verify 'PersistentVolumeUsageNearFull' alert is firing if not getattr(pvc_obj, "near_full_alert", False): try: log.info( f"Checking 'PersistentVolumeUsageNearFull' alert " f"for PVC {pvc_obj.name}") near_full_msg = ( f"PVC {pvc_obj.name} is nearing full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageNearFull", msg=near_full_msg, alerts=alerts_pvc, states=["firing"], severity="warning", ) pvc_obj.near_full_alert = True except AssertionError: log.info(f"'PersistentVolumeUsageNearFull' alert not " f"started firing for PVC {pvc_obj.name}") # Verify 'PersistentVolumeUsageCritical' alert is firing if not getattr(pvc_obj, "critical_alert", False): try: log.info( f"Checking 'PersistentVolumeUsageCritical' alert " f"for PVC {pvc_obj.name}") critical_msg = ( f"PVC {pvc_obj.name} is critically full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageCritical", msg=critical_msg, alerts=alerts_pvc, states=["firing"], severity="error", ) pvc_obj.critical_alert = True except AssertionError: log.info(f"'PersistentVolumeUsageCritical' alert not " f"started firing for PVC {pvc_obj.name}") # Collect list of PVCs for which alerts are not firing not_near_full_pvc = [ pvc_ob.name for pvc_ob in self.pvcs if not getattr(pvc_ob, "near_full_alert", False) ] not_critical_pvc = [ pvc_ob.name for pvc_ob in self.pvcs if not getattr(pvc_ob, "critical_alert", False) ] if (not not_near_full_pvc) and (not not_critical_pvc): log.info("'PersistentVolumeUsageNearFull' and " "'PersistentVolumeUsageCritical' alerts are firing " "for all PVCs.") break log.info("Expanding PVCs.") for pvc_obj in self.pvcs: log.info(f"Expanding size of PVC {pvc_obj.name} to " f"{pvc_size_expanded}Gi") pvc_obj.resize_pvc(pvc_size_expanded, True) log.info(f"All PVCs are expanded to {pvc_size_expanded}Gi") # Verify utilization alerts are stopped for response in TimeoutSampler(140, 5, prometheus_api.get, "alerts"): alerts = response.json()["data"]["alerts"] for pvc_obj in self.pvcs: alerts_pvc = [ alert for alert in alerts if alert.get("labels", {}).get( "persistentvolumeclaim") == pvc_obj.name ] if not alerts_pvc: pvc_obj.near_full_alert = False pvc_obj.critical_alert = False continue # Verify 'PersistentVolumeUsageNearFull' alert stopped firing if getattr(pvc_obj, "near_full_alert"): try: log.info( f"Checking 'PrsistentVolumeUsageNearFull' alert " f"is cleared for PVC {pvc_obj.name}") near_full_msg = ( f"PVC {pvc_obj.name} is nearing full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageNearFull", msg=near_full_msg, alerts=alerts_pvc, states=["firing"], severity="warning", ) log.info( f"'PersistentVolumeUsageNearFull' alert is not " f"stopped for PVC {pvc_obj.name}") except AssertionError: pvc_obj.near_full_alert = False log.info( f"'PersistentVolumeUsageNearFull' alert stopped " f"firing for PVC {pvc_obj.name}") # Verify 'PersistentVolumeUsageCritical' alert stopped firing if getattr(pvc_obj, "critical_alert"): try: log.info( f"Checking 'PersistentVolumeUsageCritical' alert " f"is cleared for PVC {pvc_obj.name}") critical_msg = ( f"PVC {pvc_obj.name} is critically full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageCritical", msg=critical_msg, alerts=alerts_pvc, states=["firing"], severity="error", ) log.info( f"'PersistentVolumeUsageCritical' alert is not " f"stopped for PVC {pvc_obj.name}") except AssertionError: pvc_obj.critical_alert = False log.info( f"'PersistentVolumeUsageCritical' alert stopped " f"firing for PVC {pvc_obj.name}") # Collect list of PVCs for which alerts are still firing near_full_pvcs = [ pvc_ob.name for pvc_ob in self.pvcs if getattr(pvc_ob, "near_full_alert") ] critical_pvcs = [ pvc_ob.name for pvc_ob in self.pvcs if getattr(pvc_ob, "critical_alert") ] if (not near_full_pvcs) and (not critical_pvcs): log.info( "'PersistentVolumeUsageNearFull' and " "'PersistentVolumeUsageCritical' alerts are cleared for " "all PVCs.") break # Run IO to verify the expanded capacity can be utilized log.info("Run IO after PVC expansion.") for pod_obj in self.pods: pod_obj.run_io( "fs", size="3G", io_direction="write", runtime=60, fio_filename=f"{pod_obj.name}_f2", ) # Wait for IO to complete log.info("Waiting for IO to complete on pods.") for pod_obj in self.pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"IO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info(f"Verified IO on pod {pod_obj.name} after expanding PVC.")
def test_noobaa_bucket_quota(measure_noobaa_exceed_bucket_quota): """ Test that there are appropriate alerts when NooBaa Bucket Quota is reached. """ api = prometheus.PrometheusAPI() alerts = measure_noobaa_exceed_bucket_quota.get("prometheus_alerts") # since version 4.5 all NooBaa alerts have defined Pending state if version.get_semantic_ocs_version_from_config() < version.VERSION_4_5: expected_alerts = [ ( constants.ALERT_BUCKETREACHINGQUOTASTATE, "A NooBaa Bucket Is In Reaching Quota State", ["firing"], "warning", ), ( constants.ALERT_BUCKETERRORSTATE, "A NooBaa Bucket Is In Error State", ["pending", "firing"], "warning", ), ( constants.ALERT_BUCKETEXCEEDINGQUOTASTATE, "A NooBaa Bucket Is In Exceeding Quota State", ["firing"], "warning", ), ] else: expected_alerts = [ ( constants.ALERT_BUCKETREACHINGQUOTASTATE, "A NooBaa Bucket Is In Reaching Quota State", ["pending", "firing"], "warning", ), ( constants.ALERT_BUCKETERRORSTATE, "A NooBaa Bucket Is In Error State", ["pending", "firing"], "warning", ), ( constants.ALERT_BUCKETEXCEEDINGQUOTASTATE, "A NooBaa Bucket Is In Exceeding Quota State", ["pending", "firing"], "warning", ), ] for target_label, target_msg, target_states, target_severity in expected_alerts: prometheus.check_alert_list( label=target_label, msg=target_msg, alerts=alerts, states=target_states, severity=target_severity, ) # the time to wait is increased because it takes more time for OCS # cluster to resolve its issues pg_wait = 480 api.check_alert_cleared( label=target_label, measure_end_time=measure_noobaa_exceed_bucket_quota.get("stop"), time_min=pg_wait, )
def test_clone_when_full(self, pvc_clone_factory, pod_factory): """ Create a clone from an existing PVC when the PVC is 100% utilized. Verify data integrity. Verify utilization alert in cloned PVC. Expand cloned PVC and ensure utilization alerts are stopped. """ pvc_size_expanded = 6 file_name = "fio_full" prometheus_api = PrometheusAPI() # Run IO to utilize 100% of volume log.info("Run IO on all pods to utilise 100% of PVCs") for pod_obj in self.pods: # Get available free space in M df_avail_size = pod_obj.exec_cmd_on_pod( command=f"df {pod_obj.get_storage_path()} -B M --output=avail") # Get the numeral value of available space. eg: 3070 from '3070M' available_size = int(df_avail_size.strip().split()[1][0:-1]) pod_obj.run_io( "fs", size=f"{available_size-2}M", runtime=20, rate="100M", fio_filename=file_name, end_fsync=1, ) log.info("Started IO on all pods to utilise 100% of PVCs") # Wait for IO to finish log.info("Wait for IO to finish on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"IO finished on pod {pod_obj.name}") # Verify used space on pod is 100% used_space = pod.get_used_space_on_mount_point(pod_obj) assert used_space == "100%", ( f"The used space on pod {pod_obj.name} is not 100% " f"but {used_space}") log.info(f"Verified: Used space on pod {pod_obj.name} is 100%") # Calculate md5sum of the file pod_obj.pvc.md5sum = pod.cal_md5sum(pod_obj, file_name) log.info("Creating clone of the PVCs") cloned_pvcs = [pvc_clone_factory(pvc_obj) for pvc_obj in self.pvcs] log.info("Created clone of the PVCs. Cloned PVCs are Bound") # Attach the cloned PVCs to pods log.info("Attach the cloned PVCs to pods") clone_pod_objs = [] for clone_pvc_obj in cloned_pvcs: interface = (constants.CEPHFILESYSTEM if (constants.CEPHFS_INTERFACE in clone_pvc_obj.backed_sc) else constants.CEPHBLOCKPOOL) clone_pod_obj = pod_factory(interface=interface, pvc=clone_pvc_obj, status="") log.info(f"Attached the PVC {clone_pvc_obj.name} to pod " f"{clone_pod_obj.name}") clone_pod_objs.append(clone_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in clone_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify that the md5sum matches for pod_obj in clone_pod_objs: log.info(f"Verifying md5sum of {file_name} " f"on pod {pod_obj.name}") pod.verify_data_integrity(pod_obj, file_name, pod_obj.pvc.parent.md5sum) log.info(f"Verified: md5sum of {file_name} on pod {pod_obj.name} " f"matches with the original md5sum") # Wait till utilization alerts starts for response in TimeoutSampler(180, 5, prometheus_api.get, "alerts"): alerts = response.json()["data"]["alerts"] for pvc_obj in cloned_pvcs: alerts_pvc = [ alert for alert in alerts if alert.get("labels", {}).get( "persistentvolumeclaim") == pvc_obj.name ] # At least 2 alerts should be present if len(alerts_pvc) < 2: break # Verify 'PersistentVolumeUsageNearFull' alert is firing if not getattr(pvc_obj, "near_full_alert", False): try: log.info( f"Checking 'PersistentVolumeUsageNearFull' alert " f"for PVC {pvc_obj.name}") near_full_msg = ( f"PVC {pvc_obj.name} is nearing full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageNearFull", msg=near_full_msg, alerts=alerts_pvc, states=["firing"], severity="warning", ) pvc_obj.near_full_alert = True except AssertionError: log.info(f"'PersistentVolumeUsageNearFull' alert not " f"started firing for PVC {pvc_obj.name}") # Verify 'PersistentVolumeUsageCritical' alert is firing if not getattr(pvc_obj, "critical_alert", False): try: log.info( f"Checking 'PersistentVolumeUsageCritical' alert " f"for PVC {pvc_obj.name}") critical_msg = ( f"PVC {pvc_obj.name} is critically full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageCritical", msg=critical_msg, alerts=alerts_pvc, states=["firing"], severity="error", ) pvc_obj.critical_alert = True except AssertionError: log.info(f"'PersistentVolumeUsageCritical' alert not " f"started firing for PVC {pvc_obj.name}") # Collect list of PVCs for which alerts are not firing not_near_full_pvc = [ pvc_ob.name for pvc_ob in cloned_pvcs if not getattr(pvc_ob, "near_full_alert", False) ] not_critical_pvc = [ pvc_ob.name for pvc_ob in cloned_pvcs if not getattr(pvc_ob, "critical_alert", False) ] if (not not_near_full_pvc) and (not not_critical_pvc): log.info("'PersistentVolumeUsageNearFull' and " "'PersistentVolumeUsageCritical' alerts are firing " "for all cloned PVCs.") break log.info("Verified: Utilization alerts are firing") log.info("Expanding cloned PVCs.") for pvc_obj in cloned_pvcs: log.info(f"Expanding size of PVC {pvc_obj.name} to " f"{pvc_size_expanded}Gi") # Expand PVC pvc_obj.resize_pvc(pvc_size_expanded, True) # Verify utilization alerts are stopped for response in TimeoutSampler(180, 5, prometheus_api.get, "alerts"): alerts = response.json()["data"]["alerts"] for pvc_obj in cloned_pvcs: alerts_pvc = [ alert for alert in alerts if alert.get("labels", {}).get( "persistentvolumeclaim") == pvc_obj.name ] if not alerts_pvc: pvc_obj.near_full_alert = False pvc_obj.critical_alert = False continue # Verify 'PersistentVolumeUsageNearFull' alert stopped firing if getattr(pvc_obj, "near_full_alert"): try: log.info( f"Checking 'PrsistentVolumeUsageNearFull' alert " f"is cleared for PVC {pvc_obj.name}") near_full_msg = ( f"PVC {pvc_obj.name} is nearing full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageNearFull", msg=near_full_msg, alerts=alerts_pvc, states=["firing"], severity="warning", ) log.info( f"'PersistentVolumeUsageNearFull' alert is not " f"stopped for PVC {pvc_obj.name}") except AssertionError: pvc_obj.near_full_alert = False log.info( f"'PersistentVolumeUsageNearFull' alert stopped " f"firing for PVC {pvc_obj.name}") # Verify 'PersistentVolumeUsageCritical' alert stopped firing if getattr(pvc_obj, "critical_alert"): try: log.info( f"Checking 'PersistentVolumeUsageCritical' alert " f"is cleared for PVC {pvc_obj.name}") critical_msg = ( f"PVC {pvc_obj.name} is critically full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageCritical", msg=critical_msg, alerts=alerts_pvc, states=["firing"], severity="error", ) log.info( f"'PersistentVolumeUsageCritical' alert is not " f"stopped for PVC {pvc_obj.name}") except AssertionError: pvc_obj.critical_alert = False log.info( f"'PersistentVolumeUsageCritical' alert stopped " f"firing for PVC {pvc_obj.name}") # Collect list of PVCs for which alerts are still firing near_full_pvcs = [ pvc_ob.name for pvc_ob in cloned_pvcs if getattr(pvc_ob, "near_full_alert") ] critical_pvcs = [ pvc_ob.name for pvc_ob in cloned_pvcs if getattr(pvc_ob, "critical_alert") ] if (not near_full_pvcs) and (not critical_pvcs): log.info( "'PersistentVolumeUsageNearFull' and " "'PersistentVolumeUsageCritical' alerts are cleared for " "all cloned PVCs.") break log.info("Verified: Utilization alerts stopped firing")