def test_pvc_no_space_left(self, base_setup): """ Writing data to PVC to reach limit """ log.info(f"Running FIO to fill PVC size: {self.pvc_size_gb}") self.pod_obj1.run_io('fs', size=self.pvc_size_gb, io_direction='write', runtime=60) log.info("Waiting for IO results") try: self.pod_obj1.get_fio_results() except ex.CommandFailed as cf: if "No space left on device" not in str(cf): raise else: used_space = get_used_space_on_mount_point(self.pod_obj1) assert used_space == '100%', ( f"The used space is not 100% but {used_space} which means " f"the device is not full") used_space = get_used_space_on_mount_point(self.pod_obj2) assert used_space == '100%', ( f"The used space is not 100% but {used_space} which means " f"the device is not full") log.info(f"FIO succeeded to fill the PVC with data") log.info( f"Deleting the first pod and checking used size from the 2nd pod") self.pod_obj1.delete() used_space = get_used_space_on_mount_point(self.pod_obj2) assert used_space == '100%', ( f"The used space is not 100% but {used_space} from 2nd pod")
def test_pvc_no_space_left(self, base_setup, pod_factory): """ Writing data to PVC to reach limit """ log.info(f"Running FIO to fill PVC size: {self.pvc_size_gb}") self.pod_obj.run_io('fs', size=self.pvc_size_gb, io_direction='write', runtime=60) log.info("Waiting for IO results") try: self.pod_obj.get_fio_results() except ex.CommandFailed as cf: if "No space left on device" not in str(cf): raise else: used_space = get_used_space_on_mount_point(self.pod_obj) assert used_space == '100%', ( f"The used space is not 100% but {used_space} which means " f"the device is not full") log.info(f"FIO succeeded to fill the PVC with data") log.info(f"Deleting the pod and attaching the full PVC to a new pod") self.pod_obj.delete() log.info(f"Creating a new Pod with the existing full PVC") self.pod_obj = pod_factory(interface=self.interface, pvc=self.pvc_obj) used_space = get_used_space_on_mount_point(self.pod_obj) assert used_space == '100%', ( f"The used space is not 100% but {used_space} from the new pod")
def test_snapshot_at_different_usage_level( self, snapshot_factory, snapshot_restore_factory, pod_factory ): """ Test to take multiple snapshots of same PVC when the PVC usage is at 0%, 20%, 40%, 60%, and 80%, then delete the parent PVC and restore the snapshots to create new PVCs. Delete snapshots and attach the restored PVCs to pods to verify the data. """ snapshots = [] usage_percent = [0, 20, 40, 60, 80] for usage in usage_percent: if usage != 0: for pod_obj in self.pods: log.info(f"Running IO on pod {pod_obj.name} to utilize {usage}%") pod_obj.pvc.filename = f"{pod_obj.name}_{usage}" pod_obj.run_io( storage_type="fs", size=f"{int(self.pvc_size/len(usage_percent))}G", runtime=20, fio_filename=pod_obj.pvc.filename, ) log.info(f"IO started on all pods to utilize {usage}%") for pod_obj in self.pods: # Wait for fio to finish pod_obj.get_fio_results() log.info( f"IO to utilize {usage}% finished on pod " f"{pod_obj.name}" ) # Calculate md5sum md5_sum = pod.cal_md5sum(pod_obj, pod_obj.pvc.filename) if not getattr(pod_obj.pvc, "md5_sum", None): setattr(pod_obj.pvc, "md5_sum", {}) pod_obj.pvc.md5_sum[pod_obj.pvc.filename] = md5_sum # Take snapshot of all PVCs log.info(f"Creating snapshot of all PVCs at {usage}%") for pvc_obj in self.pvcs: log.info(f"Creating snapshot of PVC {pvc_obj.name} at {usage}%") snap_obj = snapshot_factory(pvc_obj, wait=False) # Set a dict containing filename:md5sum for later verification setattr(snap_obj, "md5_sum", deepcopy(getattr(pvc_obj, "md5_sum", {}))) snap_obj.usage_on_mount = get_used_space_on_mount_point( pvc_obj.get_attached_pods()[0] ) snapshots.append(snap_obj) log.info(f"Created snapshot of PVC {pvc_obj.name} at {usage}%") log.info(f"Created snapshot of all PVCs at {usage}%") log.info("Snapshots creation completed.") # Verify snapshots are ready log.info("Verify snapshots are ready") for snapshot in snapshots: snapshot.ocp.wait_for_resource( condition="true", resource_name=snapshot.name, column=constants.STATUS_READYTOUSE, timeout=90, ) # Delete pods log.info("Deleting the pods") for pod_obj in self.pods: pod_obj.delete() pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name) log.info("Deleted all the pods") # Delete parent PVCs log.info("Deleting parent PVCs") for pvc_obj in self.pvcs: pv_obj = pvc_obj.backed_pv_obj pvc_obj.delete() pvc_obj.ocp.wait_for_delete(resource_name=pvc_obj.name) log.info( f"Deleted PVC {pvc_obj.name}. Verifying whether PV " f"{pv_obj.name} is deleted." ) pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name) log.info( "Deleted parent PVCs before restoring snapshot. " "PVs are also deleted." ) restore_pvc_objs = [] # Create PVCs out of the snapshots log.info("Creating new PVCs from snapshots") for snapshot in snapshots: log.info(f"Creating a PVC from snapshot {snapshot.name}") restore_pvc_obj = snapshot_restore_factory( snapshot_obj=snapshot, size=f"{self.pvc_size}Gi", volume_mode=snapshot.parent_volume_mode, access_mode=snapshot.parent_access_mode, status="", ) log.info( f"Created PVC {restore_pvc_obj.name} from snapshot " f"{snapshot.name}" ) restore_pvc_objs.append(restore_pvc_obj) log.info("Created new PVCs from all the snapshots") # Confirm that the restored PVCs are Bound # Increased wait time to 600 seconds as a workaround for BZ 1899968 # TODO: Revert wait time to 200 seconds once BZ 1899968 is fixed log.info("Verify the restored PVCs are Bound") for pvc_obj in restore_pvc_objs: wait_for_resource_state( resource=pvc_obj, state=constants.STATUS_BOUND, timeout=600 ) pvc_obj.reload() log.info("Verified: Restored PVCs are Bound.") snapcontent_objs = [] # Get VolumeSnapshotContent form VolumeSnapshots and delete # VolumeSnapshots log.info("Deleting snapshots") for snapshot in snapshots: snapcontent_objs.append(get_snapshot_content_obj(snap_obj=snapshot)) snapshot.delete() # Verify volume snapshots are deleted log.info("Verify snapshots are deleted") for snapshot in snapshots: snapshot.ocp.wait_for_delete(resource_name=snapshot.name) log.info("Verified: Snapshots are deleted") # Verify VolumeSnapshotContents are deleted for snapcontent_obj in snapcontent_objs: snapcontent_obj.ocp.wait_for_delete( resource_name=snapcontent_obj.name, timeout=180 ) # Attach the restored PVCs to pods log.info("Attach the restored PVCs to pods") restore_pod_objs = [] for restore_pvc_obj in restore_pvc_objs: interface = ( constants.CEPHFILESYSTEM if (constants.CEPHFS_INTERFACE in restore_pvc_obj.snapshot.parent_sc) else constants.CEPHBLOCKPOOL ) restore_pod_obj = pod_factory( interface=interface, pvc=restore_pvc_obj, status="" ) log.info( f"Attached the PVC {restore_pvc_obj.name} to pod " f"{restore_pod_obj.name}" ) restore_pod_objs.append(restore_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in restore_pod_objs: timeout = ( 300 if config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM else 60 ) wait_for_resource_state(pod_obj, constants.STATUS_RUNNING, timeout) log.info("Verified: New pods are running") # Verify md5sum of files log.info("Verifying md5sum of files on all the pods") for restore_pod_obj in restore_pod_objs: log.info( f"Verifying md5sum of these files on pod " f"{restore_pod_obj.name}:" f"{restore_pod_obj.pvc.snapshot.md5_sum}" ) for ( file_name, actual_md5_sum, ) in restore_pod_obj.pvc.snapshot.md5_sum.items(): file_path = pod.get_file_path(restore_pod_obj, file_name) log.info( f"Checking the existence of file {file_name} on pod " f"{restore_pod_obj.name}" ) assert pod.check_file_existence(restore_pod_obj, file_path), ( f"File {file_name} does not exist on pod " f"{restore_pod_obj.name}" ) log.info(f"File {file_name} exists on pod {restore_pod_obj.name}") # Verify that the md5sum matches log.info( f"Verifying md5sum of file {file_name} on pod " f"{restore_pod_obj.name}" ) pod.verify_data_integrity(restore_pod_obj, file_name, actual_md5_sum) log.info( f"Verified md5sum of file {file_name} on pod " f"{restore_pod_obj.name}" ) log.info( f"Verified md5sum of these files on pod " f"{restore_pod_obj.name}:" f"{restore_pod_obj.pvc.snapshot.md5_sum}" ) log.info("md5sum verified") # Verify usage on mount point log.info("Verify usage on new pods") for pod_obj in restore_pod_objs: usage_on_pod = get_used_space_on_mount_point(pod_obj) assert usage_on_pod == pod_obj.pvc.snapshot.usage_on_mount, ( f"Usage on mount point is not the expected value on pod " f"{pod_obj.name}. Usage in percentage {usage_on_pod}. " f"Expected usage in percentage " f"{pod_obj.pvc.snapshot.usage_on_mount}" ) log.info( f"Verified usage on new pod {pod_obj.name}. Usage in " f"percentage {usage_on_pod}. Expected usage in percentage " f"{pod_obj.pvc.snapshot.usage_on_mount}" ) log.info("Verified usage on new pods")
def test_pvc_expansion_when_full(self): """ Verify PVC expansion when the PVC is 100% utilized. Verify utilization alert will stop firing after volume expansion. """ pvc_size_expanded = 10 # Run IO to utilise 100% of volume log.info("Run IO on all to utilise 100% of PVCs") for pod_obj in self.pods: pod_obj.run_io( "fs", size=f"{self.pvc_size}G", io_direction="write", runtime=30, rate="100M", fio_filename=f"{pod_obj.name}_f1", ) log.info("Started IO on all to utilise 100% of PVCs") # Wait for IO to finish log.info("Wait for IO to finish on pods") for pod_obj in self.pods: try: pod_obj.get_fio_results() except CommandFailed as cfe: if "No space left on device" not in str(cfe): raise log.info(f"IO finished on pod {pod_obj.name}") # Verify used space on pod is 100% used_space = get_used_space_on_mount_point(pod_obj) assert used_space == "100%", ( f"The used space on pod {pod_obj.name} is not 100% " f"but {used_space}") log.info(f"Verified: Used space on pod {pod_obj.name} is 100%") prometheus_api = PrometheusAPI() # Wait till utilization alerts starts for response in TimeoutSampler(140, 5, prometheus_api.get, "alerts"): alerts = response.json()["data"]["alerts"] for pvc_obj in self.pvcs: alerts_pvc = [ alert for alert in alerts if alert.get("labels", {}).get( "persistentvolumeclaim") == pvc_obj.name ] # At least 2 alerts should be present if len(alerts_pvc) < 2: break # Verify 'PersistentVolumeUsageNearFull' alert is firing if not getattr(pvc_obj, "near_full_alert", False): try: log.info( f"Checking 'PersistentVolumeUsageNearFull' alert " f"for PVC {pvc_obj.name}") near_full_msg = ( f"PVC {pvc_obj.name} is nearing full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageNearFull", msg=near_full_msg, alerts=alerts_pvc, states=["firing"], severity="warning", ) pvc_obj.near_full_alert = True except AssertionError: log.info(f"'PersistentVolumeUsageNearFull' alert not " f"started firing for PVC {pvc_obj.name}") # Verify 'PersistentVolumeUsageCritical' alert is firing if not getattr(pvc_obj, "critical_alert", False): try: log.info( f"Checking 'PersistentVolumeUsageCritical' alert " f"for PVC {pvc_obj.name}") critical_msg = ( f"PVC {pvc_obj.name} is critically full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageCritical", msg=critical_msg, alerts=alerts_pvc, states=["firing"], severity="error", ) pvc_obj.critical_alert = True except AssertionError: log.info(f"'PersistentVolumeUsageCritical' alert not " f"started firing for PVC {pvc_obj.name}") # Collect list of PVCs for which alerts are not firing not_near_full_pvc = [ pvc_ob.name for pvc_ob in self.pvcs if not getattr(pvc_ob, "near_full_alert", False) ] not_critical_pvc = [ pvc_ob.name for pvc_ob in self.pvcs if not getattr(pvc_ob, "critical_alert", False) ] if (not not_near_full_pvc) and (not not_critical_pvc): log.info("'PersistentVolumeUsageNearFull' and " "'PersistentVolumeUsageCritical' alerts are firing " "for all PVCs.") break log.info("Expanding PVCs.") for pvc_obj in self.pvcs: log.info(f"Expanding size of PVC {pvc_obj.name} to " f"{pvc_size_expanded}Gi") pvc_obj.resize_pvc(pvc_size_expanded, True) log.info(f"All PVCs are expanded to {pvc_size_expanded}Gi") # Verify utilization alerts are stopped for response in TimeoutSampler(140, 5, prometheus_api.get, "alerts"): alerts = response.json()["data"]["alerts"] for pvc_obj in self.pvcs: alerts_pvc = [ alert for alert in alerts if alert.get("labels", {}).get( "persistentvolumeclaim") == pvc_obj.name ] if not alerts_pvc: pvc_obj.near_full_alert = False pvc_obj.critical_alert = False continue # Verify 'PersistentVolumeUsageNearFull' alert stopped firing if getattr(pvc_obj, "near_full_alert"): try: log.info( f"Checking 'PrsistentVolumeUsageNearFull' alert " f"is cleared for PVC {pvc_obj.name}") near_full_msg = ( f"PVC {pvc_obj.name} is nearing full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageNearFull", msg=near_full_msg, alerts=alerts_pvc, states=["firing"], severity="warning", ) log.info( f"'PersistentVolumeUsageNearFull' alert is not " f"stopped for PVC {pvc_obj.name}") except AssertionError: pvc_obj.near_full_alert = False log.info( f"'PersistentVolumeUsageNearFull' alert stopped " f"firing for PVC {pvc_obj.name}") # Verify 'PersistentVolumeUsageCritical' alert stopped firing if getattr(pvc_obj, "critical_alert"): try: log.info( f"Checking 'PersistentVolumeUsageCritical' alert " f"is cleared for PVC {pvc_obj.name}") critical_msg = ( f"PVC {pvc_obj.name} is critically full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageCritical", msg=critical_msg, alerts=alerts_pvc, states=["firing"], severity="error", ) log.info( f"'PersistentVolumeUsageCritical' alert is not " f"stopped for PVC {pvc_obj.name}") except AssertionError: pvc_obj.critical_alert = False log.info( f"'PersistentVolumeUsageCritical' alert stopped " f"firing for PVC {pvc_obj.name}") # Collect list of PVCs for which alerts are still firing near_full_pvcs = [ pvc_ob.name for pvc_ob in self.pvcs if getattr(pvc_ob, "near_full_alert") ] critical_pvcs = [ pvc_ob.name for pvc_ob in self.pvcs if getattr(pvc_ob, "critical_alert") ] if (not near_full_pvcs) and (not critical_pvcs): log.info( "'PersistentVolumeUsageNearFull' and " "'PersistentVolumeUsageCritical' alerts are cleared for " "all PVCs.") break # Run IO to verify the expanded capacity can be utilized log.info("Run IO after PVC expansion.") for pod_obj in self.pods: pod_obj.run_io( "fs", size="3G", io_direction="write", runtime=60, fio_filename=f"{pod_obj.name}_f2", ) # Wait for IO to complete log.info("Waiting for IO to complete on pods.") for pod_obj in self.pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"IO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info(f"Verified IO on pod {pod_obj.name} after expanding PVC.")
def test_clone_when_full(self, pvc_clone_factory, pod_factory): """ Create a clone from an existing PVC when the PVC is 100% utilized. Verify data integrity. Verify utilization alert in cloned PVC. Expand cloned PVC and ensure utilization alerts are stopped. """ pvc_size_expanded = 6 file_name = "fio_full" prometheus_api = PrometheusAPI() # Run IO to utilize 100% of volume log.info("Run IO on all pods to utilise 100% of PVCs") for pod_obj in self.pods: # Get available free space in M df_avail_size = pod_obj.exec_cmd_on_pod( command=f"df {pod_obj.get_storage_path()} -B M --output=avail") # Get the numeral value of available space. eg: 3070 from '3070M' available_size = int(df_avail_size.strip().split()[1][0:-1]) pod_obj.run_io( "fs", size=f"{available_size-2}M", runtime=20, rate="100M", fio_filename=file_name, end_fsync=1, ) log.info("Started IO on all pods to utilise 100% of PVCs") # Wait for IO to finish log.info("Wait for IO to finish on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"IO finished on pod {pod_obj.name}") # Verify used space on pod is 100% used_space = pod.get_used_space_on_mount_point(pod_obj) assert used_space == "100%", ( f"The used space on pod {pod_obj.name} is not 100% " f"but {used_space}") log.info(f"Verified: Used space on pod {pod_obj.name} is 100%") # Calculate md5sum of the file pod_obj.pvc.md5sum = pod.cal_md5sum(pod_obj, file_name) log.info("Creating clone of the PVCs") cloned_pvcs = [pvc_clone_factory(pvc_obj) for pvc_obj in self.pvcs] log.info("Created clone of the PVCs. Cloned PVCs are Bound") # Attach the cloned PVCs to pods log.info("Attach the cloned PVCs to pods") clone_pod_objs = [] for clone_pvc_obj in cloned_pvcs: interface = (constants.CEPHFILESYSTEM if (constants.CEPHFS_INTERFACE in clone_pvc_obj.backed_sc) else constants.CEPHBLOCKPOOL) clone_pod_obj = pod_factory(interface=interface, pvc=clone_pvc_obj, status="") log.info(f"Attached the PVC {clone_pvc_obj.name} to pod " f"{clone_pod_obj.name}") clone_pod_objs.append(clone_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in clone_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify that the md5sum matches for pod_obj in clone_pod_objs: log.info(f"Verifying md5sum of {file_name} " f"on pod {pod_obj.name}") pod.verify_data_integrity(pod_obj, file_name, pod_obj.pvc.parent.md5sum) log.info(f"Verified: md5sum of {file_name} on pod {pod_obj.name} " f"matches with the original md5sum") # Wait till utilization alerts starts for response in TimeoutSampler(180, 5, prometheus_api.get, "alerts"): alerts = response.json()["data"]["alerts"] for pvc_obj in cloned_pvcs: alerts_pvc = [ alert for alert in alerts if alert.get("labels", {}).get( "persistentvolumeclaim") == pvc_obj.name ] # At least 2 alerts should be present if len(alerts_pvc) < 2: break # Verify 'PersistentVolumeUsageNearFull' alert is firing if not getattr(pvc_obj, "near_full_alert", False): try: log.info( f"Checking 'PersistentVolumeUsageNearFull' alert " f"for PVC {pvc_obj.name}") near_full_msg = ( f"PVC {pvc_obj.name} is nearing full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageNearFull", msg=near_full_msg, alerts=alerts_pvc, states=["firing"], severity="warning", ) pvc_obj.near_full_alert = True except AssertionError: log.info(f"'PersistentVolumeUsageNearFull' alert not " f"started firing for PVC {pvc_obj.name}") # Verify 'PersistentVolumeUsageCritical' alert is firing if not getattr(pvc_obj, "critical_alert", False): try: log.info( f"Checking 'PersistentVolumeUsageCritical' alert " f"for PVC {pvc_obj.name}") critical_msg = ( f"PVC {pvc_obj.name} is critically full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageCritical", msg=critical_msg, alerts=alerts_pvc, states=["firing"], severity="error", ) pvc_obj.critical_alert = True except AssertionError: log.info(f"'PersistentVolumeUsageCritical' alert not " f"started firing for PVC {pvc_obj.name}") # Collect list of PVCs for which alerts are not firing not_near_full_pvc = [ pvc_ob.name for pvc_ob in cloned_pvcs if not getattr(pvc_ob, "near_full_alert", False) ] not_critical_pvc = [ pvc_ob.name for pvc_ob in cloned_pvcs if not getattr(pvc_ob, "critical_alert", False) ] if (not not_near_full_pvc) and (not not_critical_pvc): log.info("'PersistentVolumeUsageNearFull' and " "'PersistentVolumeUsageCritical' alerts are firing " "for all cloned PVCs.") break log.info("Verified: Utilization alerts are firing") log.info("Expanding cloned PVCs.") for pvc_obj in cloned_pvcs: log.info(f"Expanding size of PVC {pvc_obj.name} to " f"{pvc_size_expanded}Gi") # Expand PVC pvc_obj.resize_pvc(pvc_size_expanded, True) # Verify utilization alerts are stopped for response in TimeoutSampler(180, 5, prometheus_api.get, "alerts"): alerts = response.json()["data"]["alerts"] for pvc_obj in cloned_pvcs: alerts_pvc = [ alert for alert in alerts if alert.get("labels", {}).get( "persistentvolumeclaim") == pvc_obj.name ] if not alerts_pvc: pvc_obj.near_full_alert = False pvc_obj.critical_alert = False continue # Verify 'PersistentVolumeUsageNearFull' alert stopped firing if getattr(pvc_obj, "near_full_alert"): try: log.info( f"Checking 'PrsistentVolumeUsageNearFull' alert " f"is cleared for PVC {pvc_obj.name}") near_full_msg = ( f"PVC {pvc_obj.name} is nearing full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageNearFull", msg=near_full_msg, alerts=alerts_pvc, states=["firing"], severity="warning", ) log.info( f"'PersistentVolumeUsageNearFull' alert is not " f"stopped for PVC {pvc_obj.name}") except AssertionError: pvc_obj.near_full_alert = False log.info( f"'PersistentVolumeUsageNearFull' alert stopped " f"firing for PVC {pvc_obj.name}") # Verify 'PersistentVolumeUsageCritical' alert stopped firing if getattr(pvc_obj, "critical_alert"): try: log.info( f"Checking 'PersistentVolumeUsageCritical' alert " f"is cleared for PVC {pvc_obj.name}") critical_msg = ( f"PVC {pvc_obj.name} is critically full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageCritical", msg=critical_msg, alerts=alerts_pvc, states=["firing"], severity="error", ) log.info( f"'PersistentVolumeUsageCritical' alert is not " f"stopped for PVC {pvc_obj.name}") except AssertionError: pvc_obj.critical_alert = False log.info( f"'PersistentVolumeUsageCritical' alert stopped " f"firing for PVC {pvc_obj.name}") # Collect list of PVCs for which alerts are still firing near_full_pvcs = [ pvc_ob.name for pvc_ob in cloned_pvcs if getattr(pvc_ob, "near_full_alert") ] critical_pvcs = [ pvc_ob.name for pvc_ob in cloned_pvcs if getattr(pvc_ob, "critical_alert") ] if (not near_full_pvcs) and (not critical_pvcs): log.info( "'PersistentVolumeUsageNearFull' and " "'PersistentVolumeUsageCritical' alerts are cleared for " "all cloned PVCs.") break log.info("Verified: Utilization alerts stopped firing")