def test_rwo_dynamic_pvc(self, setup_base): """ RWO Dynamic PVC creation tests with Reclaim policy set to Delete/Retain """ logger.info( f"Creating second pod on node: {self.worker_nodes_list[1]}") pod_obj2 = helpers.create_pod(interface_type=self.interface_type, pvc_name=self.pvc_obj.name, do_reload=False, namespace=self.namespace, node_name=self.worker_nodes_list[1], pod_dict_path=constants.NGINX_POD_YAML) node_pod1 = self.pod_obj1.get().get('spec').get('nodeName') node_pod2 = pod_obj2.get().get('spec').get('nodeName') assert node_pod1 != node_pod2, 'Both pods are on the same node' logger.info(f"Running IO on pod {self.pod_obj1.name}") file_name = self.pod_obj1.name self.pod_obj1.run_io(storage_type=self.storage_type, size=self.io_size, runtime=30, fio_filename=file_name) pod.get_fio_rw_iops(self.pod_obj1) md5sum_pod1_data = pod.cal_md5sum(pod_obj=self.pod_obj1, file_name=file_name) # Verify that second pod is still in ContainerCreating state and not able to # attain Running state due to expected failure helpers.wait_for_resource_state( resource=pod_obj2, state=constants.STATUS_CONTAINER_CREATING) self.verify_expected_failure_event( ocs_obj=pod_obj2, failure_str=self.expected_pod_failure) logger.info(f"Deleting first pod so that second pod can attach" f" {self.pvc_obj.name}") self.pod_obj1.delete() self.pod_obj1.ocp.wait_for_delete(resource_name=self.pod_obj1.name) # Wait for second pod to be in Running state helpers.wait_for_resource_state(resource=pod_obj2, state=constants.STATUS_RUNNING, timeout=240) assert pod.verify_data_integrity(pod_obj=pod_obj2, file_name=file_name, original_md5sum=md5sum_pod1_data) pod_obj2.run_io(storage_type=self.storage_type, size=self.io_size, runtime=30, fio_filename=pod_obj2.name) pod.get_fio_rw_iops(pod_obj2) # Again verify data integrity assert pod.verify_data_integrity(pod_obj=pod_obj2, file_name=file_name, original_md5sum=md5sum_pod1_data)
def test_snapshot_at_different_usage_level( self, snapshot_factory, snapshot_restore_factory, pod_factory ): """ Test to take multiple snapshots of same PVC when the PVC usage is at 0%, 20%, 40%, 60%, and 80%, then delete the parent PVC and restore the snapshots to create new PVCs. Delete snapshots and attach the restored PVCs to pods to verify the data. """ snapshots = [] usage_percent = [0, 20, 40, 60, 80] for usage in usage_percent: if usage != 0: for pod_obj in self.pods: log.info(f"Running IO on pod {pod_obj.name} to utilize {usage}%") pod_obj.pvc.filename = f"{pod_obj.name}_{usage}" pod_obj.run_io( storage_type="fs", size=f"{int(self.pvc_size/len(usage_percent))}G", runtime=20, fio_filename=pod_obj.pvc.filename, ) log.info(f"IO started on all pods to utilize {usage}%") for pod_obj in self.pods: # Wait for fio to finish pod_obj.get_fio_results() log.info( f"IO to utilize {usage}% finished on pod " f"{pod_obj.name}" ) # Calculate md5sum md5_sum = pod.cal_md5sum(pod_obj, pod_obj.pvc.filename) if not getattr(pod_obj.pvc, "md5_sum", None): setattr(pod_obj.pvc, "md5_sum", {}) pod_obj.pvc.md5_sum[pod_obj.pvc.filename] = md5_sum # Take snapshot of all PVCs log.info(f"Creating snapshot of all PVCs at {usage}%") for pvc_obj in self.pvcs: log.info(f"Creating snapshot of PVC {pvc_obj.name} at {usage}%") snap_obj = snapshot_factory(pvc_obj, wait=False) # Set a dict containing filename:md5sum for later verification setattr(snap_obj, "md5_sum", deepcopy(getattr(pvc_obj, "md5_sum", {}))) snap_obj.usage_on_mount = get_used_space_on_mount_point( pvc_obj.get_attached_pods()[0] ) snapshots.append(snap_obj) log.info(f"Created snapshot of PVC {pvc_obj.name} at {usage}%") log.info(f"Created snapshot of all PVCs at {usage}%") log.info("Snapshots creation completed.") # Verify snapshots are ready log.info("Verify snapshots are ready") for snapshot in snapshots: snapshot.ocp.wait_for_resource( condition="true", resource_name=snapshot.name, column=constants.STATUS_READYTOUSE, timeout=90, ) # Delete pods log.info("Deleting the pods") for pod_obj in self.pods: pod_obj.delete() pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name) log.info("Deleted all the pods") # Delete parent PVCs log.info("Deleting parent PVCs") for pvc_obj in self.pvcs: pv_obj = pvc_obj.backed_pv_obj pvc_obj.delete() pvc_obj.ocp.wait_for_delete(resource_name=pvc_obj.name) log.info( f"Deleted PVC {pvc_obj.name}. Verifying whether PV " f"{pv_obj.name} is deleted." ) pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name) log.info( "Deleted parent PVCs before restoring snapshot. " "PVs are also deleted." ) restore_pvc_objs = [] # Create PVCs out of the snapshots log.info("Creating new PVCs from snapshots") for snapshot in snapshots: log.info(f"Creating a PVC from snapshot {snapshot.name}") restore_pvc_obj = snapshot_restore_factory( snapshot_obj=snapshot, size=f"{self.pvc_size}Gi", volume_mode=snapshot.parent_volume_mode, access_mode=snapshot.parent_access_mode, status="", ) log.info( f"Created PVC {restore_pvc_obj.name} from snapshot " f"{snapshot.name}" ) restore_pvc_objs.append(restore_pvc_obj) log.info("Created new PVCs from all the snapshots") # Confirm that the restored PVCs are Bound # Increased wait time to 600 seconds as a workaround for BZ 1899968 # TODO: Revert wait time to 200 seconds once BZ 1899968 is fixed log.info("Verify the restored PVCs are Bound") for pvc_obj in restore_pvc_objs: wait_for_resource_state( resource=pvc_obj, state=constants.STATUS_BOUND, timeout=600 ) pvc_obj.reload() log.info("Verified: Restored PVCs are Bound.") snapcontent_objs = [] # Get VolumeSnapshotContent form VolumeSnapshots and delete # VolumeSnapshots log.info("Deleting snapshots") for snapshot in snapshots: snapcontent_objs.append(get_snapshot_content_obj(snap_obj=snapshot)) snapshot.delete() # Verify volume snapshots are deleted log.info("Verify snapshots are deleted") for snapshot in snapshots: snapshot.ocp.wait_for_delete(resource_name=snapshot.name) log.info("Verified: Snapshots are deleted") # Verify VolumeSnapshotContents are deleted for snapcontent_obj in snapcontent_objs: snapcontent_obj.ocp.wait_for_delete( resource_name=snapcontent_obj.name, timeout=180 ) # Attach the restored PVCs to pods log.info("Attach the restored PVCs to pods") restore_pod_objs = [] for restore_pvc_obj in restore_pvc_objs: interface = ( constants.CEPHFILESYSTEM if (constants.CEPHFS_INTERFACE in restore_pvc_obj.snapshot.parent_sc) else constants.CEPHBLOCKPOOL ) restore_pod_obj = pod_factory( interface=interface, pvc=restore_pvc_obj, status="" ) log.info( f"Attached the PVC {restore_pvc_obj.name} to pod " f"{restore_pod_obj.name}" ) restore_pod_objs.append(restore_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in restore_pod_objs: timeout = ( 300 if config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM else 60 ) wait_for_resource_state(pod_obj, constants.STATUS_RUNNING, timeout) log.info("Verified: New pods are running") # Verify md5sum of files log.info("Verifying md5sum of files on all the pods") for restore_pod_obj in restore_pod_objs: log.info( f"Verifying md5sum of these files on pod " f"{restore_pod_obj.name}:" f"{restore_pod_obj.pvc.snapshot.md5_sum}" ) for ( file_name, actual_md5_sum, ) in restore_pod_obj.pvc.snapshot.md5_sum.items(): file_path = pod.get_file_path(restore_pod_obj, file_name) log.info( f"Checking the existence of file {file_name} on pod " f"{restore_pod_obj.name}" ) assert pod.check_file_existence(restore_pod_obj, file_path), ( f"File {file_name} does not exist on pod " f"{restore_pod_obj.name}" ) log.info(f"File {file_name} exists on pod {restore_pod_obj.name}") # Verify that the md5sum matches log.info( f"Verifying md5sum of file {file_name} on pod " f"{restore_pod_obj.name}" ) pod.verify_data_integrity(restore_pod_obj, file_name, actual_md5_sum) log.info( f"Verified md5sum of file {file_name} on pod " f"{restore_pod_obj.name}" ) log.info( f"Verified md5sum of these files on pod " f"{restore_pod_obj.name}:" f"{restore_pod_obj.pvc.snapshot.md5_sum}" ) log.info("md5sum verified") # Verify usage on mount point log.info("Verify usage on new pods") for pod_obj in restore_pod_objs: usage_on_pod = get_used_space_on_mount_point(pod_obj) assert usage_on_pod == pod_obj.pvc.snapshot.usage_on_mount, ( f"Usage on mount point is not the expected value on pod " f"{pod_obj.name}. Usage in percentage {usage_on_pod}. " f"Expected usage in percentage " f"{pod_obj.pvc.snapshot.usage_on_mount}" ) log.info( f"Verified usage on new pod {pod_obj.name}. Usage in " f"percentage {usage_on_pod}. Expected usage in percentage " f"{pod_obj.pvc.snapshot.usage_on_mount}" ) log.info("Verified usage on new pods")
def test_pvc_rwx_writeable_after_pod_deletions( self, pvc_factory, teardown_factory ): """ Test assign nodeName to a pod using RWX pvc 1. Create a new project. 2. Create a RWX CEPHFS based PVC 3. Attach the same PVC to multiple PODs and start IO on all the PODs 4. Delete all but one pod. 5. Verify mount point is still write-able. - Start IO again on the Running pod. 6. Also, access the data written by deleted pods from the Running pod """ worker_nodes_list = helpers.get_worker_nodes() # Create a RWX PVC pvc_obj = pvc_factory( interface=constants.CEPHFILESYSTEM, access_mode=constants.ACCESS_MODE_RWX, size=10, status=constants.STATUS_BOUND ) logger.info( f"Creating pods on all worker nodes backed" f"with same pvc {pvc_obj.name}" ) pod_list = [] for each_node in worker_nodes_list: pod_obj = helpers.create_pod( interface_type=constants.CEPHFILESYSTEM, pvc_name=pvc_obj.name, namespace=pvc_obj.namespace, node_name=each_node, pod_dict_path=constants.NGINX_POD_YAML ) pod_list.append(pod_obj) teardown_factory(pod_obj) # Confirm pods are created and are running on designated nodes node_count = 0 for pod_obj in pod_list: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=120 ) pod_obj.reload() assert pod.verify_node_name(pod_obj, worker_nodes_list[node_count]), ( f'Pod {pod_obj.name} is running on a different node ' f'than the selected node' ) node_count = node_count + 1 # Run IOs on all pods. FIO Filename is kept same as pod name with ThreadPoolExecutor() as p: for pod_obj in pod_list: logger.info(f"Running IO on pod {pod_obj.name}") p.submit( pod_obj.run_io, storage_type='fs', size='512M', runtime=30, fio_filename=pod_obj.name ) # Check IO from all pods for pod_obj in pod_list: pod.get_fio_rw_iops(pod_obj) # Calculate md5sum of each file md5sum_pod_data = [] for pod_obj in pod_list: md5sum_pod_data.append(pod.cal_md5sum( pod_obj=pod_obj, file_name=pod_obj.name )) # Delete all but the last app pod. for index in range(node_count - 1): pod_list[index].delete() pod_list[index].ocp.wait_for_delete( resource_name=pod_list[index].name ) # Verify presence of files written by each pod logger.info( f"Verify existence of each file from app pod " f"{pod_list[-1].name} " ) for pod_obj in pod_list: file_path = pod.get_file_path(pod_list[-1], pod_obj.name) assert pod.check_file_existence(pod_list[-1], file_path), ( f"File {pod_obj.name} doesnt exist" ) logger.info( f"File {pod_obj.name} exists in {pod_list[-1].name}" ) # From surviving pod, verify data integrity of files # written by deleted pods logger.info(f"verify all data from {pod_list[-1].name}") for index, pod_obj in enumerate(pod_list): assert pod.verify_data_integrity( pod_obj=pod_list[-1], file_name=pod_obj.name, original_md5sum=md5sum_pod_data[index] ) # From surviving pod, confirm mount point is still write-able logger.info(f"Re-running IO on pod {pod_list[-1].name}") fio_new_file = f"{pod_list[-1].name}-new-file" pod_list[-1].run_io( storage_type='fs', size='512M', runtime=30, fio_filename=fio_new_file ) pod.get_fio_rw_iops(pod_list[-1]) file_path = pod.get_file_path(pod_list[-1], fio_new_file) assert pod.check_file_existence(pod_list[-1], file_path), ( f"File {fio_new_file} doesnt exist" ) logger.info( f"File {fio_new_file} exists in {pod_list[-1].name} " )
def test_rwx_dynamic_pvc(self, setup_base): """ RWX Dynamic PVC creation tests with Reclaim policy set to Delete/Retain """ logger.info(f"CephFS RWX test") logger.info( f"Creating second pod on node: {self.worker_nodes_list[1]} " f"with pvc {self.pvc_obj.name}") pod_obj2 = helpers.create_pod(interface_type=self.interface_type, pvc_name=self.pvc_obj.name, namespace=self.namespace, node_name=self.worker_nodes_list[1], pod_dict_path=constants.NGINX_POD_YAML) helpers.wait_for_resource_state(pod_obj2, constants.STATUS_RUNNING) pod_obj2.reload() node_pod1 = self.pod_obj1.get().get('spec').get('nodeName') node_pod2 = pod_obj2.get().get('spec').get('nodeName') assert node_pod1 != node_pod2, 'Both pods are on the same node' # Run IO on both the pods logger.info(f"Running IO on pod {self.pod_obj1.name}") file_name1 = self.pod_obj1.name logger.info(file_name1) self.pod_obj1.run_io(storage_type=self.storage_type, size=self.io_size, runtime=30, fio_filename=file_name1) logger.info(f"Running IO on pod {pod_obj2.name}") file_name2 = pod_obj2.name pod_obj2.run_io(storage_type=self.storage_type, size=self.io_size, runtime=30, fio_filename=file_name2) # Check IO and calculate md5sum of files pod.get_fio_rw_iops(self.pod_obj1) md5sum_pod1_data = pod.cal_md5sum(pod_obj=self.pod_obj1, file_name=file_name1) pod.get_fio_rw_iops(pod_obj2) md5sum_pod2_data = pod.cal_md5sum(pod_obj=pod_obj2, file_name=file_name2) logger.info(f"verify data from alternate pods") assert pod.verify_data_integrity(pod_obj=pod_obj2, file_name=file_name1, original_md5sum=md5sum_pod1_data) assert pod.verify_data_integrity(pod_obj=self.pod_obj1, file_name=file_name2, original_md5sum=md5sum_pod2_data) # Verify that data is mutable from any pod logger.info(f"Perform modification of files from alternate pod") # Access and rename file written by pod-2 from pod-1 file_path2 = pod.get_file_path(pod_obj2, file_name2) logger.info(file_path2) self.pod_obj1.exec_cmd_on_pod( command=f"bash -c \"mv {file_path2} {file_path2}-renamed\"", out_yaml_format=False) # Access and rename file written by pod-1 from pod-2 file_path1 = pod.get_file_path(self.pod_obj1, file_name1) logger.info(file_path1) pod_obj2.exec_cmd_on_pod( command=f"bash -c \"mv {file_path1} {file_path1}-renamed\"", out_yaml_format=False) logger.info(f"Verify presence of renamed files from both pods") file_names = [f"{file_path1}-renamed", f"{file_path2}-renamed"] for file in file_names: assert pod.check_file_existence( self.pod_obj1, file), (f"File {file} doesn't exist") logger.info(f"File {file} exists in {self.pod_obj1.name} ") assert pod.check_file_existence( pod_obj2, file), (f"File {file} doesn't exist") logger.info(f"File {file} exists in {pod_obj2.name}")
def test_rbd_block_pvc_snapshot(self, snapshot_factory, snapshot_restore_factory, pod_factory): """ Test to take snapshots of RBD Block VolumeMode PVCs """ # Run IO log.info("Find initial md5sum value and run IO on all pods") for pod_obj in self.pod_objs: # Find initial md5sum pod_obj.md5sum_before_io = cal_md5sum( pod_obj=pod_obj, file_name=pod_obj.get_storage_path(storage_type="block"), block=True, ) pod_obj.run_io( storage_type="block", size=f"{self.pvc_size - 1}G", io_direction="write", runtime=60, ) log.info("IO started on all pods") # Wait for IO completion for pod_obj in self.pod_objs: pod_obj.get_fio_results() log.info("IO completed on all pods") snap_objs = [] # Verify md5sum has changed after IO. Create snapshot log.info("Verify md5sum has changed after IO and create snapshot from " "all PVCs") for pod_obj in self.pod_objs: md5sum_after_io = cal_md5sum( pod_obj=pod_obj, file_name=pod_obj.get_storage_path(storage_type="block"), block=True, ) assert (pod_obj.md5sum_before_io != md5sum_after_io ), f"md5sum has not changed after IO on pod {pod_obj.name}" log.info(f"Creating snapshot of PVC {pod_obj.pvc.name}") snap_obj = snapshot_factory(pod_obj.pvc, wait=False) snap_obj.md5sum = md5sum_after_io snap_objs.append(snap_obj) log.info("Snapshots created") # Verify snapshots are ready log.info("Verify snapshots are ready") for snap_obj in snap_objs: snap_obj.ocp.wait_for_resource( condition="true", resource_name=snap_obj.name, column=constants.STATUS_READYTOUSE, timeout=180, ) # Delete pods log.info("Deleting the pods") for pod_obj in self.pod_objs: pod_obj.delete() pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name) log.info("Deleted all the pods") # Delete parent PVCs to verify snapshot is independent log.info("Deleting parent PVCs") for pvc_obj in self.pvc_objs: pv_obj = pvc_obj.backed_pv_obj pvc_obj.delete() pvc_obj.ocp.wait_for_delete(resource_name=pvc_obj.name) log.info(f"Deleted PVC {pvc_obj.name}. Verifying whether PV " f"{pv_obj.name} is deleted.") pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name) log.info("Deleted parent PVCs before restoring snapshot. " "PVs are also deleted.") restore_pvc_objs = [] # Create PVCs out of the snapshots log.info("Creating new PVCs from snapshots") for snap_obj in snap_objs: log.info(f"Creating a PVC from snapshot {snap_obj.name}") restore_pvc_obj = snapshot_restore_factory( snapshot_obj=snap_obj, size=f"{self.pvc_size}Gi", volume_mode=snap_obj.parent_volume_mode, access_mode=snap_obj.parent_access_mode, status="", ) log.info(f"Created PVC {restore_pvc_obj.name} from snapshot " f"{snap_obj.name}") restore_pvc_obj.md5sum = snap_obj.md5sum restore_pvc_objs.append(restore_pvc_obj) log.info("Created new PVCs from all the snapshots") # Confirm that the restored PVCs are Bound log.info("Verify the restored PVCs are Bound") for pvc_obj in restore_pvc_objs: wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180) pvc_obj.reload() log.info("Verified: Restored PVCs are Bound.") # Attach the restored PVCs to pods. Attach RWX PVC on two pods log.info("Attach the restored PVCs to pods") restore_pod_objs = create_pods( restore_pvc_objs, pod_factory, constants.CEPHBLOCKPOOL, pods_for_rwx=2, status="", ) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in restore_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") log.info("Verifying md5sum on new pods") for pod_obj in restore_pod_objs: log.info(f"Verifying md5sum on pod {pod_obj.name}") verify_data_integrity( pod_obj=pod_obj, file_name=pod_obj.get_storage_path(storage_type="block"), original_md5sum=pod_obj.pvc.md5sum, block=True, ) log.info(f"Verified md5sum on pod {pod_obj.name}") log.info("Verified md5sum on all pods") # Run IO on new pods log.info("Starting IO on new pods") for pod_obj in restore_pod_objs: pod_obj.run_io(storage_type="block", size="500M", runtime=15) # Wait for IO completion on new pods log.info("Waiting for IO completion on new pods") for pod_obj in restore_pod_objs: pod_obj.get_fio_results() log.info("IO completed on new pods.")
def test_rwo_pvc_fencing_node_prolonged_and_short_network_failure( self, nodes, setup, node_restart_teardown): """ OCS-1431/OCS-1436: - Start DeploymentConfig based app pods on 1 node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Disrupt the leader provisioner pods if not running on above selected node - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive node - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods - Again make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods scheduled on another node are stuck due to Multi-Attach error. - Reboot the unresponsive node - When unresponsive node recovers, run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup external_mode = helpers.storagecluster_independent_check() extra_nodes = list(set(test_nodes) - set(app_pod_nodes)) helpers.remove_label_from_worker_node(node_list=extra_nodes[:-1], label_key="nodetype") # Run IO on pods md5sum_data = self.run_and_verify_io(pod_list=dc_pods, fio_filename="io_file1", run_io_in_bg=True) # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds") sleep(self.prolong_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in dc_pods + ceph_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len( dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods) new_ceph_pods = [] if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, "Unexpected number of osd pods" self.verify_multi_attach_error(new_ceph_pods) logger.info("Executing manual recovery steps") # Power off the unresponsive node logger.info(f"Powering off the unresponsive node: {app_pod_nodes}") nodes.stop_nodes(node.get_node_objs(app_pod_nodes)) # Force delete the app pods and/or mon,osd pods on the unresponsive node for pod_obj in dc_pods + ceph_pods: pod_obj.delete(force=True) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not external_mode: # Wait for mon and osd pods to reach Running state selectors_to_check = { constants.MON_APP_LABEL: self.expected_mon_count, constants.OSD_APP_LABEL: ceph_cluster.osd_count, } for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == self.expected_mon_count: # Check ceph health toolbox_status = ceph_cluster.POD.get_resource_status( ceph_cluster.toolbox.name) if toolbox_status == constants.STATUS_TERMINATING: ceph_cluster.toolbox.delete(force=True) assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods md5sum_data2 = self.run_and_verify_io(pod_list=new_dc_pods, fio_filename="io_file2", run_io_in_bg=True) helpers.label_worker_node(node_list=extra_nodes[:-1], label_key="nodetype", label_value="app-pod") # Induce network failure on the node node.node_network_failure(extra_nodes[-1]) logger.info(f"Waiting for {self.short_nw_fail_time} seconds") sleep(self.short_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name, timeout=600, sleep=30, ) # Fetch info of new pods and verify Multi-Attach error new_dc_pods2 = self.get_new_pods(new_dc_pods) assert len(new_dc_pods2) == len( new_dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods2) # Reboot the unresponsive node logger.info(f"Rebooting the unresponsive node: {extra_nodes[-1]}") nodes.restart_nodes_by_stop_and_start( node.get_node_objs([extra_nodes[-1]])) node.wait_for_nodes_status(node_names=[extra_nodes[-1]], status=constants.NODE_READY) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods2: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not external_mode: # Wait for mon and osd pods to reach Running state for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == 3: # Check ceph health assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods2): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file2", original_md5sum=md5sum_data2[num]) for num, pod_obj in enumerate(new_dc_pods2): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods self.run_and_verify_io(pod_list=new_dc_pods2, fio_filename="io_file3", return_md5sum=False)
def test_rwo_pvc_fencing_node_short_network_failure( self, nodes, setup, node_restart_teardown): """ OCS-1423/OCS-1428/OCS-1426: - Start DeploymentConfig based app pods on 1 OCS/Non-OCS node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Reboot the unresponsive node - When unresponsive node recovers, run IOs on new app pods OCS-1424/OCS-1434: - Start DeploymentConfig based app pods on multiple node Colocated scenario: Select 1 node where osd and/or mon is running, select other 2 nodes where mon/osd are not running Dedicated scenario: 3 Non-OCS nodes - Disrupt the leader provisioner pods if not running on above selected nodes - Make the nodes (where app pods are running) unresponsive by bringing their main network interface down - Check new app pods and/or mon, osd pods scheduled on another node and are stuck due to Multi-Attach error. - Reboot the unresponsive nodes - When unresponsive nodes recover, run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup # Run IO on pods md5sum_data = self.run_and_verify_io(pod_list=dc_pods, fio_filename="io_file1", run_io_in_bg=True) # OCS-1424/OCS-1434 # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.short_nw_fail_time} seconds") sleep(self.short_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in dc_pods + ceph_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name, timeout=600, sleep=30, ) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len( dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods) if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, "Unexpected number of osd pods" self.verify_multi_attach_error(new_ceph_pods) # Reboot the unresponsive node(s) logger.info(f"Rebooting the unresponsive node(s): {app_pod_nodes}") nodes.restart_nodes_by_stop_and_start( node.get_node_objs(app_pod_nodes)) node.wait_for_nodes_status(node_names=app_pod_nodes, status=constants.NODE_READY) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not helpers.storagecluster_independent_check(): # Wait for mon and osd pods to reach Running state selectors_to_check = { constants.MON_APP_LABEL: ceph_cluster.mon_count, constants.OSD_APP_LABEL: ceph_cluster.osd_count, } for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): assert pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num] ), "Data integrity check failed" # Run IO on new pods self.run_and_verify_io(pod_list=new_dc_pods, fio_filename="io_file2", return_md5sum=False)
def test_pvc_to_pvc_clone(self, kv_version, kms_provider, pod_factory): """ Test to create a clone from an existing encrypted RBD PVC. Verify that the cloned PVC is encrypted and all the data is preserved. """ log.info("Checking for encrypted device and running IO on all pods") for vol_handle, pod_obj in zip(self.vol_handles, self.pod_objs): if pod_obj.exec_sh_cmd_on_pod( command=f"lsblk | grep {vol_handle} | grep crypt"): log.info(f"Encrypted device found in {pod_obj.name}") else: raise ResourceNotFoundError( f"Encrypted device not found in {pod_obj.name}") log.info(f"File created during IO {pod_obj.name}") pod_obj.run_io( storage_type="block", size="500M", io_direction="write", runtime=60, end_fsync=1, direct=1, ) log.info("IO started on all pods") # Wait for IO completion for pod_obj in self.pod_objs: pod_obj.get_fio_results() log.info("IO completed on all pods") cloned_pvc_objs, cloned_vol_handles = ([] for i in range(2)) # Calculate the md5sum value and create clones of exisiting PVCs log.info("Calculate the md5sum after IO and create clone of all PVCs") for pod_obj in self.pod_objs: pod_obj.md5sum_after_io = pod.cal_md5sum( pod_obj=pod_obj, file_name=pod_obj.get_storage_path(storage_type="block"), block=True, ) cloned_pvc_obj = pvc.create_pvc_clone( self.sc_obj.name, pod_obj.pvc.name, constants.CSI_RBD_PVC_CLONE_YAML, self.proj_obj.namespace, volume_mode=constants.VOLUME_MODE_BLOCK, access_mode=pod_obj.pvc.access_mode, ) helpers.wait_for_resource_state(cloned_pvc_obj, constants.STATUS_BOUND) cloned_pvc_obj.reload() cloned_pvc_obj.md5sum = pod_obj.md5sum_after_io cloned_pvc_objs.append(cloned_pvc_obj) log.info("Clone of all PVCs created") # Create and attach pod to the pvc cloned_pod_objs = helpers.create_pods( cloned_pvc_objs, pod_factory, constants.CEPHBLOCKPOOL, pods_for_rwx=1, status="", ) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in cloned_pod_objs: helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) pod_obj.reload() log.info("Verified: New pods are running") # Verify encryption keys are created for cloned PVCs in Vault for pvc_obj in cloned_pvc_objs: pv_obj = pvc_obj.backed_pv_obj vol_handle = pv_obj.get().get("spec").get("csi").get( "volumeHandle") cloned_vol_handles.append(vol_handle) if kms_provider == constants.VAULT_KMS_PROVIDER: if kms.is_key_present_in_path( key=vol_handle, path=self.kms.vault_backend_path): log.info( f"Vault: Found key for restore PVC {pvc_obj.name}") else: raise ResourceNotFoundError( f"Vault: Key not found for restored PVC {pvc_obj.name}" ) # Verify encrypted device is present and md5sum on all pods for vol_handle, pod_obj in zip(cloned_vol_handles, cloned_pod_objs): if pod_obj.exec_sh_cmd_on_pod( command=f"lsblk | grep {vol_handle} | grep crypt"): log.info(f"Encrypted device found in {pod_obj.name}") else: raise ResourceNotFoundError( f"Encrypted device not found in {pod_obj.name}") log.info(f"Verifying md5sum on pod {pod_obj.name}") pod.verify_data_integrity( pod_obj=pod_obj, file_name=pod_obj.get_storage_path(storage_type="block"), original_md5sum=pod_obj.pvc.md5sum, block=True, ) log.info(f"Verified md5sum on pod {pod_obj.name}") # Run IO on new pods log.info("Starting IO on new pods") for pod_obj in cloned_pod_objs: pod_obj.run_io(storage_type="block", size="100M", runtime=10) # Wait for IO completion on new pods log.info("Waiting for IO completion on new pods") for pod_obj in cloned_pod_objs: pod_obj.get_fio_results() log.info("IO completed on new pods.") # Delete the restored pods, PVC and snapshots log.info("Deleting all pods") for pod_obj in cloned_pod_objs + self.pod_objs: pod_obj.delete() pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name) log.info("Deleting all PVCs") for pvc_obj in cloned_pvc_objs + self.pvc_objs: pv_obj = pvc_obj.backed_pv_obj pvc_obj.delete() pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name) if kms_provider == constants.VAULT_KMS_PROVIDER: # Verify if the keys for parent and cloned PVCs are deleted from Vault if kv_version == "v1" or Version.coerce( config.ENV_DATA["ocs_version"]) >= Version.coerce("4.9"): log.info( "Verify whether the keys for cloned PVCs are deleted from vault" ) for key in cloned_vol_handles + self.vol_handles: if not kms.is_key_present_in_path( key=key, path=self.kms.vault_backend_path): log.info(f"Vault: Key deleted for {key}") else: raise KMSResourceCleaneupError( f"Vault: Key deletion failed for {key}") log.info("All keys from vault were deleted")
def test_snapshot_restore_with_different_access_mode( self, pod_factory, snapshot_factory, snapshot_restore_factory): """ Restore snapshot with an access mode different than parent PVC """ file_name = "fio_test" access_modes_dict = { constants.CEPHBLOCKPOOL: { constants.VOLUME_MODE_FILESYSTEM: [constants.ACCESS_MODE_RWO], constants.VOLUME_MODE_BLOCK: [ constants.ACCESS_MODE_RWX, constants.ACCESS_MODE_RWO, ], }, constants.CEPHFILESYSTEM: { constants.VOLUME_MODE_FILESYSTEM: [ constants.ACCESS_MODE_RWX, constants.ACCESS_MODE_RWO, ] }, } # Start IO log.info("Starting IO on all pods") for pod_obj in self.pods: storage_type = ("block" if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK else "fs") pod_obj.run_io( storage_type=storage_type, size="1G", runtime=20, fio_filename=file_name, end_fsync=1, ) log.info(f"IO started on pod {pod_obj.name}") log.info("Started IO on all pods") # Wait for IO to finish log.info("Wait for IO to finish on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"IO finished on pod {pod_obj.name}") # Calculate md5sum to compare after restoring file_name_pod = (file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) pod_obj.pvc.md5sum = pod.cal_md5sum( pod_obj, file_name_pod, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info("IO finished on all pods") # Create snapshots log.info("Creating snapshot of the PVCs") snap_objs = [] for pvc_obj in self.pvcs: log.info(f"Creating snapshot of PVC {pvc_obj.name}") snap_obj = snapshot_factory(pvc_obj, wait=False) snap_obj.md5sum = pvc_obj.md5sum snap_obj.interface = pvc_obj.interface snap_objs.append(snap_obj) log.info(f"Created snapshot of PVC {pvc_obj.name}") log.info( "Snapshots are created. Wait for the snapshots to be in Ready state" ) for snap_obj in snap_objs: snap_obj.ocp.wait_for_resource( condition="true", resource_name=snap_obj.name, column=constants.STATUS_READYTOUSE, timeout=180, ) snap_obj.reload() log.info("Snapshots are Ready") # Restore snapshots log.info("Restoring snapshots to create new PVCs") restore_pvcs = [] for snap_obj in snap_objs: access_modes = access_modes_dict[snap_obj.interface][ snap_obj.parent_volume_mode] for access_mode in access_modes: restore_obj = snapshot_restore_factory( snapshot_obj=snap_obj, volume_mode=snap_obj.parent_volume_mode, access_mode=access_mode, status="", ) restore_obj.interface = snap_obj.interface restore_obj.md5sum = snap_obj.md5sum log.info( f"Created PVC {restore_obj.name} with accessMode " f"{access_mode} from snapshot {snap_obj.name}. " f"Parent PVC accessMode: {snap_obj.parent_access_mode}") restore_pvcs.append(restore_obj) log.info( "Restored all the snapshots to create PVCs with different access modes" ) log.info("Verifying restored PVCs are Bound") for pvc_obj in restore_pvcs: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=200) pvc_obj.reload() log.info("Verified: Restored PVCs are Bound") # Verify restored PVC volume mode" for pvc_obj in restore_pvcs: assert (pvc_obj.data["spec"]["volumeMode"] == pvc_obj.snapshot.parent_volume_mode ), f"Volume mode mismatch in PVC {pvc_obj.name}" # Get worker node names and create an iterator nodes_iter = cycle(node.get_worker_nodes()) # Attach the restored PVCs to pods log.info("Attach the restored PVCs to pods") restore_pod_objs = [] for pvc_obj in restore_pvcs: if pvc_obj.data["spec"]["volumeMode"] == "Block": pod_dict_path = constants.CSI_RBD_RAW_BLOCK_POD_YAML else: pod_dict_path = "" # Create 2 pods if access mode is RWX, else 1 for _ in range( int(pvc_obj.get_pvc_access_mode != constants.ACCESS_MODE_RWX), 2): restore_pod_obj = pod_factory( interface=pvc_obj.interface, pvc=pvc_obj, status="", node_name=next(nodes_iter), pod_dict_path=pod_dict_path, raw_block_pv=pvc_obj.data["spec"]["volumeMode"] == "Block", ) log.info(f"Attaching the PVC {pvc_obj.name} to pod " f"{restore_pod_obj.name}") restore_pod_objs.append(restore_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in restore_pod_objs: helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify md5sum for pod_obj in restore_pod_objs: file_name_pod = (file_name if (pod_obj.pvc.data["spec"]["volumeMode"] == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) pod.verify_data_integrity( pod_obj, file_name_pod, pod_obj.pvc.md5sum, pod_obj.pvc.data["spec"]["volumeMode"] == constants.VOLUME_MODE_BLOCK, ) log.info( f"Verified: md5sum of {file_name_pod} on pod {pod_obj.name} " "matches the original md5sum") log.info("Data integrity check passed on all pods")
def test_rwx_dynamic_pvc(self, interface_type, reclaim_policy, setup, pvc_factory, pod_factory): """ RWX Dynamic PVC creation tests with Reclaim policy set to Retain/Delete """ access_mode = constants.ACCESS_MODE_RWX storage_type = "fs" sc_obj, worker_nodes_list = setup logger.info("CephFS RWX test") logger.info(f"Creating PVC with {access_mode} access mode") pvc_obj = pvc_factory( interface=interface_type, storageclass=sc_obj, size=self.pvc_size, access_mode=access_mode, status=constants.STATUS_BOUND, ) logger.info(f"Creating first pod on node: {worker_nodes_list[0]} " f"with pvc {pvc_obj.name}") pod_obj1 = pod_factory( interface=interface_type, pvc=pvc_obj, status=constants.STATUS_RUNNING, node_name=worker_nodes_list[0], pod_dict_path=constants.NGINX_POD_YAML, ) logger.info(f"Creating second pod on node: {worker_nodes_list[1]} " f"with pvc {pvc_obj.name}") pod_obj2 = pod_factory( interface=interface_type, pvc=pvc_obj, status=constants.STATUS_RUNNING, node_name=worker_nodes_list[1], pod_dict_path=constants.NGINX_POD_YAML, ) node_pod1 = pod_obj1.get().get("spec").get("nodeName") node_pod2 = pod_obj2.get().get("spec").get("nodeName") assert node_pod1 != node_pod2, "Both pods are on the same node" # Run IO on both the pods logger.info(f"Running IO on pod {pod_obj1.name}") file_name1 = pod_obj1.name logger.info(file_name1) pod_obj1.run_io(storage_type=storage_type, size="1G", fio_filename=file_name1) logger.info(f"Running IO on pod {pod_obj2.name}") file_name2 = pod_obj2.name pod_obj2.run_io(storage_type=storage_type, size="1G", fio_filename=file_name2) # Check IO and calculate md5sum of files pod.get_fio_rw_iops(pod_obj1) md5sum_pod1_data = pod.cal_md5sum(pod_obj=pod_obj1, file_name=file_name1) pod.get_fio_rw_iops(pod_obj2) md5sum_pod2_data = pod.cal_md5sum(pod_obj=pod_obj2, file_name=file_name2) logger.info("verify data from alternate pods") pod.verify_data_integrity(pod_obj=pod_obj2, file_name=file_name1, original_md5sum=md5sum_pod1_data) pod.verify_data_integrity(pod_obj=pod_obj1, file_name=file_name2, original_md5sum=md5sum_pod2_data) # Verify that data is mutable from any pod logger.info("Perform modification of files from alternate pod") # Access and rename file written by pod-2 from pod-1 file_path2 = pod.get_file_path(pod_obj2, file_name2) logger.info(file_path2) pod_obj1.exec_cmd_on_pod( command=f'bash -c "mv {file_path2} {file_path2}-renamed"', out_yaml_format=False, ) # Access and rename file written by pod-1 from pod-2 file_path1 = pod.get_file_path(pod_obj1, file_name1) logger.info(file_path1) pod_obj2.exec_cmd_on_pod( command=f'bash -c "mv {file_path1} {file_path1}-renamed"', out_yaml_format=False, ) logger.info("Verify presence of renamed files from both pods") file_names = [f"{file_path1}-renamed", f"{file_path2}-renamed"] for file in file_names: assert pod.check_file_existence(pod_obj1, file), f"File {file} doesn't exist" logger.info(f"File {file} exists in {pod_obj1.name} ") assert pod.check_file_existence(pod_obj2, file), f"File {file} doesn't exist" logger.info(f"File {file} exists in {pod_obj2.name}")
def test_rwo_dynamic_pvc(self, interface_type, reclaim_policy, setup, pvc_factory, pod_factory): """ RWO Dynamic PVC creation tests with Reclaim policy set to Retain/Delete """ access_mode = constants.ACCESS_MODE_RWO expected_failure_str = "Multi-Attach error for volume" storage_type = "fs" sc_obj, worker_nodes_list = setup logger.info(f"Creating PVC with {access_mode} access mode") pvc_obj = pvc_factory( interface=interface_type, storageclass=sc_obj, size=self.pvc_size, access_mode=access_mode, status=constants.STATUS_BOUND, ) logger.info(f"Creating first pod on node: {worker_nodes_list[0]} " f"with pvc {pvc_obj.name}") pod_obj1 = pod_factory( interface=interface_type, pvc=pvc_obj, status=constants.STATUS_RUNNING, node_name=worker_nodes_list[0], pod_dict_path=constants.NGINX_POD_YAML, ) logger.info(f"Creating second pod on node: {worker_nodes_list[1]} " f"with pvc {pvc_obj.name}") pod_obj2 = pod_factory( interface=interface_type, pvc=pvc_obj, status=constants.STATUS_CONTAINER_CREATING, node_name=worker_nodes_list[1], pod_dict_path=constants.NGINX_POD_YAML, ) node_pod1 = pod_obj1.get().get("spec").get("nodeName") node_pod2 = pod_obj2.get().get("spec").get("nodeName") assert node_pod1 != node_pod2, "Both pods are on the same node" logger.info(f"Running IO on first pod {pod_obj1.name}") file_name = pod_obj1.name pod_obj1.run_io(storage_type=storage_type, size="1G", fio_filename=file_name) pod.get_fio_rw_iops(pod_obj1) md5sum_pod1_data = pod.cal_md5sum(pod_obj=pod_obj1, file_name=file_name) # Verify that second pod is still in ContainerCreating state and not # able to attain Running state due to expected failure logger.info( f"Verify that second pod {pod_obj2.name} is still in ContainerCreating state" ) helpers.wait_for_resource_state( resource=pod_obj2, state=constants.STATUS_CONTAINER_CREATING) self.verify_expected_failure_event(ocs_obj=pod_obj2, failure_str=expected_failure_str) logger.info( f"Deleting first pod so that second pod can attach PVC {pvc_obj.name}" ) pod_obj1.delete() pod_obj1.ocp.wait_for_delete(resource_name=pod_obj1.name) # Wait for second pod to be in Running state helpers.wait_for_resource_state(resource=pod_obj2, state=constants.STATUS_RUNNING, timeout=240) logger.info(f"Verify data from second pod {pod_obj2.name}") pod.verify_data_integrity(pod_obj=pod_obj2, file_name=file_name, original_md5sum=md5sum_pod1_data) pod_obj2.run_io(storage_type=storage_type, size="1G", fio_filename=pod_obj2.name) pod.get_fio_rw_iops(pod_obj2) # Again verify data integrity logger.info(f"Again verify data from second pod {pod_obj2.name}") pod.verify_data_integrity(pod_obj=pod_obj2, file_name=file_name, original_md5sum=md5sum_pod1_data)
def test_clone_when_full(self, pvc_clone_factory, pod_factory): """ Create a clone from an existing PVC when the PVC is 100% utilized. Verify data integrity. Verify utilization alert in cloned PVC. Expand cloned PVC and ensure utilization alerts are stopped. """ pvc_size_expanded = 6 file_name = "fio_full" prometheus_api = PrometheusAPI() # Run IO to utilize 100% of volume log.info("Run IO on all pods to utilise 100% of PVCs") for pod_obj in self.pods: # Get available free space in M df_avail_size = pod_obj.exec_cmd_on_pod( command=f"df {pod_obj.get_storage_path()} -B M --output=avail") # Get the numeral value of available space. eg: 3070 from '3070M' available_size = int(df_avail_size.strip().split()[1][0:-1]) pod_obj.run_io( "fs", size=f"{available_size-2}M", runtime=20, rate="100M", fio_filename=file_name, end_fsync=1, ) log.info("Started IO on all pods to utilise 100% of PVCs") # Wait for IO to finish log.info("Wait for IO to finish on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"IO finished on pod {pod_obj.name}") # Verify used space on pod is 100% used_space = pod.get_used_space_on_mount_point(pod_obj) assert used_space == "100%", ( f"The used space on pod {pod_obj.name} is not 100% " f"but {used_space}") log.info(f"Verified: Used space on pod {pod_obj.name} is 100%") # Calculate md5sum of the file pod_obj.pvc.md5sum = pod.cal_md5sum(pod_obj, file_name) log.info("Creating clone of the PVCs") cloned_pvcs = [pvc_clone_factory(pvc_obj) for pvc_obj in self.pvcs] log.info("Created clone of the PVCs. Cloned PVCs are Bound") # Attach the cloned PVCs to pods log.info("Attach the cloned PVCs to pods") clone_pod_objs = [] for clone_pvc_obj in cloned_pvcs: interface = (constants.CEPHFILESYSTEM if (constants.CEPHFS_INTERFACE in clone_pvc_obj.backed_sc) else constants.CEPHBLOCKPOOL) clone_pod_obj = pod_factory(interface=interface, pvc=clone_pvc_obj, status="") log.info(f"Attached the PVC {clone_pvc_obj.name} to pod " f"{clone_pod_obj.name}") clone_pod_objs.append(clone_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in clone_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify that the md5sum matches for pod_obj in clone_pod_objs: log.info(f"Verifying md5sum of {file_name} " f"on pod {pod_obj.name}") pod.verify_data_integrity(pod_obj, file_name, pod_obj.pvc.parent.md5sum) log.info(f"Verified: md5sum of {file_name} on pod {pod_obj.name} " f"matches with the original md5sum") # Wait till utilization alerts starts for response in TimeoutSampler(180, 5, prometheus_api.get, "alerts"): alerts = response.json()["data"]["alerts"] for pvc_obj in cloned_pvcs: alerts_pvc = [ alert for alert in alerts if alert.get("labels", {}).get( "persistentvolumeclaim") == pvc_obj.name ] # At least 2 alerts should be present if len(alerts_pvc) < 2: break # Verify 'PersistentVolumeUsageNearFull' alert is firing if not getattr(pvc_obj, "near_full_alert", False): try: log.info( f"Checking 'PersistentVolumeUsageNearFull' alert " f"for PVC {pvc_obj.name}") near_full_msg = ( f"PVC {pvc_obj.name} is nearing full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageNearFull", msg=near_full_msg, alerts=alerts_pvc, states=["firing"], severity="warning", ) pvc_obj.near_full_alert = True except AssertionError: log.info(f"'PersistentVolumeUsageNearFull' alert not " f"started firing for PVC {pvc_obj.name}") # Verify 'PersistentVolumeUsageCritical' alert is firing if not getattr(pvc_obj, "critical_alert", False): try: log.info( f"Checking 'PersistentVolumeUsageCritical' alert " f"for PVC {pvc_obj.name}") critical_msg = ( f"PVC {pvc_obj.name} is critically full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageCritical", msg=critical_msg, alerts=alerts_pvc, states=["firing"], severity="error", ) pvc_obj.critical_alert = True except AssertionError: log.info(f"'PersistentVolumeUsageCritical' alert not " f"started firing for PVC {pvc_obj.name}") # Collect list of PVCs for which alerts are not firing not_near_full_pvc = [ pvc_ob.name for pvc_ob in cloned_pvcs if not getattr(pvc_ob, "near_full_alert", False) ] not_critical_pvc = [ pvc_ob.name for pvc_ob in cloned_pvcs if not getattr(pvc_ob, "critical_alert", False) ] if (not not_near_full_pvc) and (not not_critical_pvc): log.info("'PersistentVolumeUsageNearFull' and " "'PersistentVolumeUsageCritical' alerts are firing " "for all cloned PVCs.") break log.info("Verified: Utilization alerts are firing") log.info("Expanding cloned PVCs.") for pvc_obj in cloned_pvcs: log.info(f"Expanding size of PVC {pvc_obj.name} to " f"{pvc_size_expanded}Gi") # Expand PVC pvc_obj.resize_pvc(pvc_size_expanded, True) # Verify utilization alerts are stopped for response in TimeoutSampler(180, 5, prometheus_api.get, "alerts"): alerts = response.json()["data"]["alerts"] for pvc_obj in cloned_pvcs: alerts_pvc = [ alert for alert in alerts if alert.get("labels", {}).get( "persistentvolumeclaim") == pvc_obj.name ] if not alerts_pvc: pvc_obj.near_full_alert = False pvc_obj.critical_alert = False continue # Verify 'PersistentVolumeUsageNearFull' alert stopped firing if getattr(pvc_obj, "near_full_alert"): try: log.info( f"Checking 'PrsistentVolumeUsageNearFull' alert " f"is cleared for PVC {pvc_obj.name}") near_full_msg = ( f"PVC {pvc_obj.name} is nearing full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageNearFull", msg=near_full_msg, alerts=alerts_pvc, states=["firing"], severity="warning", ) log.info( f"'PersistentVolumeUsageNearFull' alert is not " f"stopped for PVC {pvc_obj.name}") except AssertionError: pvc_obj.near_full_alert = False log.info( f"'PersistentVolumeUsageNearFull' alert stopped " f"firing for PVC {pvc_obj.name}") # Verify 'PersistentVolumeUsageCritical' alert stopped firing if getattr(pvc_obj, "critical_alert"): try: log.info( f"Checking 'PersistentVolumeUsageCritical' alert " f"is cleared for PVC {pvc_obj.name}") critical_msg = ( f"PVC {pvc_obj.name} is critically full. Data " f"deletion or PVC expansion is required.") check_alert_list( label="PersistentVolumeUsageCritical", msg=critical_msg, alerts=alerts_pvc, states=["firing"], severity="error", ) log.info( f"'PersistentVolumeUsageCritical' alert is not " f"stopped for PVC {pvc_obj.name}") except AssertionError: pvc_obj.critical_alert = False log.info( f"'PersistentVolumeUsageCritical' alert stopped " f"firing for PVC {pvc_obj.name}") # Collect list of PVCs for which alerts are still firing near_full_pvcs = [ pvc_ob.name for pvc_ob in cloned_pvcs if getattr(pvc_ob, "near_full_alert") ] critical_pvcs = [ pvc_ob.name for pvc_ob in cloned_pvcs if getattr(pvc_ob, "critical_alert") ] if (not near_full_pvcs) and (not critical_pvcs): log.info( "'PersistentVolumeUsageNearFull' and " "'PersistentVolumeUsageCritical' alerts are cleared for " "all cloned PVCs.") break log.info("Verified: Utilization alerts stopped firing")
def test_pvc_snapshot(self, interface, teardown_factory): """ 1. Run I/O on a pod file. 2. Calculate md5sum of the file. 3. Take a snapshot of the PVC. 4. Create a new PVC out of that snapshot. 5. Attach a new pod to it. 6. Verify that the file is present on the new pod also. 7. Verify that the md5sum of the file on the new pod matches with the md5sum of the file on the original pod. Args: interface(str): The type of the interface (e.g. CephBlockPool, CephFileSystem) pvc_factory: A fixture to create new pvc teardown_factory: A fixture to destroy objects """ log.info(f"Running IO on pod {self.pod_obj.name}") file_name = self.pod_obj.name log.info(f"File created during IO {file_name}") self.pod_obj.run_io(storage_type="fs", size="1G", fio_filename=file_name) # Wait for fio to finish fio_result = self.pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert err_count == 0, (f"IO error on pod {self.pod_obj.name}. " f"FIO result: {fio_result}") log.info(f"Verified IO on pod {self.pod_obj.name}.") # Verify presence of the file file_path = pod.get_file_path(self.pod_obj, file_name) log.info(f"Actual file path on the pod {file_path}") assert pod.check_file_existence( self.pod_obj, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {self.pod_obj.name}") # Calculate md5sum orig_md5_sum = pod.cal_md5sum(self.pod_obj, file_name) # Take a snapshot snap_yaml = constants.CSI_RBD_SNAPSHOT_YAML if interface == constants.CEPHFILESYSTEM: snap_yaml = constants.CSI_CEPHFS_SNAPSHOT_YAML snap_name = helpers.create_unique_resource_name("test", "snapshot") snap_obj = pvc.create_pvc_snapshot( self.pvc_obj.name, snap_yaml, snap_name, self.pvc_obj.namespace, helpers.default_volumesnapshotclass(interface).name, ) snap_obj.ocp.wait_for_resource( condition="true", resource_name=snap_obj.name, column=constants.STATUS_READYTOUSE, timeout=60, ) teardown_factory(snap_obj) # Same Storage class of the original PVC sc_name = self.pvc_obj.backed_sc # Size should be same as of the original PVC pvc_size = str(self.pvc_obj.size) + "Gi" # Create pvc out of the snapshot # Both, the snapshot and the restore PVC should be in same namespace restore_pvc_name = helpers.create_unique_resource_name( "test", "restore-pvc") restore_pvc_yaml = constants.CSI_RBD_PVC_RESTORE_YAML if interface == constants.CEPHFILESYSTEM: restore_pvc_yaml = constants.CSI_CEPHFS_PVC_RESTORE_YAML restore_pvc_obj = pvc.create_restore_pvc( sc_name=sc_name, snap_name=snap_obj.name, namespace=snap_obj.namespace, size=pvc_size, pvc_name=restore_pvc_name, restore_pvc_yaml=restore_pvc_yaml, ) helpers.wait_for_resource_state(restore_pvc_obj, constants.STATUS_BOUND) restore_pvc_obj.reload() teardown_factory(restore_pvc_obj) # Create and attach pod to the pvc restore_pod_obj = helpers.create_pod( interface_type=interface, pvc_name=restore_pvc_obj.name, namespace=snap_obj.namespace, pod_dict_path=constants.NGINX_POD_YAML, ) # Confirm that the pod is running helpers.wait_for_resource_state(resource=restore_pod_obj, state=constants.STATUS_RUNNING) restore_pod_obj.reload() teardown_factory(restore_pod_obj) # Verify that the file is present on the new pod log.info(f"Checking the existence of {file_name} " f"on restore pod {restore_pod_obj.name}") assert pod.check_file_existence( restore_pod_obj, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {restore_pod_obj.name}") # Verify that the md5sum matches log.info(f"Verifying that md5sum of {file_name} " f"on pod {self.pod_obj.name} matches with md5sum " f"of the same file on restore pod {restore_pod_obj.name}") assert pod.verify_data_integrity( restore_pod_obj, file_name, orig_md5_sum), "Data integrity check failed" log.info("Data integrity check passed, md5sum are same") log.info("Running IO on new pod") # Run IO on new pod restore_pod_obj.run_io(storage_type="fs", size="1G", runtime=20) # Wait for fio to finish restore_pod_obj.get_fio_results() log.info("IO finished o new pod")
def test_pvc_snapshot_performance(self, teardown_factory, pvc_size): """ 1. Run I/O on a pod file. 2. Calculate md5sum of the file. 3. Take a snapshot of the PVC and measure the time of creation. 4. Restore From the snapshot and measure the time 5. Attach a new pod to it. 6. Verify that the file is present on the new pod also. 7. Verify that the md5sum of the file on the new pod matches with the md5sum of the file on the original pod. This scenario run 3 times and report all results Args: teardown_factory: A fixture to destroy objects pvc_size: the size of the PVC to be tested - parametrize """ # Getting the total Storage capacity ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() log.info(f"Total capacity size is : {ceph_capacity}") log.info(f"PVC Size is : {pvc_size}") log.info(f"Needed capacity is {int(int(pvc_size) * 5)}") if int(ceph_capacity) < int(pvc_size) * 5: log.error( f"PVC size is {pvc_size}GiB and it is too large for this system" f" which have only {ceph_capacity}GiB") return # Calculating the file size as 25% of the PVC size # in the end the PVC will be 75% full filesize = self.pvc_obj.size * 0.25 # Change the file size to MB and from int to str file_size = f"{int(filesize * 1024)}M" all_results = [] for test_num in range(self.tests_numbers): test_results = { "test_num": test_num + 1, "dataset": (test_num + 1) * filesize * 1024, # size in MiB "create": { "time": None, "speed": None }, "restore": { "time": None, "speed": None }, } log.info(f"Starting test phase number {test_num}") # Step 1. Run I/O on a pod file. file_name = f"{self.pod_obj.name}-{test_num}" log.info(f"Starting IO on the POD {self.pod_obj.name}") # Going to run only write IO to fill the PVC for the snapshot self.pod_obj.fillup_fs(size=file_size, fio_filename=file_name) # Wait for fio to finish fio_result = self.pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"IO error on pod {self.pod_obj.name}. FIO result: {fio_result}" log.info("IO on the PVC Finished") # Verify presence of the file file_path = pod.get_file_path(self.pod_obj, file_name) log.info(f"Actual file path on the pod {file_path}") assert pod.check_file_existence( self.pod_obj, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {self.pod_obj.name}") # Step 2. Calculate md5sum of the file. orig_md5_sum = pod.cal_md5sum(self.pod_obj, file_name) # Step 3. Take a snapshot of the PVC and measure the time of creation. snap_name = self.pvc_obj.name.replace("pvc-test", f"snapshot-test{test_num}") log.info(f"Taking snapshot of the PVC {snap_name}") test_results["create"]["time"] = self.measure_create_snapshot_time( pvc_name=self.pvc_obj.name, snap_name=snap_name, interface=self.interface, ) test_results["create"]["speed"] = int( test_results["dataset"] / test_results["create"]["time"]) log.info( f' Test {test_num} dataset is {test_results["dataset"]} MiB') log.info( f'Snapshot creation time is : {test_results["create"]["time"]} sec.' ) log.info( f'Snapshot speed is : {test_results["create"]["speed"]} MB/sec' ) # Step 4. Restore the PVC from the snapshot and measure the time # Same Storage class of the original PVC sc_name = self.pvc_obj.backed_sc # Size should be same as of the original PVC pvc_size = str(self.pvc_obj.size) + "Gi" # Create pvc out of the snapshot # Both, the snapshot and the restore PVC should be in same namespace log.info("Restoring from the Snapshot") restore_pvc_name = self.pvc_obj.name.replace( "pvc-test", f"restore-pvc{test_num}") restore_pvc_yaml = constants.CSI_RBD_PVC_RESTORE_YAML if self.interface == constants.CEPHFILESYSTEM: restore_pvc_yaml = constants.CSI_CEPHFS_PVC_RESTORE_YAML log.info("Resorting the PVC from Snapshot") restore_pvc_obj = pvc.create_restore_pvc( sc_name=sc_name, snap_name=self.snap_obj.name, namespace=self.snap_obj.namespace, size=pvc_size, pvc_name=restore_pvc_name, restore_pvc_yaml=restore_pvc_yaml, ) helpers.wait_for_resource_state( restore_pvc_obj, constants.STATUS_BOUND, timeout=3600 # setting this to 60 Min. # since it can be take long time to restore, and we want it to finished. ) teardown_factory(restore_pvc_obj) restore_pvc_obj.reload() log.info("PVC was restored from the snapshot") test_results["restore"][ "time"] = helpers.measure_pvc_creation_time( self.interface, restore_pvc_obj.name) test_results["restore"]["speed"] = int( test_results["dataset"] / test_results["restore"]["time"]) log.info( f'Snapshot restore time is : {test_results["restore"]["time"]}' ) log.info( f'restore sped is : {test_results["restore"]["speed"]} MB/sec') # Step 5. Attach a new pod to the restored PVC restore_pod_obj = helpers.create_pod( interface_type=self.interface, pvc_name=restore_pvc_obj.name, namespace=self.snap_obj.namespace, pod_dict_path=constants.NGINX_POD_YAML, ) # Confirm that the pod is running helpers.wait_for_resource_state(resource=restore_pod_obj, state=constants.STATUS_RUNNING) teardown_factory(restore_pod_obj) restore_pod_obj.reload() # Step 6. Verify that the file is present on the new pod also. log.info(f"Checking the existence of {file_name} " f"on restore pod {restore_pod_obj.name}") assert pod.check_file_existence( restore_pod_obj, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {restore_pod_obj.name}") # Step 7. Verify that the md5sum matches log.info(f"Verifying that md5sum of {file_name} " f"on pod {self.pod_obj.name} matches with md5sum " f"of the same file on restore pod {restore_pod_obj.name}") assert pod.verify_data_integrity( restore_pod_obj, file_name, orig_md5_sum), "Data integrity check failed" log.info("Data integrity check passed, md5sum are same") all_results.append(test_results) # logging the test summery, all info in one place for easy log reading c_speed, c_runtime, r_speed, r_runtime = (0 for i in range(4)) log.info("Test summery :") for tst in all_results: c_speed += tst["create"]["speed"] c_runtime += tst["create"]["time"] r_speed += tst["restore"]["speed"] r_runtime += tst["restore"]["time"] log.info( f"Test {tst['test_num']} results : dataset is {tst['dataset']} MiB. " f"Take snapshot time is {tst['create']['time']} " f"at {tst['create']['speed']} MiB/Sec " f"Restore from snapshot time is {tst['restore']['time']} " f"at {tst['restore']['speed']} MiB/Sec ") log.info( f" Average snapshot creation time is {c_runtime / self.tests_numbers} sec." ) log.info( f" Average snapshot creation speed is {c_speed / self.tests_numbers} MiB/sec" ) log.info( f" Average snapshot restore time is {r_runtime / self.tests_numbers} sec." ) log.info( f" Average snapshot restore speed is {r_speed / self.tests_numbers} MiB/sec" )
def test_rwo_dynamic_pvc(self, setup_base): logger.info(f"Creating two pods using same PVC {self.pvc_obj.name}") logger.info(f"Creating first pod on node: {self.worker_nodes_list[0]}") pod_obj1 = helpers.create_pod(interface_type=self.interface_type, pvc_name=self.pvc_obj.name, desired_status=constants.STATUS_RUNNING, wait=True, namespace=self.namespace, node_name=self.worker_nodes_list[0], pod_dict_path=constants.NGINX_POD_YAML) node_pod1 = pod_obj1.get().get('spec').get('nodeName') logger.info( f"Creating second pod on node: {self.worker_nodes_list[1]}") pod_obj2 = helpers.create_pod(interface_type=self.interface_type, pvc_name=self.pvc_obj.name, wait=False, namespace=self.namespace, node_name=self.worker_nodes_list[1], pod_dict_path=constants.NGINX_POD_YAML) node_pod2 = pod_obj2.get().get('spec').get('nodeName') assert node_pod1 != node_pod2, 'Both pods are on the same node' logger.info(f"Running IO on pod {pod_obj1.name}") file_name = pod_obj1.name pod_obj1.run_io(storage_type=self.storage_type, size=self.io_size, runtime=30, fio_filename=file_name) pod.get_fio_rw_iops(pod_obj1) md5sum_pod1_data = pod.cal_md5sum(pod_obj=pod_obj1, file_name=file_name) # Verify that second pod is still in Pending state and not able to # attain Running state due to expected failure assert helpers.wait_for_resource_state(resource=pod_obj2, state=constants.STATUS_PENDING) self.verify_expected_failure_event( ocs_obj=pod_obj2, failure_str=self.expected_pod_failure) pod_obj1.delete() pod_obj1.ocp.wait_for_delete(resource_name=pod_obj1.name) # Wait for second pod to be in Running state assert helpers.wait_for_resource_state(resource=pod_obj2, state=constants.STATUS_RUNNING, timeout=240) assert pod.verify_data_integrity(pod_obj=pod_obj2, file_name=file_name, original_md5sum=md5sum_pod1_data) pod_obj2.run_io(storage_type=self.storage_type, size=self.io_size, runtime=30, fio_filename=pod_obj2.name) pod.get_fio_rw_iops(pod_obj2) # Again verify data integrity assert pod.verify_data_integrity(pod_obj=pod_obj2, file_name=file_name, original_md5sum=md5sum_pod1_data) pod_obj2.delete() pod_obj1.ocp.wait_for_delete(resource_name=pod_obj2.name)
def test_snapshot_restore_using_different_sc( self, storageclass_factory, snapshot_factory, snapshot_restore_factory, pod_factory, ): """ Test to verify snapshot restore using an SC different than that of parent """ snap_objs = [] file_name = "file_snapshot" # Run IO log.info("Start IO on all pods") for pod_obj in self.pods: pod_obj.run_io( storage_type="fs", size=f"{self.pvc_size - 1}G", runtime=30, fio_filename=file_name, ) log.info("IO started on all pods") # Wait for IO completion for pod_obj in self.pods: pod_obj.get_fio_results() # Get md5sum of the file pod_obj.pvc.md5sum = cal_md5sum(pod_obj=pod_obj, file_name=file_name) log.info("IO completed on all pods") # Create snapshots log.info("Create snapshots of all PVCs") for pvc_obj in self.pvcs: log.info(f"Creating snapshot of PVC {pvc_obj.name}") snap_obj = snapshot_factory(pvc_obj, wait=False) snap_obj.md5sum = pvc_obj.md5sum snap_obj.interface = pvc_obj.interface snap_objs.append(snap_obj) log.info("Snapshots created") # Verify snapshots are Ready log.info("Verify snapshots are ready") for snap_obj in snap_objs: snap_obj.ocp.wait_for_resource( condition="true", resource_name=snap_obj.name, column=constants.STATUS_READYTOUSE, timeout=180, ) # Create storage classes. sc_objs = { constants.CEPHBLOCKPOOL: [ storageclass_factory( interface=constants.CEPHBLOCKPOOL, ).name ], constants.CEPHFILESYSTEM: [ storageclass_factory(interface=constants.CEPHFILESYSTEM).name ], } # If ODF >=4.9 create one more storage class that will use new pool # to verify the bug 1901954 if version.get_semantic_ocs_version_from_config() >= version.VERSION_4_9: sc_objs[constants.CEPHBLOCKPOOL].append( storageclass_factory( interface=constants.CEPHBLOCKPOOL, new_rbd_pool=True ).name ) # Create PVCs out of the snapshots restore_pvc_objs = [] log.info("Creating new PVCs from snapshots") for snap_obj in snap_objs: for storageclass in sc_objs[snap_obj.interface]: log.info(f"Creating a PVC from snapshot {snap_obj.name}") restore_pvc_obj = snapshot_restore_factory( snapshot_obj=snap_obj, storageclass=storageclass, size=f"{self.pvc_size}Gi", volume_mode=snap_obj.parent_volume_mode, access_mode=snap_obj.parent_access_mode, status="", ) log.info( f"Created PVC {restore_pvc_obj.name} from snapshot {snap_obj.name}." f"Used the storage class {storageclass}" ) restore_pvc_obj.md5sum = snap_obj.md5sum restore_pvc_objs.append(restore_pvc_obj) log.info("Created new PVCs from all the snapshots") # Confirm that the restored PVCs are Bound log.info("Verify the restored PVCs are Bound") for pvc_obj in restore_pvc_objs: wait_for_resource_state( resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180 ) pvc_obj.reload() log.info("Verified: Restored PVCs are Bound.") # Attach the restored PVCs to pods log.info("Attach the restored PVCs to pods") restore_pod_objs = [] for restore_pvc_obj in restore_pvc_objs: restore_pod_obj = pod_factory( interface=restore_pvc_obj.snapshot.interface, pvc=restore_pvc_obj, status="", ) log.info( f"Attached the PVC {restore_pvc_obj.name} to pod {restore_pod_obj.name}" ) restore_pod_objs.append(restore_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in restore_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify md5sum log.info("Verifying md5sum on new pods") for pod_obj in restore_pod_objs: log.info(f"Verifying md5sum on pod {pod_obj.name}") verify_data_integrity( pod_obj=pod_obj, file_name=file_name, original_md5sum=pod_obj.pvc.snapshot.md5sum, ) log.info(f"Verified md5sum on pod {pod_obj.name}") log.info("Verified md5sum on all pods") # Run IO on new pods log.info("Starting IO on new pods") for pod_obj in restore_pod_objs: pod_obj.run_io(storage_type="fs", size="500M", runtime=15) # Wait for IO completion on new pods log.info("Waiting for IO completion on new pods") for pod_obj in restore_pod_objs: pod_obj.get_fio_results() log.info("IO completed on new pods.")
def test_worker_node_restart_during_pvc_clone( self, nodes, pvc_clone_factory, pod_factory ): """ Verify PVC cloning will succeed if a worker node is restarted while cloning is in progress """ file_name = "fio_test" executor = ThreadPoolExecutor(max_workers=len(self.pvcs) + 1) selected_node = node.get_nodes( node_type=constants.WORKER_MACHINE, num_of_nodes=1 ) # Run IO log.info("Starting IO on all pods") for pod_obj in self.pods: storage_type = ( "block" if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK else "fs" ) pod_obj.run_io( storage_type=storage_type, size="1G", runtime=20, fio_filename=file_name, end_fsync=1, ) log.info(f"IO started on pod {pod_obj.name}") log.info("Started IO on all pods") # Wait for IO to finish log.info("Wait for IO to finish on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"IO finished on pod {pod_obj.name}") # Calculate md5sum file_name_pod = ( file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block") ) pod_obj.pvc.md5sum = pod.cal_md5sum( pod_obj, file_name_pod, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) # Restart node log.info(f"Restart node {selected_node[0].name}") restart_thread = executor.submit(nodes.restart_nodes, nodes=selected_node) log.info("Creating clone of all PVCs.") for pvc_obj in self.pvcs: log.info(f"Creating clone of {pvc_obj.name}") pvc_obj.clone_proc = executor.submit( pvc_clone_factory, pvc_obj=pvc_obj, status="" ) # Check result of 'restart_nodes' restart_thread.result() log.info("Verify status of node.") node.wait_for_nodes_status( node_names=[node.get_node_name(selected_node[0])], status=constants.NODE_READY, timeout=300, ) # Get cloned PVCs cloned_pvcs = [pvc_obj.clone_proc.result() for pvc_obj in self.pvcs] log.info("Verifying cloned PVCs are Bound") for pvc_obj in cloned_pvcs: wait_for_resource_state( resource=pvc_obj, state=constants.STATUS_BOUND, timeout=540 ) pvc_obj.reload() log.info("Verified: Cloned PVCs are Bound") # Attach the cloned PVCs to pods log.info("Attach the cloned PVCs to pods") clone_pod_objs = [] for pvc_obj in cloned_pvcs: if pvc_obj.volume_mode == "Block": pod_dict_path = constants.CSI_RBD_RAW_BLOCK_POD_YAML else: pod_dict_path = "" clone_pod_obj = pod_factory( interface=pvc_obj.parent.interface, pvc=pvc_obj, status="", pod_dict_path=pod_dict_path, raw_block_pv=pvc_obj.volume_mode == "Block", ) log.info(f"Attaching the PVC {pvc_obj.name} to pod {clone_pod_obj.name}") clone_pod_objs.append(clone_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in clone_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify md5sum for pod_obj in clone_pod_objs: file_name_pod = ( file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block") ) pod.verify_data_integrity( pod_obj, file_name_pod, pod_obj.pvc.parent.md5sum, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info( f"Verified: md5sum of {file_name_pod} on pod {pod_obj.name} " f"matches with the original md5sum" ) log.info("Data integrity check passed on all pods") # Run IO log.info("Starting IO on the new pods") for pod_obj in clone_pod_objs: storage_type = ( "block" if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK else "fs" ) pod_obj.run_io( storage_type=storage_type, size="1G", runtime=20, fio_filename=f"{file_name}_1", end_fsync=1, ) log.info(f"IO started on pod {pod_obj.name}") log.info("Started IO on the new pods") # Wait for IO to finish log.info("Wait for IO to finish on the new pods") for pod_obj in clone_pod_objs: pod_obj.get_fio_results() log.info(f"IO finished on pod {pod_obj.name}") log.info("IO finished on the new pods")
def test_resource_deletion_during_snapshot_restore( self, snapshot_factory, snapshot_restore_factory, pod_factory): """ Verify PVC snapshot and restore will succeeded if rook-ceph, csi pods are re-spun while creating snapshot and while creating restore PVC """ pods_to_delete = [ "rbdplugin_provisioner", "cephfsplugin_provisioner", "cephfsplugin", "rbdplugin", "osd", "mgr", ] executor = ThreadPoolExecutor(max_workers=len(self.pvcs) + len(pods_to_delete)) disruption_ops = [ disruption_helpers.Disruptions() for _ in pods_to_delete ] file_name = "file_snap" # Run IO log.info("Running fio on all pods to create a file") for pod_obj in self.pods: storage_type = ("block" if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK) else "fs") pod_obj.run_io( storage_type=storage_type, size="1G", runtime=30, fio_filename=file_name, end_fsync=1, ) log.info("Wait for IO to complete on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"Verified IO on pod {pod_obj.name}") # Calculate md5sum file_name_pod = (file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) pod_obj.pvc.md5sum = cal_md5sum( pod_obj, file_name_pod, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info(f"md5sum obtained from pod {pod_obj.name}") log.info("IO is successful on all pods") # Select the pods to be deleted for disruption, pod_type in zip(disruption_ops, pods_to_delete): # Select snapshotter leader if the pod is provisioner pod disruption.set_resource( resource=pod_type, leader_type="snapshotter" if "provisioner" in pod_type else "", ) log.info("Start taking snapshot of all PVCs.") for pvc_obj in self.pvcs: log.info(f"Taking snapshot of PVC {pvc_obj.name}") pvc_obj.snap_proc = executor.submit(snapshot_factory, pvc_obj, wait=False) log.info("Started taking snapshot of all PVCs.") # Delete the pods 'pods_to_delete' log.info(f"Deleting pods {pods_to_delete}") for disruption in disruption_ops: disruption.delete_proc = executor.submit( disruption.delete_resource) # Wait for delete and recovery [disruption.delete_proc.result() for disruption in disruption_ops] # Get snapshots snap_objs = [] for pvc_obj in self.pvcs: snap_obj = pvc_obj.snap_proc.result() snap_obj.md5sum = pvc_obj.md5sum snap_objs.append(snap_obj) # Wait for snapshots to be Ready log.info("Waiting for all snapshots to be Ready") for snap_obj in snap_objs: snap_obj.ocp.wait_for_resource( condition="true", resource_name=snap_obj.name, column=constants.STATUS_READYTOUSE, timeout=300, ) log.info(f"Snapshot {snap_obj.name} is Ready") snap_obj.reload() log.info("All snapshots are Ready") # Select the pods to be deleted for disruption, pod_type in zip(disruption_ops, pods_to_delete): disruption.set_resource(resource=pod_type) restore_pvc_objs = [] # Create PVCs out of the snapshots log.info("Start creating new PVCs from snapshots") for snap_obj in snap_objs: log.info(f"Creating a PVC from snapshot {snap_obj.name}") snap_obj.restore_proc = executor.submit( snapshot_restore_factory, snapshot_obj=snap_obj, size=f"{self.pvc_size}Gi", volume_mode=snap_obj.parent_volume_mode, access_mode=snap_obj.parent_access_mode, status="", ) log.info("Started creating new PVCs from snapshots") # Delete the pods 'pods_to_delete' log.info(f"Deleting pods {pods_to_delete}") for disruption in disruption_ops: disruption.delete_proc = executor.submit( disruption.delete_resource) # Wait for delete and recovery [disruption.delete_proc.result() for disruption in disruption_ops] # Get restored PVCs for snap_obj in snap_objs: restore_pvc_obj = snap_obj.restore_proc.result() restore_pvc_objs.append(restore_pvc_obj) log.info(f"Created PVC {restore_pvc_obj.name} from snapshot " f"{snap_obj.name}") log.info("Created new PVCs from all the snapshots") # Confirm that the restored PVCs are Bound log.info("Verifying the restored PVCs are Bound") for pvc_obj in restore_pvc_objs: wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=300) pvc_obj.reload() pvc_obj.volume_mode = pvc_obj.data["spec"]["volumeMode"] log.info("Verified: Restored PVCs are Bound.") restore_pod_objs = [] # Attach the restored PVCs to pods log.info("Attach the restored PVCs to pods") for pvc_obj in restore_pvc_objs: if pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK: pod_dict_path = constants.CSI_RBD_RAW_BLOCK_POD_YAML else: pod_dict_path = "" restore_pod_obj = pod_factory( interface=pvc_obj.interface, pvc=pvc_obj, status="", pod_dict_path=pod_dict_path, raw_block_pv=pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK, ) restore_pod_objs.append(restore_pod_obj) log.info("Attach the restored PVCs to pods") # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in restore_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify md5sum log.info("Verify md5sum") for pod_obj in restore_pod_objs: file_name_pod = (file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) verify_data_integrity( pod_obj, file_name_pod, pod_obj.pvc.snapshot.md5sum, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info( f"Verified: md5sum of {file_name_pod} on pod {pod_obj.name} " f"matches with the original md5sum") log.info("Data integrity check passed on all pods") # Run IO log.info("Running IO on new pods") for pod_obj in restore_pod_objs: storage_type = ("block" if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK) else "fs") pod_obj.run_io( storage_type=storage_type, size="1G", runtime=20, fio_filename=file_name, end_fsync=1, ) log.info("Wait for IO to complete on new pods") for pod_obj in restore_pod_objs: pod_obj.get_fio_results() log.info(f"Verified IO on new pod {pod_obj.name}") log.info("IO to completed on new pods")
def test_clone_with_different_access_mode(self, pvc_clone_factory, pod_factory): """ Create clone of a PVC with an access mode different than parent PVC """ file_name = "fio_test" access_modes_dict = { constants.CEPHBLOCKPOOL: { constants.VOLUME_MODE_FILESYSTEM: [constants.ACCESS_MODE_RWO], constants.VOLUME_MODE_BLOCK: [ constants.ACCESS_MODE_RWX, constants.ACCESS_MODE_RWO, ], }, constants.CEPHFILESYSTEM: { constants.VOLUME_MODE_FILESYSTEM: [ constants.ACCESS_MODE_RWX, constants.ACCESS_MODE_RWO, ] }, } # Run IO log.info("Starting IO on all pods") for pod_obj in self.pods: storage_type = ( "block" if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK else "fs" ) pod_obj.run_io( storage_type=storage_type, size="1G", runtime=20, fio_filename=file_name, end_fsync=1, ) log.info(f"IO started on pod {pod_obj.name}") log.info("Started IO on all pods") # Wait for IO to finish log.info("Wait for IO to finish on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"IO finished on pod {pod_obj.name}") # Calculate md5sum file_name_pod = ( file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block") ) pod_obj.pvc.md5sum = pod.cal_md5sum( pod_obj, file_name_pod, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info("Creating clone of the PVCs with different access modes") cloned_pvcs = [] for pvc_obj in self.pvcs: access_modes = access_modes_dict[pvc_obj.interface][pvc_obj.volume_mode] for access_mode in access_modes: clone_obj = pvc_clone_factory( pvc_obj=pvc_obj, status="", access_mode=access_mode ) clone_obj.interface = pvc_obj.interface log.info( f"Clone {clone_obj.name} created. " f"Parent PVC: {pvc_obj.name}. " f"Parent accessMode: {pvc_obj.get_pvc_access_mode}. " f"Cloned PVC accessMode: {access_mode}" ) cloned_pvcs.append(clone_obj) log.info("Created clone of the PVCs with different access modes") log.info("Verifying cloned PVCs are Bound") for pvc_obj in cloned_pvcs: helpers.wait_for_resource_state( resource=pvc_obj, state=constants.STATUS_BOUND, timeout=200 ) pvc_obj.reload() log.info("Verified: Cloned PVCs are Bound") # Get worker node names and create an iterator nodes_iter = cycle(node.get_worker_nodes()) # Attach the cloned PVCs to pods log.info("Attach the cloned PVCs to pods") clone_pod_objs = [] for pvc_obj in cloned_pvcs: if pvc_obj.volume_mode == "Block": pod_dict_path = constants.CSI_RBD_RAW_BLOCK_POD_YAML else: pod_dict_path = "" # Create 2 pods if access mode is RWX, else 1 for _ in range( int(pvc_obj.get_pvc_access_mode != constants.ACCESS_MODE_RWX), 2 ): clone_pod_obj = pod_factory( interface=pvc_obj.interface, pvc=pvc_obj, status="", node_name=next(nodes_iter), pod_dict_path=pod_dict_path, raw_block_pv=pvc_obj.volume_mode == "Block", ) log.info( f"Attaching the PVC {pvc_obj.name} to pod " f"{clone_pod_obj.name}" ) clone_pod_objs.append(clone_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in clone_pod_objs: helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify md5sum for pod_obj in clone_pod_objs: file_name_pod = ( file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block") ) pod.verify_data_integrity( pod_obj, file_name_pod, pod_obj.pvc.parent.md5sum, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info( f"Verified: md5sum of {file_name_pod} on pod {pod_obj.name} " f"matches with the original md5sum" ) log.info("Data integrity check passed on all pods")
def test_rwo_pvc_fencing_node_prolonged_network_failure( self, nodes, setup, node_restart_teardown): """ OCS-1427/OCS-1429: - Start DeploymentConfig based app pods on 1 OCS/Non-OCS node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive node - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods OCS-1430/OCS-1435: - Start DeploymentConfig based app pods on multiple node Colocated scenario: Select 1 node where osd and/or mon is running, select other 2 nodes where mon/osd are not running Dedicated scenario: 3 Non-OCS nodes - Disrupt the leader provisioner pods if not running on above selected nodes - Make the nodes (where app pods are running) unresponsive by bringing their main network interface down - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive nodes - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup external_mode = helpers.storagecluster_independent_check() # Run IO on pods md5sum_data = self.run_and_verify_io(pod_list=dc_pods, fio_filename="io_file1", run_io_in_bg=True) # OCS-1430/OCS-1435 # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds") sleep(self.prolong_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in dc_pods + ceph_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len( dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods) if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, "Unexpected number of osd pods" self.verify_multi_attach_error(new_ceph_pods) logger.info("Executing manual recovery steps") # Power off the unresponsive node(s) logger.info(f"Powering off the unresponsive node(s): {app_pod_nodes}") nodes.stop_nodes(node.get_node_objs(app_pod_nodes)) # Force delete the app pods and/or mon,osd pods on the unresponsive node if float(config.ENV_DATA["ocs_version"] ) < 4.4 and ceph_cluster.mon_count == 5: for pod_obj in ceph_cluster.mons: if pod.get_pod_node(pod_obj).name in app_pod_nodes: ceph_pods.append(pod_obj) for pod_obj in dc_pods + ceph_pods: pod_obj.delete(force=True) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not external_mode: # Wait for mon and osd pods to reach Running state selectors_to_check = { constants.MON_APP_LABEL: self.expected_mon_count, constants.OSD_APP_LABEL: ceph_cluster.osd_count, } for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == self.expected_mon_count: # Check ceph health toolbox_status = ceph_cluster.POD.get_resource_status( ceph_cluster.toolbox.name) if toolbox_status == constants.STATUS_TERMINATING: ceph_cluster.toolbox.delete(force=True) assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods self.run_and_verify_io(pod_list=new_dc_pods, fio_filename="io_file2", return_md5sum=False)
def expand_verify_pvcs(pvc_objs, pod_objs, pvc_size_new, file_name, fio_size): """ Expands size of each PVC in the provided list of PVCs, Verifies data integrity by checking the existence and md5sum of file in the expanded PVC and Runs FIO on expanded PVCs and verifies results. Args: pvc_objs (list) : List of PVC objects which are to be expanded. pod_objs (list) : List of POD objects attached to the PVCs. pvc_size_new (int) : Size of the expanded PVC in GB. file_name (str) : Name of the file on which FIO is performed. fio_size (int) : Size in MB of FIO. """ # Expand original PVCs log.info("Started expansion of the PVCs.") for pvc_obj in pvc_objs: log.info(f"Expanding size of PVC {pvc_obj.name} to {pvc_size_new}G") pvc_obj.resize_pvc(pvc_size_new, True) log.info("Successfully expanded the PVCs.") # Verify that the fio exists and md5sum matches for pod_no in range(len(pod_objs)): pod_obj = pod_objs[pod_no] if pod_obj.pvc.get_pvc_vol_mode == constants.VOLUME_MODE_BLOCK: pod.verify_data_integrity_after_expansion_for_block_pvc( pod_obj, pvc_objs[pod_no], fio_size) else: pod.verify_data_integrity(pod_obj, file_name, pvc_objs[pod_no].md5sum) # Run IO to utilize 50% of volume log.info( "Run IO on all pods to utilise 50% of the expanded PVC used space") expanded_file_name = "fio_50" for pod_obj in pod_objs: log.info(f"Running IO on pod {pod_obj.name}") log.info(f"File created during IO {expanded_file_name}") fio_size = int(0.50 * pvc_size_new * 1000) storage_type = ("block" if pod_obj.pvc.get_pvc_vol_mode == constants.VOLUME_MODE_BLOCK else "fs") pod_obj.wl_setup_done = True pod_obj.wl_obj = workload.WorkLoad( "test_workload_fio", pod_obj.get_storage_path(storage_type), "fio", storage_type, pod_obj, 1, ) pod_obj.run_io( storage_type=storage_type, size=f"{fio_size}M", runtime=20, fio_filename=expanded_file_name, end_fsync=1, ) log.info("Started IO on all pods to utilise 50% of PVCs") for pod_obj in pod_objs: # Wait for IO to finish pod_obj.get_fio_results(3600) log.info(f"IO finished on pod {pod_obj.name}") is_block = (True if pod_obj.pvc.get_pvc_vol_mode == constants.VOLUME_MODE_BLOCK else False) expanded_file_name_pod = (expanded_file_name if not is_block else pod_obj.get_storage_path( storage_type="block")) # Verify presence of the file expanded_file_path = (expanded_file_name_pod if is_block else pod.get_file_path( pod_obj, expanded_file_name_pod)) log.info(f"Actual file path on the pod {expanded_file_path}") assert pod.check_file_existence( pod_obj, expanded_file_path ), f"File {expanded_file_name_pod} does not exist" log.info(f"File {expanded_file_name_pod} exists in {pod_obj.name}")
def test_pvc_to_pvc_clone(self, interface_type, teardown_factory): """ Create a clone from an existing pvc, verify data is preserved in the cloning. """ logger.info(f"Running IO on pod {self.pod_obj.name}") file_name = self.pod_obj.name logger.info(f"File created during IO {file_name}") self.pod_obj.run_io(storage_type="fs", size="500M", fio_filename=file_name) # Wait for fio to finish self.pod_obj.get_fio_results() logger.info(f"Io completed on pod {self.pod_obj.name}.") # Verify presence of the file file_path = pod.get_file_path(self.pod_obj, file_name) logger.info(f"Actual file path on the pod {file_path}") assert pod.check_file_existence( self.pod_obj, file_path), f"File {file_name} does not exist" logger.info(f"File {file_name} exists in {self.pod_obj.name}") # Calculate md5sum of the file. orig_md5_sum = pod.cal_md5sum(self.pod_obj, file_name) # Create a clone of the existing pvc. sc_name = self.pvc_obj.backed_sc parent_pvc = self.pvc_obj.name clone_yaml = constants.CSI_RBD_PVC_CLONE_YAML if interface_type == constants.CEPHFILESYSTEM: clone_yaml = constants.CSI_CEPHFS_PVC_CLONE_YAML cloned_pvc_obj = pvc.create_pvc_clone(sc_name, parent_pvc, clone_yaml) teardown_factory(cloned_pvc_obj) helpers.wait_for_resource_state(cloned_pvc_obj, constants.STATUS_BOUND) cloned_pvc_obj.reload() # Create and attach pod to the pvc clone_pod_obj = helpers.create_pod( interface_type=interface_type, pvc_name=cloned_pvc_obj.name, namespace=cloned_pvc_obj.namespace, pod_dict_path=constants.NGINX_POD_YAML, ) # Confirm that the pod is running helpers.wait_for_resource_state(resource=clone_pod_obj, state=constants.STATUS_RUNNING) clone_pod_obj.reload() teardown_factory(clone_pod_obj) # Verify file's presence on the new pod logger.info(f"Checking the existence of {file_name} on cloned pod " f"{clone_pod_obj.name}") assert pod.check_file_existence( clone_pod_obj, file_path), f"File {file_path} does not exist" logger.info(f"File {file_name} exists in {clone_pod_obj.name}") # Verify Contents of a file in the cloned pvc # by validating if md5sum matches. logger.info(f"Verifying that md5sum of {file_name} " f"on pod {self.pod_obj.name} matches with md5sum " f"of the same file on restore pod {clone_pod_obj.name}") assert pod.verify_data_integrity( clone_pod_obj, file_name, orig_md5_sum), "Data integrity check failed" logger.info("Data integrity check passed, md5sum are same") logger.info("Run IO on new pod") clone_pod_obj.run_io(storage_type="fs", size="100M", runtime=10) # Wait for IO to finish on the new pod clone_pod_obj.get_fio_results() logger.info(f"IO completed on pod {clone_pod_obj.name}")
def test_pvc_snapshot_performance(self, pvc_size): """ 1. Run I/O on a pod file 2. Calculate md5sum of the file 3. Take a snapshot of the PVC 4. Measure the total snapshot creation time and the CSI snapshot creation time 4. Restore From the snapshot and measure the time 5. Attach a new pod to it 6. Verify that the file is present on the new pod also 7. Verify that the md5sum of the file on the new pod matches with the md5sum of the file on the original pod This scenario run 3 times and report all the average results of the 3 runs and will send them to the ES Args: pvc_size: the size of the PVC to be tested - parametrize """ # Getting the total Storage capacity ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() log.info(f"Total capacity size is : {ceph_capacity}") log.info(f"PVC Size is : {pvc_size}") log.info(f"Needed capacity is {int(int(pvc_size) * 5)}") if int(ceph_capacity) < int(pvc_size) * 5: log.error( f"PVC size is {pvc_size}GiB and it is too large for this system" f" which have only {ceph_capacity}GiB") return # Calculating the file size as 25% of the PVC size # in the end the PVC will be 75% full filesize = self.pvc_obj.size * 0.25 # Change the file size to MB and from int to str file_size = f"{int(filesize * 1024)}M" all_results = [] self.results_path = get_full_test_logs_path(cname=self) log.info(f"Logs file path name is : {self.full_log_path}") # Produce ES report # Collecting environment information self.get_env_info() # Initialize the results doc file. self.full_results = self.init_full_results( ResultsAnalyse( self.uuid, self.crd_data, self.full_log_path, "pvc_snapshot_perf", )) self.full_results.add_key("pvc_size", pvc_size + " GiB") self.full_results.add_key("interface", self.sc) self.full_results.all_results["creation_time"] = [] self.full_results.all_results["csi_creation_time"] = [] self.full_results.all_results["creation_speed"] = [] self.full_results.all_results["restore_time"] = [] self.full_results.all_results["restore_speed"] = [] self.full_results.all_results["restore_csi_time"] = [] for test_num in range(self.tests_numbers): test_results = { "test_num": test_num + 1, "dataset": (test_num + 1) * filesize * 1024, # size in MiB "create": { "time": None, "csi_time": None, "speed": None }, "restore": { "time": None, "speed": None }, } log.info(f"Starting test phase number {test_num}") # Step 1. Run I/O on a pod file. file_name = f"{self.pod_object.name}-{test_num}" log.info(f"Starting IO on the POD {self.pod_object.name}") # Going to run only write IO to fill the PVC for the snapshot self.pod_object.fillup_fs(size=file_size, fio_filename=file_name) # Wait for fio to finish fio_result = self.pod_object.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"IO error on pod {self.pod_object.name}. FIO result: {fio_result}" log.info("IO on the PVC Finished") # Verify presence of the file file_path = pod.get_file_path(self.pod_object, file_name) log.info(f"Actual file path on the pod {file_path}") assert pod.check_file_existence( self.pod_object, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {self.pod_object.name}") # Step 2. Calculate md5sum of the file. orig_md5_sum = pod.cal_md5sum(self.pod_object, file_name) # Step 3. Take a snapshot of the PVC and measure the time of creation. snap_name = self.pvc_obj.name.replace("pvc-test", f"snapshot-test{test_num}") log.info(f"Taking snapshot of the PVC {snap_name}") start_time = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") test_results["create"]["time"] = self.measure_create_snapshot_time( pvc_name=self.pvc_obj.name, snap_name=snap_name, namespace=self.pod_object.namespace, interface=self.interface, start_time=start_time, ) test_results["create"][ "csi_time"] = performance_lib.measure_csi_snapshot_creation_time( interface=self.interface, snapshot_id=self.snap_uid, start_time=start_time, ) test_results["create"]["speed"] = int( test_results["dataset"] / test_results["create"]["time"]) log.info( f' Test {test_num} dataset is {test_results["dataset"]} MiB') log.info( f"Snapshot name {snap_name} and id {self.snap_uid} creation time is" f' : {test_results["create"]["time"]} sec.') log.info( f"Snapshot name {snap_name} and id {self.snap_uid} csi creation time is" f' : {test_results["create"]["csi_time"]} sec.') log.info( f'Snapshot speed is : {test_results["create"]["speed"]} MB/sec' ) # Step 4. Restore the PVC from the snapshot and measure the time # Same Storage class of the original PVC sc_name = self.pvc_obj.backed_sc # Size should be same as of the original PVC pvc_size = str(self.pvc_obj.size) + "Gi" # Create pvc out of the snapshot # Both, the snapshot and the restore PVC should be in same namespace log.info("Restoring from the Snapshot") restore_pvc_name = self.pvc_obj.name.replace( "pvc-test", f"restore-pvc{test_num}") restore_pvc_yaml = constants.CSI_RBD_PVC_RESTORE_YAML if self.interface == constants.CEPHFILESYSTEM: restore_pvc_yaml = constants.CSI_CEPHFS_PVC_RESTORE_YAML csi_start_time = self.get_time("csi") log.info("Restoring the PVC from Snapshot") restore_pvc_obj = pvc.create_restore_pvc( sc_name=sc_name, snap_name=self.snap_obj.name, namespace=self.snap_obj.namespace, size=pvc_size, pvc_name=restore_pvc_name, restore_pvc_yaml=restore_pvc_yaml, ) helpers.wait_for_resource_state( restore_pvc_obj, constants.STATUS_BOUND, timeout=3600 # setting this to 60 Min. # since it can be take long time to restore, and we want it to finished. ) restore_pvc_obj.reload() log.info("PVC was restored from the snapshot") test_results["restore"][ "time"] = helpers.measure_pvc_creation_time( self.interface, restore_pvc_obj.name) test_results["restore"]["speed"] = int( test_results["dataset"] / test_results["restore"]["time"]) log.info( f'Snapshot restore time is : {test_results["restore"]["time"]}' ) log.info( f'restore speed is : {test_results["restore"]["speed"]} MB/sec' ) test_results["restore"][ "csi_time"] = performance_lib.csi_pvc_time_measure( self.interface, restore_pvc_obj, "create", csi_start_time) log.info( f'Snapshot csi restore time is : {test_results["restore"]["csi_time"]}' ) # Step 5. Attach a new pod to the restored PVC restore_pod_object = helpers.create_pod( interface_type=self.interface, pvc_name=restore_pvc_obj.name, namespace=self.snap_obj.namespace, ) # Confirm that the pod is running helpers.wait_for_resource_state(resource=restore_pod_object, state=constants.STATUS_RUNNING) restore_pod_object.reload() # Step 6. Verify that the file is present on the new pod also. log.info(f"Checking the existence of {file_name} " f"on restore pod {restore_pod_object.name}") assert pod.check_file_existence( restore_pod_object, file_path), f"File {file_name} doesn't exist" log.info(f"File {file_name} exists in {restore_pod_object.name}") # Step 7. Verify that the md5sum matches log.info( f"Verifying that md5sum of {file_name} " f"on pod {self.pod_object.name} matches with md5sum " f"of the same file on restore pod {restore_pod_object.name}") assert pod.verify_data_integrity( restore_pod_object, file_name, orig_md5_sum), "Data integrity check failed" log.info("Data integrity check passed, md5sum are same") restore_pod_object.delete() restore_pvc_obj.delete() all_results.append(test_results) # clean the enviroment self.pod_object.delete() self.pvc_obj.delete() self.delete_test_project() # logging the test summary, all info in one place for easy log reading c_speed, c_runtime, c_csi_runtime, r_speed, r_runtime, r_csi_runtime = ( 0 for i in range(6)) log.info("Test summary :") for tst in all_results: c_speed += tst["create"]["speed"] c_runtime += tst["create"]["time"] c_csi_runtime += tst["create"]["csi_time"] r_speed += tst["restore"]["speed"] r_runtime += tst["restore"]["time"] r_csi_runtime += tst["restore"]["csi_time"] self.full_results.all_results["creation_time"].append( tst["create"]["time"]) self.full_results.all_results["csi_creation_time"].append( tst["create"]["csi_time"]) self.full_results.all_results["creation_speed"].append( tst["create"]["speed"]) self.full_results.all_results["restore_time"].append( tst["restore"]["time"]) self.full_results.all_results["restore_speed"].append( tst["restore"]["speed"]) self.full_results.all_results["restore_csi_time"].append( tst["restore"]["csi_time"]) self.full_results.all_results["dataset_inMiB"] = tst["dataset"] log.info( f"Test {tst['test_num']} results : dataset is {tst['dataset']} MiB. " f"Take snapshot time is {tst['create']['time']} " f"at {tst['create']['speed']} MiB/Sec " f"Restore from snapshot time is {tst['restore']['time']} " f"at {tst['restore']['speed']} MiB/Sec ") avg_snap_c_time = c_runtime / self.tests_numbers avg_snap_csi_c_time = c_csi_runtime / self.tests_numbers avg_snap_c_speed = c_speed / self.tests_numbers avg_snap_r_time = r_runtime / self.tests_numbers avg_snap_r_speed = r_speed / self.tests_numbers avg_snap_r_csi_time = r_csi_runtime / self.tests_numbers log.info(f" Average snapshot creation time is {avg_snap_c_time} sec.") log.info( f" Average csi snapshot creation time is {avg_snap_csi_c_time} sec." ) log.info( f" Average snapshot creation speed is {avg_snap_c_speed} MiB/sec") log.info(f" Average snapshot restore time is {avg_snap_r_time} sec.") log.info( f" Average snapshot restore speed is {avg_snap_r_speed} MiB/sec") log.info( f" Average snapshot restore csi time is {avg_snap_r_csi_time} sec." ) self.full_results.add_key("avg_snap_creation_time_insecs", avg_snap_c_time) self.full_results.add_key("avg_snap_csi_creation_time_insecs", avg_snap_csi_c_time) self.full_results.add_key("avg_snap_creation_speed", avg_snap_c_speed) self.full_results.add_key("avg_snap_restore_time_insecs", avg_snap_r_time) self.full_results.add_key("avg_snap_restore_speed", avg_snap_r_speed) self.full_results.add_key("avg_snap_restore_csi_time_insecs", avg_snap_r_csi_time) # Write the test results into the ES server log.info("writing results to elastic search server") if self.full_results.es_write(): res_link = self.full_results.results_link() # write the ES link to the test results in the test log. log.info(f"The result can be found at : {res_link}") self.write_result_to_file(res_link)
def test_snapshot_at_different_usage_level(self, snapshot_factory, snapshot_restore_factory, pod_factory): """ Test to take multiple snapshots of same PVC when the PVC usage is at 0%, 20%, 40%, 60%, and 80%, then delete the parent PVC and restore the snapshots to create new PVCs. Delete snapshots and attach the restored PVCs to pods to verify the data. """ snapshots = [] usage_percent = [0, 20, 40, 60, 80] for usage in usage_percent: if usage != 0: for pod_obj in self.pods: log.info( f"Running IO on pod {pod_obj.name} to utilize {usage}%" ) pod_obj.pvc.filename = f'{pod_obj.name}_{usage}' pod_obj.run_io( storage_type='fs', size=f'{int(self.pvc_size/len(usage_percent))}G', runtime=20, fio_filename=pod_obj.pvc.filename) log.info(f"IO started on all pods to utilize {usage}%") for pod_obj in self.pods: # Wait for fio to finish pod_obj.get_fio_results() log.info(f"IO to utilize {usage}% finished on pod " f"{pod_obj.name}") # Calculate md5sum md5_sum = pod.cal_md5sum(pod_obj, pod_obj.pvc.filename) if not getattr(pod_obj.pvc, 'md5_sum', None): setattr(pod_obj.pvc, 'md5_sum', {}) pod_obj.pvc.md5_sum[pod_obj.pvc.filename] = md5_sum # Take snapshot of all PVCs log.info(f"Creating snapshot of all PVCs at {usage}%") for pvc_obj in self.pvcs: log.info( f"Creating snapshot of PVC {pvc_obj.name} at {usage}%") snap_obj = snapshot_factory(pvc_obj, wait=False) # Set a dict containing filename:md5sum for later verification setattr(snap_obj, 'md5_sum', deepcopy(getattr(pvc_obj, 'md5_sum', {}))) snapshots.append(snap_obj) log.info(f"Created snapshot of PVC {pvc_obj.name} at {usage}%") log.info(f"Created snapshot of all PVCs at {usage}%") log.info("Snapshots creation completed.") # Verify snapshots are ready log.info("Verify snapshots are ready") for snapshot in snapshots: snapshot.ocp.wait_for_resource(condition='true', resource_name=snapshot.name, column=constants.STATUS_READYTOUSE, timeout=90) # Delete pods log.info("Deleting the pods") for pod_obj in self.pods: pod_obj.delete() pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name) log.info("Deleted all the pods") # Delete parent PVCs log.info("Deleting parent PVCs") for pvc_obj in self.pvcs: # TODO: Unblock parent PVC deletion for cephfs PVC when the bug 1854501 is fixed if constants.RBD_INTERFACE in pvc_obj.backed_sc: pv_obj = pvc_obj.backed_pv_obj pvc_obj.delete() pvc_obj.ocp.wait_for_delete(resource_name=pvc_obj.name) log.info(f"Deleted PVC {pvc_obj.name}. Verifying whether PV " f"{pv_obj.name} is deleted.") pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name) log.info("Deleted parent PVCs before restoring snapshot. " "PVs are also deleted.") restore_pvc_objs = [] # Create PVCs out of the snapshots log.info("Creating new PVCs from snapshots") for snapshot in snapshots: log.info(f"Creating a PVC from snapshot {snapshot.name}") restore_pvc_obj = snapshot_restore_factory( snapshot_obj=snapshot, size=f'{self.pvc_size}Gi', volume_mode=snapshot.parent_volume_mode, access_mode=snapshot.parent_access_mode, status='') log.info(f"Created PVC {restore_pvc_obj.name} from snapshot " f"{snapshot.name}") restore_pvc_objs.append(restore_pvc_obj) log.info("Created new PVCs from all the snapshots") # Confirm that the restored PVCs are Bound log.info("Verify the restored PVCs are Bound") for pvc_obj in restore_pvc_objs: wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=90) pvc_obj.reload() log.info("Verified: Restored PVCs are Bound.") # Delete volume snapshots log.info("Deleting snapshots") for snapshot in snapshots: snapshot.delete() # Verify volume snapshots are deleted log.info("Verify snapshots are deleted") for snapshot in snapshots: snapshot.ocp.wait_for_delete(resource_name=snapshot.name) log.info("Verified: Snapshots are deleted") # Attach the restored PVCs to pods log.info("Attach the restored PVCs to pods") restore_pod_objs = [] for restore_pvc_obj in restore_pvc_objs: interface = constants.CEPHFILESYSTEM if ( constants.CEPHFS_INTERFACE in restore_pvc_obj.snapshot.parent_sc ) else constants.CEPHBLOCKPOOL restore_pod_obj = pod_factory(interface=interface, pvc=restore_pvc_obj, status='') log.info(f"Attached the PVC {restore_pvc_obj.name} to pod " f"{restore_pod_obj.name}") restore_pod_objs.append(restore_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in restore_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify md5sum of files log.info("Verifying md5sum of files on all the pods") for restore_pod_obj in restore_pod_objs: log.info(f"Verifying md5sum of these files on pod " f"{restore_pod_obj.name}:" f"{restore_pod_obj.pvc.snapshot.md5_sum}") for file_name, actual_md5_sum in ( restore_pod_obj.pvc.snapshot.md5_sum.items()): file_path = pod.get_file_path(restore_pod_obj, file_name) log.info(f"Checking the existence of file {file_name} on pod " f"{restore_pod_obj.name}") assert pod.check_file_existence( restore_pod_obj, file_path), (f"File {file_name} does not exist on pod " f"{restore_pod_obj.name}") log.info( f"File {file_name} exists on pod {restore_pod_obj.name}") # Verify that the md5sum matches log.info(f"Verifying md5sum of file {file_name} on pod " f"{restore_pod_obj.name}") pod.verify_data_integrity(restore_pod_obj, file_name, actual_md5_sum) log.info(f"Verified md5sum of file {file_name} on pod " f"{restore_pod_obj.name}") log.info(f"Verified md5sum of these files on pod " f"{restore_pod_obj.name}:" f"{restore_pod_obj.pvc.snapshot.md5_sum}") log.info("md5sum verified")
def test_expansion_snapshot_clone(self, snapshot_factory, snapshot_restore_factory, pvc_clone_factory, pod_factory): """ This test performs the following operations : Expand parent PVC --> Take snapshot --> Expand parent PVC --> Take clone --> Restore snapshot --> Expand cloned and restored PVC --> Clone restored PVC --> Snapshot and restore of cloned PVCs --> Expand new PVCs Data integrity will be checked in each stage as required. This test verifies that the clone, snapshot and parent PVCs are independent and any operation in one will not impact the other. """ filename = "fio_file" filename_restore_clone = "fio_file_restore_clone" pvc_size_expand_1 = 4 pvc_size_expand_2 = 6 pvc_size_expand_3 = 8 snapshots = [] # Run IO log.info("Start IO on pods") for pod_obj in self.pods: log.info(f"Running IO on pod {pod_obj.name}") pod_obj.run_io(storage_type="fs", size="1G", runtime=20, fio_filename=filename) log.info("IO started on all pods") log.info("Wait for IO completion on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"IO finished on pod {pod_obj.name}") # Calculate md5sum md5sum = pod.cal_md5sum(pod_obj, filename) pod_obj.pvc.md5sum = md5sum log.info("IO completed on all pods") # Expand PVCs log.info(f"Expanding PVCs to {pvc_size_expand_1}Gi") for pvc_obj in self.pvcs: log.info( f"Expanding size of PVC {pvc_obj.name} to {pvc_size_expand_1}Gi" ) pvc_obj.resize_pvc(pvc_size_expand_1, True) log.info( f"Verified: Size of all PVCs are expanded to {pvc_size_expand_1}Gi" ) # Take snapshot of all PVCs log.info("Creating snapshot of all PVCs") for pvc_obj in self.pvcs: log.info(f"Creating snapshot of PVC {pvc_obj.name}") snap_obj = snapshot_factory(pvc_obj, wait=False) snap_obj.md5sum = pvc_obj.md5sum snapshots.append(snap_obj) log.info(f"Created snapshot of PVC {pvc_obj.name}") log.info("Created snapshot of all PVCs") # Verify snapshots are ready log.info("Verify snapshots are ready") for snap_obj in snapshots: snap_obj.ocp.wait_for_resource( condition="true", resource_name=snap_obj.name, column=constants.STATUS_READYTOUSE, timeout=180, ) snap_obj.reload() log.info("Verified: Snapshots are Ready") # Expand PVCs log.info(f"Expanding PVCs to {pvc_size_expand_2}Gi") for pvc_obj in self.pvcs: log.info( f"Expanding size of PVC {pvc_obj.name} to {pvc_size_expand_2}Gi" ) pvc_obj.resize_pvc(pvc_size_expand_2, True) log.info( f"Verified: Size of all PVCs are expanded to {pvc_size_expand_2}Gi" ) # Clone PVCs log.info("Creating clone of all PVCs") clone_objs = [] for pvc_obj in self.pvcs: log.info(f"Creating clone of PVC {pvc_obj.name}") clone_obj = pvc_clone_factory( pvc_obj=pvc_obj, status="", volume_mode=constants.VOLUME_MODE_FILESYSTEM) clone_obj.md5sum = pvc_obj.md5sum clone_objs.append(clone_obj) log.info(f"Created clone of PVC {pvc_obj.name}") log.info("Created clone of all PVCs") log.info("Wait for cloned PVcs to reach Bound state and verify size") for pvc_obj in clone_objs: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180) assert pvc_obj.size == pvc_size_expand_2, ( f"Size is not {pvc_size_expand_2} but {pvc_obj.size} in " f"cloned PVC {pvc_obj.name}") log.info( f"Cloned PVCs reached Bound state. Verified the size of all PVCs " f"as {pvc_size_expand_2}Gi") # Ensure restore size is not impacted by parent PVC expansion log.info("Verify restore size of snapshots") for snapshot_obj in snapshots: snapshot_info = snapshot_obj.get() assert snapshot_info["status"]["restoreSize"] == ( f"{pvc_size_expand_1}Gi"), ( f"Restore size mismatch in snapshot {snapshot_obj.name}\n" f"{snapshot_info}") log.info( f"Verified: Restore size of all snapshots are {pvc_size_expand_1}Gi" ) # Restore snapshots log.info("Restore snapshots") restore_objs = [] for snap_obj in snapshots: restore_obj = snapshot_restore_factory(snapshot_obj=snap_obj, status="") restore_obj.md5sum = snap_obj.md5sum restore_objs.append(restore_obj) log.info("Verify restored PVCs are Bound") for pvc_obj in restore_objs: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180) pvc_obj.reload() log.info("Verified: Restored PVCs are Bound.") # Attach the restored and cloned PVCs to pods log.info("Attach the restored and cloned PVCs to pods") restore_clone_pod_objs = [] for pvc_obj in restore_objs + clone_objs: interface = (constants.CEPHFILESYSTEM if (constants.CEPHFS_INTERFACE in pvc_obj.backed_sc) else constants.CEPHBLOCKPOOL) pod_obj = pod_factory(interface=interface, pvc=pvc_obj, status="") log.info(f"Attached the PVC {pvc_obj.name} to pod {pod_obj.name}") restore_clone_pod_objs.append(pod_obj) log.info("Verify pods are Running") for pod_obj in restore_clone_pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) pod_obj.reload() log.info("Verified: Pods reached Running state") # Expand cloned and restored PVCs log.info( f"Expanding cloned and restored PVCs to {pvc_size_expand_3}Gi") for pvc_obj in clone_objs + restore_objs: log.info(f"Expanding size of PVC {pvc_obj.name} to " f"{pvc_size_expand_3}Gi from {pvc_obj.size}") pvc_obj.resize_pvc(pvc_size_expand_3, True) log.info( f"Verified: Size of all cloned and restored PVCs are expanded to " f"{pvc_size_expand_3}G") # Run IO on pods attached with cloned and restored PVCs log.info("Starting IO on pods attached with cloned and restored PVCs") for pod_obj in restore_clone_pod_objs: log.info(f"Running IO on pod {pod_obj.name}") pod_obj.run_io( storage_type="fs", size="1G", runtime=20, fio_filename=filename_restore_clone, ) log.info("IO started on all pods") log.info("Waiting for IO completion on pods attached with cloned and " "restored PVCs") for pod_obj in restore_clone_pod_objs: pod_obj.get_fio_results() log.info(f"IO finished on pod {pod_obj.name}") # Calculate md5sum of second file 'filename_restore_clone' md5sum = pod.cal_md5sum(pod_obj, filename_restore_clone) pod_obj.pvc.md5sum_new = md5sum log.info(f"IO completed on all pods. Obtained md5sum of file " f"{filename_restore_clone}") # Verify md5sum of first file 'filename' log.info(f"Verify md5sum of file {filename} on pods") for pod_obj in restore_clone_pod_objs: pod.verify_data_integrity(pod_obj, filename, pod_obj.pvc.md5sum) log.info(f"Verified: md5sum of {filename} on pod {pod_obj.name} " f"matches with the original md5sum") log.info("Data integrity check passed on all pods where restored and " "cloned PVCs are attached") # Clone the restored PVCs log.info("Creating clone of restored PVCs") restored_clone_objs = [] for pvc_obj in restore_objs: log.info(f"Creating clone of restored PVC {pvc_obj.name}") clone_obj = pvc_clone_factory( pvc_obj=pvc_obj, status="", volume_mode=constants.VOLUME_MODE_FILESYSTEM) clone_obj.md5sum = pvc_obj.md5sum clone_obj.md5sum_new = pvc_obj.md5sum_new restored_clone_objs.append(clone_obj) log.info(f"Created clone of restored PVC {pvc_obj.name}") log.info("Created clone of restored all PVCs") log.info("Wait for cloned PVcs to reach Bound state and verify size") for pvc_obj in restored_clone_objs: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180) assert pvc_obj.size == pvc_size_expand_3, ( f"Size is not {pvc_size_expand_3} but {pvc_obj.size} in " f"cloned PVC {pvc_obj.name}") log.info( f"Cloned PVCs reached Bound state. Verified the size of all PVCs " f"as {pvc_size_expand_3}Gi") # Take snapshot of all cloned PVCs snapshots_new = [] log.info("Creating snapshot of all cloned PVCs") for pvc_obj in clone_objs + restored_clone_objs: log.info(f"Creating snapshot of PVC {pvc_obj.name}") snap_obj = snapshot_factory(pvc_obj, wait=False) snap_obj.md5sum = pvc_obj.md5sum snap_obj.md5sum_new = pvc_obj.md5sum_new snapshots_new.append(snap_obj) log.info(f"Created snapshot of PVC {pvc_obj.name}") log.info("Created snapshot of all cloned PVCs") # Verify snapshots are ready log.info("Verify snapshots of cloned PVCs are Ready") for snap_obj in snapshots_new: snap_obj.ocp.wait_for_resource( condition="true", resource_name=snap_obj.name, column=constants.STATUS_READYTOUSE, timeout=180, ) snap_obj.reload() log.info("Verified: Snapshots of cloned PVCs are Ready") # Restore snapshots log.info("Restoring snapshots of cloned PVCs") restore_objs_new = [] for snap_obj in snapshots_new: restore_obj = snapshot_restore_factory(snap_obj, status="") restore_obj.md5sum = snap_obj.md5sum restore_obj.md5sum_new = snap_obj.md5sum_new restore_objs_new.append(restore_obj) log.info("Verify restored PVCs are Bound") for pvc_obj in restore_objs_new: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180) pvc_obj.reload() log.info("Verified: Restored PVCs are Bound.") # Delete pods to attach the cloned PVCs to new pods log.info("Delete pods") for pod_obj in restore_clone_pod_objs: pod_obj.delete() for pod_obj in restore_clone_pod_objs: pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name) log.info("Pods are deleted") # Attach the restored and cloned PVCs to new pods log.info("Attach the restored and cloned PVCs to new pods") restore_clone_pod_objs.clear() for pvc_obj in restore_objs_new + clone_objs: interface = (constants.CEPHFILESYSTEM if (constants.CEPHFS_INTERFACE in pvc_obj.backed_sc) else constants.CEPHBLOCKPOOL) pod_obj = pod_factory(interface=interface, pvc=pvc_obj, status="") log.info(f"Attached the PVC {pvc_obj.name} to pod {pod_obj.name}") restore_clone_pod_objs.append(pod_obj) log.info("Verify pods are Running") for pod_obj in restore_clone_pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) pod_obj.reload() log.info("Verified: Pods reached Running state") # Expand PVCs pvc_size_expand_4 = pvc_size_expand_3 + 2 log.info( f"Expanding restored and cloned PVCs to {pvc_size_expand_4}Gi") for pvc_obj in restore_objs_new + clone_objs: log.info( f"Expanding size of PVC {pvc_obj.name} to {pvc_size_expand_4}Gi" ) pvc_obj.resize_pvc(pvc_size_expand_4, True) log.info( f"Verified: Size of all PVCs are expanded to {pvc_size_expand_4}Gi" ) # Verify md5sum of both files log.info( f"Verify md5sum of files {filename} and {filename_restore_clone}") for pod_obj in restore_clone_pod_objs: pod.verify_data_integrity(pod_obj, filename, pod_obj.pvc.md5sum) log.info(f"Verified: md5sum of {filename} on pod {pod_obj.name} " f"matches with the original md5sum") pod.verify_data_integrity(pod_obj, filename_restore_clone, pod_obj.pvc.md5sum_new) log.info(f"Verified: md5sum of {filename_restore_clone} on pod " f"{pod_obj.name} matches with the original md5sum") log.info("Data integrity check passed on all pods where restored and " "cloned PVCs are attached")
def test_resource_deletion_during_pvc_clone(self, pvc_clone_factory, pod_factory): """ Verify PVC clone will succeeded if rook-ceph, csi pods are re-spun while creating the clone """ pods_to_delete = [ "rbdplugin_provisioner", "cephfsplugin_provisioner", "cephfsplugin", "rbdplugin", "osd", "mgr", ] executor = ThreadPoolExecutor(max_workers=len(self.pvcs) + len(pods_to_delete)) disruption_ops = [ disruption_helpers.Disruptions() for _ in pods_to_delete ] file_name = "file_clone" # Run IO log.info("Running fio on all pods to create a file") for pod_obj in self.pods: storage_type = ("block" if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK) else "fs") pod_obj.run_io( storage_type=storage_type, size="1G", runtime=30, fio_filename=file_name, end_fsync=1, ) log.info("Wait for IO to complete on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"Verified IO on pod {pod_obj.name}") # Calculate md5sum file_name_pod = (file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) pod_obj.pvc.md5sum = cal_md5sum( pod_obj, file_name_pod, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info(f"md5sum obtained from pod {pod_obj.name}") log.info("IO is successful on all pods") # Select the pods to be deleted for disruption, pod_type in zip(disruption_ops, pods_to_delete): cluster_index = None # 'provider_index' will not be None if the platform is Managed Services if self.provider_index is not None: if pod_type in ["osd", "mgr"]: cluster_index = self.provider_index config.switch_to_provider() else: cluster_index = self.consumer_index config.switch_ctx(cluster_index) disruption.set_resource(resource=pod_type, cluster_index=cluster_index) # Switch cluster context if the platform is MS. 'provider_index' will not be None if platform is MS. if self.provider_index is not None: config.switch_ctx(self.consumer_index) # Clone PVCs log.info("Start creating clone of PVCs") for pvc_obj in self.pvcs: log.info(f"Creating clone of PVC {pvc_obj.name}") pvc_obj.clone_proc = executor.submit( pvc_clone_factory, pvc_obj=pvc_obj, status="", access_mode=pvc_obj.get_pvc_access_mode, volume_mode=pvc_obj.volume_mode, ) log.info("Started creating clone") # Delete the pods 'pods_to_delete' log.info(f"Deleting pods {pods_to_delete}") for disruption in disruption_ops: disruption.delete_proc = executor.submit( disruption.delete_resource) # Wait for delete and recovery [disruption.delete_proc.result() for disruption in disruption_ops] # Get cloned PVCs clone_pvc_objs = [] for pvc_obj in self.pvcs: clone_obj = pvc_obj.clone_proc.result() clone_pvc_objs.append(clone_obj) log.info(f"Created clone {clone_obj.name} of PVC {pvc_obj.name}") log.info("Created clone of all PVCs") # Confirm that the cloned PVCs are Bound log.info("Verifying the cloned PVCs are Bound") for pvc_obj in clone_pvc_objs: wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=300) pvc_obj.reload() pvc_obj.volume_mode = pvc_obj.data["spec"]["volumeMode"] log.info("Verified: Cloned PVCs are Bound.") clone_pod_objs = [] # Attach the cloned PVCs to pods log.info("Attach the cloned PVCs to pods") for pvc_obj in clone_pvc_objs: if pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK: pod_dict_path = constants.CSI_RBD_RAW_BLOCK_POD_YAML else: pod_dict_path = "" restore_pod_obj = pod_factory( interface=pvc_obj.interface, pvc=pvc_obj, status="", pod_dict_path=pod_dict_path, raw_block_pv=pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK, ) clone_pod_objs.append(restore_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in clone_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify md5sum log.info("Verify md5sum") for pod_obj in clone_pod_objs: file_name_pod = (file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) verify_data_integrity( pod_obj, file_name_pod, pod_obj.pvc.parent.md5sum, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info( f"Verified: md5sum of {file_name_pod} on pod {pod_obj.name} " f"matches with the original md5sum") log.info("Data integrity check passed on all pods") # Run IO log.info("Running IO on new pods") for pod_obj in clone_pod_objs: storage_type = ("block" if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK) else "fs") pod_obj.run_io( storage_type=storage_type, size="1G", runtime=20, fio_filename=file_name, end_fsync=1, ) log.info("Wait for IO to complete on new pods") for pod_obj in clone_pod_objs: pod_obj.get_fio_results() log.info(f"Verified IO on new pod {pod_obj.name}") log.info("IO to completed on new pods")
def test_encrypted_rbd_block_pvc_snapshot( self, kms_provider, snapshot_factory, snapshot_restore_factory, pod_factory, kv_version, ): """ Test to take snapshots of encrypted RBD Block VolumeMode PVCs """ log.info( "Check for encrypted device, find initial md5sum value and run IO on all pods" ) for vol_handle, pod_obj in zip(self.vol_handles, self.pod_objs): # Verify whether encrypted device is present inside the pod if pod_obj.exec_sh_cmd_on_pod( command=f"lsblk | grep {vol_handle} | grep crypt"): log.info(f"Encrypted device found in {pod_obj.name}") else: raise ResourceNotFoundError( f"Encrypted device not found in {pod_obj.name}") # Find initial md5sum pod_obj.md5sum_before_io = cal_md5sum( pod_obj=pod_obj, file_name=pod_obj.get_storage_path(storage_type="block"), block=True, ) pod_obj.run_io( storage_type="block", size=f"{self.pvc_size - 1}G", io_direction="write", runtime=60, ) log.info("IO started on all pods") # Wait for IO completion for pod_obj in self.pod_objs: pod_obj.get_fio_results() log.info("IO completed on all pods") snap_objs, snap_handles = ([] for i in range(2)) # Verify md5sum has changed after IO. Create snapshot log.info( "Verify md5sum has changed after IO and create snapshot from all PVCs" ) for pod_obj in self.pod_objs: md5sum_after_io = cal_md5sum( pod_obj=pod_obj, file_name=pod_obj.get_storage_path(storage_type="block"), block=True, ) assert (pod_obj.md5sum_before_io != md5sum_after_io ), f"md5sum has not changed after IO on pod {pod_obj.name}" log.info(f"Creating snapshot of PVC {pod_obj.pvc.name}") snap_obj = snapshot_factory(pod_obj.pvc, wait=False) snap_obj.md5sum = md5sum_after_io snap_objs.append(snap_obj) log.info("Snapshots created") # Verify snapshots are ready and verify if encryption key is created in vault log.info("Verify snapshots are ready") for snap_obj in snap_objs: snap_obj.ocp.wait_for_resource( condition="true", resource_name=snap_obj.name, column=constants.STATUS_READYTOUSE, timeout=180, ) snapshot_content = get_snapshot_content_obj(snap_obj=snap_obj) snap_handle = snapshot_content.get().get("status").get( "snapshotHandle") if kms_provider == constants.VAULT_KMS_PROVIDER: if kms.is_key_present_in_path( key=snap_handle, path=self.kms.vault_backend_path): log.info(f"Vault: Found key for snapshot {snap_obj.name}") else: raise ResourceNotFoundError( f"Vault: Key not found for snapshot {snap_obj.name}") snap_handles.append(snap_handle) # Delete pods log.info("Deleting the pods") for pod_obj in self.pod_objs: pod_obj.delete() pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name) log.info("Deleted all the pods") # Delete parent PVCs to verify snapshot is independent log.info("Deleting parent PVCs") for pvc_obj in self.pvc_objs: pv_obj = pvc_obj.backed_pv_obj pvc_obj.delete() pvc_obj.ocp.wait_for_delete(resource_name=pvc_obj.name) log.info(f"Deleted PVC {pvc_obj.name}. Verifying whether PV " f"{pv_obj.name} is deleted.") pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name) log.info( "All parent PVCs and PVs are deleted before restoring snapshot.") restore_pvc_objs, restore_vol_handles = ([] for i in range(2)) # Create PVCs out of the snapshots log.info("Creating new PVCs from snapshots") for snap_obj in snap_objs: log.info(f"Creating a PVC from snapshot {snap_obj.name}") restore_pvc_obj = snapshot_restore_factory( snapshot_obj=snap_obj, storageclass=self.sc_obj.name, size=f"{self.pvc_size}Gi", volume_mode=snap_obj.parent_volume_mode, access_mode=snap_obj.parent_access_mode, status="", ) log.info(f"Created PVC {restore_pvc_obj.name} from snapshot " f"{snap_obj.name}") restore_pvc_obj.md5sum = snap_obj.md5sum restore_pvc_objs.append(restore_pvc_obj) log.info("Created new PVCs from all the snapshots") # Confirm that the restored PVCs are Bound log.info("Verify the restored PVCs are Bound") for pvc_obj in restore_pvc_objs: wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180) pvc_obj.reload() log.info("Verified: Restored PVCs are Bound.") # Attach the restored PVCs to pods. Attach RWX PVC on two pods log.info("Attach the restored PVCs to pods") restore_pod_objs = create_pods( restore_pvc_objs, pod_factory, constants.CEPHBLOCKPOOL, pods_for_rwx=1, status="", ) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in restore_pod_objs: timeout = (300 if config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM else 60) wait_for_resource_state(pod_obj, constants.STATUS_RUNNING, timeout) log.info("Verified: New pods are running") # Verify encryption keys are created for restored PVCs in Vault for pvc_obj in restore_pvc_objs: pv_obj = pvc_obj.backed_pv_obj vol_handle = pv_obj.get().get("spec").get("csi").get( "volumeHandle") restore_vol_handles.append(vol_handle) if kms_provider == constants.VAULT_KMS_PROVIDER: if kms.is_key_present_in_path( key=vol_handle, path=self.kms.vault_backend_path): log.info( f"Vault: Found key for restore PVC {pvc_obj.name}") else: raise ResourceNotFoundError( f"Vault: Key not found for restored PVC {pvc_obj.name}" ) # Verify encrypted device is present and md5sum on all pods for vol_handle, pod_obj in zip(restore_vol_handles, restore_pod_objs): if pod_obj.exec_sh_cmd_on_pod( command=f"lsblk | grep {vol_handle} | grep crypt"): log.info(f"Encrypted device found in {pod_obj.name}") else: raise ResourceNotFoundError( f"Encrypted device not found in {pod_obj.name}") log.info(f"Verifying md5sum on pod {pod_obj.name}") verify_data_integrity( pod_obj=pod_obj, file_name=pod_obj.get_storage_path(storage_type="block"), original_md5sum=pod_obj.pvc.md5sum, block=True, ) log.info(f"Verified md5sum on pod {pod_obj.name}") # Run IO on new pods log.info("Starting IO on new pods") for pod_obj in restore_pod_objs: pod_obj.run_io(storage_type="block", size="500M", runtime=15) # Wait for IO completion on new pods log.info("Waiting for IO completion on new pods") for pod_obj in restore_pod_objs: pod_obj.get_fio_results() log.info("IO completed on new pods.") # Delete the restored pods, PVC and snapshots log.info("Deleting pods using restored PVCs") for pod_obj in restore_pod_objs: pod_obj.delete() pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name) log.info("Deleting restored PVCs") for pvc_obj in restore_pvc_objs: pv_obj = pvc_obj.backed_pv_obj pvc_obj.delete() pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name) log.info("Deleting the snapshots") for snap_obj in snap_objs: snapcontent_obj = get_snapshot_content_obj(snap_obj=snap_obj) snap_obj.delete() snapcontent_obj.ocp.wait_for_delete( resource_name=snapcontent_obj.name) if kms_provider == constants.VAULT_KMS_PROVIDER: # Verify if keys for PVCs and snapshots are deleted from Vault if kv_version == "v1" or Version.coerce( config.ENV_DATA["ocs_version"]) >= Version.coerce("4.9"): log.info( "Verify whether the keys for PVCs and snapshots are deleted in vault" ) for key in self.vol_handles + snap_handles + restore_vol_handles: if not kms.is_key_present_in_path( key=key, path=self.kms.vault_backend_path): log.info(f"Vault: Key deleted for {key}") else: raise KMSResourceCleaneupError( f"Vault: Key deletion failed for {key}") log.info("All keys from vault were deleted")