def test_pv_scale_out_create_pvcs_and_respin_ceph_pods( self, fioscale, resource_to_delete, ): """ Test case to scale PVC+POD with multi projects and reach expected PVC count """ # Get info from SCALE_DATA_FILE for validation if os.path.exists(SCALE_DATA_FILE): file_data = templating.load_yaml(SCALE_DATA_FILE) namespace = file_data.get("NAMESPACE") pod_scale_list = file_data.get("POD_SCALE_LIST") pvc_scale_list = file_data.get("PVC_SCALE_LIST") else: raise FileNotFoundError disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) no_of_resource = disruption.resource_count for i in range(0, no_of_resource): disruption.delete_resource(resource_id=i) utils.ceph_health_check() # Validate all PVCs from namespace are in Bound state assert scale_lib.validate_all_pvcs_and_check_state( namespace=namespace, pvc_scale_list=pvc_scale_list) # Validate all PODs from namespace are up and running assert scale_lib.validate_all_pods_and_check_state( namespace=namespace, pod_scale_list=pod_scale_list)
def test_respin_ceph_pods(self, resource_to_delete): """ Test re-spin of Ceph daemond pods, Operator and CSI Pods in Scaled cluster """ # Get info from SCALE_DATA_FILE for validation if os.path.exists(SCALE_DATA_FILE): file_data = templating.load_yaml(SCALE_DATA_FILE) namespace = file_data.get("NAMESPACE") pod_scale_list = file_data.get("POD_SCALE_LIST") pvc_scale_list = file_data.get("PVC_SCALE_LIST") else: raise FileNotFoundError # perform disruption test disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) no_of_resource = disruption.resource_count for i in range(0, no_of_resource): disruption.delete_resource(resource_id=i) utils.ceph_health_check() # Validate all PVCs from namespace are in Bound state assert scale_lib.validate_all_pvcs_and_check_state( namespace=namespace, pvc_scale_list=pvc_scale_list) # Validate all PODs from namespace are up and running assert scale_lib.validate_all_pods_and_check_state( namespace=namespace, pod_scale_list=pod_scale_list) # Check ceph health status utils.ceph_health_check(tries=20)
def disrupt_plugin_provisioner_pods(self, node_list): """ Set leader plugin-provisioner resources for disruption, skip if running on node from the node_list Args: node_list (list): list of node names to check Returns: list: list of Disruption objects """ provisioner_resource = [] for interface in [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]: provisioner_pod = pod.get_plugin_provisioner_leader( interface=interface) node_name = pod.get_pod_node(provisioner_pod).name if node_name not in node_list: if interface == constants.CEPHBLOCKPOOL: provisioner_resource.append("rbdplugin_provisioner") else: provisioner_resource.append("cephfsplugin_provisioner") disruptor = [] for resource in provisioner_resource: disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource) disruptor.append(disruption) return disruptor
def test_registry_respin_pod(self, pod_name): """ Test registry workload when backed by OCS respin of ceph pods """ # Respin relevant pod log.info(f"Respin Ceph pod {pod_name}") disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=f"{pod_name}") disruption.delete_resource() # Pull and push images to registries log.info("Pull and push images to registries") image_pull_and_push( project_name=self.project_name, template="eap-cd-basic-s2i", image="registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest", pattern="eap-app", ) # Validate image exists in registries path validate_image_exists(namespace=self.project_name) # Validate image registry pods validate_registry_pod_status() # Validate cluster health ok and all pods are running self.sanity_helpers.health_check()
def test_scale_million_cephfs_files( self, million_file_cephfs, resource_to_delete, ): """ Add a million files to the ceph filesystem Delete each instance of the parametrized ceph pod once the ceph cluster is healthy. Make sure the ceph cluster comes back up and that rename operations function as expected. Args: million_file_cephfs (MillionFilesOnCephfs object): Tracks cephfs pod, pvcs, and list of files to rename. resource_to_delete (str): resource deleted for each testcase """ logging.info(f"Testing respin of {resource_to_delete}") disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) disruption.delete_resource() ocp_obj = million_file_cephfs.ocp_obj for sfile in million_file_cephfs.test_file_list: sample = os.sep.join([constants.MOUNT_POINT, "x", sfile]) newname = str(uuid.uuid4()) fullnew = os.sep.join([constants.MOUNT_POINT, "x", newname]) ocp_obj.exec_oc_cmd( f"exec {million_file_cephfs.pod_name} -- mv {sample} {fullnew}" ) ocp_obj.exec_oc_cmd( f"exec {million_file_cephfs.pod_name} -- mv {fullnew} {sample}" ) logging.info("Tests complete")
def test_scale_endpoint_and_respin_ceph_pods( self, mcg_job_factory, resource_to_delete, worker_node, ): """ Generate S3 workload to trigger autoscale to increase from 1 to 2 endpoint then respin ceph pods """ # Add workers node to cluster scale_pgsql.add_worker_node() # Check autoscale endpoint count before start s3 load self._assert_endpoint_count(desired_count=self.MIN_ENDPOINT_COUNT) endpoint_cnt = get_endpoint_pod_count( constants.OPENSHIFT_STORAGE_NAMESPACE) get_hpa_utilization(constants.OPENSHIFT_STORAGE_NAMESPACE) job_cnt = 0 wait_time = 30 job_list = list() while endpoint_cnt < self.MAX_ENDPOINT_COUNT: exec(f"job{job_cnt} = mcg_job_factory(custom_options=options)") job_list.append(f"job{job_cnt}") time.sleep(wait_time) endpoint_cnt = get_endpoint_pod_count( constants.OPENSHIFT_STORAGE_NAMESPACE) hpa_cpu_utilization = get_hpa_utilization( constants.OPENSHIFT_STORAGE_NAMESPACE) log.info( f"HPA CPU utilization by noobaa-endpoint is {hpa_cpu_utilization}%" ) if endpoint_cnt == self.MAX_ENDPOINT_COUNT: break job_cnt += 1 # Validate autoscale endpoint count self._assert_endpoint_count(desired_count=self.MAX_ENDPOINT_COUNT) # Respin ceph pods disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) number_of_resource = disruption.resource_count for i in range(0, number_of_resource): disruption.delete_resource(resource_id=i) # Delete mcg_job_factory for job in job_list: exec(f"{job}.delete()") exec( f"{job}.ocp.wait_for_delete(resource_name={job}.name, timeout=60)" ) # Validate autoscale endpoint count self._assert_endpoint_count(desired_count=self.MIN_ENDPOINT_COUNT) # Check ceph health status utils.ceph_health_check()
def respin_ceph_pod(self, resource_to_delete): """ Function to respin ceph pods one by one, delete_resource functions checks for the deleted pod back up and running Args: resource_to_delete (str): Ceph resource type to be deleted, eg: mgr/mon/osd/mds """ disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) no_of_resource = disruption.resource_count for i in range(0, no_of_resource): disruption.delete_resource(resource_id=i)
def test_pv_scale_out_create_pvcs_and_respin_ceph_pods( self, fioscale, resource_to_delete, ): """ Test case to scale PVC+POD with multi projects and reach expected PVC count """ disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) no_of_resource = disruption.resource_count for i in range(0, no_of_resource): disruption.delete_resource(resource_id=i) utils.ceph_health_check()
def test_run_pgsql_respin_pod(self, pgsql, transactions, pod_name): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark(replicas=3, transactions=transactions, clients=3) # Start measuring time start_time = datetime.now() # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Check worker node utilization(adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) # Respin relevant pod if pod_name == "postgres": pgsql.respin_pgsql_app_pod() else: log.info(f"Respin Ceph pod {pod_name}") disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=f"{pod_name}") disruption.delete_resource() # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Calculate the time from running state to completed state end_time = datetime.now() diff_time = end_time - start_time log.info( f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n" ) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods) # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=40)
def test_respin_osd_pods_to_verify_logging( self, create_pvc_and_deploymentconfig_pod ): """ This function creates projects before and after respin of osd and verify project existence in EFK stack. 1. Creates new project with PVC and app-pods 2. Respins osd 3. Logs into the EFK stack and checks for the health of cluster-logging 4. Logs into the EFK stack and checks project existence 5. Checks for the shards of the project in the EFK stack 6. Creates new project and checks the existence again """ # Create 1st project and app_pod dc_pod_obj, dc_pvc_obj = create_pvc_and_deploymentconfig_pod project1 = dc_pvc_obj.project.namespace # Delete the OSD pod disruption = disruption_helpers.Disruptions() disruption.set_resource(resource="osd") disruption.delete_resource() # Check the health of the cluster-logging assert ocp_logging_obj.check_health_of_clusterlogging() # Check for the 1st project created in EFK stack before the respin self.validate_project_exists(project1) # Check the files in the project self.check_filecount_in_project(project1) # Create another app_pod in new project pod_obj, pvc_obj = create_pvc_and_deploymentconfig_pod project2 = pvc_obj.project.namespace # Check the 2nd project exists in the EFK stack self.validate_project_exists(project2) self.check_filecount_in_project(project2)
def test_scale_endpoint_and_respin_ceph_pods(self, mcg_job_factory, resource_to_delete): """ Generate S3 workload to trigger autoscale to increase from 1 to 2 endpoint then respin ceph pods """ # Add workers node to cluster scale_pgsql.add_worker_node() # Check autoscale endpoint count before start s3 load self._assert_endpoint_count(desired_count=1) # Create s3 workload using mcg_job_factory for i in range(10): exec(f"job{i} = mcg_job_factory(custom_options=options)") # Validate autoscale endpoint count self._assert_endpoint_count(desired_count=2) # Respin ceph pods disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) number_of_resource = disruption.resource_count for i in range(0, number_of_resource): disruption.delete_resource(resource_id=i) # Delete mcg_job_factory for i in range(10): exec(f"job{i}.delete()") exec( f"job{i}.ocp.wait_for_delete(resource_name=job{i}.name, timeout=60)" ) # Validate autoscale endpoint count self._assert_endpoint_count(desired_count=1) # Delete workers node in the cluster scale_pgsql.delete_worker_node() # Check ceph health status utils.ceph_health_check()
def test_run_jenkins_respin_pod(self, jenkins, pod_name, num_projects, num_of_builds): """ Test jenkins workload """ # Init number of projects jenkins.number_projects = num_projects # Create app jenkins jenkins.create_app_jenkins() # Create jenkins pvc jenkins.create_jenkins_pvc() # Create jenkins build config jenkins.create_jenkins_build_config() # Wait jenkins deploy pod reach to completed state jenkins.wait_for_jenkins_deploy_status(status=STATUS_COMPLETED) # Init number of builds per project jenkins.number_builds_per_project = num_of_builds # Start Builds jenkins.start_build() # Respin pod log.info(f"Respin pod {pod_name}") disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=f"{pod_name}") disruption.delete_resource() # Wait build reach 'Complete' state jenkins.wait_for_build_to_complete() # Print table of builds jenkins.print_completed_builds_results() # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=40)
def test_registry_respin_pod(self, pod_name): """ Test registry workload when backed by OCS respin of ceph pods """ # Respin relevant pod log.info(f"Respin Ceph pod {pod_name}") disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=f"{pod_name}") disruption.delete_resource() # Pull and push images to registries log.info("Pull and push images to registries") image_pull_and_push(project_name=self.project_name) # Validate image exists in registries path validate_image_exists() # Validate image registry pods validate_registry_pod_status() # Validate cluster health ok and all pods are running self.sanity_helpers.health_check(tries=40)
def test_daemon_kill_during_pvc_pod_creation_and_io( self, interface, resource_name, setup, multi_pvc_factory, pod_factory ): """ Kill 'resource_name' daemon while PVCs creation, pods creation and IO operation are progressing. """ num_of_new_pvcs = 5 pvc_objs, io_pods, pvc_objs_new_pods, access_modes = setup proj_obj = pvc_objs[0].project storageclass = pvc_objs[0].storageclass pod_functions = { "mds": partial(get_mds_pods), "mon": partial(get_mon_pods), "mgr": partial(get_mgr_pods), "osd": partial(get_osd_pods), "rbdplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods), "operator": partial(get_operator_pods), } executor = ThreadPoolExecutor(max_workers=len(io_pods)) disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_name) # Get number of pods of type 'resource_name' resource_pods_num = len(pod_functions[resource_name]()) # Do setup for running IO on pods log.info("Setting up pods for running IO") for pod_obj in io_pods: if pod_obj.pvc.volume_mode == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in io_pods: log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(180, 2, getattr, pod_obj, "wl_setup_done"): if sample: log.info( f"Setup for running IO is completed on pod " f"{pod_obj.name}." ) break log.info("Setup for running IO is completed on pods") # Set daemon to be killed disruption.select_daemon() # Start creating new pods log.info("Start creating new pods.") bulk_pod_create = executor.submit( helpers.create_pods, pvc_objs_new_pods, pod_factory, interface, 2 ) # Start creation of new PVCs log.info("Start creating new PVCs.") bulk_pvc_create = executor.submit( multi_pvc_factory, interface=interface, project=proj_obj, storageclass=storageclass, size=self.pvc_size, access_modes=access_modes, access_modes_selection="distribute_random", status="", num_of_pvc=num_of_new_pvcs, wait_each=False, ) # Start IO on each pod log.info("Start IO on pods") for pod_obj in io_pods: if pod_obj.pvc.volume_mode == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="1G", runtime=10, fio_filename=f"{pod_obj.name}_io_file1", ) log.info("IO started on all pods.") # Kill daemon disruption.kill_daemon() # Getting result of PVC creation as list of PVC objects pvc_objs_new = bulk_pvc_create.result() # Confirm PVCs are Bound for pvc_obj in pvc_objs_new: helpers.wait_for_resource_state( resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180 ) pvc_obj.reload() log.info("Verified: New PVCs are Bound.") # Getting result of pods creation as list of Pod objects pod_objs_new = bulk_pod_create.result() # Verify new pods are Running for pod_obj in pod_objs_new: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING ) pod_obj.reload() log.info("Verified: All new pods are Running.") # Verify IO log.info("Fetching IO results from IO pods.") for pod_obj in io_pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info(f"IOPs after FIO on pod {pod_obj.name}:") log.info(f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}") log.info(f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}") log.info("Verified IO result on IO pods.") all_pod_objs = io_pods + pod_objs_new # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod in all_pod_objs: pod_info = pod.get() node = pod_info["spec"]["nodeName"] pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"] for pvc_obj in pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Delete pods for pod_obj in all_pod_objs: pod_obj.delete(wait=False) # Verify pods are deleted for pod_obj in all_pod_objs: pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name) # Verify number of 'resource_name' type pods final_resource_pods_num = len(pod_functions[resource_name]()) assert final_resource_pods_num == resource_pods_num, ( f"Total number of {resource_name} pods is not matching with " f"initial value. Total number of pods before daemon kill: " f"{resource_pods_num}. Total number of pods present now: " f"{final_resource_pods_num}" ) # Verify volumes are unmapped from nodes after deleting the pods node_pv_mounted = helpers.verify_pv_mounted_on_node(node_pv_dict) for node, pvs in node_pv_mounted.items(): assert not pvs, ( f"PVs {pvs} is still present on node {node} after " f"deleting the pods." ) log.info( "Verified: mount points are removed from nodes after deleting " "the pods" ) # Set volume mode on PVC objects for pvc_obj in pvc_objs_new: pvc_info = pvc_obj.get() setattr(pvc_obj, "volume_mode", pvc_info["spec"]["volumeMode"]) # Verify that PVCs are reusable by creating new pods all_pvc_objs = pvc_objs + pvc_objs_new pod_objs_re = helpers.create_pods(all_pvc_objs, pod_factory, interface, 2) # Verify pods are Running for pod_obj in pod_objs_re: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING ) pod_obj.reload() log.info("Successfully created new pods using all PVCs.") # Run IO on each of the newly created pods for pod_obj in pod_objs_re: if pod_obj.pvc.volume_mode == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="1G", runtime=10, fio_filename=f"{pod_obj.name}_io_file2", ) log.info("Fetching IO results from newly created pods") for pod_obj in pod_objs_re: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info(f"IOPs after FIO on pod {pod_obj.name}:") log.info(f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}") log.info(f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}") log.info("Verified IO result on newly created pods.")
def test_resource_deletion_during_pvc_expansion(self, resource_to_delete): """ Verify PVC expansion will succeed when rook-ceph, csi pods are re-spun during expansion """ pvc_size_expanded = 30 executor = ThreadPoolExecutor(max_workers=len(self.pvcs)) disruption_ops = disruption_helpers.Disruptions() # Run IO to fill some data log.info( "Running IO on all pods to fill some data before PVC expansion.") for pod_obj in self.pods: storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs" pod_obj.run_io( storage_type=storage_type, size="4G", io_direction="write", runtime=30, rate="10M", fio_filename=f"{pod_obj.name}_f1", ) log.info("Wait for IO to complete on pods") for pod_obj in self.pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert err_count == 0, (f"IO error on pod {pod_obj.name}. " f"FIO result: {fio_result}") log.info(f"Verified IO on pod {pod_obj.name}.") log.info("IO is successful on all pods before PVC expansion.") # Select the pod to be deleted disruption_ops.set_resource(resource=resource_to_delete) log.info("Expanding all PVCs.") for pvc_obj in self.pvcs: log.info( f"Expanding size of PVC {pvc_obj.name} to {pvc_size_expanded}G" ) pvc_obj.expand_proc = executor.submit(pvc_obj.resize_pvc, pvc_size_expanded, True) # Delete the pod 'resource_to_delete' disruption_ops.delete_resource() # Verify pvc expand status for pvc_obj in self.pvcs: assert (pvc_obj.expand_proc.result() ), f"Expansion failed for PVC {pvc_obj.name}" log.info("PVC expansion was successful on all PVCs") # Run IO to fill more data log.info("Write more data after PVC expansion.") for pod_obj in self.pods: storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs" pod_obj.run_io( storage_type=storage_type, size="10G", io_direction="write", runtime=30, rate="10M", fio_filename=f"{pod_obj.name}_f2", ) log.info("Wait for IO to complete on all pods") for pod_obj in self.pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert err_count == 0, (f"IO error on pod {pod_obj.name}. " f"FIO result: {fio_result}") log.info(f"Verified IO on pod {pod_obj.name}.") log.info("IO is successful on all pods after PVC expansion.")
def test_resource_deletion_during_pvc_expansion(self, resource_to_delete): """ Verify PVC expansion will succeed when rook-ceph, csi pods are re-spun during expansion """ pvc_size_expanded = 30 executor = ThreadPoolExecutor(max_workers=len(self.pvcs)) disruption_ops = disruption_helpers.Disruptions() # Run IO to fill some data log.info("Running IO on all pods to fill some data before PVC expansion.") for pod_obj in self.pods: storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs" pod_obj.run_io( storage_type=storage_type, size="4G", io_direction="write", runtime=30, rate="10M", fio_filename=f"{pod_obj.name}_f1", ) log.info("Wait for IO to complete on pods") for pod_obj in self.pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert err_count == 0, ( f"IO error on pod {pod_obj.name}. " f"FIO result: {fio_result}" ) log.info(f"Verified IO on pod {pod_obj.name}.") log.info("IO is successful on all pods before PVC expansion.") if self.provider_index is not None: # Switch to provider cluster context to get ceph pods config.switch_to_provider() # Select the pod to be deleted disruption_ops.set_resource(resource=resource_to_delete) if self.provider_index is not None: config.switch_ctx(self.consumer_index) log.info("Expanding all PVCs.") for pvc_obj in self.pvcs: log.info(f"Expanding size of PVC {pvc_obj.name} to {pvc_size_expanded}G") pvc_obj.expand_proc = executor.submit( pvc_obj.resize_pvc, pvc_size_expanded, True ) # Delete the pod 'resource_to_delete' disruption_ops.delete_resource() # Verify pvc expand status for pvc_obj in self.pvcs: assert ( pvc_obj.expand_proc.result() ), f"Expansion failed for PVC {pvc_obj.name}" log.info("PVC expansion was successful on all PVCs") log.info("Verifying new size on pods.") for pod_obj in self.pods: if pod_obj.pvc.volume_mode == "Block": log.info( f"Skipping check on pod {pod_obj.name} as volume mode is Block." ) continue # Wait for 240 seconds to reflect the change on pod log.info(f"Checking pod {pod_obj.name} to verify the change.") for df_out in TimeoutSampler( 240, 3, pod_obj.exec_cmd_on_pod, command="df -kh" ): if not df_out: continue df_out = df_out.split() new_size_mount = df_out[df_out.index(pod_obj.get_storage_path()) - 4] if new_size_mount in [ f"{pvc_size_expanded - 0.1}G", f"{float(pvc_size_expanded)}G", f"{pvc_size_expanded}G", ]: log.info( f"Verified: Expanded size of PVC {pod_obj.pvc.name} " f"is reflected on pod {pod_obj.name}" ) break log.info( f"Expanded size of PVC {pod_obj.pvc.name} is not reflected" f" on pod {pod_obj.name}. New size on mount is not " f"{pvc_size_expanded}G as expected, but {new_size_mount}. " f"Checking again." ) log.info( f"Verified: Modified size {pvc_size_expanded}G is reflected on all pods." ) # Run IO to fill more data log.info("Write more data after PVC expansion.") for pod_obj in self.pods: storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs" pod_obj.run_io( storage_type=storage_type, size="10G", io_direction="write", runtime=30, rate="10M", fio_filename=f"{pod_obj.name}_f2", end_fsync=1, ) log.info("Wait for IO to complete on all pods") for pod_obj in self.pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert err_count == 0, ( f"IO error on pod {pod_obj.name}. " f"FIO result: {fio_result}" ) log.info(f"Verified IO on pod {pod_obj.name}.") log.info("IO is successful on all pods after PVC expansion.")
def test_disruptive_during_pod_pvc_deletion_and_io( self, interface, resource_to_delete, setup_base ): """ Delete ceph/rook pod while PVCs deletion, pods deletion and IO are progressing """ pvc_objs, pod_objs, rwx_pod_objs = setup_base namespace = pvc_objs[0].project.namespace num_of_pods_to_delete = 3 num_of_io_pods = 1 # Select pods to be deleted pods_to_delete = pod_objs[:num_of_pods_to_delete] pods_to_delete.extend( [ pod for pod in rwx_pod_objs for pod_obj in pods_to_delete if (pod_obj.pvc == pod.pvc) ] ) # Select pods to run IO io_pods = pod_objs[ num_of_pods_to_delete : num_of_pods_to_delete + num_of_io_pods ] io_pods.extend( [ pod for pod in rwx_pod_objs for pod_obj in io_pods if (pod_obj.pvc == pod.pvc) ] ) # Select pods which are having PVCs to delete pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods :] pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc] pods_for_pvc.extend( [ pod for pod in rwx_pod_objs for pod_obj in pods_for_pvc if (pod_obj.pvc == pod.pvc) ] ) log.info( f"{len(pods_to_delete)} pods selected for deletion in which " f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod " f"share same RWX PVC" ) log.info( f"{len(io_pods)} pods selected for running IO in which " f"{len(io_pods) - num_of_io_pods} pairs of pod share same " f"RWX PVC" ) no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete) log.info( f"{len(pvcs_to_delete)} PVCs selected for deletion. " f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, " f"RWX PVCs: {no_of_rwx_pvcs_delete}" ) pod_functions = { "mds": partial(get_mds_pods), "mon": partial(get_mon_pods), "mgr": partial(get_mgr_pods), "osd": partial(get_osd_pods), "rbdplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods), "operator": partial(get_operator_pods), } disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) executor = ThreadPoolExecutor(max_workers=len(pod_objs) + len(rwx_pod_objs)) # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) # Fetch the number of Pods and PVCs initial_num_of_pods = len(get_all_pods(namespace=namespace)) initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"]) # Fetch PV names to verify after deletion pv_objs = [] for pvc_obj in pvcs_to_delete: pvc_obj.reload() pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in pods_to_delete: pod_info = pod_obj.get() node = pod_info["spec"]["nodeName"] pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"] for pvc_obj in pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Fetch image uuid associated with PVCs to be deleted pvc_uuid_map = {} for pvc_obj in pvcs_to_delete: pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Do setup on pods for running IO log.info("Setting up pods for running IO.") for pod_obj in pod_objs + rwx_pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs + rwx_pod_objs: log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(180, 2, getattr, pod_obj, "wl_setup_done"): if sample: log.info( f"Setup for running IO is completed on pod " f"{pod_obj.name}." ) break log.info("Setup for running IO is completed on all pods.") # Start IO on pods having PVCs to delete to load data log.info("Starting IO on pods having PVCs to delete.") self.run_io_on_pods(pods_for_pvc) log.info("IO started on pods having PVCs to delete.") log.info("Fetching IO results from the pods having PVCs to delete.") for pod_obj in pods_for_pvc: get_fio_rw_iops(pod_obj) log.info("Verified IO result on pods having PVCs to delete.") # Delete pods having PVCs to delete. delete_pods(pods_for_pvc) for pod_obj in pods_for_pvc: pod_obj.ocp.wait_for_delete(pod_obj.name) log.info("Verified: Deleted pods which are having PVCs to delete.") # Start IO on pods to be deleted log.info("Starting IO on pods to be deleted.") self.run_io_on_pods(pods_to_delete) log.info("IO started on pods to be deleted.") # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete) log.info("Started deleting PVCs") # Start deleting pods pod_bulk_delete = executor.submit(delete_pods, pods_to_delete, wait=False) log.info("Started deleting pods") # Start IO on IO pods self.run_io_on_pods(io_pods) log.info("Started IO on IO pods") # Verify pvc deletion has started pvc_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pvcs, previous_num=initial_num_of_pvc, namespace=namespace, change_type="decrease", min_difference=1, timeout=30, interval=0.01, ) # Verify pod deletion has started pod_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pods, previous_num=initial_num_of_pods, namespace=namespace, change_type="decrease", min_difference=1, timeout=30, interval=0.01, ) assert pvc_deleting.result(), "Wait timeout: PVCs are not being deleted." log.info("PVCs deletion has started.") assert pod_deleting.result(), "Wait timeout: Pods are not being deleted." log.info("Pods deletion has started.") # Delete pod of type 'resource_to_delete' disruption.delete_resource() pod_bulk_delete.result() # Verify pods are deleted for pod_obj in pods_to_delete: pod_obj.ocp.wait_for_delete(pod_obj.name, 300) log.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict) for node, pvs in node_pv_mounted.items(): assert not pvs, ( f"PVs {pvs} is still present on node {node} after " f"deleting the pods." ) log.info( "Verified: mount points are removed from nodes after deleting " "the pods" ) pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in pvcs_to_delete: pvc_obj.ocp.wait_for_delete(pvc_obj.name) log.info("Verified: PVCs are deleted.") # Verify PVs are deleted for pv_obj in pv_objs: pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300) log.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. pool_name = default_ceph_block_pool() for pvc_name, uuid in pvc_uuid_map.items(): if interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend( interface=interface, image_uuid=uuid, pool_name=pool_name ) if interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend( interface=interface, image_uuid=uuid ) assert ret, ( f"Volume associated with PVC {pvc_name} still exists " f"in backend" ) log.info("Fetching IO results from the pods.") for pod_obj in io_pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info("Verified IO result on pods.") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}" ) # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) log.info("Ceph cluster health is OK")
def disruptive_base(self, interface, operation_to_disrupt, resource_to_delete): """ Base function for disruptive tests. Deletion of 'resource_to_delete' will be introduced while 'operation_to_disrupt' is progressing. """ pod_functions = { "mds": partial(get_mds_pods), "mon": partial(get_mon_pods), "mgr": partial(get_mgr_pods), "osd": partial(get_osd_pods), "rbdplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods), "operator": partial(get_operator_pods), } disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) executor = ThreadPoolExecutor(max_workers=1) # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) # Fetch the number of Pods and PVCs initial_num_of_pods = len(get_all_pods(namespace=self.namespace)) initial_num_of_pvc = len( get_all_pvcs(namespace=self.namespace)["items"]) # Fetch PV names pv_objs = [] for pvc_obj in self.pvc_objs: pvc_obj.reload() pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in self.pod_objs: pod_info = pod_obj.get() node = pod_info["spec"]["nodeName"] pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][ "claimName"] for pvc_obj in self.pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Do setup for running IO on pods log.info("Setting up pods for running IO") for pod_obj in self.pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": pod_obj.pvc.storage_type = "block" else: pod_obj.pvc.storage_type = "fs" pod_obj.workload_setup(storage_type=pod_obj.pvc.storage_type) log.info("Setup for running IO is completed on pods") # Start IO on each pod. RWX PVC will be used on two pods. So split the # size accordingly log.info("Starting IO on pods") for pod_obj in self.pod_objs: if pod_obj.pvc.access_mode == constants.ACCESS_MODE_RWX: io_size = int((self.pvc_size - 1) / 2) else: io_size = self.pvc_size - 1 pod_obj.run_io( storage_type=pod_obj.pvc.storage_type, size=f"{io_size}G", fio_filename=f"{pod_obj.name}_io", end_fsync=1, ) log.info("IO started on all pods.") # Start deleting pods pod_bulk_delete = executor.submit(delete_pods, self.pod_objs, wait=False) if operation_to_disrupt == "delete_pods": ret = wait_for_resource_count_change( get_all_pods, initial_num_of_pods, self.namespace, "decrease", timeout=50, ) assert ret, "Wait timeout: Pods are not being deleted." log.info("Pods deletion has started.") disruption.delete_resource() pod_bulk_delete.result() # Verify pods are deleted for pod_obj in self.pod_objs: assert pod_obj.ocp.wait_for_delete( pod_obj.name, 180), f"Pod {pod_obj.name} is not deleted" log.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod for node, pvs in node_pv_dict.items(): cmd = f"oc debug nodes/{node} -- df" df_on_node = run_cmd(cmd) for pv in pvs: assert pv not in df_on_node, ( f"{pv} is still present on node {node} after " f"deleting the pods.") log.info( "Verified: mount points are removed from nodes after deleting " "the pods") # Fetch image uuid associated with PVCs pvc_uuid_map = {} for pvc_obj in self.pvc_objs: pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, self.pvc_objs) if operation_to_disrupt == "delete_pvcs": ret = wait_for_resource_count_change(get_all_pvcs, initial_num_of_pvc, self.namespace, "decrease", timeout=50) assert ret, "Wait timeout: PVCs are not being deleted." log.info("PVCs deletion has started.") disruption.delete_resource() pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in self.pvc_objs: assert pvc_obj.ocp.wait_for_delete( pvc_obj.name), f"PVC {pvc_obj.name} is not deleted" log.info("Verified: PVCs are deleted.") # Verify PVs are deleted for pv_obj in pv_objs: assert pv_obj.ocp.wait_for_delete( pv_obj.name, 120), f"PV {pv_obj.name} is not deleted" log.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. pool_name = default_ceph_block_pool() for pvc_name, uuid in pvc_uuid_map.items(): if interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid, pool_name=pool_name) if interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid) assert ret, (f"Volume associated with PVC {pvc_name} still exists " f"in backend") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}") # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) log.info("Ceph cluster health is OK")
def test_ceph_daemon_kill_during_resource_creation( self, interface, operation_to_disrupt, resource_to_delete, multi_pvc_factory, pod_factory, ): """ Base function for ceph daemon kill disruptive tests. Deletion of 'resource_to_delete' daemon will be introduced while 'operation_to_disrupt' is progressing. """ disruption = disruption_helpers.Disruptions() pod_functions = { "mds": partial(pod.get_mds_pods), "mon": partial(pod.get_mon_pods), "mgr": partial(pod.get_mgr_pods), "osd": partial(pod.get_osd_pods), "rbdplugin": partial(pod.get_plugin_pods, interface=interface), "cephfsplugin": partial(pod.get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(pod.get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(pod.get_rbdfsplugin_provisioner_pods), "operator": partial(pod.get_operator_pods), } # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) num_of_pvc = 12 namespace = self.proj_obj.namespace # Fetch the number of Pods and PVCs initial_num_of_pods = len(pod.get_all_pods(namespace=namespace)) initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"]) executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc)) disruption.set_resource(resource=resource_to_delete) disruption.select_daemon() access_modes = [constants.ACCESS_MODE_RWO] if interface == constants.CEPHFILESYSTEM: access_modes.append(constants.ACCESS_MODE_RWX) # Modify access_modes list to create rbd `block` type volume with # RWX access mode. RWX is not supported in non-block type rbd if interface == constants.CEPHBLOCKPOOL: access_modes.extend([ f"{constants.ACCESS_MODE_RWO}-Block", f"{constants.ACCESS_MODE_RWX}-Block", ]) # Start creation of PVCs bulk_pvc_create = executor.submit( multi_pvc_factory, interface=interface, project=self.proj_obj, size=8, access_modes=access_modes, access_modes_selection="distribute_random", status=constants.STATUS_BOUND, num_of_pvc=num_of_pvc, wait_each=False, timeout=90, ) if operation_to_disrupt == "create_pvc": # Ensure PVCs are being created before deleting the resource ret = helpers.wait_for_resource_count_change( get_all_pvcs, initial_num_of_pvc, namespace, "increase") assert ret, "Wait timeout: PVCs are not being created." log.info("PVCs creation has started.") disruption.kill_daemon() pvc_objs = bulk_pvc_create.result() # Confirm that PVCs are Bound for pvc_obj in pvc_objs: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=120) pvc_obj.reload() log.info("Verified: PVCs are Bound.") # Start creating pods bulk_pod_create = executor.submit(helpers.create_pods, pvc_objs, pod_factory, interface, 2) if operation_to_disrupt == "create_pod": # Ensure that pods are being created before deleting the resource ret = helpers.wait_for_resource_count_change( pod.get_all_pods, initial_num_of_pods, namespace, "increase") assert ret, "Wait timeout: Pods are not being created." log.info("Pods creation has started.") disruption.kill_daemon() pod_objs = bulk_pod_create.result() # Verify pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) pod_obj.reload() log.info("Verified: All pods are Running.") # Do setup on pods for running IO log.info("Setting up pods for running IO.") for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs: log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(180, 2, getattr, pod_obj, "wl_setup_done"): if sample: log.info(f"Setup for running IO is completed on pod " f"{pod_obj.name}.") break log.info("Setup for running IO is completed on all pods.") # Start IO on each pod for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="2G", runtime=30, fio_filename=f"{pod_obj.name}_io_file1", ) log.info("FIO started on all pods.") if operation_to_disrupt == "run_io": disruption.kill_daemon() log.info("Fetching FIO results.") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info(f"FIO is success on pod {pod_obj.name}") log.info("Verified FIO result on pods.") # Delete pods for pod_obj in pod_objs: pod_obj.delete(wait=True) for pod_obj in pod_objs: pod_obj.ocp.wait_for_delete(pod_obj.name) # Verify that PVCs are reusable by creating new pods pod_objs = helpers.create_pods(pvc_objs, pod_factory, interface, 2) # Verify new pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) pod_obj.reload() log.info("Verified: All new pods are Running.") # Run IO on each of the new pods for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" pod_obj.run_io( storage_type=storage_type, size="1G", runtime=10, fio_filename=f"{pod_obj.name}_io_file2", ) log.info("Fetching FIO results from new pods") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info(f"FIO is success on pod {pod_obj.name}") log.info("Verified FIO result on new pods.") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}") # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) log.info("Ceph cluster health is OK")
import logging from concurrent.futures import ThreadPoolExecutor import pytest from functools import partial from ocs_ci.framework.testlib import ManageTest, tier4, tier4a, ignore_leftover_label from ocs_ci.framework import config from ocs_ci.ocs import constants from ocs_ci.ocs.resources.pvc import get_all_pvcs from ocs_ci.ocs.resources import pod from ocs_ci.utility.utils import TimeoutSampler, ceph_health_check from ocs_ci.helpers import helpers, disruption_helpers logger = logging.getLogger(__name__) DISRUPTION_OPS = disruption_helpers.Disruptions() @tier4 @tier4a @ignore_leftover_label(constants.drain_canary_pod_label) @pytest.mark.parametrize( argnames=["interface", "operation_to_disrupt", "resource_to_delete"], argvalues=[ pytest.param( *[constants.CEPHBLOCKPOOL, "create_pvc", "mgr"], marks=pytest.mark.polarion_id("OCS-568"), ), pytest.param( *[constants.CEPHBLOCKPOOL, "create_pod", "mgr"], marks=pytest.mark.polarion_id("OCS-569"),
def test_resource_deletion_during_pvc_clone(self, pvc_clone_factory, pod_factory): """ Verify PVC clone will succeeded if rook-ceph, csi pods are re-spun while creating the clone """ pods_to_delete = [ "rbdplugin_provisioner", "cephfsplugin_provisioner", "cephfsplugin", "rbdplugin", "osd", "mgr", ] executor = ThreadPoolExecutor(max_workers=len(self.pvcs) + len(pods_to_delete)) disruption_ops = [ disruption_helpers.Disruptions() for _ in pods_to_delete ] file_name = "file_clone" # Run IO log.info("Running fio on all pods to create a file") for pod_obj in self.pods: storage_type = ("block" if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK) else "fs") pod_obj.run_io( storage_type=storage_type, size="1G", runtime=30, fio_filename=file_name, end_fsync=1, ) log.info("Wait for IO to complete on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"Verified IO on pod {pod_obj.name}") # Calculate md5sum file_name_pod = (file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) pod_obj.pvc.md5sum = cal_md5sum( pod_obj, file_name_pod, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info(f"md5sum obtained from pod {pod_obj.name}") log.info("IO is successful on all pods") # Select the pods to be deleted for disruption, pod_type in zip(disruption_ops, pods_to_delete): cluster_index = None # 'provider_index' will not be None if the platform is Managed Services if self.provider_index is not None: if pod_type in ["osd", "mgr"]: cluster_index = self.provider_index config.switch_to_provider() else: cluster_index = self.consumer_index config.switch_ctx(cluster_index) disruption.set_resource(resource=pod_type, cluster_index=cluster_index) # Switch cluster context if the platform is MS. 'provider_index' will not be None if platform is MS. if self.provider_index is not None: config.switch_ctx(self.consumer_index) # Clone PVCs log.info("Start creating clone of PVCs") for pvc_obj in self.pvcs: log.info(f"Creating clone of PVC {pvc_obj.name}") pvc_obj.clone_proc = executor.submit( pvc_clone_factory, pvc_obj=pvc_obj, status="", access_mode=pvc_obj.get_pvc_access_mode, volume_mode=pvc_obj.volume_mode, ) log.info("Started creating clone") # Delete the pods 'pods_to_delete' log.info(f"Deleting pods {pods_to_delete}") for disruption in disruption_ops: disruption.delete_proc = executor.submit( disruption.delete_resource) # Wait for delete and recovery [disruption.delete_proc.result() for disruption in disruption_ops] # Get cloned PVCs clone_pvc_objs = [] for pvc_obj in self.pvcs: clone_obj = pvc_obj.clone_proc.result() clone_pvc_objs.append(clone_obj) log.info(f"Created clone {clone_obj.name} of PVC {pvc_obj.name}") log.info("Created clone of all PVCs") # Confirm that the cloned PVCs are Bound log.info("Verifying the cloned PVCs are Bound") for pvc_obj in clone_pvc_objs: wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=300) pvc_obj.reload() pvc_obj.volume_mode = pvc_obj.data["spec"]["volumeMode"] log.info("Verified: Cloned PVCs are Bound.") clone_pod_objs = [] # Attach the cloned PVCs to pods log.info("Attach the cloned PVCs to pods") for pvc_obj in clone_pvc_objs: if pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK: pod_dict_path = constants.CSI_RBD_RAW_BLOCK_POD_YAML else: pod_dict_path = "" restore_pod_obj = pod_factory( interface=pvc_obj.interface, pvc=pvc_obj, status="", pod_dict_path=pod_dict_path, raw_block_pv=pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK, ) clone_pod_objs.append(restore_pod_obj) # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in clone_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify md5sum log.info("Verify md5sum") for pod_obj in clone_pod_objs: file_name_pod = (file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) verify_data_integrity( pod_obj, file_name_pod, pod_obj.pvc.parent.md5sum, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info( f"Verified: md5sum of {file_name_pod} on pod {pod_obj.name} " f"matches with the original md5sum") log.info("Data integrity check passed on all pods") # Run IO log.info("Running IO on new pods") for pod_obj in clone_pod_objs: storage_type = ("block" if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK) else "fs") pod_obj.run_io( storage_type=storage_type, size="1G", runtime=20, fio_filename=file_name, end_fsync=1, ) log.info("Wait for IO to complete on new pods") for pod_obj in clone_pod_objs: pod_obj.get_fio_results() log.info(f"Verified IO on new pod {pod_obj.name}") log.info("IO to completed on new pods")
def test_resource_deletion_during_snapshot_restore( self, snapshot_factory, snapshot_restore_factory, pod_factory): """ Verify PVC snapshot and restore will succeeded if rook-ceph, csi pods are re-spun while creating snapshot and while creating restore PVC """ pods_to_delete = [ "rbdplugin_provisioner", "cephfsplugin_provisioner", "cephfsplugin", "rbdplugin", "osd", "mgr", ] executor = ThreadPoolExecutor(max_workers=len(self.pvcs) + len(pods_to_delete)) disruption_ops = [ disruption_helpers.Disruptions() for _ in pods_to_delete ] file_name = "file_snap" # Run IO log.info("Running fio on all pods to create a file") for pod_obj in self.pods: storage_type = ("block" if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK) else "fs") pod_obj.run_io( storage_type=storage_type, size="1G", runtime=30, fio_filename=file_name, end_fsync=1, ) log.info("Wait for IO to complete on pods") for pod_obj in self.pods: pod_obj.get_fio_results() log.info(f"Verified IO on pod {pod_obj.name}") # Calculate md5sum file_name_pod = (file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) pod_obj.pvc.md5sum = cal_md5sum( pod_obj, file_name_pod, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info(f"md5sum obtained from pod {pod_obj.name}") log.info("IO is successful on all pods") # Select the pods to be deleted for disruption, pod_type in zip(disruption_ops, pods_to_delete): # Select snapshotter leader if the pod is provisioner pod disruption.set_resource( resource=pod_type, leader_type="snapshotter" if "provisioner" in pod_type else "", ) log.info("Start taking snapshot of all PVCs.") for pvc_obj in self.pvcs: log.info(f"Taking snapshot of PVC {pvc_obj.name}") pvc_obj.snap_proc = executor.submit(snapshot_factory, pvc_obj, wait=False) log.info("Started taking snapshot of all PVCs.") # Delete the pods 'pods_to_delete' log.info(f"Deleting pods {pods_to_delete}") for disruption in disruption_ops: disruption.delete_proc = executor.submit( disruption.delete_resource) # Wait for delete and recovery [disruption.delete_proc.result() for disruption in disruption_ops] # Get snapshots snap_objs = [] for pvc_obj in self.pvcs: snap_obj = pvc_obj.snap_proc.result() snap_obj.md5sum = pvc_obj.md5sum snap_objs.append(snap_obj) # Wait for snapshots to be Ready log.info("Waiting for all snapshots to be Ready") for snap_obj in snap_objs: snap_obj.ocp.wait_for_resource( condition="true", resource_name=snap_obj.name, column=constants.STATUS_READYTOUSE, timeout=300, ) log.info(f"Snapshot {snap_obj.name} is Ready") snap_obj.reload() log.info("All snapshots are Ready") # Select the pods to be deleted for disruption, pod_type in zip(disruption_ops, pods_to_delete): disruption.set_resource(resource=pod_type) restore_pvc_objs = [] # Create PVCs out of the snapshots log.info("Start creating new PVCs from snapshots") for snap_obj in snap_objs: log.info(f"Creating a PVC from snapshot {snap_obj.name}") snap_obj.restore_proc = executor.submit( snapshot_restore_factory, snapshot_obj=snap_obj, size=f"{self.pvc_size}Gi", volume_mode=snap_obj.parent_volume_mode, access_mode=snap_obj.parent_access_mode, status="", ) log.info("Started creating new PVCs from snapshots") # Delete the pods 'pods_to_delete' log.info(f"Deleting pods {pods_to_delete}") for disruption in disruption_ops: disruption.delete_proc = executor.submit( disruption.delete_resource) # Wait for delete and recovery [disruption.delete_proc.result() for disruption in disruption_ops] # Get restored PVCs for snap_obj in snap_objs: restore_pvc_obj = snap_obj.restore_proc.result() restore_pvc_objs.append(restore_pvc_obj) log.info(f"Created PVC {restore_pvc_obj.name} from snapshot " f"{snap_obj.name}") log.info("Created new PVCs from all the snapshots") # Confirm that the restored PVCs are Bound log.info("Verifying the restored PVCs are Bound") for pvc_obj in restore_pvc_objs: wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=300) pvc_obj.reload() pvc_obj.volume_mode = pvc_obj.data["spec"]["volumeMode"] log.info("Verified: Restored PVCs are Bound.") restore_pod_objs = [] # Attach the restored PVCs to pods log.info("Attach the restored PVCs to pods") for pvc_obj in restore_pvc_objs: if pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK: pod_dict_path = constants.CSI_RBD_RAW_BLOCK_POD_YAML else: pod_dict_path = "" restore_pod_obj = pod_factory( interface=pvc_obj.interface, pvc=pvc_obj, status="", pod_dict_path=pod_dict_path, raw_block_pv=pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK, ) restore_pod_objs.append(restore_pod_obj) log.info("Attach the restored PVCs to pods") # Verify the new pods are running log.info("Verify the new pods are running") for pod_obj in restore_pod_objs: wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) log.info("Verified: New pods are running") # Verify md5sum log.info("Verify md5sum") for pod_obj in restore_pod_objs: file_name_pod = (file_name if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_FILESYSTEM) else pod_obj.get_storage_path(storage_type="block")) verify_data_integrity( pod_obj, file_name_pod, pod_obj.pvc.snapshot.md5sum, pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK, ) log.info( f"Verified: md5sum of {file_name_pod} on pod {pod_obj.name} " f"matches with the original md5sum") log.info("Data integrity check passed on all pods") # Run IO log.info("Running IO on new pods") for pod_obj in restore_pod_objs: storage_type = ("block" if (pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK) else "fs") pod_obj.run_io( storage_type=storage_type, size="1G", runtime=20, fio_filename=file_name, end_fsync=1, ) log.info("Wait for IO to complete on new pods") for pod_obj in restore_pod_objs: pod_obj.get_fio_results() log.info(f"Verified IO on new pod {pod_obj.name}") log.info("IO to completed on new pods")
def test_pod_disruptions(self, create_pvcs_and_pods): """ Test to perform pod disruption in consumer and provider cluster """ # List of pods to be disrupted. Using different list for consumer and provider for the easy implementation pods_on_consumer = [ "alertmanager_managed_ocs_alertmanager", "ocs_osd_controller_manager", "prometheus_managed_ocs_prometheus", "prometheus_operator", "ocs_operator", ] pods_on_provider = [ "alertmanager_managed_ocs_alertmanager", "ocs_osd_controller_manager", "prometheus_managed_ocs_prometheus", "prometheus_operator", "ocs_provider_server", "ocs_operator", ] disruption_on_consumer = [] disruption_on_provider = [] # Start I/O log.info("Starting fio on all pods") for pod_obj in self.io_pods: if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK: storage_type = "block" direct = 1 else: storage_type = "fs" direct = 0 pod_obj.run_io( storage_type=storage_type, size="10G", fio_filename=f"{pod_obj.name}", runtime=320, end_fsync=1, direct=direct, invalidate=0, fio_installed=True, ) consumer_index_iter = cycle(self.consumer_indexes) # Create Disruptions instance for each pod to be disrupted on consumer for pod_type in pods_on_consumer: consumer_index = next(consumer_index_iter) config.switch_ctx(consumer_index) disruption_obj = disruption_helpers.Disruptions() # Select each pod to be disrupted from different consumers disruption_obj.set_resource(resource=pod_type, cluster_index=consumer_index) disruption_obj.index_of_consumer = consumer_index disruption_on_consumer.append(disruption_obj) # Create Disruptions instance for each pod to be disrupted on provider config.switch_to_provider() for pod_type in pods_on_provider: disruption_obj = disruption_helpers.Disruptions() disruption_obj.set_resource( resource=pod_type, cluster_index=self.provider_cluster_index) disruption_on_provider.append(disruption_obj) # Delete pods on consumer one at a time log.info("Starting pod disruptions on consumer clusters") for disruptions_obj in disruption_on_consumer: disruptions_obj.delete_resource() # ocs-operator respin will trigger rook-ceph-tools pod respin. # Patch rook-ceph-tools pod to run ceph commands. if disruptions_obj.resource == "ocs_operator": config.switch_ctx(disruptions_obj.index_of_consumer) patch_consumer_toolbox() # Delete pods on provider one at a time log.info("Starting pod disruptions on provider cluster") for disruptions_obj in disruption_on_provider: disruptions_obj.delete_resource() log.info("Wait for IO to complete on pods") for pod_obj in self.io_pods: pod_obj.get_fio_results() log.info(f"Verified IO on pod {pod_obj.name}") log.info("IO is successful on all pods") # Performs different checks in the clusters for cluster_index in [self.provider_cluster_index ] + self.consumer_indexes: config.switch_ctx(cluster_index) # Verify managedocs components are Ready log.info("Verifying managedocs components state") managedocs_obj = OCP( kind="managedocs", resource_name="managedocs", namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) for component in {"alertmanager", "prometheus", "storageCluster"}: assert ( managedocs_obj.get()["status"]["components"][component] ["state"] == "Ready" ), f"{component} status is {managedocs_obj.get()['status']['components'][component]['state']}" # Verify storagecluster status log.info("Verifying storagecluster status") verify_storage_cluster() # Verify CSV status for managed_csv in { constants.OCS_CSV_PREFIX, constants.OSD_DEPLOYER, constants.OSE_PROMETHEUS_OPERATOR, }: csvs = csv.get_csvs_start_with_prefix( managed_csv, constants.OPENSHIFT_STORAGE_NAMESPACE) assert ( len(csvs) == 1 ), f"Unexpected number of CSVs with {managed_csv} prefix: {len(csvs)}" csv_name = csvs[0]["metadata"]["name"] csv_obj = csv.CSV( resource_name=csv_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) log.info(f"Check if {csv_name} is in Succeeded phase.") csv_obj.wait_for_phase(phase="Succeeded", timeout=600) # Verify the phase of ceph cluster log.info("Verify the phase of ceph cluster") cephcluster = OCP(kind="CephCluster", namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) cephcluster_yaml = cephcluster.get().get("items")[0] expected_phase = "Connected" if cluster_index == self.provider_cluster_index: expected_phase = "Ready" assert ( cephcluster_yaml["status"]["phase"] == expected_phase ), f"Status of cephcluster {cephcluster_yaml['metadata']['name']} is {cephcluster_yaml['status']['phase']}" # Create PVC and pods on all consumer clusters log.info("Creating new PVCs and pods") pods = list() for cluster_index in self.consumer_indexes: config.switch_ctx(cluster_index) consumer_cluster_kubeconfig = os.path.join( config.clusters[cluster_index].ENV_DATA["cluster_path"], config.clusters[cluster_index].RUN.get("kubeconfig_location"), ) pvcs, io_pods = create_pvcs_and_pods( pvc_size=self.pvc_size, replica_count=1, pod_dict_path=constants.PERF_POD_YAML, ) for pvc_obj in pvcs: pvc_obj.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig for io_pod in io_pods: io_pod.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig pvcs[0].project.cluster_kubeconfig = consumer_cluster_kubeconfig pods.extend(io_pods) # Run I/O on new pods log.info("Running I/O on new pods") for pod_obj in pods: if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK: storage_type = "block" direct = 1 else: storage_type = "fs" direct = 0 pod_obj.run_io( storage_type=storage_type, size="10G", fio_filename=f"{pod_obj.name}", runtime=320, end_fsync=1, direct=direct, invalidate=0, fio_installed=True, ) log.info("Wait for I/O to complete on new pods") for pod_obj in pods: pod_obj.get_fio_results() log.info(f"Verified IO on the new pod {pod_obj.name}") log.info("IO is successful on new pods")
def test_daemon_kill_during_pvc_pod_creation_deletion_and_io( self, setup_base, multi_pvc_factory, pod_factory ): """ Kill ceph daemons while PVCs creation, PVCs deletion, pods creation, pods deletion and IO are progressing """ daemons_to_kill = [ "mgr", "mon", "osd", "mds", ] ( pvc_objs, pod_objs, rwx_pod_objs, cephfs_pvc_for_pods, rbd_pvc_for_pods, ) = setup_base num_of_pods_to_delete = 3 num_of_io_pods = 1 num_pvc_create_during_disruption = len( self.access_modes_cephfs + self.access_modes_rbd ) # Select pods to be deleted pods_to_delete = pod_objs[:num_of_pods_to_delete] pods_to_delete.extend( [ pod for pod in rwx_pod_objs for pod_obj in pods_to_delete if (pod_obj.pvc == pod.pvc) ] ) # Select pods to run IO io_pods = pod_objs[ num_of_pods_to_delete : num_of_pods_to_delete + num_of_io_pods ] io_pods.extend( [ pod for pod in rwx_pod_objs for pod_obj in io_pods if (pod_obj.pvc == pod.pvc) ] ) # Select pods which are having PVCs to delete pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods :] pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc] pods_for_pvc.extend( [ pod for pod in rwx_pod_objs for pod_obj in pods_for_pvc if (pod_obj.pvc == pod.pvc) ] ) io_pods = [ pod_obj for pod_obj in io_pods if pod_obj.pvc in select_unique_pvcs([pod_obj.pvc for pod_obj in io_pods]) ] log.info( f"{len(pods_to_delete)} pods selected for deletion in which " f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod " f"share same RWX PVC" ) log.info( f"{len(io_pods)} pods selected for running IO in which one " f"pair of pod share same RWX PVC" ) no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete) log.info( f"{len(pvcs_to_delete)} PVCs selected for deletion. " f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, " f"RWX PVCs: {no_of_rwx_pvcs_delete}" ) pod_functions = { "mds": partial(get_mds_pods), "mon": partial(get_mon_pods), "mgr": partial(get_mgr_pods), "osd": partial(get_osd_pods), } # Disruption object for each daemon type disruption_ops = [disruption_helpers.Disruptions() for _ in daemons_to_kill] # Select the resource of each type for disruption, pod_type in zip(disruption_ops, daemons_to_kill): disruption.set_resource(resource=pod_type) executor = ThreadPoolExecutor( max_workers=len(pod_objs) + len(rwx_pod_objs) + len(rbd_pvc_for_pods) + len(cephfs_pvc_for_pods) + len(daemons_to_kill) + num_pvc_create_during_disruption ) # Get number of pods of the type given in daemons_to_kill list num_of_resource_pods = [ len(pod_functions[resource_name]()) for resource_name in daemons_to_kill ] # Fetch PV names to verify after deletion pv_objs = [] for pvc_obj in pvcs_to_delete: pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in pods_to_delete: pod_info = pod_obj.get() node = pod_info["spec"]["nodeName"] pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"] for pvc_obj in pvc_objs: if pvc_obj.name == pvc: pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Fetch image uuid associated with PVCs to be deleted pvc_uuid_map = {} for pvc_obj in pvcs_to_delete: pvc_uuid_map[pvc_obj] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Do setup on pods for running IO log.info("Setting up pods for running IO.") for pod_obj in pod_objs + rwx_pod_objs: if pod_obj.pvc.get_pvc_vol_mode == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs + rwx_pod_objs: log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(360, 2, getattr, pod_obj, "wl_setup_done"): if sample: log.info( f"Setup for running IO is completed on pod " f"{pod_obj.name}." ) break log.info("Setup for running IO is completed on all pods.") # Start IO on pods having PVCs to delete to load data pods_for_pvc_io = [ pod_obj for pod_obj in pods_for_pvc if pod_obj.pvc in select_unique_pvcs([pod_obj.pvc for pod_obj in pods_for_pvc]) ] log.info("Starting IO on pods having PVCs to delete.") self.run_io_on_pods(pods_for_pvc_io) log.info("IO started on pods having PVCs to delete.") log.info("Fetching IO results from the pods having PVCs to delete.") for pod_obj in pods_for_pvc_io: get_fio_rw_iops(pod_obj) log.info("Verified IO result on pods having PVCs to delete.") # Delete pods having PVCs to delete. assert self.delete_pods( pods_for_pvc ), "Couldn't delete pods which are having PVCs to delete." for pod_obj in pods_for_pvc: pod_obj.ocp.wait_for_delete(pod_obj.name) log.info("Verified: Deleted pods which are having PVCs to delete.") # Select daemon of each type of resource and identify the daemons running on each node nodes_and_pids = {} for disruption in disruption_ops: disruption.select_daemon() node_name = disruption.resource_obj[0].pod_data.get("spec").get("nodeName") # Create node-daemons dict. Value as string for passing in the 'kill' command nodes_and_pids[ node_name ] = f"{nodes_and_pids.get(node_name, '')} {disruption.daemon_pid}" # Start IO on pods to be deleted pods_to_delete_io = [ pod_obj for pod_obj in pods_to_delete if pod_obj.pvc in select_unique_pvcs([pod_obj.pvc for pod_obj in pods_to_delete]) ] log.info("Starting IO on selected pods to be deleted.") self.run_io_on_pods(pods_to_delete_io) log.info("IO started on selected pods to be deleted.") # Start creating new pods log.info("Start creating new pods.") pod_create_rbd = executor.submit( helpers.create_pods, rbd_pvc_for_pods, pod_factory, constants.CEPHBLOCKPOOL, 2, ) pod_create_cephfs = executor.submit( helpers.create_pods, cephfs_pvc_for_pods, pod_factory, constants.CEPHFILESYSTEM, 2, ) # Start creation of new CephFS PVCs. log.info("Start creating new CephFS PVCs.") pvc_create_cephfs = executor.submit( multi_pvc_factory, interface=constants.CEPHFILESYSTEM, project=self.project, storageclass=None, size=self.pvc_size, access_modes=self.access_modes_cephfs, access_modes_selection="distribute_random", status="", num_of_pvc=len(self.access_modes_cephfs), wait_each=False, ) # Start creation of new RBD PVCs log.info("Start creating new RBD PVCs.") pvc_create_rbd = executor.submit( multi_pvc_factory, interface=constants.CEPHBLOCKPOOL, project=self.project, storageclass=None, size=self.pvc_size, access_modes=self.access_modes_rbd, access_modes_selection="distribute_random", status="", num_of_pvc=len(self.access_modes_rbd), wait_each=False, ) # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete) log.info("Started deleting PVCs") # Start deleting pods pod_bulk_delete = executor.submit(self.delete_pods, pods_to_delete) log.info("Started deleting pods") # Start IO on IO pods self.run_io_on_pods(io_pods) log.info("Started IO on IO pods") # Wait for 1 second before killing daemons. This is to wait for the create/delete operations to start sleep(1) # Kill daemons node_and_kill_proc = {} log.info(f"Killing daemons of {daemons_to_kill}") for node_name, pids in nodes_and_pids.items(): # Command to kill the daemon kill_cmd = f"oc debug node/{node_name} -- chroot /host kill -9 {pids}" # Create node-kill process map for verifying the result node_and_kill_proc[node_name] = executor.submit(run_cmd, kill_cmd) # Verify daemon kill process for node_name, daemon_kill_proc in node_and_kill_proc.items(): # Get the type of daemons killed on the particular node resources = [ disruption.resource for disruption in disruption_ops if disruption.daemon_pid in nodes_and_pids[node_name] ] # 'daemon_kill_proc' result will be an empty string if command is success cmd_out = daemon_kill_proc.result() assert isinstance(cmd_out, str) and (not cmd_out), ( f"Failed to kill {resources } daemons in the node {node_name}. " f"Daemon kill command output - {cmd_out}" ) # Wait for new daemon to come up [disruption.check_new_pid() for disruption in disruption_ops] log.info("Verified daemons kill") pods_deleted = pod_bulk_delete.result() assert pods_deleted, "Deletion of pods failed." # Verify pods are deleted for pod_obj in pods_to_delete: pod_obj.ocp.wait_for_delete(pod_obj.name, 300) log.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict) for node, pvs in node_pv_mounted.items(): assert not pvs, ( f"PVs {pvs} is still present on node {node} after " f"deleting the pods." ) log.info( "Verified: mount points are removed from nodes after deleting " "the pods" ) pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in pvcs_to_delete: pvc_obj.ocp.wait_for_delete(pvc_obj.name) log.info("Verified: PVCs are deleted.") # Getting result of PVC creation as list of PVC objects log.info("Getting the result of CephFS PVC creation process") pvc_objs_cephfs_new = pvc_create_cephfs.result() log.info("Getting the result of RBD PVC creation process") pvc_objs_rbd_new = pvc_create_rbd.result() # Set interface argument for reference for pvc_obj in pvc_objs_cephfs_new: pvc_obj.interface = constants.CEPHFILESYSTEM # Set interface argument for reference for pvc_obj in pvc_objs_rbd_new: pvc_obj.interface = constants.CEPHBLOCKPOOL # Confirm PVCs are Bound log.info("Verifying the new CephFS and RBD PVCs are Bound") for pvc_obj in pvc_objs_cephfs_new + pvc_objs_rbd_new: helpers.wait_for_resource_state( resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180 ) pvc_obj.reload() log.info("Verified: New CephFS and RBD PVCs are Bound.") # Getting result of pods creation as list of Pod objects log.info("Getting the result of pods creation process") pod_objs_rbd_new = pod_create_rbd.result() pod_objs_cephfs_new = pod_create_cephfs.result() # Verify new pods are Running log.info("Verifying the new pods are Running") for pod_obj in pod_objs_rbd_new + pod_objs_cephfs_new: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=90 ) pod_obj.reload() log.info("Verified: All new pods are Running.") # Verify PVs are deleted for pv_obj in pv_objs: pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300) log.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. pool_name = default_ceph_block_pool() for pvc_obj, uuid in pvc_uuid_map.items(): if pvc_obj.interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend( interface=constants.CEPHBLOCKPOOL, image_uuid=uuid, pool_name=pool_name, ) if pvc_obj.interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend( interface=constants.CEPHFILESYSTEM, image_uuid=uuid ) assert ( ret ), f"Volume associated with PVC {pvc_obj.name} still exists in the backend" log.info("Fetching IO results from the pods.") for pod_obj in io_pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info("Verified IO result on pods.") # Verify that the new PVCs are usable by creating new pods log.info("Verify that the new PVCs are usable by creating new pods") pod_objs_rbd_re = helpers.create_pods( pvc_objs_rbd_new, pod_factory, constants.CEPHBLOCKPOOL, 2 ) pod_objs_cephfs_re = helpers.create_pods( pvc_objs_cephfs_new, pod_factory, constants.CEPHFILESYSTEM, 2 ) # Verify pods are Running log.info("Verifying the pods are Running") for pod_obj in pod_objs_rbd_re + pod_objs_cephfs_re: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=90 ) pod_obj.reload() log.info( "Successfully created and verified the status of the pods using the new CephFS and RBD PVCs." ) new_pods = ( pod_objs_rbd_new + pod_objs_cephfs_new + pod_objs_rbd_re + pod_objs_cephfs_re ) # Do setup on the new pods for running IO log.info("Setting up the new pods for running IO.") for pod_obj in new_pods: if pod_obj.pvc.get_pvc_vol_mode == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on the new pods to complete for pod_obj in new_pods: log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(360, 2, getattr, pod_obj, "wl_setup_done"): if sample: log.info( f"Setup for running IO is completed on pod " f"{pod_obj.name}." ) break log.info("Setup for running IO is completed on the new pods.") # Start IO on the new pods log.info("Start IO on the new pods") self.run_io_on_pods(new_pods) log.info("Started IO on the new pods") log.info("Fetching IO results from the new pods.") for pod_obj in new_pods: get_fio_rw_iops(pod_obj) log.info("Verified IO result on the new pods.") # Verify number of pods of each daemon type final_num_resource_name = [ len(pod_functions[resource_name]()) for resource_name in daemons_to_kill ] assert final_num_resource_name == num_of_resource_pods, ( f"Total number of pods of each type is not matching with " f"initial value. Total number of pods of each type before daemon kill: " f"{num_of_resource_pods}. Total number of pods of each type present now: " f"{final_num_resource_name}" ) # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) log.info("Ceph cluster health is OK")