Пример #1
0
    def test_pv_scale_out_create_pvcs_and_respin_ceph_pods(
        self,
        fioscale,
        resource_to_delete,
    ):
        """
        Test case to scale PVC+POD with multi projects and reach expected PVC count
        """

        # Get info from SCALE_DATA_FILE for validation
        if os.path.exists(SCALE_DATA_FILE):
            file_data = templating.load_yaml(SCALE_DATA_FILE)
            namespace = file_data.get("NAMESPACE")
            pod_scale_list = file_data.get("POD_SCALE_LIST")
            pvc_scale_list = file_data.get("PVC_SCALE_LIST")
        else:
            raise FileNotFoundError

        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        no_of_resource = disruption.resource_count
        for i in range(0, no_of_resource):
            disruption.delete_resource(resource_id=i)

        utils.ceph_health_check()

        # Validate all PVCs from namespace are in Bound state
        assert scale_lib.validate_all_pvcs_and_check_state(
            namespace=namespace, pvc_scale_list=pvc_scale_list)

        # Validate all PODs from namespace are up and running
        assert scale_lib.validate_all_pods_and_check_state(
            namespace=namespace, pod_scale_list=pod_scale_list)
    def test_respin_ceph_pods(self, resource_to_delete):
        """
        Test re-spin of Ceph daemond pods, Operator and CSI Pods
        in Scaled cluster
        """

        # Get info from SCALE_DATA_FILE for validation
        if os.path.exists(SCALE_DATA_FILE):
            file_data = templating.load_yaml(SCALE_DATA_FILE)
            namespace = file_data.get("NAMESPACE")
            pod_scale_list = file_data.get("POD_SCALE_LIST")
            pvc_scale_list = file_data.get("PVC_SCALE_LIST")
        else:
            raise FileNotFoundError

        # perform disruption test
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        no_of_resource = disruption.resource_count
        for i in range(0, no_of_resource):
            disruption.delete_resource(resource_id=i)

        utils.ceph_health_check()

        # Validate all PVCs from namespace are in Bound state
        assert scale_lib.validate_all_pvcs_and_check_state(
            namespace=namespace, pvc_scale_list=pvc_scale_list)

        # Validate all PODs from namespace are up and running
        assert scale_lib.validate_all_pods_and_check_state(
            namespace=namespace, pod_scale_list=pod_scale_list)

        # Check ceph health status
        utils.ceph_health_check(tries=20)
Пример #3
0
    def disrupt_plugin_provisioner_pods(self, node_list):
        """
        Set leader plugin-provisioner resources for disruption, skip if running
        on node from the node_list

        Args:
            node_list (list): list of node names to check

        Returns:
            list: list of Disruption objects

        """
        provisioner_resource = []
        for interface in [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]:
            provisioner_pod = pod.get_plugin_provisioner_leader(
                interface=interface)
            node_name = pod.get_pod_node(provisioner_pod).name
            if node_name not in node_list:
                if interface == constants.CEPHBLOCKPOOL:
                    provisioner_resource.append("rbdplugin_provisioner")
                else:
                    provisioner_resource.append("cephfsplugin_provisioner")

        disruptor = []
        for resource in provisioner_resource:
            disruption = disruption_helpers.Disruptions()
            disruption.set_resource(resource=resource)
            disruptor.append(disruption)

        return disruptor
Пример #4
0
    def test_registry_respin_pod(self, pod_name):
        """
        Test registry workload when backed by OCS respin of ceph pods
        """

        # Respin relevant pod
        log.info(f"Respin Ceph pod {pod_name}")
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=f"{pod_name}")
        disruption.delete_resource()

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(
            project_name=self.project_name,
            template="eap-cd-basic-s2i",
            image="registry.redhat.io/jboss-eap-7-tech-preview/eap-cd-openshift-rhel8:latest",
            pattern="eap-app",
        )

        # Validate image exists in registries path
        validate_image_exists(namespace=self.project_name)

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check()
Пример #5
0
    def test_scale_million_cephfs_files(
        self,
        million_file_cephfs,
        resource_to_delete,
    ):
        """
        Add a million files to the ceph filesystem
        Delete each instance of the parametrized ceph pod once
        the ceph cluster is healthy.  Make sure the ceph cluster comes back
        up and that rename operations function as expected.

        Args:
            million_file_cephfs (MillionFilesOnCephfs object):
                Tracks cephfs pod, pvcs, and list of files to rename.
            resource_to_delete (str): resource deleted for each testcase

        """
        logging.info(f"Testing respin of {resource_to_delete}")
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        disruption.delete_resource()
        ocp_obj = million_file_cephfs.ocp_obj
        for sfile in million_file_cephfs.test_file_list:
            sample = os.sep.join([constants.MOUNT_POINT, "x", sfile])
            newname = str(uuid.uuid4())
            fullnew = os.sep.join([constants.MOUNT_POINT, "x", newname])
            ocp_obj.exec_oc_cmd(
                f"exec {million_file_cephfs.pod_name} -- mv {sample} {fullnew}"
            )
            ocp_obj.exec_oc_cmd(
                f"exec {million_file_cephfs.pod_name} -- mv {fullnew} {sample}"
            )
        logging.info("Tests complete")
Пример #6
0
    def test_scale_endpoint_and_respin_ceph_pods(
        self,
        mcg_job_factory,
        resource_to_delete,
        worker_node,
    ):
        """
        Generate S3 workload to trigger autoscale to increase from 1 to 2 endpoint
        then respin ceph pods
        """
        # Add workers node to cluster
        scale_pgsql.add_worker_node()

        # Check autoscale endpoint count before start s3 load
        self._assert_endpoint_count(desired_count=self.MIN_ENDPOINT_COUNT)

        endpoint_cnt = get_endpoint_pod_count(
            constants.OPENSHIFT_STORAGE_NAMESPACE)
        get_hpa_utilization(constants.OPENSHIFT_STORAGE_NAMESPACE)
        job_cnt = 0
        wait_time = 30
        job_list = list()

        while endpoint_cnt < self.MAX_ENDPOINT_COUNT:
            exec(f"job{job_cnt} = mcg_job_factory(custom_options=options)")
            job_list.append(f"job{job_cnt}")
            time.sleep(wait_time)
            endpoint_cnt = get_endpoint_pod_count(
                constants.OPENSHIFT_STORAGE_NAMESPACE)
            hpa_cpu_utilization = get_hpa_utilization(
                constants.OPENSHIFT_STORAGE_NAMESPACE)
            log.info(
                f"HPA CPU utilization by noobaa-endpoint is {hpa_cpu_utilization}%"
            )
            if endpoint_cnt == self.MAX_ENDPOINT_COUNT:
                break
            job_cnt += 1

        # Validate autoscale endpoint count
        self._assert_endpoint_count(desired_count=self.MAX_ENDPOINT_COUNT)

        # Respin ceph pods
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        number_of_resource = disruption.resource_count
        for i in range(0, number_of_resource):
            disruption.delete_resource(resource_id=i)

        # Delete mcg_job_factory
        for job in job_list:
            exec(f"{job}.delete()")
            exec(
                f"{job}.ocp.wait_for_delete(resource_name={job}.name, timeout=60)"
            )

        # Validate autoscale endpoint count
        self._assert_endpoint_count(desired_count=self.MIN_ENDPOINT_COUNT)

        # Check ceph health status
        utils.ceph_health_check()
    def respin_ceph_pod(self, resource_to_delete):
        """
        Function to respin ceph pods one by one,
        delete_resource functions checks for the deleted pod back up and running

        Args:
            resource_to_delete (str): Ceph resource type to be deleted, eg: mgr/mon/osd/mds
        """
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        no_of_resource = disruption.resource_count
        for i in range(0, no_of_resource):
            disruption.delete_resource(resource_id=i)
    def test_pv_scale_out_create_pvcs_and_respin_ceph_pods(
        self,
        fioscale,
        resource_to_delete,
    ):
        """
        Test case to scale PVC+POD with multi projects and reach expected PVC count
        """

        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        no_of_resource = disruption.resource_count
        for i in range(0, no_of_resource):
            disruption.delete_resource(resource_id=i)

        utils.ceph_health_check()
Пример #9
0
    def test_run_pgsql_respin_pod(self, pgsql, transactions, pod_name):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)
        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Check worker node utilization(adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)

        # Respin relevant pod
        if pod_name == "postgres":
            pgsql.respin_pgsql_app_pod()
        else:
            log.info(f"Respin Ceph pod {pod_name}")
            disruption = disruption_helpers.Disruptions()
            disruption.set_resource(resource=f"{pod_name}")
            disruption.delete_resource()

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=40)
    def test_respin_osd_pods_to_verify_logging(
        self, create_pvc_and_deploymentconfig_pod
    ):
        """
        This function creates projects before and after respin of osd
        and verify project existence in EFK stack.
        1. Creates new project with PVC and app-pods
        2. Respins osd
        3. Logs into the EFK stack and checks for the health of cluster-logging
        4. Logs into the EFK stack and checks project existence
        5. Checks for the shards of the project in the EFK stack
        6. Creates new project and checks the existence again
        """

        # Create 1st project and app_pod
        dc_pod_obj, dc_pvc_obj = create_pvc_and_deploymentconfig_pod

        project1 = dc_pvc_obj.project.namespace

        # Delete the OSD pod
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource="osd")
        disruption.delete_resource()

        # Check the health of the cluster-logging
        assert ocp_logging_obj.check_health_of_clusterlogging()

        # Check for the 1st project created in EFK stack before the respin
        self.validate_project_exists(project1)

        # Check the files in the project
        self.check_filecount_in_project(project1)

        # Create another app_pod in new project
        pod_obj, pvc_obj = create_pvc_and_deploymentconfig_pod

        project2 = pvc_obj.project.namespace

        # Check the 2nd project exists in the EFK stack
        self.validate_project_exists(project2)

        self.check_filecount_in_project(project2)
Пример #11
0
    def test_scale_endpoint_and_respin_ceph_pods(self, mcg_job_factory,
                                                 resource_to_delete):
        """
        Generate S3 workload to trigger autoscale to increase from 1 to 2 endpoint
        then respin ceph pods
        """
        # Add workers node to cluster
        scale_pgsql.add_worker_node()

        # Check autoscale endpoint count before start s3 load
        self._assert_endpoint_count(desired_count=1)

        # Create s3 workload using mcg_job_factory
        for i in range(10):
            exec(f"job{i} = mcg_job_factory(custom_options=options)")

        # Validate autoscale endpoint count
        self._assert_endpoint_count(desired_count=2)

        # Respin ceph pods
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        number_of_resource = disruption.resource_count
        for i in range(0, number_of_resource):
            disruption.delete_resource(resource_id=i)

        # Delete mcg_job_factory
        for i in range(10):
            exec(f"job{i}.delete()")
            exec(
                f"job{i}.ocp.wait_for_delete(resource_name=job{i}.name, timeout=60)"
            )

        # Validate autoscale endpoint count
        self._assert_endpoint_count(desired_count=1)

        # Delete workers node in the cluster
        scale_pgsql.delete_worker_node()

        # Check ceph health status
        utils.ceph_health_check()
Пример #12
0
    def test_run_jenkins_respin_pod(self, jenkins, pod_name, num_projects,
                                    num_of_builds):
        """
        Test jenkins workload
        """
        # Init number of projects
        jenkins.number_projects = num_projects

        # Create app jenkins
        jenkins.create_app_jenkins()

        # Create jenkins pvc
        jenkins.create_jenkins_pvc()

        # Create jenkins build config
        jenkins.create_jenkins_build_config()

        # Wait jenkins deploy pod reach to completed state
        jenkins.wait_for_jenkins_deploy_status(status=STATUS_COMPLETED)

        # Init number of builds per project
        jenkins.number_builds_per_project = num_of_builds

        # Start Builds
        jenkins.start_build()

        # Respin pod
        log.info(f"Respin pod {pod_name}")
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=f"{pod_name}")
        disruption.delete_resource()

        # Wait build reach 'Complete' state
        jenkins.wait_for_build_to_complete()

        # Print table of builds
        jenkins.print_completed_builds_results()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=40)
Пример #13
0
    def test_registry_respin_pod(self, pod_name):
        """
        Test registry workload when backed by OCS respin of ceph pods
        """

        # Respin relevant pod
        log.info(f"Respin Ceph pod {pod_name}")
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=f"{pod_name}")
        disruption.delete_resource()

        # Pull and push images to registries
        log.info("Pull and push images to registries")
        image_pull_and_push(project_name=self.project_name)

        # Validate image exists in registries path
        validate_image_exists()

        # Validate image registry pods
        validate_registry_pod_status()

        # Validate cluster health ok and all pods are running
        self.sanity_helpers.health_check(tries=40)
    def test_daemon_kill_during_pvc_pod_creation_and_io(
        self, interface, resource_name, setup, multi_pvc_factory, pod_factory
    ):
        """
        Kill 'resource_name' daemon while PVCs creation, pods
        creation and IO operation are progressing.
        """
        num_of_new_pvcs = 5
        pvc_objs, io_pods, pvc_objs_new_pods, access_modes = setup
        proj_obj = pvc_objs[0].project
        storageclass = pvc_objs[0].storageclass

        pod_functions = {
            "mds": partial(get_mds_pods),
            "mon": partial(get_mon_pods),
            "mgr": partial(get_mgr_pods),
            "osd": partial(get_osd_pods),
            "rbdplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods),
            "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods),
            "operator": partial(get_operator_pods),
        }

        executor = ThreadPoolExecutor(max_workers=len(io_pods))

        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_name)

        # Get number of pods of type 'resource_name'
        resource_pods_num = len(pod_functions[resource_name]())

        # Do setup for running IO on pods
        log.info("Setting up pods for running IO")
        for pod_obj in io_pods:
            if pod_obj.pvc.volume_mode == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in io_pods:
            log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}")
            for sample in TimeoutSampler(180, 2, getattr, pod_obj, "wl_setup_done"):
                if sample:
                    log.info(
                        f"Setup for running IO is completed on pod " f"{pod_obj.name}."
                    )
                    break
        log.info("Setup for running IO is completed on pods")

        # Set daemon to be killed
        disruption.select_daemon()

        # Start creating new pods
        log.info("Start creating new pods.")
        bulk_pod_create = executor.submit(
            helpers.create_pods, pvc_objs_new_pods, pod_factory, interface, 2
        )

        # Start creation of new PVCs
        log.info("Start creating new PVCs.")
        bulk_pvc_create = executor.submit(
            multi_pvc_factory,
            interface=interface,
            project=proj_obj,
            storageclass=storageclass,
            size=self.pvc_size,
            access_modes=access_modes,
            access_modes_selection="distribute_random",
            status="",
            num_of_pvc=num_of_new_pvcs,
            wait_each=False,
        )

        # Start IO on each pod
        log.info("Start IO on pods")
        for pod_obj in io_pods:
            if pod_obj.pvc.volume_mode == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=10,
                fio_filename=f"{pod_obj.name}_io_file1",
            )
        log.info("IO started on all pods.")

        # Kill daemon
        disruption.kill_daemon()

        # Getting result of PVC creation as list of PVC objects
        pvc_objs_new = bulk_pvc_create.result()

        # Confirm PVCs are Bound
        for pvc_obj in pvc_objs_new:
            helpers.wait_for_resource_state(
                resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180
            )
            pvc_obj.reload()
        log.info("Verified: New PVCs are Bound.")

        # Getting result of pods creation as list of Pod objects
        pod_objs_new = bulk_pod_create.result()

        # Verify new pods are Running
        for pod_obj in pod_objs_new:
            helpers.wait_for_resource_state(
                resource=pod_obj, state=constants.STATUS_RUNNING
            )
            pod_obj.reload()
        log.info("Verified: All new pods are Running.")

        # Verify IO
        log.info("Fetching IO results from IO pods.")
        for pod_obj in io_pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
            log.info(f"IOPs after FIO on pod {pod_obj.name}:")
            log.info(f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}")
            log.info(f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}")
        log.info("Verified IO result on IO pods.")

        all_pod_objs = io_pods + pod_objs_new

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod in all_pod_objs:
            pod_info = pod.get()
            node = pod_info["spec"]["nodeName"]
            pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"]
            for pvc_obj in pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Delete pods
        for pod_obj in all_pod_objs:
            pod_obj.delete(wait=False)

        # Verify pods are deleted
        for pod_obj in all_pod_objs:
            pod_obj.ocp.wait_for_delete(resource_name=pod_obj.name)

        # Verify number of 'resource_name' type pods
        final_resource_pods_num = len(pod_functions[resource_name]())
        assert final_resource_pods_num == resource_pods_num, (
            f"Total number of {resource_name} pods is not matching with "
            f"initial value. Total number of pods before daemon kill: "
            f"{resource_pods_num}. Total number of pods present now: "
            f"{final_resource_pods_num}"
        )

        # Verify volumes are unmapped from nodes after deleting the pods
        node_pv_mounted = helpers.verify_pv_mounted_on_node(node_pv_dict)
        for node, pvs in node_pv_mounted.items():
            assert not pvs, (
                f"PVs {pvs} is still present on node {node} after "
                f"deleting the pods."
            )
        log.info(
            "Verified: mount points are removed from nodes after deleting " "the pods"
        )

        # Set volume mode on PVC objects
        for pvc_obj in pvc_objs_new:
            pvc_info = pvc_obj.get()
            setattr(pvc_obj, "volume_mode", pvc_info["spec"]["volumeMode"])

        # Verify that PVCs are reusable by creating new pods
        all_pvc_objs = pvc_objs + pvc_objs_new
        pod_objs_re = helpers.create_pods(all_pvc_objs, pod_factory, interface, 2)

        # Verify pods are Running
        for pod_obj in pod_objs_re:
            helpers.wait_for_resource_state(
                resource=pod_obj, state=constants.STATUS_RUNNING
            )
            pod_obj.reload()
        log.info("Successfully created new pods using all PVCs.")

        # Run IO on each of the newly created pods
        for pod_obj in pod_objs_re:
            if pod_obj.pvc.volume_mode == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=10,
                fio_filename=f"{pod_obj.name}_io_file2",
            )

        log.info("Fetching IO results from newly created pods")
        for pod_obj in pod_objs_re:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
            log.info(f"IOPs after FIO on pod {pod_obj.name}:")
            log.info(f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}")
            log.info(f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}")
        log.info("Verified IO result on newly created pods.")
    def test_resource_deletion_during_pvc_expansion(self, resource_to_delete):
        """
        Verify PVC expansion will succeed when rook-ceph, csi pods are re-spun
        during expansion

        """
        pvc_size_expanded = 30
        executor = ThreadPoolExecutor(max_workers=len(self.pvcs))
        disruption_ops = disruption_helpers.Disruptions()

        # Run IO to fill some data
        log.info(
            "Running IO on all pods to fill some data before PVC expansion.")
        for pod_obj in self.pods:
            storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="4G",
                io_direction="write",
                runtime=30,
                rate="10M",
                fio_filename=f"{pod_obj.name}_f1",
            )

        log.info("Wait for IO to complete on pods")
        for pod_obj in self.pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert err_count == 0, (f"IO error on pod {pod_obj.name}. "
                                    f"FIO result: {fio_result}")
            log.info(f"Verified IO on pod {pod_obj.name}.")
        log.info("IO is successful on all pods before PVC expansion.")

        # Select the pod to be deleted
        disruption_ops.set_resource(resource=resource_to_delete)

        log.info("Expanding all PVCs.")
        for pvc_obj in self.pvcs:
            log.info(
                f"Expanding size of PVC {pvc_obj.name} to {pvc_size_expanded}G"
            )
            pvc_obj.expand_proc = executor.submit(pvc_obj.resize_pvc,
                                                  pvc_size_expanded, True)

        # Delete the pod 'resource_to_delete'
        disruption_ops.delete_resource()

        # Verify pvc expand status
        for pvc_obj in self.pvcs:
            assert (pvc_obj.expand_proc.result()
                    ), f"Expansion failed for PVC {pvc_obj.name}"
        log.info("PVC expansion was successful on all PVCs")

        # Run IO to fill more data
        log.info("Write more data after PVC expansion.")
        for pod_obj in self.pods:
            storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="10G",
                io_direction="write",
                runtime=30,
                rate="10M",
                fio_filename=f"{pod_obj.name}_f2",
            )

        log.info("Wait for IO to complete on all pods")
        for pod_obj in self.pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert err_count == 0, (f"IO error on pod {pod_obj.name}. "
                                    f"FIO result: {fio_result}")
            log.info(f"Verified IO on pod {pod_obj.name}.")
        log.info("IO is successful on all pods after PVC expansion.")
    def test_resource_deletion_during_pvc_expansion(self, resource_to_delete):
        """
        Verify PVC expansion will succeed when rook-ceph, csi pods are re-spun
        during expansion

        """
        pvc_size_expanded = 30
        executor = ThreadPoolExecutor(max_workers=len(self.pvcs))
        disruption_ops = disruption_helpers.Disruptions()

        # Run IO to fill some data
        log.info("Running IO on all pods to fill some data before PVC expansion.")
        for pod_obj in self.pods:
            storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="4G",
                io_direction="write",
                runtime=30,
                rate="10M",
                fio_filename=f"{pod_obj.name}_f1",
            )

        log.info("Wait for IO to complete on pods")
        for pod_obj in self.pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert err_count == 0, (
                f"IO error on pod {pod_obj.name}. " f"FIO result: {fio_result}"
            )
            log.info(f"Verified IO on pod {pod_obj.name}.")
        log.info("IO is successful on all pods before PVC expansion.")

        if self.provider_index is not None:
            # Switch to provider cluster context to get ceph pods
            config.switch_to_provider()

        # Select the pod to be deleted
        disruption_ops.set_resource(resource=resource_to_delete)

        if self.provider_index is not None:
            config.switch_ctx(self.consumer_index)

        log.info("Expanding all PVCs.")
        for pvc_obj in self.pvcs:
            log.info(f"Expanding size of PVC {pvc_obj.name} to {pvc_size_expanded}G")
            pvc_obj.expand_proc = executor.submit(
                pvc_obj.resize_pvc, pvc_size_expanded, True
            )

        # Delete the pod 'resource_to_delete'
        disruption_ops.delete_resource()

        # Verify pvc expand status
        for pvc_obj in self.pvcs:
            assert (
                pvc_obj.expand_proc.result()
            ), f"Expansion failed for PVC {pvc_obj.name}"
        log.info("PVC expansion was successful on all PVCs")

        log.info("Verifying new size on pods.")
        for pod_obj in self.pods:
            if pod_obj.pvc.volume_mode == "Block":
                log.info(
                    f"Skipping check on pod {pod_obj.name} as volume mode is Block."
                )
                continue

            # Wait for 240 seconds to reflect the change on pod
            log.info(f"Checking pod {pod_obj.name} to verify the change.")
            for df_out in TimeoutSampler(
                240, 3, pod_obj.exec_cmd_on_pod, command="df -kh"
            ):
                if not df_out:
                    continue
                df_out = df_out.split()
                new_size_mount = df_out[df_out.index(pod_obj.get_storage_path()) - 4]
                if new_size_mount in [
                    f"{pvc_size_expanded - 0.1}G",
                    f"{float(pvc_size_expanded)}G",
                    f"{pvc_size_expanded}G",
                ]:
                    log.info(
                        f"Verified: Expanded size of PVC {pod_obj.pvc.name} "
                        f"is reflected on pod {pod_obj.name}"
                    )
                    break
                log.info(
                    f"Expanded size of PVC {pod_obj.pvc.name} is not reflected"
                    f" on pod {pod_obj.name}. New size on mount is not "
                    f"{pvc_size_expanded}G as expected, but {new_size_mount}. "
                    f"Checking again."
                )
        log.info(
            f"Verified: Modified size {pvc_size_expanded}G is reflected on all pods."
        )

        # Run IO to fill more data
        log.info("Write more data after PVC expansion.")
        for pod_obj in self.pods:
            storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="10G",
                io_direction="write",
                runtime=30,
                rate="10M",
                fio_filename=f"{pod_obj.name}_f2",
                end_fsync=1,
            )

        log.info("Wait for IO to complete on all pods")
        for pod_obj in self.pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert err_count == 0, (
                f"IO error on pod {pod_obj.name}. " f"FIO result: {fio_result}"
            )
            log.info(f"Verified IO on pod {pod_obj.name}.")
        log.info("IO is successful on all pods after PVC expansion.")
    def test_disruptive_during_pod_pvc_deletion_and_io(
        self, interface, resource_to_delete, setup_base
    ):
        """
        Delete ceph/rook pod while PVCs deletion, pods deletion and IO are
        progressing
        """
        pvc_objs, pod_objs, rwx_pod_objs = setup_base
        namespace = pvc_objs[0].project.namespace

        num_of_pods_to_delete = 3
        num_of_io_pods = 1

        # Select pods to be deleted
        pods_to_delete = pod_objs[:num_of_pods_to_delete]
        pods_to_delete.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in pods_to_delete
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        # Select pods to run IO
        io_pods = pod_objs[
            num_of_pods_to_delete : num_of_pods_to_delete + num_of_io_pods
        ]
        io_pods.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in io_pods
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        # Select pods which are having PVCs to delete
        pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods :]
        pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc]
        pods_for_pvc.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in pods_for_pvc
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        log.info(
            f"{len(pods_to_delete)} pods selected for deletion in which "
            f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod "
            f"share same RWX PVC"
        )
        log.info(
            f"{len(io_pods)} pods selected for running IO in which "
            f"{len(io_pods) - num_of_io_pods} pairs of pod share same "
            f"RWX PVC"
        )
        no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete)
        log.info(
            f"{len(pvcs_to_delete)} PVCs selected for deletion. "
            f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, "
            f"RWX PVCs: {no_of_rwx_pvcs_delete}"
        )

        pod_functions = {
            "mds": partial(get_mds_pods),
            "mon": partial(get_mon_pods),
            "mgr": partial(get_mgr_pods),
            "osd": partial(get_osd_pods),
            "rbdplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods),
            "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods),
            "operator": partial(get_operator_pods),
        }

        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        executor = ThreadPoolExecutor(max_workers=len(pod_objs) + len(rwx_pod_objs))

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(get_all_pods(namespace=namespace))
        initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"])

        # Fetch PV names to verify after deletion
        pv_objs = []
        for pvc_obj in pvcs_to_delete:
            pvc_obj.reload()
            pv_objs.append(pvc_obj.backed_pv_obj)

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod_obj in pods_to_delete:
            pod_info = pod_obj.get()
            node = pod_info["spec"]["nodeName"]
            pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"]
            for pvc_obj in pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Fetch image uuid associated with PVCs to be deleted
        pvc_uuid_map = {}
        for pvc_obj in pvcs_to_delete:
            pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid
        log.info("Fetched image uuid associated with each PVC")

        # Do setup on pods for running IO
        log.info("Setting up pods for running IO.")
        for pod_obj in pod_objs + rwx_pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in pod_objs + rwx_pod_objs:
            log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}")
            for sample in TimeoutSampler(180, 2, getattr, pod_obj, "wl_setup_done"):
                if sample:
                    log.info(
                        f"Setup for running IO is completed on pod " f"{pod_obj.name}."
                    )
                    break
        log.info("Setup for running IO is completed on all pods.")

        # Start IO on pods having PVCs to delete to load data
        log.info("Starting IO on pods having PVCs to delete.")
        self.run_io_on_pods(pods_for_pvc)
        log.info("IO started on pods having PVCs to delete.")

        log.info("Fetching IO results from the pods having PVCs to delete.")
        for pod_obj in pods_for_pvc:
            get_fio_rw_iops(pod_obj)
        log.info("Verified IO result on pods having PVCs to delete.")

        # Delete pods having PVCs to delete.
        delete_pods(pods_for_pvc)
        for pod_obj in pods_for_pvc:
            pod_obj.ocp.wait_for_delete(pod_obj.name)
        log.info("Verified: Deleted pods which are having PVCs to delete.")

        # Start IO on pods to be deleted
        log.info("Starting IO on pods to be deleted.")
        self.run_io_on_pods(pods_to_delete)
        log.info("IO started on pods to be deleted.")

        # Start deleting PVCs
        pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete)
        log.info("Started deleting PVCs")

        # Start deleting pods
        pod_bulk_delete = executor.submit(delete_pods, pods_to_delete, wait=False)
        log.info("Started deleting pods")

        # Start IO on IO pods
        self.run_io_on_pods(io_pods)
        log.info("Started IO on IO pods")

        # Verify pvc deletion has started
        pvc_deleting = executor.submit(
            wait_for_resource_count_change,
            func_to_use=get_all_pvcs,
            previous_num=initial_num_of_pvc,
            namespace=namespace,
            change_type="decrease",
            min_difference=1,
            timeout=30,
            interval=0.01,
        )

        # Verify pod deletion has started
        pod_deleting = executor.submit(
            wait_for_resource_count_change,
            func_to_use=get_all_pods,
            previous_num=initial_num_of_pods,
            namespace=namespace,
            change_type="decrease",
            min_difference=1,
            timeout=30,
            interval=0.01,
        )

        assert pvc_deleting.result(), "Wait timeout: PVCs are not being deleted."
        log.info("PVCs deletion has started.")

        assert pod_deleting.result(), "Wait timeout: Pods are not being deleted."
        log.info("Pods deletion has started.")

        # Delete pod of type 'resource_to_delete'
        disruption.delete_resource()

        pod_bulk_delete.result()

        # Verify pods are deleted
        for pod_obj in pods_to_delete:
            pod_obj.ocp.wait_for_delete(pod_obj.name, 300)
        log.info("Verified: Pods are deleted.")

        # Verify that the mount point is removed from nodes after deleting pod
        node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict)
        for node, pvs in node_pv_mounted.items():
            assert not pvs, (
                f"PVs {pvs} is still present on node {node} after "
                f"deleting the pods."
            )
        log.info(
            "Verified: mount points are removed from nodes after deleting " "the pods"
        )

        pvcs_deleted = pvc_bulk_delete.result()
        assert pvcs_deleted, "Deletion of PVCs failed."

        # Verify PVCs are deleted
        for pvc_obj in pvcs_to_delete:
            pvc_obj.ocp.wait_for_delete(pvc_obj.name)
        log.info("Verified: PVCs are deleted.")

        # Verify PVs are deleted
        for pv_obj in pv_objs:
            pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300)
        log.info("Verified: PVs are deleted.")

        # Verify PV using ceph toolbox. Image/Subvolume should be deleted.
        pool_name = default_ceph_block_pool()
        for pvc_name, uuid in pvc_uuid_map.items():
            if interface == constants.CEPHBLOCKPOOL:
                ret = verify_volume_deleted_in_backend(
                    interface=interface, image_uuid=uuid, pool_name=pool_name
                )
            if interface == constants.CEPHFILESYSTEM:
                ret = verify_volume_deleted_in_backend(
                    interface=interface, image_uuid=uuid
                )
            assert ret, (
                f"Volume associated with PVC {pvc_name} still exists " f"in backend"
            )

        log.info("Fetching IO results from the pods.")
        for pod_obj in io_pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
        log.info("Verified IO result on pods.")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}"
        )

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
        log.info("Ceph cluster health is OK")
    def disruptive_base(self, interface, operation_to_disrupt,
                        resource_to_delete):
        """
        Base function for disruptive tests.
        Deletion of 'resource_to_delete' will be introduced while
        'operation_to_disrupt' is progressing.
        """
        pod_functions = {
            "mds": partial(get_mds_pods),
            "mon": partial(get_mon_pods),
            "mgr": partial(get_mgr_pods),
            "osd": partial(get_osd_pods),
            "rbdplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin_provisioner":
            partial(get_cephfsplugin_provisioner_pods),
            "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods),
            "operator": partial(get_operator_pods),
        }
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        executor = ThreadPoolExecutor(max_workers=1)

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(get_all_pods(namespace=self.namespace))
        initial_num_of_pvc = len(
            get_all_pvcs(namespace=self.namespace)["items"])

        # Fetch PV names
        pv_objs = []
        for pvc_obj in self.pvc_objs:
            pvc_obj.reload()
            pv_objs.append(pvc_obj.backed_pv_obj)

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod_obj in self.pod_objs:
            pod_info = pod_obj.get()
            node = pod_info["spec"]["nodeName"]
            pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][
                "claimName"]
            for pvc_obj in self.pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Do setup for running IO on pods
        log.info("Setting up pods for running IO")
        for pod_obj in self.pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                pod_obj.pvc.storage_type = "block"
            else:
                pod_obj.pvc.storage_type = "fs"
            pod_obj.workload_setup(storage_type=pod_obj.pvc.storage_type)
        log.info("Setup for running IO is completed on pods")

        # Start IO on each pod. RWX PVC will be used on two pods. So split the
        # size accordingly
        log.info("Starting IO on pods")
        for pod_obj in self.pod_objs:
            if pod_obj.pvc.access_mode == constants.ACCESS_MODE_RWX:
                io_size = int((self.pvc_size - 1) / 2)
            else:
                io_size = self.pvc_size - 1
            pod_obj.run_io(
                storage_type=pod_obj.pvc.storage_type,
                size=f"{io_size}G",
                fio_filename=f"{pod_obj.name}_io",
                end_fsync=1,
            )
        log.info("IO started on all pods.")

        # Start deleting pods
        pod_bulk_delete = executor.submit(delete_pods,
                                          self.pod_objs,
                                          wait=False)

        if operation_to_disrupt == "delete_pods":
            ret = wait_for_resource_count_change(
                get_all_pods,
                initial_num_of_pods,
                self.namespace,
                "decrease",
                timeout=50,
            )
            assert ret, "Wait timeout: Pods are not being deleted."
            log.info("Pods deletion has started.")
            disruption.delete_resource()

        pod_bulk_delete.result()

        # Verify pods are deleted
        for pod_obj in self.pod_objs:
            assert pod_obj.ocp.wait_for_delete(
                pod_obj.name, 180), f"Pod {pod_obj.name} is not deleted"
        log.info("Verified: Pods are deleted.")

        # Verify that the mount point is removed from nodes after deleting pod
        for node, pvs in node_pv_dict.items():
            cmd = f"oc debug nodes/{node} -- df"
            df_on_node = run_cmd(cmd)
            for pv in pvs:
                assert pv not in df_on_node, (
                    f"{pv} is still present on node {node} after "
                    f"deleting the pods.")
        log.info(
            "Verified: mount points are removed from nodes after deleting "
            "the pods")

        # Fetch image uuid associated with PVCs
        pvc_uuid_map = {}
        for pvc_obj in self.pvc_objs:
            pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid
        log.info("Fetched image uuid associated with each PVC")

        # Start deleting PVCs
        pvc_bulk_delete = executor.submit(delete_pvcs, self.pvc_objs)

        if operation_to_disrupt == "delete_pvcs":
            ret = wait_for_resource_count_change(get_all_pvcs,
                                                 initial_num_of_pvc,
                                                 self.namespace,
                                                 "decrease",
                                                 timeout=50)
            assert ret, "Wait timeout: PVCs are not being deleted."
            log.info("PVCs deletion has started.")
            disruption.delete_resource()

        pvcs_deleted = pvc_bulk_delete.result()

        assert pvcs_deleted, "Deletion of PVCs failed."

        # Verify PVCs are deleted
        for pvc_obj in self.pvc_objs:
            assert pvc_obj.ocp.wait_for_delete(
                pvc_obj.name), f"PVC {pvc_obj.name} is not deleted"
        log.info("Verified: PVCs are deleted.")

        # Verify PVs are deleted
        for pv_obj in pv_objs:
            assert pv_obj.ocp.wait_for_delete(
                pv_obj.name, 120), f"PV {pv_obj.name} is not deleted"
        log.info("Verified: PVs are deleted.")

        # Verify PV using ceph toolbox. Image/Subvolume should be deleted.
        pool_name = default_ceph_block_pool()
        for pvc_name, uuid in pvc_uuid_map.items():
            if interface == constants.CEPHBLOCKPOOL:
                ret = verify_volume_deleted_in_backend(interface=interface,
                                                       image_uuid=uuid,
                                                       pool_name=pool_name)
            if interface == constants.CEPHFILESYSTEM:
                ret = verify_volume_deleted_in_backend(interface=interface,
                                                       image_uuid=uuid)
            assert ret, (f"Volume associated with PVC {pvc_name} still exists "
                         f"in backend")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}")

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
        log.info("Ceph cluster health is OK")
Пример #19
0
    def test_ceph_daemon_kill_during_resource_creation(
        self,
        interface,
        operation_to_disrupt,
        resource_to_delete,
        multi_pvc_factory,
        pod_factory,
    ):
        """
        Base function for ceph daemon kill disruptive tests.
        Deletion of 'resource_to_delete' daemon will be introduced while
        'operation_to_disrupt' is progressing.
        """
        disruption = disruption_helpers.Disruptions()
        pod_functions = {
            "mds":
            partial(pod.get_mds_pods),
            "mon":
            partial(pod.get_mon_pods),
            "mgr":
            partial(pod.get_mgr_pods),
            "osd":
            partial(pod.get_osd_pods),
            "rbdplugin":
            partial(pod.get_plugin_pods, interface=interface),
            "cephfsplugin":
            partial(pod.get_plugin_pods, interface=interface),
            "cephfsplugin_provisioner":
            partial(pod.get_cephfsplugin_provisioner_pods),
            "rbdplugin_provisioner":
            partial(pod.get_rbdfsplugin_provisioner_pods),
            "operator":
            partial(pod.get_operator_pods),
        }

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        num_of_pvc = 12
        namespace = self.proj_obj.namespace

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(pod.get_all_pods(namespace=namespace))
        initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"])

        executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc))

        disruption.set_resource(resource=resource_to_delete)
        disruption.select_daemon()

        access_modes = [constants.ACCESS_MODE_RWO]
        if interface == constants.CEPHFILESYSTEM:
            access_modes.append(constants.ACCESS_MODE_RWX)

        # Modify access_modes list to create rbd `block` type volume with
        # RWX access mode. RWX is not supported in non-block type rbd
        if interface == constants.CEPHBLOCKPOOL:
            access_modes.extend([
                f"{constants.ACCESS_MODE_RWO}-Block",
                f"{constants.ACCESS_MODE_RWX}-Block",
            ])

        # Start creation of PVCs
        bulk_pvc_create = executor.submit(
            multi_pvc_factory,
            interface=interface,
            project=self.proj_obj,
            size=8,
            access_modes=access_modes,
            access_modes_selection="distribute_random",
            status=constants.STATUS_BOUND,
            num_of_pvc=num_of_pvc,
            wait_each=False,
            timeout=90,
        )

        if operation_to_disrupt == "create_pvc":
            # Ensure PVCs are being created before deleting the resource
            ret = helpers.wait_for_resource_count_change(
                get_all_pvcs, initial_num_of_pvc, namespace, "increase")
            assert ret, "Wait timeout: PVCs are not being created."
            log.info("PVCs creation has started.")
            disruption.kill_daemon()

        pvc_objs = bulk_pvc_create.result()

        # Confirm that PVCs are Bound
        for pvc_obj in pvc_objs:
            helpers.wait_for_resource_state(resource=pvc_obj,
                                            state=constants.STATUS_BOUND,
                                            timeout=120)
            pvc_obj.reload()
        log.info("Verified: PVCs are Bound.")

        # Start creating pods
        bulk_pod_create = executor.submit(helpers.create_pods, pvc_objs,
                                          pod_factory, interface, 2)

        if operation_to_disrupt == "create_pod":
            # Ensure that pods are being created before deleting the resource
            ret = helpers.wait_for_resource_count_change(
                pod.get_all_pods, initial_num_of_pods, namespace, "increase")
            assert ret, "Wait timeout: Pods are not being created."
            log.info("Pods creation has started.")
            disruption.kill_daemon()

        pod_objs = bulk_pod_create.result()

        # Verify pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING,
                                            timeout=180)
            pod_obj.reload()
        log.info("Verified: All pods are Running.")

        # Do setup on pods for running IO
        log.info("Setting up pods for running IO.")
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in pod_objs:
            log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}")
            for sample in TimeoutSampler(180, 2, getattr, pod_obj,
                                         "wl_setup_done"):
                if sample:
                    log.info(f"Setup for running IO is completed on pod "
                             f"{pod_obj.name}.")
                    break
        log.info("Setup for running IO is completed on all pods.")

        # Start IO on each pod
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="2G",
                runtime=30,
                fio_filename=f"{pod_obj.name}_io_file1",
            )
        log.info("FIO started on all pods.")

        if operation_to_disrupt == "run_io":
            disruption.kill_daemon()

        log.info("Fetching FIO results.")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
            log.info(f"FIO is success on pod {pod_obj.name}")
        log.info("Verified FIO result on pods.")

        # Delete pods
        for pod_obj in pod_objs:
            pod_obj.delete(wait=True)
        for pod_obj in pod_objs:
            pod_obj.ocp.wait_for_delete(pod_obj.name)

        # Verify that PVCs are reusable by creating new pods
        pod_objs = helpers.create_pods(pvc_objs, pod_factory, interface, 2)

        # Verify new pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING)
            pod_obj.reload()
        log.info("Verified: All new pods are Running.")

        # Run IO on each of the new pods
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=10,
                fio_filename=f"{pod_obj.name}_io_file2",
            )

        log.info("Fetching FIO results from new pods")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
            log.info(f"FIO is success on pod {pod_obj.name}")
        log.info("Verified FIO result on new pods.")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}")

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
        log.info("Ceph cluster health is OK")
Пример #20
0
import logging
from concurrent.futures import ThreadPoolExecutor
import pytest
from functools import partial

from ocs_ci.framework.testlib import ManageTest, tier4, tier4a, ignore_leftover_label
from ocs_ci.framework import config
from ocs_ci.ocs import constants
from ocs_ci.ocs.resources.pvc import get_all_pvcs
from ocs_ci.ocs.resources import pod
from ocs_ci.utility.utils import TimeoutSampler, ceph_health_check
from ocs_ci.helpers import helpers, disruption_helpers

logger = logging.getLogger(__name__)

DISRUPTION_OPS = disruption_helpers.Disruptions()


@tier4
@tier4a
@ignore_leftover_label(constants.drain_canary_pod_label)
@pytest.mark.parametrize(
    argnames=["interface", "operation_to_disrupt", "resource_to_delete"],
    argvalues=[
        pytest.param(
            *[constants.CEPHBLOCKPOOL, "create_pvc", "mgr"],
            marks=pytest.mark.polarion_id("OCS-568"),
        ),
        pytest.param(
            *[constants.CEPHBLOCKPOOL, "create_pod", "mgr"],
            marks=pytest.mark.polarion_id("OCS-569"),
Пример #21
0
    def test_resource_deletion_during_pvc_clone(self, pvc_clone_factory,
                                                pod_factory):
        """
        Verify PVC clone will succeeded if rook-ceph, csi pods are re-spun
        while creating the clone

        """
        pods_to_delete = [
            "rbdplugin_provisioner",
            "cephfsplugin_provisioner",
            "cephfsplugin",
            "rbdplugin",
            "osd",
            "mgr",
        ]
        executor = ThreadPoolExecutor(max_workers=len(self.pvcs) +
                                      len(pods_to_delete))
        disruption_ops = [
            disruption_helpers.Disruptions() for _ in pods_to_delete
        ]
        file_name = "file_clone"

        # Run IO
        log.info("Running fio on all pods to create a file")
        for pod_obj in self.pods:
            storage_type = ("block" if
                            (pod_obj.pvc.volume_mode
                             == constants.VOLUME_MODE_BLOCK) else "fs")
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=30,
                fio_filename=file_name,
                end_fsync=1,
            )

        log.info("Wait for IO to complete on pods")
        for pod_obj in self.pods:
            pod_obj.get_fio_results()
            log.info(f"Verified IO on pod {pod_obj.name}")
            # Calculate md5sum
            file_name_pod = (file_name if
                             (pod_obj.pvc.volume_mode
                              == constants.VOLUME_MODE_FILESYSTEM) else
                             pod_obj.get_storage_path(storage_type="block"))
            pod_obj.pvc.md5sum = cal_md5sum(
                pod_obj,
                file_name_pod,
                pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK,
            )
            log.info(f"md5sum obtained from pod {pod_obj.name}")
        log.info("IO is successful on all pods")

        # Select the pods to be deleted
        for disruption, pod_type in zip(disruption_ops, pods_to_delete):
            cluster_index = None
            # 'provider_index' will not be None if the platform is Managed Services
            if self.provider_index is not None:
                if pod_type in ["osd", "mgr"]:
                    cluster_index = self.provider_index
                    config.switch_to_provider()
                else:
                    cluster_index = self.consumer_index
                    config.switch_ctx(cluster_index)

            disruption.set_resource(resource=pod_type,
                                    cluster_index=cluster_index)

        # Switch cluster context if the platform is MS. 'provider_index' will not be None if platform is MS.
        if self.provider_index is not None:
            config.switch_ctx(self.consumer_index)

        # Clone PVCs
        log.info("Start creating clone of PVCs")
        for pvc_obj in self.pvcs:
            log.info(f"Creating clone of PVC {pvc_obj.name}")
            pvc_obj.clone_proc = executor.submit(
                pvc_clone_factory,
                pvc_obj=pvc_obj,
                status="",
                access_mode=pvc_obj.get_pvc_access_mode,
                volume_mode=pvc_obj.volume_mode,
            )
        log.info("Started creating clone")

        # Delete the pods 'pods_to_delete'
        log.info(f"Deleting pods {pods_to_delete}")
        for disruption in disruption_ops:
            disruption.delete_proc = executor.submit(
                disruption.delete_resource)

        # Wait for delete and recovery
        [disruption.delete_proc.result() for disruption in disruption_ops]

        # Get cloned PVCs
        clone_pvc_objs = []
        for pvc_obj in self.pvcs:
            clone_obj = pvc_obj.clone_proc.result()
            clone_pvc_objs.append(clone_obj)
            log.info(f"Created clone {clone_obj.name} of PVC {pvc_obj.name}")
        log.info("Created clone of all PVCs")

        # Confirm that the cloned PVCs are Bound
        log.info("Verifying the cloned PVCs are Bound")
        for pvc_obj in clone_pvc_objs:
            wait_for_resource_state(resource=pvc_obj,
                                    state=constants.STATUS_BOUND,
                                    timeout=300)
            pvc_obj.reload()
            pvc_obj.volume_mode = pvc_obj.data["spec"]["volumeMode"]
        log.info("Verified: Cloned PVCs are Bound.")

        clone_pod_objs = []

        # Attach the cloned PVCs to pods
        log.info("Attach the cloned PVCs to pods")
        for pvc_obj in clone_pvc_objs:
            if pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK:
                pod_dict_path = constants.CSI_RBD_RAW_BLOCK_POD_YAML
            else:
                pod_dict_path = ""
            restore_pod_obj = pod_factory(
                interface=pvc_obj.interface,
                pvc=pvc_obj,
                status="",
                pod_dict_path=pod_dict_path,
                raw_block_pv=pvc_obj.volume_mode ==
                constants.VOLUME_MODE_BLOCK,
            )
            clone_pod_objs.append(restore_pod_obj)

        # Verify the new pods are running
        log.info("Verify the new pods are running")
        for pod_obj in clone_pod_objs:
            wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
        log.info("Verified: New pods are running")

        # Verify md5sum
        log.info("Verify md5sum")
        for pod_obj in clone_pod_objs:
            file_name_pod = (file_name if
                             (pod_obj.pvc.volume_mode
                              == constants.VOLUME_MODE_FILESYSTEM) else
                             pod_obj.get_storage_path(storage_type="block"))
            verify_data_integrity(
                pod_obj,
                file_name_pod,
                pod_obj.pvc.parent.md5sum,
                pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK,
            )
            log.info(
                f"Verified: md5sum of {file_name_pod} on pod {pod_obj.name} "
                f"matches with the original md5sum")
        log.info("Data integrity check passed on all pods")

        # Run IO
        log.info("Running IO on new pods")
        for pod_obj in clone_pod_objs:
            storage_type = ("block" if
                            (pod_obj.pvc.volume_mode
                             == constants.VOLUME_MODE_BLOCK) else "fs")
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=20,
                fio_filename=file_name,
                end_fsync=1,
            )

        log.info("Wait for IO to complete on new pods")
        for pod_obj in clone_pod_objs:
            pod_obj.get_fio_results()
            log.info(f"Verified IO on new pod {pod_obj.name}")
        log.info("IO to completed on new pods")
    def test_resource_deletion_during_snapshot_restore(
            self, snapshot_factory, snapshot_restore_factory, pod_factory):
        """
        Verify PVC snapshot and restore will succeeded if rook-ceph,
        csi pods are re-spun while creating snapshot and while creating
        restore PVC

        """
        pods_to_delete = [
            "rbdplugin_provisioner",
            "cephfsplugin_provisioner",
            "cephfsplugin",
            "rbdplugin",
            "osd",
            "mgr",
        ]
        executor = ThreadPoolExecutor(max_workers=len(self.pvcs) +
                                      len(pods_to_delete))
        disruption_ops = [
            disruption_helpers.Disruptions() for _ in pods_to_delete
        ]
        file_name = "file_snap"

        # Run IO
        log.info("Running fio on all pods to create a file")
        for pod_obj in self.pods:
            storage_type = ("block" if
                            (pod_obj.pvc.volume_mode
                             == constants.VOLUME_MODE_BLOCK) else "fs")
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=30,
                fio_filename=file_name,
                end_fsync=1,
            )

        log.info("Wait for IO to complete on pods")
        for pod_obj in self.pods:
            pod_obj.get_fio_results()
            log.info(f"Verified IO on pod {pod_obj.name}")
            # Calculate md5sum
            file_name_pod = (file_name if
                             (pod_obj.pvc.volume_mode
                              == constants.VOLUME_MODE_FILESYSTEM) else
                             pod_obj.get_storage_path(storage_type="block"))
            pod_obj.pvc.md5sum = cal_md5sum(
                pod_obj,
                file_name_pod,
                pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK,
            )
            log.info(f"md5sum obtained from pod {pod_obj.name}")
        log.info("IO is successful on all pods")

        # Select the pods to be deleted
        for disruption, pod_type in zip(disruption_ops, pods_to_delete):
            # Select snapshotter leader if the pod is provisioner pod
            disruption.set_resource(
                resource=pod_type,
                leader_type="snapshotter" if "provisioner" in pod_type else "",
            )

        log.info("Start taking snapshot of all PVCs.")
        for pvc_obj in self.pvcs:
            log.info(f"Taking snapshot of PVC {pvc_obj.name}")
            pvc_obj.snap_proc = executor.submit(snapshot_factory,
                                                pvc_obj,
                                                wait=False)
        log.info("Started taking snapshot of all PVCs.")

        # Delete the pods 'pods_to_delete'
        log.info(f"Deleting pods {pods_to_delete}")
        for disruption in disruption_ops:
            disruption.delete_proc = executor.submit(
                disruption.delete_resource)

        # Wait for delete and recovery
        [disruption.delete_proc.result() for disruption in disruption_ops]

        # Get snapshots
        snap_objs = []
        for pvc_obj in self.pvcs:
            snap_obj = pvc_obj.snap_proc.result()
            snap_obj.md5sum = pvc_obj.md5sum
            snap_objs.append(snap_obj)

        # Wait for snapshots to be Ready
        log.info("Waiting for all snapshots to be Ready")
        for snap_obj in snap_objs:
            snap_obj.ocp.wait_for_resource(
                condition="true",
                resource_name=snap_obj.name,
                column=constants.STATUS_READYTOUSE,
                timeout=300,
            )
            log.info(f"Snapshot {snap_obj.name} is Ready")
            snap_obj.reload()
        log.info("All snapshots are Ready")

        # Select the pods to be deleted
        for disruption, pod_type in zip(disruption_ops, pods_to_delete):
            disruption.set_resource(resource=pod_type)

        restore_pvc_objs = []

        # Create PVCs out of the snapshots
        log.info("Start creating new PVCs from snapshots")
        for snap_obj in snap_objs:
            log.info(f"Creating a PVC from snapshot {snap_obj.name}")
            snap_obj.restore_proc = executor.submit(
                snapshot_restore_factory,
                snapshot_obj=snap_obj,
                size=f"{self.pvc_size}Gi",
                volume_mode=snap_obj.parent_volume_mode,
                access_mode=snap_obj.parent_access_mode,
                status="",
            )
        log.info("Started creating new PVCs from snapshots")

        # Delete the pods 'pods_to_delete'
        log.info(f"Deleting pods {pods_to_delete}")
        for disruption in disruption_ops:
            disruption.delete_proc = executor.submit(
                disruption.delete_resource)

        # Wait for delete and recovery
        [disruption.delete_proc.result() for disruption in disruption_ops]

        # Get restored PVCs
        for snap_obj in snap_objs:
            restore_pvc_obj = snap_obj.restore_proc.result()
            restore_pvc_objs.append(restore_pvc_obj)
            log.info(f"Created PVC {restore_pvc_obj.name} from snapshot "
                     f"{snap_obj.name}")
        log.info("Created new PVCs from all the snapshots")

        # Confirm that the restored PVCs are Bound
        log.info("Verifying the restored PVCs are Bound")
        for pvc_obj in restore_pvc_objs:
            wait_for_resource_state(resource=pvc_obj,
                                    state=constants.STATUS_BOUND,
                                    timeout=300)
            pvc_obj.reload()
            pvc_obj.volume_mode = pvc_obj.data["spec"]["volumeMode"]
        log.info("Verified: Restored PVCs are Bound.")

        restore_pod_objs = []

        # Attach the restored PVCs to pods
        log.info("Attach the restored PVCs to pods")
        for pvc_obj in restore_pvc_objs:
            if pvc_obj.volume_mode == constants.VOLUME_MODE_BLOCK:
                pod_dict_path = constants.CSI_RBD_RAW_BLOCK_POD_YAML
            else:
                pod_dict_path = ""
            restore_pod_obj = pod_factory(
                interface=pvc_obj.interface,
                pvc=pvc_obj,
                status="",
                pod_dict_path=pod_dict_path,
                raw_block_pv=pvc_obj.volume_mode ==
                constants.VOLUME_MODE_BLOCK,
            )
            restore_pod_objs.append(restore_pod_obj)
        log.info("Attach the restored PVCs to pods")

        # Verify the new pods are running
        log.info("Verify the new pods are running")
        for pod_obj in restore_pod_objs:
            wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
        log.info("Verified: New pods are running")

        # Verify md5sum
        log.info("Verify md5sum")
        for pod_obj in restore_pod_objs:
            file_name_pod = (file_name if
                             (pod_obj.pvc.volume_mode
                              == constants.VOLUME_MODE_FILESYSTEM) else
                             pod_obj.get_storage_path(storage_type="block"))
            verify_data_integrity(
                pod_obj,
                file_name_pod,
                pod_obj.pvc.snapshot.md5sum,
                pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK,
            )
            log.info(
                f"Verified: md5sum of {file_name_pod} on pod {pod_obj.name} "
                f"matches with the original md5sum")
        log.info("Data integrity check passed on all pods")

        # Run IO
        log.info("Running IO on new pods")
        for pod_obj in restore_pod_objs:
            storage_type = ("block" if
                            (pod_obj.pvc.volume_mode
                             == constants.VOLUME_MODE_BLOCK) else "fs")
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=20,
                fio_filename=file_name,
                end_fsync=1,
            )

        log.info("Wait for IO to complete on new pods")
        for pod_obj in restore_pod_objs:
            pod_obj.get_fio_results()
            log.info(f"Verified IO on new pod {pod_obj.name}")
        log.info("IO to completed on new pods")
Пример #23
0
    def test_pod_disruptions(self, create_pvcs_and_pods):
        """
        Test to perform pod disruption in consumer and provider cluster

        """
        # List of pods to be disrupted. Using different list for consumer and provider for the easy implementation
        pods_on_consumer = [
            "alertmanager_managed_ocs_alertmanager",
            "ocs_osd_controller_manager",
            "prometheus_managed_ocs_prometheus",
            "prometheus_operator",
            "ocs_operator",
        ]
        pods_on_provider = [
            "alertmanager_managed_ocs_alertmanager",
            "ocs_osd_controller_manager",
            "prometheus_managed_ocs_prometheus",
            "prometheus_operator",
            "ocs_provider_server",
            "ocs_operator",
        ]
        disruption_on_consumer = []
        disruption_on_provider = []

        # Start I/O
        log.info("Starting fio on all pods")
        for pod_obj in self.io_pods:
            if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK:
                storage_type = "block"
                direct = 1
            else:
                storage_type = "fs"
                direct = 0
            pod_obj.run_io(
                storage_type=storage_type,
                size="10G",
                fio_filename=f"{pod_obj.name}",
                runtime=320,
                end_fsync=1,
                direct=direct,
                invalidate=0,
                fio_installed=True,
            )

        consumer_index_iter = cycle(self.consumer_indexes)

        # Create Disruptions instance for each pod to be disrupted on consumer
        for pod_type in pods_on_consumer:
            consumer_index = next(consumer_index_iter)
            config.switch_ctx(consumer_index)
            disruption_obj = disruption_helpers.Disruptions()
            # Select each pod to be disrupted from different consumers
            disruption_obj.set_resource(resource=pod_type,
                                        cluster_index=consumer_index)
            disruption_obj.index_of_consumer = consumer_index
            disruption_on_consumer.append(disruption_obj)

        # Create Disruptions instance for each pod to be disrupted on provider
        config.switch_to_provider()
        for pod_type in pods_on_provider:
            disruption_obj = disruption_helpers.Disruptions()
            disruption_obj.set_resource(
                resource=pod_type, cluster_index=self.provider_cluster_index)
            disruption_on_provider.append(disruption_obj)

        # Delete pods on consumer one at a time
        log.info("Starting pod disruptions on consumer clusters")
        for disruptions_obj in disruption_on_consumer:
            disruptions_obj.delete_resource()
            # ocs-operator respin will trigger rook-ceph-tools pod respin.
            # Patch rook-ceph-tools pod to run ceph commands.
            if disruptions_obj.resource == "ocs_operator":
                config.switch_ctx(disruptions_obj.index_of_consumer)
                patch_consumer_toolbox()

        # Delete pods on provider one at a time
        log.info("Starting pod disruptions on provider cluster")
        for disruptions_obj in disruption_on_provider:
            disruptions_obj.delete_resource()

        log.info("Wait for IO to complete on pods")
        for pod_obj in self.io_pods:
            pod_obj.get_fio_results()
            log.info(f"Verified IO on pod {pod_obj.name}")
        log.info("IO is successful on all pods")

        # Performs different checks in the clusters
        for cluster_index in [self.provider_cluster_index
                              ] + self.consumer_indexes:
            config.switch_ctx(cluster_index)

            # Verify managedocs components are Ready
            log.info("Verifying managedocs components state")
            managedocs_obj = OCP(
                kind="managedocs",
                resource_name="managedocs",
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            for component in {"alertmanager", "prometheus", "storageCluster"}:
                assert (
                    managedocs_obj.get()["status"]["components"][component]
                    ["state"] == "Ready"
                ), f"{component} status is {managedocs_obj.get()['status']['components'][component]['state']}"

            # Verify storagecluster status
            log.info("Verifying storagecluster status")
            verify_storage_cluster()

            # Verify CSV status
            for managed_csv in {
                    constants.OCS_CSV_PREFIX,
                    constants.OSD_DEPLOYER,
                    constants.OSE_PROMETHEUS_OPERATOR,
            }:
                csvs = csv.get_csvs_start_with_prefix(
                    managed_csv, constants.OPENSHIFT_STORAGE_NAMESPACE)
                assert (
                    len(csvs) == 1
                ), f"Unexpected number of CSVs with {managed_csv} prefix: {len(csvs)}"
                csv_name = csvs[0]["metadata"]["name"]
                csv_obj = csv.CSV(
                    resource_name=csv_name,
                    namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
                )
                log.info(f"Check if {csv_name} is in Succeeded phase.")
                csv_obj.wait_for_phase(phase="Succeeded", timeout=600)

            # Verify the phase of ceph cluster
            log.info("Verify the phase of ceph cluster")
            cephcluster = OCP(kind="CephCluster",
                              namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
            cephcluster_yaml = cephcluster.get().get("items")[0]
            expected_phase = "Connected"
            if cluster_index == self.provider_cluster_index:
                expected_phase = "Ready"
            assert (
                cephcluster_yaml["status"]["phase"] == expected_phase
            ), f"Status of cephcluster {cephcluster_yaml['metadata']['name']} is {cephcluster_yaml['status']['phase']}"

        # Create PVC and pods on all consumer clusters
        log.info("Creating new PVCs and pods")
        pods = list()
        for cluster_index in self.consumer_indexes:
            config.switch_ctx(cluster_index)
            consumer_cluster_kubeconfig = os.path.join(
                config.clusters[cluster_index].ENV_DATA["cluster_path"],
                config.clusters[cluster_index].RUN.get("kubeconfig_location"),
            )
            pvcs, io_pods = create_pvcs_and_pods(
                pvc_size=self.pvc_size,
                replica_count=1,
                pod_dict_path=constants.PERF_POD_YAML,
            )
            for pvc_obj in pvcs:
                pvc_obj.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig
            for io_pod in io_pods:
                io_pod.ocp.cluster_kubeconfig = consumer_cluster_kubeconfig
            pvcs[0].project.cluster_kubeconfig = consumer_cluster_kubeconfig
            pods.extend(io_pods)

        # Run I/O on new pods
        log.info("Running I/O on new pods")
        for pod_obj in pods:
            if pod_obj.pvc.volume_mode == constants.VOLUME_MODE_BLOCK:
                storage_type = "block"
                direct = 1
            else:
                storage_type = "fs"
                direct = 0
            pod_obj.run_io(
                storage_type=storage_type,
                size="10G",
                fio_filename=f"{pod_obj.name}",
                runtime=320,
                end_fsync=1,
                direct=direct,
                invalidate=0,
                fio_installed=True,
            )

        log.info("Wait for I/O to complete on new pods")
        for pod_obj in pods:
            pod_obj.get_fio_results()
            log.info(f"Verified IO on the new pod {pod_obj.name}")
        log.info("IO is successful on new pods")
Пример #24
0
    def test_daemon_kill_during_pvc_pod_creation_deletion_and_io(
        self, setup_base, multi_pvc_factory, pod_factory
    ):
        """
        Kill ceph daemons while PVCs creation, PVCs deletion, pods creation, pods deletion
        and IO are progressing
        """
        daemons_to_kill = [
            "mgr",
            "mon",
            "osd",
            "mds",
        ]

        (
            pvc_objs,
            pod_objs,
            rwx_pod_objs,
            cephfs_pvc_for_pods,
            rbd_pvc_for_pods,
        ) = setup_base

        num_of_pods_to_delete = 3
        num_of_io_pods = 1
        num_pvc_create_during_disruption = len(
            self.access_modes_cephfs + self.access_modes_rbd
        )

        # Select pods to be deleted
        pods_to_delete = pod_objs[:num_of_pods_to_delete]
        pods_to_delete.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in pods_to_delete
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        # Select pods to run IO
        io_pods = pod_objs[
            num_of_pods_to_delete : num_of_pods_to_delete + num_of_io_pods
        ]
        io_pods.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in io_pods
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        # Select pods which are having PVCs to delete
        pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods :]
        pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc]
        pods_for_pvc.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in pods_for_pvc
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        io_pods = [
            pod_obj
            for pod_obj in io_pods
            if pod_obj.pvc in select_unique_pvcs([pod_obj.pvc for pod_obj in io_pods])
        ]

        log.info(
            f"{len(pods_to_delete)} pods selected for deletion in which "
            f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod "
            f"share same RWX PVC"
        )
        log.info(
            f"{len(io_pods)} pods selected for running IO in which one "
            f"pair of pod share same RWX PVC"
        )
        no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete)
        log.info(
            f"{len(pvcs_to_delete)} PVCs selected for deletion. "
            f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, "
            f"RWX PVCs: {no_of_rwx_pvcs_delete}"
        )

        pod_functions = {
            "mds": partial(get_mds_pods),
            "mon": partial(get_mon_pods),
            "mgr": partial(get_mgr_pods),
            "osd": partial(get_osd_pods),
        }

        # Disruption object for each daemon type
        disruption_ops = [disruption_helpers.Disruptions() for _ in daemons_to_kill]

        # Select the resource of each type
        for disruption, pod_type in zip(disruption_ops, daemons_to_kill):
            disruption.set_resource(resource=pod_type)
        executor = ThreadPoolExecutor(
            max_workers=len(pod_objs)
            + len(rwx_pod_objs)
            + len(rbd_pvc_for_pods)
            + len(cephfs_pvc_for_pods)
            + len(daemons_to_kill)
            + num_pvc_create_during_disruption
        )

        # Get number of pods of the type given in daemons_to_kill list
        num_of_resource_pods = [
            len(pod_functions[resource_name]()) for resource_name in daemons_to_kill
        ]

        # Fetch PV names to verify after deletion
        pv_objs = []
        for pvc_obj in pvcs_to_delete:
            pv_objs.append(pvc_obj.backed_pv_obj)

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod_obj in pods_to_delete:
            pod_info = pod_obj.get()
            node = pod_info["spec"]["nodeName"]
            pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"]
            for pvc_obj in pvc_objs:
                if pvc_obj.name == pvc:
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Fetch image uuid associated with PVCs to be deleted
        pvc_uuid_map = {}
        for pvc_obj in pvcs_to_delete:
            pvc_uuid_map[pvc_obj] = pvc_obj.image_uuid
        log.info("Fetched image uuid associated with each PVC")

        # Do setup on pods for running IO
        log.info("Setting up pods for running IO.")
        for pod_obj in pod_objs + rwx_pod_objs:
            if pod_obj.pvc.get_pvc_vol_mode == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in pod_objs + rwx_pod_objs:
            log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}")
            for sample in TimeoutSampler(360, 2, getattr, pod_obj, "wl_setup_done"):
                if sample:
                    log.info(
                        f"Setup for running IO is completed on pod " f"{pod_obj.name}."
                    )
                    break
        log.info("Setup for running IO is completed on all pods.")

        # Start IO on pods having PVCs to delete to load data
        pods_for_pvc_io = [
            pod_obj
            for pod_obj in pods_for_pvc
            if pod_obj.pvc
            in select_unique_pvcs([pod_obj.pvc for pod_obj in pods_for_pvc])
        ]
        log.info("Starting IO on pods having PVCs to delete.")
        self.run_io_on_pods(pods_for_pvc_io)
        log.info("IO started on pods having PVCs to delete.")

        log.info("Fetching IO results from the pods having PVCs to delete.")
        for pod_obj in pods_for_pvc_io:
            get_fio_rw_iops(pod_obj)
        log.info("Verified IO result on pods having PVCs to delete.")

        # Delete pods having PVCs to delete.
        assert self.delete_pods(
            pods_for_pvc
        ), "Couldn't delete pods which are having PVCs to delete."
        for pod_obj in pods_for_pvc:
            pod_obj.ocp.wait_for_delete(pod_obj.name)
        log.info("Verified: Deleted pods which are having PVCs to delete.")

        # Select daemon of each type of resource and identify the daemons running on each node
        nodes_and_pids = {}
        for disruption in disruption_ops:
            disruption.select_daemon()
            node_name = disruption.resource_obj[0].pod_data.get("spec").get("nodeName")
            # Create node-daemons dict. Value as string for passing in the 'kill' command
            nodes_and_pids[
                node_name
            ] = f"{nodes_and_pids.get(node_name, '')} {disruption.daemon_pid}"

        # Start IO on pods to be deleted
        pods_to_delete_io = [
            pod_obj
            for pod_obj in pods_to_delete
            if pod_obj.pvc
            in select_unique_pvcs([pod_obj.pvc for pod_obj in pods_to_delete])
        ]
        log.info("Starting IO on selected pods to be deleted.")
        self.run_io_on_pods(pods_to_delete_io)
        log.info("IO started on selected pods to be deleted.")

        # Start creating new pods
        log.info("Start creating new pods.")
        pod_create_rbd = executor.submit(
            helpers.create_pods,
            rbd_pvc_for_pods,
            pod_factory,
            constants.CEPHBLOCKPOOL,
            2,
        )
        pod_create_cephfs = executor.submit(
            helpers.create_pods,
            cephfs_pvc_for_pods,
            pod_factory,
            constants.CEPHFILESYSTEM,
            2,
        )

        # Start creation of new CephFS PVCs.
        log.info("Start creating new CephFS PVCs.")
        pvc_create_cephfs = executor.submit(
            multi_pvc_factory,
            interface=constants.CEPHFILESYSTEM,
            project=self.project,
            storageclass=None,
            size=self.pvc_size,
            access_modes=self.access_modes_cephfs,
            access_modes_selection="distribute_random",
            status="",
            num_of_pvc=len(self.access_modes_cephfs),
            wait_each=False,
        )

        # Start creation of new RBD PVCs
        log.info("Start creating new RBD PVCs.")
        pvc_create_rbd = executor.submit(
            multi_pvc_factory,
            interface=constants.CEPHBLOCKPOOL,
            project=self.project,
            storageclass=None,
            size=self.pvc_size,
            access_modes=self.access_modes_rbd,
            access_modes_selection="distribute_random",
            status="",
            num_of_pvc=len(self.access_modes_rbd),
            wait_each=False,
        )

        # Start deleting PVCs
        pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete)
        log.info("Started deleting PVCs")

        # Start deleting pods
        pod_bulk_delete = executor.submit(self.delete_pods, pods_to_delete)
        log.info("Started deleting pods")

        # Start IO on IO pods
        self.run_io_on_pods(io_pods)
        log.info("Started IO on IO pods")

        # Wait for 1 second before killing daemons. This is to wait for the create/delete operations to start
        sleep(1)

        # Kill daemons
        node_and_kill_proc = {}
        log.info(f"Killing daemons of {daemons_to_kill}")
        for node_name, pids in nodes_and_pids.items():
            # Command to kill the daemon
            kill_cmd = f"oc debug node/{node_name} -- chroot /host kill -9 {pids}"
            # Create node-kill process map for verifying the result
            node_and_kill_proc[node_name] = executor.submit(run_cmd, kill_cmd)

        # Verify daemon kill process
        for node_name, daemon_kill_proc in node_and_kill_proc.items():
            # Get the type of daemons killed on the particular node
            resources = [
                disruption.resource
                for disruption in disruption_ops
                if disruption.daemon_pid in nodes_and_pids[node_name]
            ]
            # 'daemon_kill_proc' result will be an empty string if command is success
            cmd_out = daemon_kill_proc.result()
            assert isinstance(cmd_out, str) and (not cmd_out), (
                f"Failed to kill {resources } daemons in the node {node_name}. "
                f"Daemon kill command output - {cmd_out}"
            )

        # Wait for new daemon to come up
        [disruption.check_new_pid() for disruption in disruption_ops]
        log.info("Verified daemons kill")

        pods_deleted = pod_bulk_delete.result()
        assert pods_deleted, "Deletion of pods failed."

        # Verify pods are deleted
        for pod_obj in pods_to_delete:
            pod_obj.ocp.wait_for_delete(pod_obj.name, 300)
        log.info("Verified: Pods are deleted.")

        # Verify that the mount point is removed from nodes after deleting pod
        node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict)
        for node, pvs in node_pv_mounted.items():
            assert not pvs, (
                f"PVs {pvs} is still present on node {node} after "
                f"deleting the pods."
            )
        log.info(
            "Verified: mount points are removed from nodes after deleting " "the pods"
        )

        pvcs_deleted = pvc_bulk_delete.result()
        assert pvcs_deleted, "Deletion of PVCs failed."

        # Verify PVCs are deleted
        for pvc_obj in pvcs_to_delete:
            pvc_obj.ocp.wait_for_delete(pvc_obj.name)
        log.info("Verified: PVCs are deleted.")

        # Getting result of PVC creation as list of PVC objects
        log.info("Getting the result of CephFS PVC creation process")
        pvc_objs_cephfs_new = pvc_create_cephfs.result()

        log.info("Getting the result of RBD PVC creation process")
        pvc_objs_rbd_new = pvc_create_rbd.result()

        # Set interface argument for reference
        for pvc_obj in pvc_objs_cephfs_new:
            pvc_obj.interface = constants.CEPHFILESYSTEM

        # Set interface argument for reference
        for pvc_obj in pvc_objs_rbd_new:
            pvc_obj.interface = constants.CEPHBLOCKPOOL

        # Confirm PVCs are Bound
        log.info("Verifying the new CephFS and RBD PVCs are Bound")
        for pvc_obj in pvc_objs_cephfs_new + pvc_objs_rbd_new:
            helpers.wait_for_resource_state(
                resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180
            )
            pvc_obj.reload()
        log.info("Verified: New CephFS and RBD PVCs are Bound.")

        # Getting result of pods creation as list of Pod objects
        log.info("Getting the result of pods creation process")
        pod_objs_rbd_new = pod_create_rbd.result()
        pod_objs_cephfs_new = pod_create_cephfs.result()

        # Verify new pods are Running
        log.info("Verifying the new pods are Running")
        for pod_obj in pod_objs_rbd_new + pod_objs_cephfs_new:
            helpers.wait_for_resource_state(
                resource=pod_obj, state=constants.STATUS_RUNNING, timeout=90
            )
            pod_obj.reload()
        log.info("Verified: All new pods are Running.")

        # Verify PVs are deleted
        for pv_obj in pv_objs:
            pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300)
        log.info("Verified: PVs are deleted.")

        # Verify PV using ceph toolbox. Image/Subvolume should be deleted.
        pool_name = default_ceph_block_pool()
        for pvc_obj, uuid in pvc_uuid_map.items():
            if pvc_obj.interface == constants.CEPHBLOCKPOOL:
                ret = verify_volume_deleted_in_backend(
                    interface=constants.CEPHBLOCKPOOL,
                    image_uuid=uuid,
                    pool_name=pool_name,
                )
            if pvc_obj.interface == constants.CEPHFILESYSTEM:
                ret = verify_volume_deleted_in_backend(
                    interface=constants.CEPHFILESYSTEM, image_uuid=uuid
                )
            assert (
                ret
            ), f"Volume associated with PVC {pvc_obj.name} still exists in the backend"

        log.info("Fetching IO results from the pods.")
        for pod_obj in io_pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
        log.info("Verified IO result on pods.")

        # Verify that the new PVCs are usable by creating new pods
        log.info("Verify that the new PVCs are usable by creating new pods")
        pod_objs_rbd_re = helpers.create_pods(
            pvc_objs_rbd_new, pod_factory, constants.CEPHBLOCKPOOL, 2
        )
        pod_objs_cephfs_re = helpers.create_pods(
            pvc_objs_cephfs_new, pod_factory, constants.CEPHFILESYSTEM, 2
        )

        # Verify pods are Running
        log.info("Verifying the pods are Running")
        for pod_obj in pod_objs_rbd_re + pod_objs_cephfs_re:
            helpers.wait_for_resource_state(
                resource=pod_obj, state=constants.STATUS_RUNNING, timeout=90
            )
            pod_obj.reload()
        log.info(
            "Successfully created and verified the status of the pods using the new CephFS and RBD PVCs."
        )

        new_pods = (
            pod_objs_rbd_new
            + pod_objs_cephfs_new
            + pod_objs_rbd_re
            + pod_objs_cephfs_re
        )

        # Do setup on the new pods for running IO
        log.info("Setting up the new pods for running IO.")
        for pod_obj in new_pods:
            if pod_obj.pvc.get_pvc_vol_mode == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on the new pods to complete
        for pod_obj in new_pods:
            log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}")
            for sample in TimeoutSampler(360, 2, getattr, pod_obj, "wl_setup_done"):
                if sample:
                    log.info(
                        f"Setup for running IO is completed on pod " f"{pod_obj.name}."
                    )
                    break
        log.info("Setup for running IO is completed on the new pods.")

        # Start IO on the new pods
        log.info("Start IO on the new pods")
        self.run_io_on_pods(new_pods)
        log.info("Started IO on the new pods")

        log.info("Fetching IO results from the new pods.")
        for pod_obj in new_pods:
            get_fio_rw_iops(pod_obj)
        log.info("Verified IO result on the new pods.")

        # Verify number of pods of each daemon type
        final_num_resource_name = [
            len(pod_functions[resource_name]()) for resource_name in daemons_to_kill
        ]
        assert final_num_resource_name == num_of_resource_pods, (
            f"Total number of pods of each type is not matching with "
            f"initial value. Total number of pods of each type before daemon kill: "
            f"{num_of_resource_pods}. Total number of pods of each type present now: "
            f"{final_num_resource_name}"
        )

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
        log.info("Ceph cluster health is OK")