예제 #1
0
    def test_node_maintenance(self, node_type, pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get a list of 2 nodes. Pick one of them after checking
        # which one does't have the rook operator running on
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2)
        typed_node_name = typed_nodes[0].name
        # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get('metadata').get('name') == typed_node_name:
            typed_node_name = typed_nodes[1].name
        # End of workaround for BZ 1778488

        # Maintenance the node (unschedule and drain)
        drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
        def finalizer():

            try:

                # Validate all mon pods are running
                log.info("Validate all mons are up and running")
                POD_OBJ.wait_for_resource(
                    condition=STATUS_RUNNING,
                    selector=MON_APP_LABEL,
                    resource_count=len(mon_pod),
                )
                log.info("All mons are up and running")

            except (TimeoutExpiredError, ResourceWrongStatusException) as ex:
                log.error(f"{ex}")
                # Restart operator
                operator_pod_obj = get_operator_pods()
                delete_pods(pod_objs=operator_pod_obj)

                # Wait untill mon pod recovery
                POD_OBJ.wait_for_resource(
                    condition=STATUS_RUNNING,
                    selector=MON_APP_LABEL,
                    resource_count=len(mon_pod),
                    timeout=3600,
                    sleep=5,
                )
                log.info("All mons are up and running")

                # Check the ceph health OK
                ceph_health_check(tries=90, delay=15)
예제 #3
0
    def set_resource(self, resource, leader_type="provisioner"):
        self.resource = resource
        if (config.ENV_DATA["platform"]
                in constants.MANAGED_SERVICE_PLATFORMS) and (resource
                                                             in CEPH_PODS):
            # If the platform is Managed Services, then the ceph pods will be present in the provider cluster.
            # Consumer cluster will be the primary cluster context in a multicluster run. Setting 'cluster_kubeconfig'
            # attribute to use as the value of the parameter '--kubeconfig' in the 'oc' commands to get ceph pods.
            provider_kubeconfig = os.path.join(
                config.clusters[
                    config.get_provider_index()].ENV_DATA["cluster_path"],
                config.clusters[config.get_provider_index()].RUN.get(
                    "kubeconfig_location"),
            )
            self.cluster_kubeconfig = provider_kubeconfig
        resource_count = 0
        if self.resource == "mgr":
            self.resource_obj = pod.get_mgr_pods()
            self.selector = constants.MGR_APP_LABEL
        if self.resource == "mon":
            self.resource_obj = pod.get_mon_pods()
            self.selector = constants.MON_APP_LABEL
        if self.resource == "osd":
            self.resource_obj = pod.get_osd_pods()
            self.selector = constants.OSD_APP_LABEL
        if self.resource == "mds":
            self.resource_obj = pod.get_mds_pods()
            self.selector = constants.MDS_APP_LABEL
        if self.resource == "cephfsplugin":
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHFILESYSTEM)
            self.selector = constants.CSI_CEPHFSPLUGIN_LABEL
        if self.resource == "rbdplugin":
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHBLOCKPOOL)
            self.selector = constants.CSI_RBDPLUGIN_LABEL
        if self.resource == "cephfsplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHFILESYSTEM,
                    leader_type=leader_type)
            ]
            self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_cephfsplugin_provisioner_pods())
        if self.resource == "rbdplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHBLOCKPOOL, leader_type=leader_type)
            ]
            self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_rbdfsplugin_provisioner_pods())
        if self.resource == "operator":
            self.resource_obj = pod.get_operator_pods()
            self.selector = constants.OPERATOR_LABEL

        self.resource_count = resource_count or len(self.resource_obj)
예제 #4
0
        def finalizer():
            # Use provider cluster in managed service platform
            if self.consumer_cluster_index is not None:
                config.switch_to_provider()

            # Validate all mon services are running
            if len(mon_svc_list) != len(
                    get_services_by_label(
                        label=constants.MON_APP_LABEL,
                        namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
                    )):

                # Restart the rook-operator pod
                operator_pod_obj = get_operator_pods()
                delete_pods(pod_objs=operator_pod_obj)
                POD_OBJ.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.OPERATOR_LABEL,
                )

                # Wait till all mon services are up
                for svc_list in TimeoutSampler(
                        1200,
                        len(mon_svc_list),
                        get_services_by_label,
                        constants.MON_APP_LABEL,
                        constants.OPENSHIFT_STORAGE_NAMESPACE,
                ):
                    try:
                        if len(svc_list) == len(mon_svc_list):
                            log.info("All expected mon services are up")
                            break
                    except IndexError:
                        log.error(
                            f"All expected mon services are not up only found :{svc_list}. "
                            f"Expected: {mon_svc_list}")

                # Wait till all mon pods running
                POD_OBJ.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.MON_APP_LABEL,
                    resource_count=len(mon_pods_list),
                    timeout=600,
                    sleep=3,
                )

                # Check the ceph health OK
                ceph_health_check(tries=90, delay=15)

            # Switch the context to consumer cluster if needed
            if self.consumer_cluster_index is not None:
                config.switch_to_consumer(self.consumer_cluster_index)
예제 #5
0
파일: pgsql.py 프로젝트: ramkiperiy/ocs-ci
    def respin_pgsql_app_pod(self):
        """
        Respin the pgsql app pod

        Returns:
            pod status

        """
        app_pod_list = get_operator_pods(constants.PGSQL_APP_LABEL, BMO_NAME)
        app_pod = app_pod_list[random.randint(0, len(app_pod_list) - 1)]
        log.info(f"respin pod {app_pod.name}")
        app_pod.delete(wait=True, force=False)
        wait_for_resource_state(resource=app_pod,
                                state=constants.STATUS_RUNNING,
                                timeout=300)
예제 #6
0
    def set_resource(self, resource, leader_type="provisioner"):
        self.resource = resource
        resource_count = 0
        if self.resource == "mgr":
            self.resource_obj = pod.get_mgr_pods()
            self.selector = constants.MGR_APP_LABEL
        if self.resource == "mon":
            self.resource_obj = pod.get_mon_pods()
            self.selector = constants.MON_APP_LABEL
        if self.resource == "osd":
            self.resource_obj = pod.get_osd_pods()
            self.selector = constants.OSD_APP_LABEL
        if self.resource == "mds":
            self.resource_obj = pod.get_mds_pods()
            self.selector = constants.MDS_APP_LABEL
        if self.resource == "cephfsplugin":
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHFILESYSTEM)
            self.selector = constants.CSI_CEPHFSPLUGIN_LABEL
        if self.resource == "rbdplugin":
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHBLOCKPOOL)
            self.selector = constants.CSI_RBDPLUGIN_LABEL
        if self.resource == "cephfsplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHFILESYSTEM,
                    leader_type=leader_type)
            ]
            self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_cephfsplugin_provisioner_pods())
        if self.resource == "rbdplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHBLOCKPOOL, leader_type=leader_type)
            ]
            self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_rbdfsplugin_provisioner_pods())
        if self.resource == "operator":
            self.resource_obj = pod.get_operator_pods()
            self.selector = constants.OPERATOR_LABEL

        self.resource_count = resource_count or len(self.resource_obj)
예제 #7
0
    def test_node_maintenance_restart_activate(self, nodes, pvc_factory,
                                               pod_factory, node_type):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get a list of 2 nodes. Pick one of them after checking
        # which one does't have the rook operator running on
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get('metadata').get('name') == typed_node_name:
            typed_node_name = typed_nodes[1].name
        # End of workaround for BZ 1778488

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=True)

        wait_for_nodes_status(node_names=[typed_node_name],
                              status=constants.NODE_READY_SCHEDULING_DISABLED)
        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
예제 #8
0
    def set_resource(self, resource):
        self.resource = resource
        resource_count = 0
        if self.resource == 'mgr':
            self.resource_obj = pod.get_mgr_pods()
            self.selector = constants.MGR_APP_LABEL
        if self.resource == 'mon':
            self.resource_obj = pod.get_mon_pods()
            self.selector = constants.MON_APP_LABEL
        if self.resource == 'osd':
            self.resource_obj = pod.get_osd_pods()
            self.selector = constants.OSD_APP_LABEL
        if self.resource == 'mds':
            self.resource_obj = pod.get_mds_pods()
            self.selector = constants.MDS_APP_LABEL
        if self.resource == 'cephfsplugin':
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHFILESYSTEM)
            self.selector = constants.CSI_CEPHFSPLUGIN_LABEL
        if self.resource == 'rbdplugin':
            self.resource_obj = pod.get_plugin_pods(
                interface=constants.CEPHBLOCKPOOL)
            self.selector = constants.CSI_RBDPLUGIN_LABEL
        if self.resource == 'cephfsplugin_provisioner':
            self.resource_obj = [
                pod.plugin_provisioner_leader(
                    interface=constants.CEPHFILESYSTEM)
            ]
            self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_cephfsplugin_provisioner_pods())
        if self.resource == 'rbdplugin_provisioner':
            self.resource_obj = [
                pod.plugin_provisioner_leader(
                    interface=constants.CEPHBLOCKPOOL)
            ]
            self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_rbdfsplugin_provisioner_pods())
        if self.resource == 'operator':
            self.resource_obj = pod.get_operator_pods()
            self.selector = constants.OPERATOR_LABEL

        self.resource_count = resource_count or len(self.resource_obj)
예제 #9
0
    def test_recovery_from_volume_deletion(self, nodes, pvc_factory,
                                           pod_factory):
        """
        Test cluster recovery from disk deletion from the platform side.
        Based on documented procedure detailed in
        https://bugzilla.redhat.com/show_bug.cgi?id=1823183

        """
        logger.info("Picking a PV which to be deleted from the platform side")
        osd_pvs = get_deviceset_pvs()
        osd_pv = random.choice(osd_pvs)
        osd_pv_name = osd_pv.name
        # get the claim name
        logger.info(f"Getting the claim name for OSD PV {osd_pv_name}")
        claim_name = osd_pv.get().get("spec").get("claimRef").get("name")

        # Get the backing volume name
        logger.info(f"Getting the backing volume name for PV {osd_pv_name}")
        backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0]

        # Get the corresponding PVC
        logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}")
        osd_pvcs = get_deviceset_pvcs()
        osd_pvcs_count = len(osd_pvcs)
        osd_pvc = [
            ds for ds in osd_pvcs
            if ds.get().get("metadata").get("name") == claim_name
        ][0]

        # Get the corresponding OSD pod and ID
        logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}")
        osd_pods = get_osd_pods()
        osd_pods_count = len(osd_pods)
        osd_pod = [
            osd_pod for osd_pod in osd_pods
            if osd_pod.get().get("metadata").get("labels").get(
                constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        logger.info(f"OSD_POD {osd_pod.name}")
        osd_id = osd_pod.get().get("metadata").get("labels").get("ceph-osd-id")

        # Get the node that has the OSD pod running on
        logger.info(
            f"Getting the node that has the OSD pod {osd_pod.name} running on")
        osd_node = get_pod_node(osd_pod)
        osd_prepare_pods = get_osd_prepare_pods()
        osd_prepare_pod = [
            pod for pod in osd_prepare_pods if pod.get().get("metadata").get(
                "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get(
            "labels").get("job-name"))
        osd_prepare_job = get_job_obj(osd_prepare_job_name)

        # Get the corresponding OSD deployment
        logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}")
        osd_deployment = [
            osd_pod for osd_pod in get_osd_deployments()
            if osd_pod.get().get("metadata").get("labels").get(
                constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_deployment_name = osd_deployment.name

        # Delete the volume from the platform side
        logger.info(f"Deleting {backing_volume} from the platform side")
        nodes.detach_volume(backing_volume, osd_node)

        # Scale down OSD deployment
        logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0")
        ocp.OCP().exec_oc_cmd(
            f"scale --replicas=0 deployment/{osd_deployment_name}")

        # Force delete OSD pod if necessary
        osd_pod_name = osd_pod.name
        logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted")
        try:
            osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)
        except TimeoutError:
            osd_pod.delete(force=True)
            osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)

        # Run ocs-osd-removal job
        ocp_version = float(get_ocp_version())
        if ocp_version >= 4.6:
            cmd = f"process ocs-osd-removal -p FAILED_OSD_IDS={osd_id} -o yaml"
        else:
            cmd = f"process ocs-osd-removal -p FAILED_OSD_ID={osd_id} -o yaml"

        logger.info(f"Executing OSD removal job on OSD-{osd_id}")
        ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"])
        osd_removal_job_yaml = ocp_obj.exec_oc_cmd(cmd)
        osd_removal_job = OCS(**osd_removal_job_yaml)
        osd_removal_job.create(do_reload=False)

        # Get ocs-osd-removal pod name
        logger.info("Getting the ocs-osd-removal pod name")
        osd_removal_pod_name = get_osd_removal_pod_name(osd_id)
        osd_removal_pod_obj = get_pod_obj(osd_removal_pod_name,
                                          namespace="openshift-storage")
        osd_removal_pod_obj.ocp.wait_for_resource(
            condition=constants.STATUS_COMPLETED,
            resource_name=osd_removal_pod_name)

        # Verify OSD removal from the ocs-osd-removal pod logs
        logger.info(
            f"Verifying removal of OSD from {osd_removal_pod_name} pod logs")
        logs = get_pod_logs(osd_removal_pod_name)
        pattern = f"purged osd.{osd_id}"
        assert re.search(pattern, logs)

        osd_pvc_name = osd_pvc.name

        if ocp_version < 4.6:
            # Delete the OSD prepare job
            logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}")
            osd_prepare_job.delete()
            osd_prepare_job.ocp.wait_for_delete(
                resource_name=osd_prepare_job_name, timeout=120)

            # Delete the OSD PVC
            logger.info(f"Deleting OSD PVC {osd_pvc_name}")
            osd_pvc.delete()
            osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

            # Delete the OSD deployment
            logger.info(f"Deleting OSD deployment {osd_deployment_name}")
            osd_deployment.delete()
            osd_deployment.ocp.wait_for_delete(
                resource_name=osd_deployment_name, timeout=120)
        else:
            # If ocp version is '4.6' and above the osd removal job should
            # delete the OSD prepare job, OSD PVC, OSD deployment
            logger.info(
                f"Verifying deletion of OSD prepare job {osd_prepare_job_name}"
            )
            osd_prepare_job.ocp.wait_for_delete(
                resource_name=osd_prepare_job_name, timeout=30)
            logger.info(f"Verifying deletion of OSD PVC {osd_pvc_name}")
            osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name, timeout=30)
            logger.info(
                f"Verifying deletion of OSD deployment {osd_deployment_name}")
            osd_deployment.ocp.wait_for_delete(
                resource_name=osd_deployment_name, timeout=30)

        # Delete PV
        logger.info(f"Verifying deletion of PV {osd_pv_name}")
        try:
            osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
        except TimeoutError:
            osd_pv.delete()
            osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)

        if ocp_version < 4.6:
            # Delete the rook ceph operator pod to trigger reconciliation
            rook_operator_pod = get_operator_pods()[0]
            logger.info(
                f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
            rook_operator_pod.delete()

        # Delete the OSD removal job
        logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}")
        osd_removal_job = get_job_obj(f"ocs-osd-removal-{osd_id}")
        osd_removal_job.delete()
        osd_removal_job.ocp.wait_for_delete(
            resource_name=f"ocs-osd-removal-{osd_id}")

        timeout = 600
        # Wait for OSD PVC to get created and reach Bound state
        logger.info(
            "Waiting for a new OSD PVC to get created and reach Bound state")
        assert osd_pvc.ocp.wait_for_resource(
            timeout=timeout,
            condition=constants.STATUS_BOUND,
            selector=constants.OSD_PVC_GENERIC_LABEL,
            resource_count=osd_pvcs_count,
        ), (f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: "
            f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}"
            )
        # Wait for OSD pod to get created and reach Running state
        logger.info(
            "Waiting for a new OSD pod to get created and reach Running state")
        assert osd_pod.ocp.wait_for_resource(
            timeout=timeout,
            condition=constants.STATUS_RUNNING,
            selector=constants.OSD_APP_LABEL,
            resource_count=osd_pods_count,
        ), (f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: "
            f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}"
            )

        # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810
        # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438
        if ocp_version >= 4.6:
            silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning(
                osd_pod_name)
            if not silence_osd_crash:
                logger.info("Didn't find ceph osd crash warning")

        # Validate cluster is still functional
        self.sanity_helpers.health_check(tries=100)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
예제 #10
0
    def test_rook_operator_restart_during_mon_failover(self,
                                                       node_drain_teardown):
        """
        Verify the number of monitoring pod is three when drain node

        """
        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=1,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "the expected pdb state is not equal to actual pdb state"

        log.info("Get worker node name where monitoring pod run")
        mon_pod_objs = get_mon_pods()
        node_name = mon_pod_objs[0].data["spec"]["nodeName"]

        drain_nodes([node_name])

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=0,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "the expected pdb state is not equal to actual pdb state"

        timeout = 1400
        log.info(f"Verify the number of mon pods is 3 for {timeout} seconds")
        sample = TimeoutSampler(timeout=timeout,
                                sleep=10,
                                func=check_number_of_mon_pods)
        if sample.wait_for_func_status(result=False):
            assert "There are more than 3 mon pods."

        log.info("Respin pod rook-ceph operator pod")
        rook_ceph_operator_pod_obj = get_operator_pods()
        rook_ceph_operator_pod_obj[0].delete()

        schedule_nodes([node_name])

        log.info("Wait for all the pods in openshift-storage to be running.")
        assert wait_for_pods_to_be_running(timeout=300)

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=1,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "the expected pdb state is not equal to actual pdb state"

        ceph_health_check()

        assert check_number_of_mon_pods(
        ), "The number of mon pods not equal to 3"
예제 #11
0
    def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory):
        """
        Test cluster recovery from disk deletion from the platform side.
        Based on documented procedure detailed in
        https://bugzilla.redhat.com/show_bug.cgi?id=1787236#c16

        """
        logger.info("Picking a PV which will be deleted from the platform side")
        osd_pvs = get_deviceset_pvs()
        osd_pv = random.choice(osd_pvs)
        osd_pv_name = osd_pv.name
        # get the claim name
        logger.info(f"Getting the claim name for OSD PV {osd_pv_name}")
        claim_name = osd_pv.get().get('spec').get('claimRef').get('name')

        # Get the backing volume name
        logger.info(f"Getting the backing volume name for PV {osd_pv_name}")
        backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0]

        # Get the corresponding PVC
        logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}")
        osd_pvcs = get_deviceset_pvcs()
        osd_pvcs_count = len(osd_pvcs)
        osd_pvc = [ds for ds in osd_pvcs if ds.get().get('metadata').get('name') == claim_name][0]

        # Get the corresponding OSD pod
        logger.info(f"Getting the corresponding OSD pod of PVC {osd_pvc.name}")
        osd_pods = get_osd_pods()
        osd_pods_count = len(osd_pods)
        osd_pod = [
            osd_pod for osd_pod in osd_pods if osd_pod.get()
            .get('metadata').get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]

        # Get the node that has the OSD pod running on
        logger.info(f"Getting the node that has the OSD pod {osd_pod.name} running on")
        osd_node = get_pod_node(osd_pod)
        volume_size = osd_pvc.size
        osd_prepare_pods = get_osd_prepare_pods()
        osd_prepare_pod = [
            pod for pod in osd_prepare_pods if pod.get().get('metadata')
            .get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_prepare_job_name = osd_prepare_pod.get().get('metadata').get('labels').get('job-name')
        osd_prepare_job = get_job_obj(osd_prepare_job_name)

        # Get the corresponding OSD deployment
        logger.info(f"Getting the corresponding OSD deployment for OSD PVC {claim_name}")
        osd_deployment = [
            osd_pod for osd_pod in get_osd_deployments() if osd_pod.get()
            .get('metadata').get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]

        # Delete the volume from the platform side
        logger.info(f"Deleting volume {backing_volume} from the platform side")
        nodes.detach_volume(backing_volume, osd_node)

        # Delete the OSD deployment
        osd_deployment_name = osd_deployment.name
        logger.info(f"Deleting OSD deployment {osd_deployment_name}")
        osd_deployment.delete()
        osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name, timeout=120)

        # Delete the OSD prepare job
        osd_prepare_job.delete()
        osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name, timeout=120)

        # Delete the OSD PVC
        osd_pvc_name = osd_pvc.name
        logger.info(f"Deleting OSD PVC {osd_pvc_name}")
        osd_pvc.delete()
        osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

        # Recreate a volume from the platform side
        logger.info("Creating a replacing volume from the platform side")
        nodes.create_and_attach_volume(osd_node, volume_size)

        # Delete the rook ceph operator pod to trigger reconciliation
        rook_operator_pod = get_operator_pods()[0]
        logger.info(f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
        rook_operator_pod.delete()

        timeout = 600
        # Wait for OSD PVC to get created and reach Bound state
        logger.info("Waiting for a new OSD PVC to get created and reach Bound state")
        assert osd_pvc.ocp.wait_for_resource(
            timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL,
            resource_count=osd_pvcs_count
        ), (
            f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: "
            f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}"
        )
        # Wait for OSD pod to get created and reach Running state
        logger.info("Waiting for a new OSD pod to get created and reach Running state")
        assert osd_pod.ocp.wait_for_resource(
            timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL,
            resource_count=osd_pods_count
        ), (
            f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: "
            f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}"
        )

        # Validate cluster is still functional
        self.sanity_helpers.health_check(tries=80)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
    def test_multiple_mon_pod_stays_on_same_node(self):
        """
        A testcase to verify multiple mon pods stays on same node

        1. Edit the rook-ceph-mon-endpoints configmap
           say, assign mon-a to another node that would be on
           the same node as another mon (compute-1 instead of compute-0)
        2. Delete the mon-a deployment
        3. Edit the mon-b deployment to remove the required mon anti-affinity
        4. Restart the operator
        5. Edit the mon-a deployment to remove the required mon anti-affinity
        6. See mon-a start on compute-1 with mon-b
        7. Soon after, see the operator failover one of these mons onto the
        node that doesn't currently have a mon (compute-0) and start mon-d

        """
        ocs_version = config.ENV_DATA["ocs_version"]
        # Check that we have LSO cluster and OCS version is 4.8 and below
        # This is a workaround due to issue https://github.com/red-hat-storage/ocs-ci/issues/4937
        if not (is_lso_cluster()
                and Version.coerce(ocs_version) <= Version.coerce("4.8")):
            pytest.skip(
                "Skip the test because mons are not node assignment from Rook, if cluster is not "
                "LSO based. And also currently, we want to run the test only with OCS 4.8 and "
                "below. This is a workaround due to issue "
                "https://github.com/red-hat-storage/ocs-ci/issues/4937")
        # Initialize
        rook_ceph_mon = "rook-ceph-mon"

        # Get mons running on pod
        mon_pods = get_mon_pods()
        mon_name_to_del = mon_pods[0].get().get("metadata").get("labels").get(
            "mon")
        mon_name_to_edit = mon_pods[1].get().get("metadata").get("labels").get(
            "mon")
        mon_node = get_pod_node(mon_pods[1])

        # Edit the rook-ceph-mon-endpoints
        log.info(f"Edit the configmap {ROOK_CEPH_MON_ENDPOINTS}")
        configmap_obj = OCP(kind=CONFIGMAP,
                            namespace=OPENSHIFT_STORAGE_NAMESPACE)
        rook_ceph_mon_configmap = configmap_obj.get(
            resource_name=ROOK_CEPH_MON_ENDPOINTS)
        json_val = json.loads(rook_ceph_mon_configmap["data"]["mapping"])
        json_val["node"][mon_name_to_del].update(
            json_val["node"][mon_name_to_edit])
        rook_ceph_mon_configmap["data"]["mapping"] = json.dumps(json_val)
        new_data = rook_ceph_mon_configmap["data"]
        params = f'{{"data": {json.dumps(new_data)}}}'
        configmap_obj.patch(
            resource_name=ROOK_CEPH_MON_ENDPOINTS,
            params=params,
            format_type="strategic",
        )
        log.info(f"Configmap {ROOK_CEPH_MON_ENDPOINTS} edited successfully")
        log.info(
            f"Rook-ceph-mon-endpoints updated configmap: {rook_ceph_mon_configmap}"
        )

        # Delete one mon deployment which had been edited
        dep_obj = OCP(kind=DEPLOYMENT, namespace=OPENSHIFT_STORAGE_NAMESPACE)
        mon_deployment_name_to_del = f"{rook_ceph_mon}-{mon_name_to_del}"
        log.info(f"Deleting mon {mon_deployment_name_to_del} deployments")
        dep_obj.delete(resource_name=mon_deployment_name_to_del)

        # Edit other mon deployment to remove mon anti-affinity
        mon_deployment_name_to_edit = f"{rook_ceph_mon}-{mon_name_to_edit}"
        log.info(f"Edit mon {mon_deployment_name_to_edit} deployment "
                 "to remove the required mon anti-affinity")
        params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]'
        dep_obj.patch(resource_name=mon_deployment_name_to_edit,
                      params=params,
                      format_type="json")
        log.info(
            f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_edit}"
        )

        # Restart operator
        operator_pod_obj = get_operator_pods()
        delete_pods(pod_objs=operator_pod_obj)
        POD_OBJ.wait_for_resource(condition=STATUS_RUNNING,
                                  selector=OPERATOR_LABEL)

        # Validate deleted deployment mon came up and in pending state
        # Initially mon stucks in pending state, remove defined anti-affinity
        POD_OBJ.wait_for_resource(
            condition=STATUS_PENDING,
            resource_count=1,
            selector=MON_APP_LABEL,
            timeout=1200,
        )
        # Edit mon deployment to remove mon anti-affinity
        log.info(f"Edit mon {mon_deployment_name_to_del} deployment "
                 "to remove the required mon anti-affinity")
        params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]'
        dep_obj.patch(resource_name=mon_deployment_name_to_del,
                      params=params,
                      format_type="json")
        log.info(
            f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_del}"
        )

        # Validate mon pod moved to another node such that 2 mons are running on same node
        log.info("Waiting for 5 seconds for mon recovery")
        time.sleep(5)
        new_mon_pods = get_mon_pods()
        new_node = [
            get_pod_node(mon) for mon in new_mon_pods if mon.get().get(
                "metadata").get("labels").get("mon") == mon_name_to_del
        ]
        assert (
            new_node[0].name == mon_node.name
        ), f"Mon moved to node {mon_node} such that 2 mons are running on same node"

        # Verify rook deletes one of the mon and move to another node
        timeout = 60
        log.info(f"Waiting for {timeout} seconds for mon recovery")
        time.sleep(timeout)

        POD_OBJ.wait_for_resource(
            condition=STATUS_RUNNING,
            resource_count=len(mon_pods),
            selector=MON_APP_LABEL,
            timeout=3600,
            sleep=5,
        )
        log.info(
            "Mons are up and running state and validate are running on different nodes"
        )
        mon_pods_running_on_same_node()
예제 #13
0
    def set_resource(self, resource, leader_type="provisioner", cluster_index=None):
        self.resource = resource
        if (config.ENV_DATA["platform"] in constants.MANAGED_SERVICE_PLATFORMS) and (
            resource in CEPH_PODS
        ):
            # If the platform is Managed Services, then the ceph pods will be present in the provider cluster.
            # Consumer cluster will be the primary cluster context in a multicluster run. Setting 'cluster_kubeconfig'
            # attribute to use as the value of the parameter '--kubeconfig' in the 'oc' commands to get ceph pods.
            provider_kubeconfig = os.path.join(
                config.clusters[config.get_provider_index()].ENV_DATA["cluster_path"],
                config.clusters[config.get_provider_index()].RUN.get(
                    "kubeconfig_location"
                ),
            )
            self.cluster_kubeconfig = provider_kubeconfig
        elif config.ENV_DATA["platform"] in constants.MANAGED_SERVICE_PLATFORMS:
            # cluster_index is used to identify the the cluster in which the pod is residing. If cluster_index is not
            # passed, assume that the context is already changed to the cluster where the pod is residing.
            cluster_index = (
                cluster_index if cluster_index is not None else config.cur_index
            )
            self.cluster_kubeconfig = os.path.join(
                config.clusters[cluster_index].ENV_DATA["cluster_path"],
                config.clusters[cluster_index].RUN.get("kubeconfig_location"),
            )
        resource_count = 0
        if self.resource == "mgr":
            self.resource_obj = pod.get_mgr_pods()
            self.selector = constants.MGR_APP_LABEL
        if self.resource == "mon":
            self.resource_obj = pod.get_mon_pods()
            self.selector = constants.MON_APP_LABEL
        if self.resource == "osd":
            self.resource_obj = pod.get_osd_pods()
            self.selector = constants.OSD_APP_LABEL
        if self.resource == "mds":
            self.resource_obj = pod.get_mds_pods()
            self.selector = constants.MDS_APP_LABEL
        if self.resource == "cephfsplugin":
            self.resource_obj = pod.get_plugin_pods(interface=constants.CEPHFILESYSTEM)
            self.selector = constants.CSI_CEPHFSPLUGIN_LABEL
        if self.resource == "rbdplugin":
            self.resource_obj = pod.get_plugin_pods(interface=constants.CEPHBLOCKPOOL)
            self.selector = constants.CSI_RBDPLUGIN_LABEL
        if self.resource == "cephfsplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHFILESYSTEM, leader_type=leader_type
                )
            ]
            self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_cephfsplugin_provisioner_pods())
        if self.resource == "rbdplugin_provisioner":
            self.resource_obj = [
                pod.get_plugin_provisioner_leader(
                    interface=constants.CEPHBLOCKPOOL, leader_type=leader_type
                )
            ]
            self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
            resource_count = len(pod.get_rbdfsplugin_provisioner_pods())
        if self.resource == "operator":
            self.resource_obj = pod.get_operator_pods()
            self.selector = constants.OPERATOR_LABEL
        if self.resource == "ocs_operator":
            self.resource_obj = [pod.get_ocs_operator_pod()]
            self.selector = constants.OCS_OPERATOR_LABEL
        if self.resource == "alertmanager_managed_ocs_alertmanager":
            self.resource_obj = pod.get_alertmanager_managed_ocs_alertmanager_pods()
            self.selector = constants.MANAGED_ALERTMANAGER_LABEL
        if self.resource == "ocs_osd_controller_manager":
            self.resource_obj = [pod.get_ocs_osd_controller_manager_pod()]
            self.selector = constants.MANAGED_CONTROLLER_LABEL
            # Setting resource_count because odf-operator-controller-manager pod also have the same label.
            resource_count = len(
                pod.get_pods_having_label(
                    constants.MANAGED_CONTROLLER_LABEL,
                    config.ENV_DATA["cluster_namespace"],
                )
            )
        if self.resource == "prometheus_managed_ocs_prometheus":
            self.resource_obj = [pod.get_prometheus_managed_ocs_prometheus_pod()]
            self.selector = constants.MANAGED_PROMETHEUS_LABEL
        if self.resource == "prometheus_operator":
            self.resource_obj = [pod.get_prometheus_operator_pod()]
            self.selector = constants.PROMETHEUS_OPERATOR_LABEL
        if self.resource == "ocs_provider_server":
            self.resource_obj = [pod.get_ocs_provider_server_pod()]
            self.selector = constants.PROVIDER_SERVER_LABEL

        self.resource_count = resource_count or len(self.resource_obj)
    def test_pvc_creation_after_del_mon_services(self, interface, pod_factory):
        """
        1. Delete one mon service
        2. Edit the configmap rook-ceph-endpoints
           remove all the deleted mon services entries
        3. Delete deployment, pvc of deleted mon service
        4. Restart rook-ceph-operator
        5. Make sure all mon pods are running
        6. Make sure ceph health Ok and storage pods are running
        7. Sleep for 300 seconds before deleting another mon
        8. Repeat above steps for all mons and at the
           end each mon should contain different endpoints
        9. Create PVC, should succeeded.

        """

        pod_obj = pod_factory(interface=interface)
        run_io_in_bg(pod_obj)

        # Get all mon services
        mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods = get_mon_pods()
        mon_count = len(mon_pods)

        list_old_svc = []
        for svc in mon_svc:

            # Get rook-ceph-operator pod obj
            operator_pod_obj = get_operator_pods()
            operator_name = operator_pod_obj[0].name

            # Scale down rook-ceph-operator
            log.info("Scale down rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator", replica_count=0
            ), "Failed to scale down rook-ceph-operator to 0"
            log.info("Successfully scaled down rook-ceph-operator to 0")

            # Validate rook-ceph-operator pod not running
            POD_OBJ.wait_for_delete(resource_name=operator_name)

            svc_name = svc["metadata"]["name"]
            cluster_ip = svc["spec"]["clusterIP"]
            port = svc["spec"]["ports"][0]["port"]
            mon_endpoint = f"{cluster_ip}:{port}"
            mon_id = svc["spec"]["selector"]["mon"]
            list_old_svc.append(cluster_ip)

            # Delete deployment
            log.info("Delete mon deployments")
            del_obj = OCP(
                kind=constants.DEPLOYMENT,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            mon_info = del_obj.get(resource_name=svc_name)
            del_obj.delete(resource_name=svc_name)

            # Delete pvc
            if is_lso_cluster():
                mon_data_path = f"/var/lib/rook/mon-{mon_id}"
                mon_node = mon_info["spec"]["template"]["spec"]["nodeSelector"][
                    "kubernetes.io/hostname"
                ]
                log.info(f"Delete the directory `{mon_data_path}` from {mon_node}")
                cmd = f"rm -rf {mon_data_path}"
                ocp_obj = OCP(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
                ocp_obj.exec_oc_debug_cmd(node=mon_node, cmd_list=[cmd])
            else:
                log.info("Delete mon PVC")
                pvc_name = svc["metadata"]["labels"]["pvc_name"]
                pvc_obj = OCP(
                    kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
                )
                pvc_obj.delete(resource_name=pvc_name)

            # Delete the mon service
            log.info("Delete mon service")
            svc_obj = OCP(
                kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
            )
            svc_obj.delete(resource_name=svc_name)

            # Edit the cm
            log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}")
            configmap_obj = OCP(
                kind=constants.CONFIGMAP,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            output_get = configmap_obj.get(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS
            )
            new_data = output_get["data"]
            new_data["csi-cluster-config-json"] = (
                new_data["csi-cluster-config-json"].replace(f'"{mon_endpoint}",', "")
                if new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",') != 1
                else new_data["csi-cluster-config-json"].replace(
                    f',"{mon_endpoint}"', ""
                )
            )
            new_data["data"] = ",".join(
                [
                    value
                    for value in new_data["data"].split(",")
                    if f"{mon_id}=" not in value
                ]
            )
            new_data["mapping"] = (
                new_data["mapping"].replace(f'"{mon_id}":null,', "")
                if new_data["mapping"].find(f'"{mon_id}":null,') != -1
                else new_data["mapping"].replace(f',"{mon_id}":null', "")
            )
            params = f'{{"data": {json.dumps(new_data)}}}'
            log.info(f"Removing {mon_id} entries from configmap")
            configmap_obj.patch(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS,
                params=params,
                format_type="strategic",
            )
            log.info(
                f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully"
            )

            # Scale up rook-ceph-operator
            log.info("Scale up rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator", replica_count=1
            ), "Failed to scale up rook-ceph-operator to 1"
            log.info("Successfully scaled up rook-ceph-operator to 1")
            log.info("Validate rook-ceph-operator pod is running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.OPERATOR_LABEL,
                resource_count=1,
                timeout=600,
                sleep=5,
            )

            # Validate all mons are running
            log.info("Validate all mons are up and running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.MON_APP_LABEL,
                resource_count=mon_count,
                timeout=1200,
                sleep=5,
            )
            log.info("All mons are up and running")

            # Check the ceph health OK
            ceph_health_check(tries=90, delay=15)

            # Validate all storage pods are running
            wait_for_storage_pods()

            # Sleep for some seconds before deleting another mon
            sleep_time = 300
            log.info(f"Waiting for {sleep_time} seconds before deleting another mon")
            time.sleep(sleep_time)

        # Check the endpoints are different
        log.info("Validate the mon endpoints are changed")
        new_mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        list_new_svc = []
        for new_svc in new_mon_svc:
            cluster_ip = new_svc["spec"]["clusterIP"]
            list_new_svc.append(cluster_ip)
        diff = set(list_new_svc) ^ set(list_old_svc)
        assert len(diff) == len(list_old_svc + list_new_svc), (
            f"Not all endpoints are changed. Set of old "
            f"endpoints {list_old_svc} and new endpoints {list_new_svc}"
        )
        log.info(f"All new mon endpoints are created {list_new_svc}")

        # Create PVC and pods
        log.info(f"Create {interface} PVC")
        pod_obj = pod_factory(interface=interface)
        pod_obj.run_io(storage_type="fs", size="500M")
    def test_del_mon_svc(
        self, multi_pvc_factory, validate_all_mon_svc_are_up_at_teardown
    ):
        """
        Test to verify same mon comes up and running
        after deleting mon services manually and joins the quorum

        1. Delete the mon services
        2. Restart the rook operator
        3. Make sure all mon pods are running,
        and same service or endpoints are running
        4. Make sure ceph health Ok and storage pods are running
        5. Create PVC, should succeeded.

        """

        self.sanity_helpers = Sanity()

        # Get all mon services
        mon_svc_before = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods = get_mon_pods()

        # Delete the mon services one by one
        svc_obj = OCP(
            kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
        )
        mon_svc_ip_before = []
        for svc in mon_svc_before:
            svc_name = svc["metadata"]["name"]
            mon_svc_ip_before.append(svc["spec"]["clusterIP"])
            log.info(f"Delete mon service {svc_name}")
            svc_obj.delete(resource_name=svc_name)
            # Verify mon services deleted
            svc_obj.wait_for_delete(resource_name=svc_name)

        # Restart the rook-operator pod
        operator_pod_obj = get_operator_pods()
        delete_pods(pod_objs=operator_pod_obj)
        POD_OBJ.wait_for_resource(
            condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL
        )

        # Verify same mon services are created again
        for svc in mon_svc_before:
            svc_name = svc["metadata"]["name"]
            svc_obj.check_resource_existence(
                should_exist=True, timeout=300, resource_name=svc_name
            )
        log.info("Same old mon services are recreated")

        # Validate all mons are running
        log.info("Validate all mons are up and running")
        POD_OBJ.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=constants.MON_APP_LABEL,
            resource_count=len(mon_pods),
            timeout=600,
            sleep=3,
        )

        # Validate same mon services are running
        log.info("Validate same mon services are running")
        mon_svc_after = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        mon_svc_ip_after = [svc["spec"]["clusterIP"] for svc in mon_svc_after]
        assert len(set(mon_svc_ip_after) ^ set(mon_svc_ip_before)) == 0, (
            "Different mon services are running. "
            f"Before mon services list: {mon_svc_ip_before}, "
            f"After mon services list: {mon_svc_ip_after}"
        )
        log.info("Same old mon services are running and all mons are in running state")

        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)

        # Validate all storage pods are running
        wait_for_storage_pods()

        # Create and delete resources
        self.sanity_helpers.create_pvc_delete(multi_pvc_factory=multi_pvc_factory)
예제 #16
0
    def test_pv_provisioning_under_degraded_state(self, nodes, pvc_factory,
                                                  pod_factory, interface,
                                                  operation):
        """
        Test PV provisioning under degraded state

        OCS-1138:
        - Stop 1 worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1241:
        - Stop 1 worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1139:
        - Stop 1 worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1242:
        - Stop 1 worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        """
        if operation == 'delete_resources':
            # Create resources that their deletion will be tested later
            self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        provisioner_pods = None
        # Get the provisioner pod according to the interface
        if interface == 'rbd':
            provisioner_pods = pod.get_rbdfsplugin_provisioner_pods()
        elif interface == 'cephfs':
            provisioner_pods = pod.get_cephfsplugin_provisioner_pods()
        provisioner_pod = provisioner_pods[0]
        # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222
        provisioner_node = pod.get_pod_node(provisioner_pod)
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get('metadata').get(
                'name') == provisioner_node.get().get('metadata').get('name'):
            provisioner_pod = provisioner_pods[1]
        # End of workaround for BZ 1778488

        provisioner_pod_name = provisioner_pod.name
        logger.info(
            f"{interface} provisioner pod found: {provisioner_pod_name}")

        # Get the node name that has the provisioner pod running on
        provisioner_node = pod.get_pod_node(provisioner_pod)
        provisioner_node_name = provisioner_node.get().get('metadata').get(
            'name')
        logger.info(
            f"{interface} provisioner pod is running on node {provisioner_node_name}"
        )

        # Stopping the nodes
        nodes.stop_nodes(nodes=[provisioner_node])

        # Wait for the provisioner pod to get to running status
        selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if (
            interface
            == 'rbd') else constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL

        # Wait for the provisioner pod to reach Terminating status
        logger.info(
            f"Waiting for pod {provisioner_pod_name} to reach status Terminating"
        )
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            resource_name=provisioner_pod.name,
            condition=constants.STATUS_TERMINATING
        ), f"{interface} provisioner pod failed to reach status Terminating"
        logger.info(
            f"Pod {provisioner_pod_name} has reached status Terminating")

        # Wait for the provisioner pod to be started and reach running status
        logger.info(
            f"Waiting for pod {provisioner_pod_name} to reach status Running")
        logger.info(f"Pod {provisioner_pod_name} has reached status Running")

        # After this change https://github.com/rook/rook/pull/3642/, there are
        # 2 provisioners for each interface
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector=selector,
            resource_count=2
        ), f"{interface} provisioner pod failed to reach status Running"

        assert wait_for_ct_pod_recovery(
        ), "Ceph tools pod failed to come up on another node"

        if operation == 'create_resources':
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        elif operation == 'delete_resources':
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.delete_resources()

        # Starting the nodes
        nodes.start_nodes(nodes=[provisioner_node])

        # Checking cluster and Ceph health
        self.sanity_helpers.health_check()
예제 #17
0
def osd_device_replacement(nodes):
    """
    Replacing randomly picked osd device
    Args:
        node (OCS): The OCS object representing the node
    """
    logger.info("Picking a PV which to be deleted from the platform side")
    osd_pvs = get_deviceset_pvs()
    osd_pv = random.choice(osd_pvs)
    osd_pv_name = osd_pv.name
    # get the claim name
    logger.info(f"Getting the claim name for OSD PV {osd_pv_name}")
    claim_name = osd_pv.get().get("spec").get("claimRef").get("name")

    # Get the backing volume name
    logger.info(f"Getting the backing volume name for PV {osd_pv_name}")
    backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0]

    # Get the corresponding PVC
    logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}")
    osd_pvcs = get_deviceset_pvcs()
    osd_pvcs_count = len(osd_pvcs)
    osd_pvc = [
        ds for ds in osd_pvcs
        if ds.get().get("metadata").get("name") == claim_name
    ][0]

    # Get the corresponding OSD pod and ID
    logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}")
    osd_pods = get_osd_pods()
    osd_pods_count = len(osd_pods)
    osd_pod = [
        osd_pod for osd_pod in osd_pods if osd_pod.get().get("metadata").get(
            "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
    ][0]
    logger.info(f"OSD_POD {osd_pod.name}")
    osd_id = get_osd_pod_id(osd_pod)

    # Get the node that has the OSD pod running on
    logger.info(
        f"Getting the node that has the OSD pod {osd_pod.name} running on")
    osd_node = get_pod_node(osd_pod)
    ocp_version = get_ocp_version()
    if Version.coerce(ocp_version) < Version.coerce("4.6"):
        osd_prepare_pods = get_osd_prepare_pods()
        osd_prepare_pod = [
            pod for pod in osd_prepare_pods if pod.get().get("metadata").get(
                "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get(
            "labels").get("job-name"))
        osd_prepare_job = get_job_obj(osd_prepare_job_name)

    # Get the corresponding OSD deployment
    logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}")
    osd_deployment = [
        osd_pod for osd_pod in get_osd_deployments()
        if osd_pod.get().get("metadata").get("labels").get(
            constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
    ][0]
    osd_deployment_name = osd_deployment.name

    # Delete the volume from the platform side
    logger.info(f"Deleting {backing_volume} from the platform side")
    nodes.detach_volume(backing_volume, osd_node)

    # Scale down OSD deployment
    logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0")
    ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"])
    ocp_obj.exec_oc_cmd(f"scale --replicas=0 deployment/{osd_deployment_name}")

    # Force delete OSD pod if necessary
    osd_pod_name = osd_pod.name
    logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted")
    try:
        osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)
    except TimeoutError:
        osd_pod.delete(force=True)
        osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)

    # Run ocs-osd-removal job
    osd_removal_job = run_osd_removal_job([osd_id])
    assert osd_removal_job, "ocs-osd-removal failed to create"
    is_completed = verify_osd_removal_job_completed_successfully(osd_id)
    assert is_completed, "ocs-osd-removal-job is not in status 'completed'"
    logger.info("ocs-osd-removal-job completed successfully")

    osd_pvc_name = osd_pvc.name

    if Version.coerce(ocp_version) < Version.coerce("4.6"):
        # Delete the OSD prepare job
        logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}")
        osd_prepare_job.delete()
        osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name,
                                            timeout=120)

        # Delete the OSD PVC
        logger.info(f"Deleting OSD PVC {osd_pvc_name}")
        osd_pvc.delete()
        osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

        # Delete the OSD deployment
        logger.info(f"Deleting OSD deployment {osd_deployment_name}")
        osd_deployment.delete()
        osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name,
                                           timeout=120)
    else:
        # If ocp version is '4.6' and above the osd removal job should
        # delete the OSD prepare job, OSD PVC, OSD deployment
        # We just need to verify the old PV is in the expected status
        logger.info(
            f"Verify that the old PV '{osd_pv_name}' is in the expected status"
        )
        if cluster.is_lso_cluster():
            expected_old_pv_statuses = [constants.STATUS_RELEASED]
        else:
            expected_old_pv_statuses = [
                constants.STATUS_RELEASED,
                constants.STATUS_FAILED,
            ]

        assert (osd_pv.ocp.get_resource_status(osd_pv_name)
                in expected_old_pv_statuses), logger.warning(
                    f"The old PV '{osd_pv_name}' is not in "
                    f"the expected statuses: {expected_old_pv_statuses}")

    # Delete PV
    logger.info(f"Verifying deletion of PV {osd_pv_name}")
    try:
        osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
    except TimeoutError:
        osd_pv.delete()
        osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)

    # If we use LSO, we need to create and attach a new disk manually
    if cluster.is_lso_cluster():
        node.add_disk_to_node(osd_node)

    if Version.coerce(ocp_version) < Version.coerce("4.6"):
        # Delete the rook ceph operator pod to trigger reconciliation
        rook_operator_pod = get_operator_pods()[0]
        logger.info(
            f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
        rook_operator_pod.delete()

    # Delete the OSD removal job
    logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}")
    is_deleted = delete_osd_removal_job(osd_id)
    assert is_deleted, "Failed to delete ocs-osd-removal-job"
    logger.info("ocs-osd-removal-job deleted successfully")

    timeout = 600
    # Wait for OSD PVC to get created and reach Bound state
    logger.info(
        "Waiting for a new OSD PVC to get created and reach Bound state")
    assert osd_pvc.ocp.wait_for_resource(
        timeout=timeout,
        condition=constants.STATUS_BOUND,
        selector=constants.OSD_PVC_GENERIC_LABEL,
        resource_count=osd_pvcs_count,
    ), (f"Cluster recovery failed after {timeout} seconds. "
        f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: "
        f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}"
        )
    # Wait for OSD pod to get created and reach Running state
    logger.info(
        "Waiting for a new OSD pod to get created and reach Running state")
    assert osd_pod.ocp.wait_for_resource(
        timeout=timeout,
        condition=constants.STATUS_RUNNING,
        selector=constants.OSD_APP_LABEL,
        resource_count=osd_pods_count,
    ), (f"Cluster recovery failed after {timeout} seconds. "
        f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: "
        f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}"
        )

    # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810
    # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438
    if Version.coerce(ocp_version) >= Version.coerce("4.6"):
        silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning(
            osd_pod_name)
        if not silence_osd_crash:
            logger.info("Didn't find ceph osd crash warning")
    sanity_helpers = Sanity()
    sanity_helpers.health_check(tries=120)
예제 #18
0
    def test_pv_provisioning_under_degraded_state_stop_rook_operator_pod_node(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        operation,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Test PV provisioning under degraded state -
        stop the node that has the rook operator pod running on

        OCS-2016:
        - Stop 1 worker node that has the rook ceph operator pod running on
        - Wait for the rook ceph operator pod to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-2017:
        - Stop 1 worker node that has the rook ceph operator pod running on
        - Wait for the rook ceph operator pod to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources
        - Start the worker node
        - Check cluster and Ceph health
        """
        if operation == "delete_resources":
            # Create resources that their deletion will be tested later
            self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                                 bucket_factory,
                                                 rgw_bucket_factory)

        rook_operator_pods = pod.get_operator_pods()
        rook_operator_pod = rook_operator_pods[0]

        rook_operator_pod_name = rook_operator_pod.name
        logger.info(f"rook operator pod found: {rook_operator_pod_name}")

        # Get the node name that has the rook operator pod running on
        operator_node = pod.get_pod_node(rook_operator_pod)
        operator_node_name = operator_node.get().get("metadata").get("name")
        logger.info(
            f"{rook_operator_pod_name} pod is running on node {operator_node_name}"
        )

        # Stopping the node
        nodes.stop_nodes(nodes=[operator_node])

        # Wait for the rook operator pod to get to running status
        selector = constants.OPERATOR_LABEL

        # Wait for the rook operator pod to reach Terminating status
        logger.info(
            f"Waiting for pod {rook_operator_pod_name} to reach status Terminating"
        )
        assert rook_operator_pod.ocp.wait_for_resource(
            timeout=600,
            resource_name=rook_operator_pod_name,
            condition=constants.STATUS_TERMINATING,
        ), "rook operator pod failed to reach status Terminating"
        logger.info(
            f"Pod {rook_operator_pod_name} has reached status Terminating")

        # Wait for the rook operator pod to be started and reach running status
        logger.info(
            f"Waiting for pod {rook_operator_pod_name} to reach status Running"
        )

        assert rook_operator_pod.ocp.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector=selector,
            resource_count=1,
        ), "rook operator pod failed to reach status Running"
        logger.info("rook operator pod has reached status Running")

        assert (wait_for_ct_pod_recovery()
                ), "Ceph tools pod failed to come up on another node"

        if operation == "create_resources":
            # Cluster validation (resources creation and IO running)

            self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                                 bucket_factory,
                                                 rgw_bucket_factory)
        elif operation == "delete_resources":
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.delete_resources()

        # Starting the nodes
        nodes.start_nodes(nodes=[operator_node])

        # Checking cluster and Ceph health
        self.sanity_helpers.health_check()
예제 #19
0
    def test_pv_provisioning_under_degraded_state_stop_provisioner_pod_node(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        interface,
        operation,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Test PV provisioning under degraded state -
        stop the node that has the provisioner pod running on

        OCS-1138:
        - Stop 1 worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1241:
        - Stop 1 worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1139:
        - Stop 1 worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1242:
        - Stop 1 worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        """
        if operation == "delete_resources":
            # Create resources that their deletion will be tested later
            self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                                 bucket_factory,
                                                 rgw_bucket_factory)

        provisioner_pods = None
        # Get the provisioner pod according to the interface
        if interface == "rbd":
            provisioner_pods = pod.get_rbdfsplugin_provisioner_pods()
        elif interface == "cephfs":
            provisioner_pods = pod.get_cephfsplugin_provisioner_pods()
        provisioner_pod = provisioner_pods[0]

        # Making sure that the node is not running the rook operator pod:
        provisioner_node = pod.get_pod_node(provisioner_pod)
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get("metadata").get(
                "name") == provisioner_node.get().get("metadata").get("name"):
            provisioner_pod = provisioner_pods[1]

        provisioner_pod_name = provisioner_pod.name
        logger.info(
            f"{interface} provisioner pod found: {provisioner_pod_name}")

        # Get the node name that has the provisioner pod running on
        provisioner_node = pod.get_pod_node(provisioner_pod)
        provisioner_node_name = provisioner_node.get().get("metadata").get(
            "name")
        logger.info(
            f"{interface} provisioner pod is running on node {provisioner_node_name}"
        )

        # Stopping the nodes
        nodes.stop_nodes(nodes=[provisioner_node])

        # Wait for the provisioner pod to get to running status
        selector = (constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if
                    (interface == "rbd") else
                    constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL)

        # Wait for the provisioner pod to reach Terminating status
        logger.info(
            f"Waiting for pod {provisioner_pod_name} to reach status Terminating"
        )
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            resource_name=provisioner_pod.name,
            condition=constants.STATUS_TERMINATING,
        ), f"{interface} provisioner pod failed to reach status Terminating"
        logger.info(
            f"Pod {provisioner_pod_name} has reached status Terminating")

        # Wait for the provisioner pod to be started and reach running status
        logger.info(
            f"Waiting for {interface} provisioner pod to reach status Running")
        # After this change https://github.com/rook/rook/pull/3642/, there are
        # 2 provisioners for each interface
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector=selector,
            resource_count=2,
        ), f"{interface} provisioner pod failed to reach status Running"

        logger.info(f"{interface} provisioner pod has reached status Running")
        if operation == "create_resources":
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                                 bucket_factory,
                                                 rgw_bucket_factory)
        elif operation == "delete_resources":
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.delete_resources()

        # Starting the nodes
        nodes.start_nodes(nodes=[provisioner_node])

        # Checking cluster and Ceph health
        self.sanity_helpers.health_check()