class TestNodeReplacementWithIO(ManageTest):
    """
    Knip-894 Node replacement proactive with IO

    """

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_nodereplacement_proactive_with_io_running(
        self, pvc_factory, pod_factory, dc_pod_factory
    ):
        """
        Knip-894 Node Replacement proactive when IO running in the background

        """

        # Get worker nodes
        worker_node_list = node.get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_node_name = select_osd_node_name()

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(
                    interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20
                )
                pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(
                    interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20
                )
                pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True)

        delete_and_create_osd_node(osd_node_name)

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)
Пример #2
0
class TestNoobaaBackupAndRecovery(E2ETest):
    """
    Test to verify noobaa backup and recovery

    """

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    def test_noobaa_db_backup_and_recovery(
        self,
        pvc_factory,
        pod_factory,
        snapshot_factory,
        bucket_factory,
        rgw_bucket_factory,
        noobaa_db_backup_and_recovery,
    ):
        """
        Test case to verify noobaa backup and recovery

        1. Take snapshot db-noobaa-db-0 PVC and retore it to PVC
        2. Scale down the statefulset noobaa-db
        3. Get the yaml of the current PVC, db-noobaa-db-0 and
           change the parameter persistentVolumeReclaimPolicy to Retain for restored PVC
        4. Delete both PVCs, the PV for the original claim db-noobaa-db-0 will be removed.
           The PV for claim db-noobaa-db-0-snapshot-restore will move to ‘Released’
        5. Edit again restore PV and remove the claimRef section.
           The volume will transition to Available.
        6. Edit the yaml db-noobaa-db-0.yaml and change the setting volumeName to restored PVC.
        7. Scale up the stateful set again and the pod should be running

        """
        noobaa_db_backup_and_recovery(snapshot_factory=snapshot_factory)

        # Verify all storage pods are running
        wait_for_storage_pods()

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(
            pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
        )
        # Deleting Resources
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)
Пример #3
0
class TestRollingWorkerNodeShutdownAndRecovery(ManageTest):
    """
    Test rolling shutdown and recovery of OCS pods running worker nodes
    """

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Make sure all nodes are up again

        """

        def finalizer():
            nodes.restart_nodes_by_stop_and_start_teardown()

        request.addfinalizer(finalizer)

    def test_rolling_shutdown_and_recovery(
        self, nodes, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
    ):
        SECONDS_TO_WAIT = 180
        """
        Test rolling shutdown and recovery of OCS worker nodes

        """
        # Get OCS worker node objects
        ocs_node_objs = get_ocs_nodes()

        # Start rolling shutdown and recovery of OCS worker nodes
        log.info("ShutDown OCS worker")
        for node_obj in ocs_node_objs:
            nodes.stop_nodes(nodes=[node_obj])
            log.info(f"Keeping node in stopped state for {SECONDS_TO_WAIT} mins")
            time.sleep(SECONDS_TO_WAIT)
            nodes.start_nodes(nodes=[node_obj])
            self.sanity_helpers.health_check(cluster_check=False, tries=60)
            log.info("Checking storage pods status")
            # Validate storage pods are running
            wait_for_pods_to_be_running(timeout=600)

        # Check basic cluster functionality by creating some resources
        self.sanity_helpers.create_resources(
            pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
        )
Пример #4
0
class TestDiskFailures(ManageTest):
    """
    Test class for detach and attach worker volume

    """
    def detach_volume_and_wait_for_attach(self, nodes, data_volume,
                                          worker_node):
        """
        Detach an EBS volume from an AWS instance and wait for the volume
        to be re-attached

        Args:
            node (OCS): The OCS object representing the node
            data_volume (Volume): The ec2 volume to delete
            worker_node (OCS): The OCS object of the EC2 instance

        """
        try:
            # Detach volume (logging is done inside the function)
            nodes.detach_volume(data_volume, worker_node)
        except AWSTimeoutException as e:
            if "Volume state: in-use" in e:
                logger.info(
                    f"Volume {data_volume} re-attached successfully to worker"
                    f" node {worker_node}")
            else:
                raise
        else:
            """
            Wait for worker volume to be re-attached automatically
            to the node
            """
            assert nodes.wait_for_volume_attach(data_volume), (
                f"Volume {data_volume} failed to be re-attached to worker "
                f"node {worker_node}")

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Restart nodes that are in status NotReady, for situations in
        which the test failed before restarting the node after detach volume,
        which leaves nodes in NotReady

        """
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()

            # Restart node if the osd stays at CLBO state
            osd_pods_obj_list = get_osd_pods()
            for pod in osd_pods_obj_list:
                if (pod.get().get("status").get("containerStatuses")[0].get(
                        "state") == constants.STATUS_CLBO):
                    node_obj = get_pod_node(pod)
                    nodes.restart_nodes([node_obj])
                    node.wait_for_nodes_status([node_obj.name])

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @cloud_platform_required
    @pytest.mark.polarion_id("OCS-1085")
    @bugzilla("1825675")
    def test_detach_attach_worker_volume(self, nodes, pvc_factory,
                                         pod_factory):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Wait for the volumes to be re-attached back to the worker node
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be
          unhealthy) by creating resources and running IO
        - Restart the node so the volume will get re-mounted

        """
        # Get a data volume
        data_volume = nodes.get_data_volumes()[0]
        # Get the worker node according to the volume attachment
        worker = nodes.get_node_by_attached_volume(data_volume)

        # Detach volume and wait for the volume to attach
        self.detach_volume_and_wait_for_attach(nodes, data_volume, worker)

        # Validate cluster is still functional
        # In case the selected node that its volume disk was detached was the one
        # running the ceph tools pod, we'll need to wait for a new ct pod to start.
        # For that, a function that connects to the ct pod is being used to check if
        # it's alive
        assert (wait_for_ct_pod_recovery()
                ), "Ceph tools pod failed to come up on another node"

        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        # Restart the instance so the volume will get re-mounted
        nodes.restart_nodes([worker])

        # Cluster health check
        # W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster
        # becomes healthy eventually
        # TODO: Remove 'tries=100'
        self.sanity_helpers.health_check(tries=100)

    @cloud_platform_required
    @pytest.mark.polarion_id("OCS-1086")
    def test_detach_attach_2_data_volumes(self, nodes, pvc_factory,
                                          pod_factory):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data 2 of the data volumes from their worker nodes
        - Wait for the volumes to be re-attached back to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Get 2 data volumes
        data_volumes = nodes.get_data_volumes()[:2]
        workers_and_volumes = [{
            "worker": nodes.get_node_by_attached_volume(vol),
            "volume": vol
        } for vol in data_volumes]
        for worker_and_volume in workers_and_volumes:
            # Detach volume and wait for the volume to attach
            self.detach_volume_and_wait_for_attach(nodes,
                                                   worker_and_volume["volume"],
                                                   worker_and_volume["worker"])
        # Restart the instances so the volume will get re-mounted
        nodes.restart_nodes([
            worker_and_volume["worker"]
            for worker_and_volume in workers_and_volumes
        ])

        # Validate cluster is still functional
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

    @bugzilla("1830702")
    @vsphere_platform_required
    @pytest.mark.polarion_id("OCS-2172")
    def test_recovery_from_volume_deletion(self, nodes, pvc_factory,
                                           pod_factory):
        """
        Test cluster recovery from disk deletion from the platform side.
        Based on documented procedure detailed in
        https://bugzilla.redhat.com/show_bug.cgi?id=1823183

        """
        logger.info("Picking a PV which to be deleted from the platform side")
        osd_pvs = get_deviceset_pvs()
        osd_pv = random.choice(osd_pvs)
        osd_pv_name = osd_pv.name
        # get the claim name
        logger.info(f"Getting the claim name for OSD PV {osd_pv_name}")
        claim_name = osd_pv.get().get("spec").get("claimRef").get("name")

        # Get the backing volume name
        logger.info(f"Getting the backing volume name for PV {osd_pv_name}")
        backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0]

        # Get the corresponding PVC
        logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}")
        osd_pvcs = get_deviceset_pvcs()
        osd_pvcs_count = len(osd_pvcs)
        osd_pvc = [
            ds for ds in osd_pvcs
            if ds.get().get("metadata").get("name") == claim_name
        ][0]

        # Get the corresponding OSD pod and ID
        logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}")
        osd_pods = get_osd_pods()
        osd_pods_count = len(osd_pods)
        osd_pod = [
            osd_pod for osd_pod in osd_pods
            if osd_pod.get().get("metadata").get("labels").get(
                constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        logger.info(f"OSD_POD {osd_pod.name}")
        osd_id = osd_pod.get().get("metadata").get("labels").get("ceph-osd-id")

        # Get the node that has the OSD pod running on
        logger.info(
            f"Getting the node that has the OSD pod {osd_pod.name} running on")
        osd_node = get_pod_node(osd_pod)
        osd_prepare_pods = get_osd_prepare_pods()
        osd_prepare_pod = [
            pod for pod in osd_prepare_pods if pod.get().get("metadata").get(
                "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get(
            "labels").get("job-name"))
        osd_prepare_job = get_job_obj(osd_prepare_job_name)

        # Get the corresponding OSD deployment
        logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}")
        osd_deployment = [
            osd_pod for osd_pod in get_osd_deployments()
            if osd_pod.get().get("metadata").get("labels").get(
                constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name
        ][0]
        osd_deployment_name = osd_deployment.name

        # Delete the volume from the platform side
        logger.info(f"Deleting {backing_volume} from the platform side")
        nodes.detach_volume(backing_volume, osd_node)

        # Scale down OSD deployment
        logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0")
        ocp.OCP().exec_oc_cmd(
            f"scale --replicas=0 deployment/{osd_deployment_name}")

        # Force delete OSD pod if necessary
        osd_pod_name = osd_pod.name
        logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted")
        try:
            osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)
        except TimeoutError:
            osd_pod.delete(force=True)
            osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name)

        # Run ocs-osd-removal job
        ocp_version = float(get_ocp_version())
        if ocp_version >= 4.6:
            cmd = f"process ocs-osd-removal -p FAILED_OSD_IDS={osd_id} -o yaml"
        else:
            cmd = f"process ocs-osd-removal -p FAILED_OSD_ID={osd_id} -o yaml"

        logger.info(f"Executing OSD removal job on OSD-{osd_id}")
        ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"])
        osd_removal_job_yaml = ocp_obj.exec_oc_cmd(cmd)
        osd_removal_job = OCS(**osd_removal_job_yaml)
        osd_removal_job.create(do_reload=False)

        # Get ocs-osd-removal pod name
        logger.info("Getting the ocs-osd-removal pod name")
        osd_removal_pod_name = get_osd_removal_pod_name(osd_id)
        osd_removal_pod_obj = get_pod_obj(osd_removal_pod_name,
                                          namespace="openshift-storage")
        osd_removal_pod_obj.ocp.wait_for_resource(
            condition=constants.STATUS_COMPLETED,
            resource_name=osd_removal_pod_name)

        # Verify OSD removal from the ocs-osd-removal pod logs
        logger.info(
            f"Verifying removal of OSD from {osd_removal_pod_name} pod logs")
        logs = get_pod_logs(osd_removal_pod_name)
        pattern = f"purged osd.{osd_id}"
        assert re.search(pattern, logs)

        osd_pvc_name = osd_pvc.name

        if ocp_version < 4.6:
            # Delete the OSD prepare job
            logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}")
            osd_prepare_job.delete()
            osd_prepare_job.ocp.wait_for_delete(
                resource_name=osd_prepare_job_name, timeout=120)

            # Delete the OSD PVC
            logger.info(f"Deleting OSD PVC {osd_pvc_name}")
            osd_pvc.delete()
            osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name)

            # Delete the OSD deployment
            logger.info(f"Deleting OSD deployment {osd_deployment_name}")
            osd_deployment.delete()
            osd_deployment.ocp.wait_for_delete(
                resource_name=osd_deployment_name, timeout=120)
        else:
            # If ocp version is '4.6' and above the osd removal job should
            # delete the OSD prepare job, OSD PVC, OSD deployment
            logger.info(
                f"Verifying deletion of OSD prepare job {osd_prepare_job_name}"
            )
            osd_prepare_job.ocp.wait_for_delete(
                resource_name=osd_prepare_job_name, timeout=30)
            logger.info(f"Verifying deletion of OSD PVC {osd_pvc_name}")
            osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name, timeout=30)
            logger.info(
                f"Verifying deletion of OSD deployment {osd_deployment_name}")
            osd_deployment.ocp.wait_for_delete(
                resource_name=osd_deployment_name, timeout=30)

        # Delete PV
        logger.info(f"Verifying deletion of PV {osd_pv_name}")
        try:
            osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)
        except TimeoutError:
            osd_pv.delete()
            osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name)

        if ocp_version < 4.6:
            # Delete the rook ceph operator pod to trigger reconciliation
            rook_operator_pod = get_operator_pods()[0]
            logger.info(
                f"deleting Rook Ceph operator pod {rook_operator_pod.name}")
            rook_operator_pod.delete()

        # Delete the OSD removal job
        logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}")
        osd_removal_job = get_job_obj(f"ocs-osd-removal-{osd_id}")
        osd_removal_job.delete()
        osd_removal_job.ocp.wait_for_delete(
            resource_name=f"ocs-osd-removal-{osd_id}")

        timeout = 600
        # Wait for OSD PVC to get created and reach Bound state
        logger.info(
            "Waiting for a new OSD PVC to get created and reach Bound state")
        assert osd_pvc.ocp.wait_for_resource(
            timeout=timeout,
            condition=constants.STATUS_BOUND,
            selector=constants.OSD_PVC_GENERIC_LABEL,
            resource_count=osd_pvcs_count,
        ), (f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: "
            f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}"
            )
        # Wait for OSD pod to get created and reach Running state
        logger.info(
            "Waiting for a new OSD pod to get created and reach Running state")
        assert osd_pod.ocp.wait_for_resource(
            timeout=timeout,
            condition=constants.STATUS_RUNNING,
            selector=constants.OSD_APP_LABEL,
            resource_count=osd_pods_count,
        ), (f"Cluster recovery failed after {timeout} seconds. "
            f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: "
            f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}"
            )

        # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810
        # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438
        if ocp_version >= 4.6:
            silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning(
                osd_pod_name)
            if not silence_osd_crash:
                logger.info("Didn't find ceph osd crash warning")

        # Validate cluster is still functional
        self.sanity_helpers.health_check(tries=100)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes
    """

    @pytest.fixture(autouse=True)
    def teardown(self, request):
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface"],
        argvalues=[
            pytest.param(*["rbd"], marks=pytest.mark.polarion_id("OCS-2100")),
            pytest.param(*["cephfs"], marks=pytest.mark.polarion_id("OCS-2101")),
        ],
    )
    def test_automated_recovery_from_failed_nodes_IPI_proactive(
        self, interface, pvc_factory, pod_factory, dc_pod_factory
    ):
        """
        Knip-678 Automated recovery from failed nodes
        Proactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes, label_key="dc", label_value="fedora")

        # Create DC app pods
        log.info("Creating DC based app pods")
        interface = (
            constants.CEPHBLOCKPOOL if interface == "rbd" else constants.CEPHFILESYSTEM
        )
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface, node_selector={"dc": "fedora"})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name
        )
        msg = "Common OSD and app running node(s) NOT found"
        assert len(common_nodes) > 0, msg
        log.info(f"Common OSD and app pod running nodes are {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(machine_name)
        log.info(f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # DC app pods on the failed node will get automatically created on
        # other running node. Waiting for all dc app pod to reach running
        # state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj)
        log.info("All the dc pods reached running state")

        pod.wait_for_storage_pods()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
class TestNodeReplacement(ManageTest):
    """
    Knip-894 Node replacement - AWS-IPI-Reactive
    """
    @pytest.fixture(autouse=True)
    def teardown(self, request):
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")
            # Verify OSD encrypted
            if config.ENV_DATA.get("encryption_at_rest"):
                osd_encryption_verification()

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface", "failure"],
        argvalues=[
            pytest.param(*["rbd", "power off"],
                         marks=pytest.mark.polarion_id("OCS-2118")),
            pytest.param(*["rbd", "network failure"],
                         marks=pytest.mark.polarion_id("OCS-2120")),
            pytest.param(*["cephfs", "power off"],
                         marks=pytest.mark.polarion_id("OCS-2119")),
            pytest.param(
                *["cephfs", "network failure"],
                marks=pytest.mark.polarion_id("OCS-2121"),
            ),
        ],
    )
    def test_node_replacement_reactive_aws_ipi(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        dc_pod_factory,
        failure,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Knip-894 Node replacement - AWS-IPI-Reactive

        """
        # Get worker nodes
        initial_nodes = get_worker_nodes()

        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key="dc",
                          label_value="fedora")

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == "rbd":
            interface = constants.CEPHBLOCKPOOL
        elif interface == "cephfs":
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "power off":
            # Power off AWS worker node instance
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(
                f"Successfully powered off node: {failure_node_obj[0].name}")
        elif failure == "network failure":
            # Induce Network failure
            node_network_failure([failure_node_obj[0].name])

        # Add annotation to the failed node
        annotation = "machine.openshift.io/exclude-node-draining=''"
        machine.add_annotation_to_machine(annotation=annotation,
                                          machine_name=machine_name)

        # Delete the machine
        machine.delete_machine(machine_name)
        log.info(f"Successfully deleted machine {machine_name}")

        # Wait for the new machine to spin
        log.info("Waiting for the new node to be in ready state")
        machine.wait_for_new_node_to_be_ready(machineset_name)

        # Get the node name of new spun node
        nodes_after_new_spun_node = get_worker_nodes()
        new_spun_node = list(
            set(nodes_after_new_spun_node) - set(initial_nodes))
        log.info(f"New spun node is {new_spun_node}")

        # Label it
        node_obj = ocp.OCP(kind="node")
        node_obj.add_label(resource_name=new_spun_node[0],
                           label=constants.OPERATOR_NODE_LABEL)
        log.info(
            f"Successfully labeled {new_spun_node} with OCS storage label")

        # DC app pods on the failed node will get automatically created on other
        # running node. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                        timeout=1200)
        log.info("All the dc pods reached running state")

        pod.wait_for_storage_pods()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Пример #7
0
class TestDiskFailures(ManageTest):
    """
    Test class for detach and attach worker volume

    """
    def detach_volume_and_wait_for_attach(self, nodes, data_volume,
                                          worker_node):
        """
        Detach an EBS volume from an AWS instance and wait for the volume
        to be re-attached

        Args:
            node (OCS): The OCS object representing the node
            data_volume (Volume): The ec2 volume to delete
            worker_node (OCS): The OCS object of the EC2 instance

        """
        try:
            # Detach volume (logging is done inside the function)
            nodes.detach_volume(data_volume, worker_node)
        except AWSTimeoutException as e:
            if "Volume state: in-use" in e:
                logger.info(
                    f"Volume {data_volume} re-attached successfully to worker"
                    f" node {worker_node}")
            else:
                raise
        else:
            """
            Wait for worker volume to be re-attached automatically
            to the node
            """
            assert nodes.wait_for_volume_attach(data_volume), (
                f"Volume {data_volume} failed to be re-attached to worker "
                f"node {worker_node}")

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Restart nodes that are in status NotReady, for situations in
        which the test failed before restarting the node after detach volume,
        which leaves nodes in NotReady

        """
        def finalizer():
            not_ready_nodes = [
                n for n in node.get_node_objs() if n.ocp.get_resource_status(
                    n.name) == constants.NODE_NOT_READY
            ]
            logger.warning(
                f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}"
            )
            if not_ready_nodes:
                nodes.restart_nodes(not_ready_nodes)
                node.wait_for_nodes_status()

            # Restart node if the osd stays at CLBO state
            osd_pods_obj_list = get_osd_pods()
            for pod in osd_pods_obj_list:
                if (pod.get().get("status").get("containerStatuses")[0].get(
                        "state") == constants.STATUS_CLBO):
                    node_obj = get_pod_node(pod)
                    nodes.restart_nodes([node_obj])
                    node.wait_for_nodes_status([node_obj.name])

            # Verify OSD encrypted
            if config.ENV_DATA.get("encryption_at_rest"):
                osd_encryption_verification()

            logger.info("Clear crash warnings and osd removal leftovers")
            clear_crash_warning_and_osd_removal_leftovers()

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @cloud_platform_required
    @pytest.mark.polarion_id("OCS-1085")
    @bugzilla("1825675")
    def test_detach_attach_worker_volume(self, nodes, pvc_factory, pod_factory,
                                         bucket_factory, rgw_bucket_factory):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Wait for the volumes to be re-attached back to the worker node
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be
          unhealthy) by creating resources and running IO
        - Restart the node so the volume will get re-mounted

        """
        # Get a data volume
        data_volume = nodes.get_data_volumes()[0]
        # Get the worker node according to the volume attachment
        worker = nodes.get_node_by_attached_volume(data_volume)

        # Detach volume and wait for the volume to attach
        self.detach_volume_and_wait_for_attach(nodes, data_volume, worker)

        # Validate cluster is still functional
        # In case the selected node that its volume disk was detached was the one
        # running the ceph tools pod, we'll need to wait for a new ct pod to start.
        # For that, a function that connects to the ct pod is being used to check if
        # it's alive
        assert (wait_for_ct_pod_recovery()
                ), "Ceph tools pod failed to come up on another node"

        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)

        # Restart the instance so the volume will get re-mounted
        nodes.restart_nodes([worker])

        # Cluster health check
        # W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster
        # becomes healthy eventually
        # TODO: Remove 'tries=100'
        self.sanity_helpers.health_check(tries=100)

    @cloud_platform_required
    @pytest.mark.polarion_id("OCS-1086")
    @skipif_ibm_cloud
    def test_detach_attach_2_data_volumes(self, nodes, pvc_factory,
                                          pod_factory, bucket_factory,
                                          rgw_bucket_factory):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data 2 of the data volumes from their worker nodes
        - Wait for the volumes to be re-attached back to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Get 2 data volumes
        data_volumes = nodes.get_data_volumes()[:2]
        workers_and_volumes = [{
            "worker": nodes.get_node_by_attached_volume(vol),
            "volume": vol
        } for vol in data_volumes]
        for worker_and_volume in workers_and_volumes:
            # Detach volume and wait for the volume to attach
            self.detach_volume_and_wait_for_attach(nodes,
                                                   worker_and_volume["volume"],
                                                   worker_and_volume["worker"])
        # Restart the instances so the volume will get re-mounted
        nodes.restart_nodes([
            worker_and_volume["worker"]
            for worker_and_volume in workers_and_volumes
        ])

        # Validate cluster is still functional
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)

    @bugzilla("1830702")
    @vsphere_platform_required
    @pytest.mark.polarion_id("OCS-2172")
    def test_recovery_from_volume_deletion(self, nodes, pvc_factory,
                                           pod_factory, bucket_factory,
                                           rgw_bucket_factory):
        """
        Test cluster recovery from disk deletion from the platform side.
        Based on documented procedure detailed in
        https://bugzilla.redhat.com/show_bug.cgi?id=1823183

        """
        osd_operations.osd_device_replacement(nodes)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
Пример #8
0
class TestAutomatedRecoveryFromFailedNodes(ManageTest):
    """
    Knip-678 Automated recovery from failed nodes - Reactive
    """

    threads = []

    @pytest.fixture(autouse=True)
    def teardown(self, request):
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")
            for thread in self.threads:
                thread.join()
            ceph_health_check()

        request.addfinalizer(finalizer)

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.mark.parametrize(
        argnames=["interface", "failure"],
        argvalues=[
            pytest.param(
                *["rbd", "shutdown"],
                marks=[
                    pytest.mark.polarion_id("OCS-2102"),
                    pytest.mark.bugzilla("1845666"),
                ],
            ),
            pytest.param(*["rbd", "terminate"],
                         marks=pytest.mark.polarion_id("OCS-2103")),
            pytest.param(
                *["cephfs", "shutdown"],
                marks=[
                    pytest.mark.polarion_id("OCS-2104"),
                    pytest.mark.bugzilla("1845666"),
                ],
            ),
            pytest.param(*["cephfs", "terminate"],
                         marks=pytest.mark.polarion_id("OCS-2105")),
        ],
    )
    def test_automated_recovery_from_failed_nodes_IPI_reactive(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        failure,
        dc_pod_factory,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI
        """
        # Get OSD running nodes
        osd_running_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_nodes}")
        # Label osd nodes with fedora app
        label_worker_node(osd_running_nodes,
                          label_key="dc",
                          label_value="fedora")

        # Create DC app pods
        log.info("Creating DC based app pods")
        if interface == "rbd":
            interface = constants.CEPHBLOCKPOOL
        elif interface == "cephfs":
            interface = constants.CEPHFILESYSTEM
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True))
            dc_pod_obj.append(dc_pod)

        # Get app pods running nodes
        dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj)
        log.info(f"DC app pod running nodes are {dc_pod_node_name}")

        # Get both osd and app pod running node
        common_nodes = get_both_osd_and_app_pod_running_node(
            osd_running_nodes, dc_pod_node_name)
        log.info(f"Both OSD and app pod is running on nodes {common_nodes}")

        # Get the machine name using the node name
        machine_name = machine.get_machine_from_node_name(common_nodes[0])
        log.info(f"{common_nodes[0]} associated machine is {machine_name}")

        # Get the machineset name using machine name
        machineset_name = machine.get_machineset_from_machine_name(
            machine_name)
        log.info(
            f"{common_nodes[0]} associated machineset is {machineset_name}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_name)
        # Get the failure node obj
        failure_node_obj = get_node_objs(node_names=[common_nodes[0]])

        # Induce failure on the selected failure node
        log.info(f"Inducing failure on node {failure_node_obj[0].name}")
        if failure == "shutdown":
            nodes.stop_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully powered off node: "
                     f"{failure_node_obj[0].name}")
        elif failure == "terminate":
            nodes.terminate_nodes(failure_node_obj, wait=True)
            log.info(f"Successfully terminated node : "
                     f"{failure_node_obj[0].name} instance")

        try:
            # DC app pods on the failed node will get automatically created on other
            # running node. Waiting for all dc app pod to reach running state
            pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                            timeout=720)
            log.info("All the dc pods reached running state")
            pod.wait_for_storage_pods()

        except ResourceWrongStatusException:
            if failure == "shutdown":
                nodes.terminate_nodes(failure_node_obj, wait=True)
                log.info(f"Successfully terminated node : "
                         f"{failure_node_obj[0].name} instance")
            raise

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
Пример #9
0
class TestNodesMaintenance(ManageTest):
    """
    Test basic flows of maintenance (unschedule and drain) and
    activate operations, followed by cluster functionality and health checks

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        if storagecluster_independent_check():
            self.sanity_helpers = SanityExternalCluster()
        else:
            self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def health_checker(self):
        """
        Check Ceph health

        """
        try:
            status = ceph_health_check_base()
            if status:
                log.info("Health check passed")
        except CephHealthException as e:
            # skip because ceph is not in good health
            pytest.skip(str(e))

    @tier1
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*["worker"],
                         marks=pytest.mark.polarion_id("OCS-1269")),
            pytest.param(*["master"],
                         marks=pytest.mark.polarion_id("OCS-1272")),
        ],
    )
    def test_node_maintenance(
        self,
        reduce_and_resume_cluster_load,
        node_type,
        pvc_factory,
        pod_factory,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain)
        drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=90)

    @tier4
    @tier4b
    @skipif_bm
    @pytest.mark.parametrize(
        argnames=["node_type"],
        argvalues=[
            pytest.param(*["worker"],
                         marks=pytest.mark.polarion_id("OCS-1292")),
            pytest.param(*["master"],
                         marks=pytest.mark.polarion_id("OCS-1293")),
        ],
    )
    def test_node_maintenance_restart_activate(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        node_type,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        reboot_events_cmd = (
            f"get events -A --field-selector involvedObject.name="
            f"{typed_node_name},reason=Rebooted -o yaml")

        # Find the number of reboot events in 'typed_node_name'
        num_events = len(
            typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"])

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=False)

        try:
            wait_for_nodes_status(
                node_names=[typed_node_name],
                status=constants.NODE_NOT_READY_SCHEDULING_DISABLED,
            )
        except ResourceWrongStatusException:
            # Sometimes, the node will be back to running state quickly so
            # that the status change won't be detected. Verify the node was
            # actually restarted by checking the reboot events count
            new_num_events = len(
                typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"])
            assert new_num_events > num_events, (
                f"Reboot event not found."
                f"Node {typed_node_name} did not restart.")

        wait_for_nodes_status(
            node_names=[typed_node_name],
            status=constants.NODE_READY_SCHEDULING_DISABLED,
        )

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

    @tier3
    @pytest.mark.parametrize(
        argnames=["nodes_type"],
        argvalues=[
            pytest.param(*["worker"],
                         marks=pytest.mark.polarion_id("OCS-1273")),
            pytest.param(*["master"],
                         marks=pytest.mark.polarion_id("OCS-1271")),
        ],
    )
    def test_2_nodes_maintenance_same_type(self, nodes_type):
        """
        OCS-1273/OCs-1271:
        - Try draining 2 nodes from the same type - should fail
        - Check cluster and Ceph health

        """
        # Get 2 nodes
        typed_nodes = get_nodes(node_type=nodes_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {nodes_type} node for the test"

        typed_node_names = [typed_node.name for typed_node in typed_nodes]

        # Try draining 2 nodes - should fail
        try:
            drain_nodes(typed_node_names)
        except TimeoutExpired:
            log.info(
                f"Draining of nodes {typed_node_names} failed as expected")

        schedule_nodes(typed_node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier2
    @pytest.mark.polarion_id("OCS-1274")
    def test_2_nodes_different_types(self, pvc_factory, pod_factory,
                                     bucket_factory, rgw_bucket_factory):
        """
        OCS-1274:
        - Maintenance (mark as unscheduable and drain) 1 worker node and 1
          master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the nodes as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node from each type
        nodes = [
            get_nodes(node_type=node_type, num_of_nodes=1)[0]
            for node_type in ["worker", "master"]
        ]
        assert nodes, "Failed to find a nodes for the test"

        node_names = [typed_node.name for typed_node in nodes]

        # Maintenance the nodes (unschedule and drain)
        drain_nodes(node_names)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Mark the nodes back to schedulable
        schedule_nodes(node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @tier4
    @tier4b
    @aws_platform_required
    @ipi_deployment_required
    @pytest.mark.parametrize(
        argnames=["interface"],
        argvalues=[
            pytest.param(*["rbd"], marks=pytest.mark.polarion_id("OCS-2128")),
            pytest.param(*["cephfs"],
                         marks=pytest.mark.polarion_id("OCS-2129")),
        ],
    )
    def test_simultaneous_drain_of_two_ocs_nodes(
        self,
        pvc_factory,
        pod_factory,
        dc_pod_factory,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        OCS-2128/OCS-2129:
        - Create PVCs and start IO on DC based app pods
        - Add one extra node in two of the AZs and label the nodes
          with OCS storage label
        - Maintenance (mark as unscheduable and drain) 2 worker nodes
          simultaneously
        - Confirm that OCS and DC pods are in running state
        - Remove unscheduled nodes
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Check cluster and Ceph health

        """
        # Get OSD running nodes
        osd_running_worker_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_worker_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_worker_nodes,
                          label_key="dc",
                          label_value="fedora")
        log.info("Successfully labeled worker nodes with {dc:fedora}")

        # Create DC app pods
        log.info("Creating DC based app pods and starting IO in background")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == "rbd" else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get the machine name using the node name
        machine_names = [
            machine.get_machine_from_node_name(osd_running_worker_node)
            for osd_running_worker_node in osd_running_worker_nodes[:2]
        ]
        log.info(f"{osd_running_worker_nodes} associated "
                 f"machine are {machine_names}")

        # Get the machineset name using machine name
        machineset_names = [
            machine.get_machineset_from_machine_name(machine_name)
            for machine_name in machine_names
        ]
        log.info(f"{osd_running_worker_nodes} associated machineset "
                 f"is {machineset_names}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_names[0])
        add_new_node_and_label_it(machineset_names[1])

        # Drain 2 nodes
        drain_nodes(osd_running_worker_nodes[:2])

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            if ("-1-deploy" or "ocs-deviceset") not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj,
                        state=constants.STATUS_RUNNING,
                        timeout=200)
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if "rook-ceph-crashcollector" in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                        pod_name = pod_obj.name
                        deployment_name = "-".join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")

        # DC app pods on the drained node will get automatically created on other
        # running node in same AZ. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                        timeout=1200)
        log.info("All the dc pods reached running state")

        # Remove unscheduled nodes
        # In scenarios where the drain is attempted on >3 worker setup,
        # post completion of drain we are removing the unscheduled nodes so
        # that we maintain 3 worker nodes.
        log.info(f"Removing scheduled nodes {osd_running_worker_nodes[:2]}")
        remove_node_objs = get_node_objs(osd_running_worker_nodes[:2])
        remove_nodes(remove_node_objs)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

    @bugzilla("1861104")
    @pytest.mark.polarion_id("OCS-2524")
    @tier4b
    def test_pdb_check_simultaneous_node_drains(
        self,
        pvc_factory,
        pod_factory,
        bucket_factory,
        rgw_bucket_factory,
        node_drain_teardown,
    ):
        """
        - Check for OSD PDBs before drain
        - Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs
        - Drain will be completed on worker node A
        - Drain will be pending on worker node B due to blocking PDBs
        - Check the OSD PDBs
        - Mark the node A as schedulable
        - Let drain finish on Node B
        - Mark the node B as schedulable
        - Check cluster and Ceph health

        """

        # Validate OSD PDBs before drain operation
        assert (not validate_existence_of_blocking_pdb()
                ), "Blocking PDBs exist, Can't perform drain"

        # Get 2 worker nodes to drain
        typed_nodes = get_nodes(num_of_nodes=2)
        assert len(
            typed_nodes) == 2, "Failed to find worker nodes for the test"
        node_A = typed_nodes[0].name
        node_B = typed_nodes[1].name

        # Drain Node A and validate blocking PDBs
        drain_nodes([node_A])
        assert (validate_existence_of_blocking_pdb()
                ), "Blocking PDBs not created post drain"

        # Inducing delay between 2 drains
        # Node-B drain expected to be in pending due to blocking PDBs
        time.sleep(30)
        try:
            drain_nodes([node_B])
        except TimeoutExpired:
            # Mark the node-A back to schedulable and let drain finish in Node-B
            schedule_nodes([node_A])

        time.sleep(40)

        # Validate OSD PDBs
        assert (validate_existence_of_blocking_pdb()
                ), "Blocking PDBs not created post second drain"

        # Mark the node-B back to schedulable and recover the cluster
        schedule_nodes([node_B])

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=validate_existence_of_blocking_pdb,
        )
        if not sample.wait_for_func_status(result=False):
            log.error("Blocking PDBs still exist")

        # wait for storage pods
        pod.wait_for_storage_pods()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=50)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()
Пример #10
0
class TestNodesRestart(ManageTest):
    """
    Test ungraceful cluster shutdown
    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def teardown(self, request, nodes):
        """
        Make sure all nodes are up again

        """
        def finalizer():
            nodes.restart_nodes_by_stop_and_start_teardown()

        request.addfinalizer(finalizer)

    @pytest.mark.parametrize(
        argnames=["force"],
        argvalues=[
            pytest.param(*[True], marks=pytest.mark.polarion_id("OCS-894")),
            pytest.param(
                *[False],
                marks=[
                    pytest.mark.polarion_id("OCS-895"), cloud_platform_required
                ],
            ),
        ],
    )
    def test_nodes_restart(self, nodes, pvc_factory, pod_factory, force,
                           bucket_factory, rgw_bucket_factory):
        """
        Test nodes restart (from the platform layer, i.e, EC2 instances, VMWare VMs)

        """
        ocp_nodes = get_node_objs()
        nodes.restart_nodes_by_stop_and_start(nodes=ocp_nodes, force=force)
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)

    @bugzilla("1754287")
    @pytest.mark.polarion_id("OCS-2015")
    def test_rolling_nodes_restart(self, nodes, pvc_factory, pod_factory,
                                   bucket_factory, rgw_bucket_factory):
        """
        Test restart nodes one after the other and check health status in between

        """
        ocp_nodes = get_node_objs()
        for node in ocp_nodes:
            nodes.restart_nodes(nodes=[node], wait=False)
            self.sanity_helpers.health_check(cluster_check=False, tries=60)
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)

    @pytest.mark.parametrize(
        argnames=["interface", "operation"],
        argvalues=[
            pytest.param(*["rbd", "create_resources"],
                         marks=pytest.mark.polarion_id("OCS-1138")),
            pytest.param(*["rbd", "delete_resources"],
                         marks=pytest.mark.polarion_id("OCS-1241")),
            pytest.param(
                *["cephfs", "create_resources"],
                marks=pytest.mark.polarion_id("OCS-1139"),
            ),
            pytest.param(
                *["cephfs", "delete_resources"],
                marks=pytest.mark.polarion_id("OCS-1242"),
            ),
        ],
    )
    def test_pv_provisioning_under_degraded_state_stop_provisioner_pod_node(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        interface,
        operation,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Test PV provisioning under degraded state -
        stop the node that has the provisioner pod running on

        OCS-1138:
        - Stop 1 worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1241:
        - Stop 1 worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1139:
        - Stop 1 worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-1242:
        - Stop 1 worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        """
        if operation == "delete_resources":
            # Create resources that their deletion will be tested later
            self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                                 bucket_factory,
                                                 rgw_bucket_factory)

        provisioner_pods = None
        # Get the provisioner pod according to the interface
        if interface == "rbd":
            provisioner_pods = pod.get_rbdfsplugin_provisioner_pods()
        elif interface == "cephfs":
            provisioner_pods = pod.get_cephfsplugin_provisioner_pods()
        provisioner_pod = provisioner_pods[0]

        # Making sure that the node is not running the rook operator pod:
        provisioner_node = pod.get_pod_node(provisioner_pod)
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get("metadata").get(
                "name") == provisioner_node.get().get("metadata").get("name"):
            provisioner_pod = provisioner_pods[1]

        provisioner_pod_name = provisioner_pod.name
        logger.info(
            f"{interface} provisioner pod found: {provisioner_pod_name}")

        # Get the node name that has the provisioner pod running on
        provisioner_node = pod.get_pod_node(provisioner_pod)
        provisioner_node_name = provisioner_node.get().get("metadata").get(
            "name")
        logger.info(
            f"{interface} provisioner pod is running on node {provisioner_node_name}"
        )

        # Stopping the nodes
        nodes.stop_nodes(nodes=[provisioner_node])

        # Wait for the provisioner pod to get to running status
        selector = (constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if
                    (interface == "rbd") else
                    constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL)

        # Wait for the provisioner pod to reach Terminating status
        logger.info(
            f"Waiting for pod {provisioner_pod_name} to reach status Terminating"
        )
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            resource_name=provisioner_pod.name,
            condition=constants.STATUS_TERMINATING,
        ), f"{interface} provisioner pod failed to reach status Terminating"
        logger.info(
            f"Pod {provisioner_pod_name} has reached status Terminating")

        # Wait for the provisioner pod to be started and reach running status
        logger.info(
            f"Waiting for {interface} provisioner pod to reach status Running")
        # After this change https://github.com/rook/rook/pull/3642/, there are
        # 2 provisioners for each interface
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector=selector,
            resource_count=2,
        ), f"{interface} provisioner pod failed to reach status Running"

        logger.info(f"{interface} provisioner pod has reached status Running")
        if operation == "create_resources":
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                                 bucket_factory,
                                                 rgw_bucket_factory)
        elif operation == "delete_resources":
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.delete_resources()

        # Starting the nodes
        nodes.start_nodes(nodes=[provisioner_node])

        # Checking cluster and Ceph health
        self.sanity_helpers.health_check()

    @pytest.mark.parametrize(
        argnames=["operation"],
        argvalues=[
            pytest.param(*["create_resources"],
                         marks=[pytest.mark.polarion_id("OCS-2016")]),
            pytest.param(*["delete_resources"],
                         marks=[pytest.mark.polarion_id("OCS-2017")]),
        ],
    )
    def test_pv_provisioning_under_degraded_state_stop_rook_operator_pod_node(
        self,
        nodes,
        pvc_factory,
        pod_factory,
        operation,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Test PV provisioning under degraded state -
        stop the node that has the rook operator pod running on

        OCS-2016:
        - Stop 1 worker node that has the rook ceph operator pod running on
        - Wait for the rook ceph operator pod to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node
        - Check cluster and Ceph health

        OCS-2017:
        - Stop 1 worker node that has the rook ceph operator pod running on
        - Wait for the rook ceph operator pod to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources
        - Start the worker node
        - Check cluster and Ceph health
        """
        if operation == "delete_resources":
            # Create resources that their deletion will be tested later
            self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                                 bucket_factory,
                                                 rgw_bucket_factory)

        rook_operator_pods = pod.get_operator_pods()
        rook_operator_pod = rook_operator_pods[0]

        rook_operator_pod_name = rook_operator_pod.name
        logger.info(f"rook operator pod found: {rook_operator_pod_name}")

        # Get the node name that has the rook operator pod running on
        operator_node = pod.get_pod_node(rook_operator_pod)
        operator_node_name = operator_node.get().get("metadata").get("name")
        logger.info(
            f"{rook_operator_pod_name} pod is running on node {operator_node_name}"
        )

        # Stopping the node
        nodes.stop_nodes(nodes=[operator_node])

        # Wait for the rook operator pod to get to running status
        selector = constants.OPERATOR_LABEL

        # Wait for the rook operator pod to reach Terminating status
        logger.info(
            f"Waiting for pod {rook_operator_pod_name} to reach status Terminating"
        )
        assert rook_operator_pod.ocp.wait_for_resource(
            timeout=600,
            resource_name=rook_operator_pod_name,
            condition=constants.STATUS_TERMINATING,
        ), "rook operator pod failed to reach status Terminating"
        logger.info(
            f"Pod {rook_operator_pod_name} has reached status Terminating")

        # Wait for the rook operator pod to be started and reach running status
        logger.info(
            f"Waiting for pod {rook_operator_pod_name} to reach status Running"
        )

        assert rook_operator_pod.ocp.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector=selector,
            resource_count=1,
        ), "rook operator pod failed to reach status Running"
        logger.info("rook operator pod has reached status Running")

        assert (wait_for_ct_pod_recovery()
                ), "Ceph tools pod failed to come up on another node"

        if operation == "create_resources":
            # Cluster validation (resources creation and IO running)

            self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                                 bucket_factory,
                                                 rgw_bucket_factory)
        elif operation == "delete_resources":
            # Cluster validation (resources creation and IO running)
            self.sanity_helpers.delete_resources()

        # Starting the nodes
        nodes.start_nodes(nodes=[operator_node])

        # Checking cluster and Ceph health
        self.sanity_helpers.health_check()

    @skipif_no_lso
    @bugzilla("1873938")
    @pytest.mark.polarion_id("OCS-2448")
    def test_pv_after_reboot_node(self, nodes):
        """
        Verify unexpected PV is not created after node reboot on LSO cluster

        """
        pv_before_reset = get_pv_names()
        worker_nodes = get_nodes(node_type=constants.WORKER_MACHINE,
                                 num_of_nodes=3)
        ocp_obj = OCP(kind=constants.PV)
        for worker_node in worker_nodes:
            # Restart one worker node
            nodes.restart_nodes(nodes=[worker_node], wait=True)
            self.sanity_helpers.health_check(cluster_check=False, tries=60)
            logger.info(f"Verify PV after reboot {worker_node}")
            pv_after_reset = get_pv_names()
            pv_diff = set(pv_after_reset) - set(pv_before_reset)
            pv_new = []
            for pv in pv_diff:
                pv_obj = ocp_obj.get(resource_name=pv)
                if pv_obj["spec"]["storageClassName"] == "localblock":
                    pv_new.append(pv)
            assert (
                not pv_new
            ), f"Unexpected PV {pv_new} created after reboot {worker_node}"
        logger.info("SUCCESS - No new PV was created.")
class TestRestoreCephMonQuorum(E2ETest):
    """
    The objective of this test case is to verify that
    mons can be brought to quorum successfully by
    following the steps mentioned in
    https://access.redhat.com/solutions/5898541, when
    mon(s) are out of quorum on a cluster

    """

    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def base_set_up(self):
        """
        Preconditions:

        a) Have a cluster with FIPS and Hugepages enabled
        b) Create some resources s3 objects, buckets and write data's to it.
        c) Create RGW kafka notification & see the objects are notified to kafka
        d) Perform mcg bucket replication (bidirectional) and see the objects are synced.
        e) Perform noobaa caching
        f) there are snapshots and clones for few PVCs
        g) Background IOs (fio pods) running
        """

        # ToDo: Create objects and write data's to it

        # ToDo: Create RGW kafka notification

        # ToDo: Perform mcg bucket replication

        # Todo: Noobaa caching

        # ToDo: Create PVC, snapshot and restore into new PVC, clone PVC

        # ToDo: Run background IOs

    @pytest.fixture(autouse=True)
    def rook_operator_teardown(self, request):
        def finalizer():
            op_obj = OCP(
                kind=constants.DEPLOYMENT,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            pod_obj = OCP(
                kind=constants.POD, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
            )
            operator_obj = op_obj.get(resource_name=constants.ROOK_CEPH_OPERATOR)
            if operator_obj.get("spec").get("replicas") != 1:
                modify_deployment_replica_count(
                    deployment_name=constants.ROOK_CEPH_OPERATOR, replica_count=1
                ), "Failed to scale up rook-ceph-operator to 1"

            log.info("Validate all mons are up and running")
            try:
                pod_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.MON_APP_LABEL,
                    resource_count=3,
                    timeout=60,
                    sleep=5,
                )
            except (TimeoutExpiredError, ResourceWrongStatusException) as ex:
                log.warning(ex)
                op_obj.delete(resource_name=constants.ROOK_CEPH_OPERATOR)
                for pod in get_mon_pods():
                    pod.delete()
                pod_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.MON_APP_LABEL,
                    resource_count=3,
                    timeout=360,
                    sleep=5,
                )
            log.info("All mons are up and running")

        request.addfinalizer(finalizer)

    def test_restore_ceph_mon_quorum(
        self, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
    ):
        """
        Test Procedure:

        1) With all the preconditions met, take mons out of quorum
        2) Follow the procedure mentioned in
           https://access.redhat.com/solutions/5898541 to recover the quorum
        3) Check for data access and integrity of block,
           object and file based data - No DU/DL/DC
        4) Make sure basic functionality working fine

        """

        # Take mons out of the quorum and confirm it
        (
            self.mon_pod_obj_list,
            mon_pod_running,
            ceph_mon_daemon_id,
        ) = induce_mon_quorum_loss()

        # Recover mon quorum
        recover_mon_quorum(self.mon_pod_obj_list, mon_pod_running, ceph_mon_daemon_id)

        # Validate storage pods are running
        wait_for_storage_pods()

        # Remove crash list from ceph health
        log.info("Silence the ceph warnings by “archiving” the crash")
        tool_pod = get_ceph_tools_pod()
        tool_pod.exec_ceph_cmd(ceph_cmd="ceph crash archive-all", format=None)
        log.info("Removed ceph crash warnings. Check for ceph and cluster health")

        # Validate cluster health
        self.sanity_helpers.health_check(tries=40)

        # ToDo: Common system test case validation: Check for data integrity and corruption after mon recovery

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(
            pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory
        )
        # Deleting Resources
        self.sanity_helpers.delete_resources()
Пример #12
0
class TestNoobaaBackupAndRecovery(E2ETest):
    """
    Test to verify noobaa backup and recovery

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(autouse=True)
    def teardown_fixture(self, request):
        def finalizer():
            # Get the statefulset replica count
            sst_obj = OCP(
                kind=constants.STATEFULSET,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            noobaa_db_sst_obj = sst_obj.get(
                resource_name=self.noobaa_db_sst_name)
            if noobaa_db_sst_obj["spec"]["replicas"] != 1:
                modify_statefulset_replica_count(
                    statefulset_name=self.noobaa_db_sst_name, replica_count=1
                ), f"Failed to scale up the statefulset {self.noobaa_db_sst_name}"

            try:
                self.restore_pvc_obj.delete()
            except CommandFailed as ex:
                if f'"{ self.restore_pvc_obj.name}" not found' not in str(ex):
                    raise ex

        request.addfinalizer(finalizer)

    def test_noobaa_db_backup_and_recovery(
        self,
        pvc_factory,
        pod_factory,
        snapshot_factory,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Test case to verify noobaa backup and recovery

        1. Take snapshot db-noobaa-db-0 PVC and retore it to PVC
        2. Scale down the statefulset noobaa-db
        3. Get the yaml of the current PVC, db-noobaa-db-0 and
           change the parameter persistentVolumeReclaimPolicy to Retain for restored PVC
        4. Delete both PVCs, the PV for the original claim db-noobaa-db-0 will be removed.
           The PV for claim db-noobaa-db-0-snapshot-restore will move to ‘Released’
        5. Edit again restore PV and remove the claimRef section.
           The volume will transition to Available.
        6. Edit the yaml db-noobaa-db-0.yaml and change the setting volumeName to restored PVC.
        7. Scale up the stateful set again and the pod should be running

        """

        # Initialise variable
        self.noobaa_db_sst_name = "noobaa-db-pg"

        # Get noobaa pods before execution
        noobaa_pods = get_noobaa_pods()

        # Get noobaa PVC before execution
        noobaa_pvc_obj = get_pvc_objs(pvc_names=["db-noobaa-db-pg-0"])
        noobaa_pv_name = noobaa_pvc_obj[0].get("spec").get("spec").get(
            "volumeName")

        # Take snapshot db-noobaa-db-0 PVC
        log.info(f"Creating snapshot of the {noobaa_pvc_obj[0].name} PVC")
        snap_obj = snapshot_factory(
            pvc_obj=noobaa_pvc_obj[0],
            wait=True,
            snapshot_name=f"{noobaa_pvc_obj[0].name}-snapshot",
        )
        log.info(
            f"Successfully created snapshot {snap_obj.name} and in Ready state"
        )

        # Restore it to PVC
        log.info(f"Restoring snapshot {snap_obj.name} to create new PVC")
        sc_name = noobaa_pvc_obj[0].get().get("spec").get("storageClassName")
        pvc_size = (noobaa_pvc_obj[0].get().get("spec").get("resources").get(
            "requests").get("storage"))
        self.restore_pvc_obj = create_restore_pvc(
            sc_name=sc_name,
            snap_name=snap_obj.name,
            namespace=snap_obj.namespace,
            size=pvc_size,
            pvc_name=f"{snap_obj.name}-restore",
            volume_mode=snap_obj.parent_volume_mode,
            access_mode=snap_obj.parent_access_mode,
        )
        wait_for_resource_state(self.restore_pvc_obj, constants.STATUS_BOUND)
        self.restore_pvc_obj.reload()
        log.info(f"Succeesfuly created PVC {self.restore_pvc_obj.name} "
                 f"from snapshot {snap_obj.name}")

        # Scale down the statefulset noobaa-db
        modify_statefulset_replica_count(
            statefulset_name=self.noobaa_db_sst_name, replica_count=0
        ), f"Failed to scale down the statefulset {self.noobaa_db_sst_name}"

        # Get the noobaa-db PVC
        pvc_obj = OCP(kind=constants.PVC,
                      namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        noobaa_pvc_yaml = pvc_obj.get(resource_name=noobaa_pvc_obj[0].name)

        # Get the restored noobaa PVC and
        # change the parameter persistentVolumeReclaimPolicy to Retain
        restored_noobaa_pvc_obj = get_pvc_objs(
            pvc_names=[f"{snap_obj.name}-restore"])
        restored_noobaa_pv_name = (restored_noobaa_pvc_obj[0].get("spec").get(
            "spec").get("volumeName"))
        pv_obj = OCP(kind=constants.PV,
                     namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
        params = '{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}'
        assert pv_obj.patch(
            resource_name=restored_noobaa_pv_name, params=params), (
                "Failed to change the parameter persistentVolumeReclaimPolicy"
                f" to Retain {restored_noobaa_pv_name}")

        # Delete both PVCs
        delete_pvcs(pvc_objs=[noobaa_pvc_obj[0], restored_noobaa_pvc_obj[0]])

        # Validate original claim db-noobaa-db-0 removed
        assert validate_pv_delete(
            pv_name=noobaa_pv_name
        ), f"PV not deleted, still exist {noobaa_pv_name}"

        # Validate PV for claim db-noobaa-db-0-snapshot-restore is in Released state
        pv_obj.wait_for_resource(condition=constants.STATUS_RELEASED,
                                 resource_name=restored_noobaa_pv_name)

        # Edit again restore PV and remove the claimRef section
        log.info(
            f"Remove the claimRef section from PVC {restored_noobaa_pv_name}")
        params = '[{"op": "remove", "path": "/spec/claimRef"}]'
        pv_obj.patch(resource_name=restored_noobaa_pv_name,
                     params=params,
                     format_type="json")
        log.info(
            f"Successfully removed claimRef section from PVC {restored_noobaa_pv_name}"
        )

        # Validate PV is in Available state
        pv_obj.wait_for_resource(condition=constants.STATUS_AVAILABLE,
                                 resource_name=restored_noobaa_pv_name)

        # Edit the yaml db-noobaa-db-0.yaml and change the
        # setting volumeName to restored PVC
        noobaa_pvc_yaml["spec"]["volumeName"] = restored_noobaa_pv_name
        noobaa_pvc_yaml = OCS(**noobaa_pvc_yaml)
        noobaa_pvc_yaml.create()

        # Validate noobaa PVC is in bound state
        pvc_obj.wait_for_resource(
            condition=constants.STATUS_BOUND,
            resource_name=noobaa_pvc_obj[0].name,
            timeout=120,
        )

        # Scale up the statefulset again
        assert modify_statefulset_replica_count(
            statefulset_name=self.noobaa_db_sst_name, replica_count=1
        ), f"Failed to scale up the statefulset {self.noobaa_db_sst_name}"

        # Validate noobaa pod is up and running
        pod_obj = OCP(kind=constants.POD,
                      namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        pod_obj.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            resource_count=len(noobaa_pods),
            selector=constants.NOOBAA_APP_LABEL,
        )

        # Change the parameter persistentVolumeReclaimPolicy to Delete again
        params = '{"spec":{"persistentVolumeReclaimPolicy":"Delete"}}'
        assert pv_obj.patch(
            resource_name=restored_noobaa_pv_name, params=params), (
                "Failed to change the parameter persistentVolumeReclaimPolicy"
                f" to Delete {restored_noobaa_pv_name}")
        log.info(
            "Changed the parameter persistentVolumeReclaimPolicy to Delete again"
        )

        # Verify all storage pods are running
        wait_for_storage_pods()

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)
class TestRestartMgrWhileTwoMonsDown(ManageTest):
    """
    Restart mgr pod while two mon pods are down

    """
    @pytest.fixture(autouse=True)
    def init_sanity(self):
        """
        Initialize Sanity instance

        """
        self.sanity_helpers = Sanity()

    @pytest.fixture(scope="function", autouse=True)
    def teardown(self, request):
        """
        Verify all pods on openshift-storage project on Running state

        """
        def finalizer():
            for mon_scale in self.mons_scale:
                self.oc.exec_oc_cmd(
                    f"scale --replicas=1 deployment/{mon_scale}")
            wait_for_pods_to_be_running(timeout=600)

        request.addfinalizer(finalizer)

    def test_restart_mgr_while_two_mons_down(self, pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory):
        """
        Test Procedure:
        1.Scaling down two mons:
        oc scale --replicas=0 deploy/rook-ceph-mon-a
        oc scale --replicas=0 deploy/rook-ceph-mon-b

        2.Restarting mgr
        oc delete pod -l app=rook-ceph-mgr

        3.sleep 5 seconds

        4.Scaling mons back up
        oc scale --replicas=1 deploy/rook-ceph-mon-a
        oc scale --replicas=1 deploy/rook-ceph-mon-b

        5.sleep 10

        6.Waiting for mgr pod move to running state:
        oc get pod -l app=rook-ceph-mgr

        """
        self.oc = ocp.OCP(kind=constants.DEPLOYMENT,
                          namespace=config.ENV_DATA["cluster_namespace"])
        mons = [
            mon["metadata"]["name"] for mon in get_deployments_having_label(
                constants.MON_APP_LABEL, defaults.ROOK_CLUSTER_NAMESPACE)
        ]
        self.mons_scale = mons[0:2]
        tries = 11
        for index in range(1, tries):
            log.info(f"Scaling down two mons {self.mons_scale}, index={index}")
            for mon_scale in self.mons_scale:
                self.oc.exec_oc_cmd(
                    f"scale --replicas=0 deployment/{mon_scale}")

            log.info(f"Restarting mgr pod, index={index}")
            mgr_pod = get_mgr_pods()
            mgr_pod[0].delete(wait=True)

            time.sleep(5)

            log.info(f"Scaling up two mons {self.mons_scale}, index={index}")
            for mon_scale in self.mons_scale:
                self.oc.exec_oc_cmd(
                    f"scale --replicas=1 deployment/{mon_scale}")

            time.sleep(10)

            log.info(
                f"Waiting for mgr pod move to Running state, index={index}")
            mgr_pod_obj = ocp.OCP(kind=constants.POD,
                                  namespace=defaults.ROOK_CLUSTER_NAMESPACE)
            assert mgr_pod_obj.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.MGR_APP_LABEL,
                resource_count=1,
                timeout=100,
            ), f"Mgr pod did'nt move to Running state after 100 seconds, index={index}"

        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        log.info("Deleting Resources using sanity helpers")
        self.sanity_helpers.delete_resources()
Пример #14
0
class TestHugePages(E2ETest):
    """
    Enable huge pages post ODF installation

    """
    @pytest.fixture(scope="function", autouse=True)
    def huge_pages_setup(self, request):
        """
        Initializes sanity

        """
        self.sanity_helpers = Sanity()

        def finalizer():
            """
            Removes huge pages on worker nodes and verifies all pods are up

            """
            disable_huge_pages()

            wait_for_nodes_status(status=constants.NODE_READY, timeout=600)

            nodes = get_nodes()
            for node in nodes:
                assert (node.get()["status"]["allocatable"]["hugepages-2Mi"] ==
                        "0"), f"Huge pages is not applied on {node.name}"

            log.info("Wait for all pods to be in running state")
            wait_for_pods_to_be_running(timeout=600)
            sanity_helpers.ceph_health_check(tries=120)

        request.addfinalizer(finalizer)

    def test_hugepages_post_odf_deployment(
        self,
        pvc_factory,
        pod_factory,
        bucket_factory,
        rgw_bucket_factory,
        node_restart_teardown,
    ):
        """
        Test to verify that after enabling huge pages the nodes come up with
        higher page size and all odf cluster pods come back up.

        """
        # Applies huge pages on the cluster nodes
        enable_huge_pages()

        log.info("Wait for all worker node to be READY state")
        wait_for_nodes_status(status=constants.NODE_READY, timeout=600)

        nodes = get_nodes()
        for node in nodes:
            assert (node.get()["status"]["allocatable"]["hugepages-2Mi"] ==
                    "64Mi"), f"Huge pages is not applied on {node.name}"

        log.info("Wait for all storage cluster pods to be in running state")
        wait_for_pods_to_be_running(timeout=600)

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory, False)

        # Deleting Resources
        log.info("Deleting the resources created")
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)