Пример #1
0
 def finalizer():
     worker_nodes = get_worker_nodes()
     # Removing created label on all worker nodes
     remove_label_from_worker_node(worker_nodes, label_key="dc")
     for thread in self.threads:
         thread.join()
     ceph_health_check()
Пример #2
0
    def cleanup(self):
        """
        Function to tear down
        """
        # Delete all pods, pvcs and namespaces
        for namespace in self.namespace_list:
            delete_objs_parallel(
                obj_list=pod.get_all_pods(namespace=namespace.namespace),
                namespace=namespace.namespace,
                kind=self.kind,
            )
            delete_objs_parallel(
                obj_list=pvc.get_all_pvc_objs(namespace=namespace.namespace),
                namespace=namespace.namespace,
                kind=constants.PVC,
            )
            ocp = OCP(kind=constants.NAMESPACE)
            ocp.delete(resource_name=namespace.namespace)

        # Remove scale label from worker nodes in cleanup
        scale_workers = machine.get_labeled_nodes(constants.SCALE_LABEL)
        helpers.remove_label_from_worker_node(node_list=scale_workers,
                                              label_key="scale-label")

        # Delete machineset which will delete respective nodes too for aws-ipi platform
        if self.ms_name:
            for name in self.ms_name:
                machine.delete_custom_machineset(name)
 def finalizer():
     worker_nodes = get_worker_nodes()
     # Removing created label on all worker nodes
     remove_label_from_worker_node(worker_nodes, label_key="dc")
     # Verify OSD encrypted
     if config.ENV_DATA.get("encryption_at_rest"):
         osd_encryption_verification()
Пример #4
0
        def finalizer():
            helpers.remove_label_from_worker_node(
                node_list=test_nodes, label_key="nodetype"
            )

            # Check ceph health
            ceph_health_check(tries=40)
Пример #5
0
def delete_worker_node():
    # Remove scale label from worker nodes
    scale_workers = machine.get_labeled_nodes(constants.SCALE_LABEL)
    if scale_workers:
        helpers.remove_label_from_worker_node(node_list=scale_workers,
                                              label_key="scale-label")
    # Delete machineset
    if ms_name:
        for name in ms_name:
            machine.delete_custom_machineset(name)
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")
            for thread in self.threads:
                thread.join()

            log.info("Get the machine set name from one of the worker node names")
            machine_name = machine.get_machine_from_node_name(worker_nodes[0])
            machineset_name = machine.get_machineset_from_machine_name(machine_name)
            log.info(
                "Verify that the current replica count is equal to the ready replica count"
            )
            machine.change_current_replica_count_to_ready_replica_count(machineset_name)

            ceph_health_check()
Пример #7
0
 def cleanup(self):
     run(f"oc delete -f {self.crd}", shell=True, cwd=self.dir)
     run(f"oc delete -f {self.operator}", shell=True, cwd=self.dir)
     run("oc delete -f deploy", shell=True, cwd=self.dir)
     run_cmd(f"oc delete project {self.namespace}")
     run(
         "oc delete -f resources/kernel-cache-drop-clusterrole.yaml",
         shell=True,
         check=True,
         cwd=self.dir,
     )
     self.ns_obj.wait_for_delete(resource_name=self.namespace, timeout=180)
     # Reset namespace to default
     switch_to_default_rook_cluster_project()
     helpers.remove_label_from_worker_node(self.worker_nodes,
                                           label_key="kernel-cache-dropper")
Пример #8
0
    def finalizer():
        """
        Make sure that all cluster's nodes are in 'Ready' state and if not,
        change them back to 'Ready' state by marking them as schedulable
        """
        scheduling_disabled_nodes = [
            n.name for n in get_node_objs() if n.ocp.get_resource_status(
                n.name) == constants.NODE_READY_SCHEDULING_DISABLED
        ]
        if scheduling_disabled_nodes:
            schedule_nodes(scheduling_disabled_nodes)

        # Remove label created for DC app pods on all worker nodes
        node_objs = get_node_objs()
        for node_obj in node_objs:
            if "dc" in node_obj.get().get("metadata").get("labels").keys():
                remove_label_from_worker_node([node_obj.name], label_key="dc")
Пример #9
0
    def cleanup(self):
        """
        Clean up the cluster from the benchmark operator project

        """
        # Reset namespace to default
        switch_to_default_rook_cluster_project()

        log.info("Delete the benchmark-operator project")
        run("make undeploy", shell=True, check=True, cwd=self.dir)
        # Wait until the benchmark-operator project deleted
        self.ns_obj.wait_for_delete(resource_name=self.namespace, timeout=180)

        # remove from workers the label used for cache dropping
        log.info("Remove labels from worker nodes.")
        helpers.remove_label_from_worker_node(self.worker_nodes, label_key=BMO_LABEL)

        # wait another 10 sec. after cleanup done.
        time.sleep(10)
 def finalizer():
     worker_nodes = get_worker_nodes()
     # Removing created label on all worker nodes
     remove_label_from_worker_node(worker_nodes, label_key="dc")
Пример #11
0
    def test_rwo_pvc_fencing_node_prolonged_and_short_network_failure(
            self, nodes, setup, node_restart_teardown):
        """
        OCS-1431/OCS-1436:
        - Start DeploymentConfig based app pods on 1 node
        - Make the node (where app pods are running) unresponsive
            by bringing its main network interface down
        - Disrupt the leader provisioner pods if not running on above selected
            node
        - Check new app pods and/or mon, osd pods scheduled on another node
            are stuck due to Multi-Attach error.
        - Power off the unresponsive node
        - Force delete the app pods and/or mon,osd pods on the unresponsive node
        - Check new app pods and/or mon, osd pods scheduled on another node comes
            into Running state
        - Run IOs on new app pods
        - Again make the node (where app pods are running) unresponsive
            by bringing its main network interface down
        - Check new app pods scheduled on another node are stuck due to
            Multi-Attach error.
        - Reboot the unresponsive node
        - When unresponsive node recovers, run IOs on new app pods

        """
        ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup

        external_mode = helpers.storagecluster_independent_check()
        extra_nodes = list(set(test_nodes) - set(app_pod_nodes))
        helpers.remove_label_from_worker_node(node_list=extra_nodes[:-1],
                                              label_key="nodetype")

        # Run IO on pods
        md5sum_data = self.run_and_verify_io(pod_list=dc_pods,
                                             fio_filename="io_file1",
                                             run_io_in_bg=True)

        # Disrupt leader plugin-provisioner pods, skip if running on node to be failed
        if disruptor:
            [disruption.delete_resource() for disruption in disruptor]

        # Induce network failure on the nodes
        node.node_network_failure(app_pod_nodes)
        logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds")
        sleep(self.prolong_nw_fail_time)

        # Wait for pods to be rescheduled
        for pod_obj in dc_pods + ceph_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_TERMINATING,
                resource_name=pod_obj.name)

        # Fetch info of new pods and verify Multi-Attach error
        new_dc_pods = self.get_new_pods(dc_pods)
        assert len(new_dc_pods) == len(
            dc_pods), "Unexpected number of app pods"
        self.verify_multi_attach_error(new_dc_pods)

        new_ceph_pods = []
        if ceph_pods:
            new_ceph_pods = self.get_new_pods(ceph_pods)
            assert len(new_ceph_pods) > 0, "Unexpected number of osd pods"
            self.verify_multi_attach_error(new_ceph_pods)

        logger.info("Executing manual recovery steps")
        # Power off the unresponsive node
        logger.info(f"Powering off the unresponsive node: {app_pod_nodes}")
        nodes.stop_nodes(node.get_node_objs(app_pod_nodes))

        # Force delete the app pods and/or mon,osd pods on the unresponsive node
        for pod_obj in dc_pods + ceph_pods:
            pod_obj.delete(force=True)

        # Wait for new app pods to reach Running state
        for pod_obj in new_dc_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=1200,
                sleep=30,
            ), (f"App pod with name {pod_obj.name} did not reach Running state"
                )

        if not external_mode:
            # Wait for mon and osd pods to reach Running state
            selectors_to_check = {
                constants.MON_APP_LABEL: self.expected_mon_count,
                constants.OSD_APP_LABEL: ceph_cluster.osd_count,
            }
            for selector, count in selectors_to_check.items():
                assert ceph_cluster.POD.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=selector,
                    resource_count=count,
                    timeout=1800,
                    sleep=60,
                ), f"{count} expected pods with selector {selector} are not in Running state"

            if ceph_cluster.mon_count == self.expected_mon_count:
                # Check ceph health
                toolbox_status = ceph_cluster.POD.get_resource_status(
                    ceph_cluster.toolbox.name)
                if toolbox_status == constants.STATUS_TERMINATING:
                    ceph_cluster.toolbox.delete(force=True)

                assert ceph_health_check(), "Ceph cluster health is not OK"
                logger.info("Ceph cluster health is OK")

        # Verify data integrity from new pods
        for num, pod_obj in enumerate(new_dc_pods):
            pod.verify_data_integrity(pod_obj=pod_obj,
                                      file_name="io_file1",
                                      original_md5sum=md5sum_data[num])

        # Run IO on new pods
        md5sum_data2 = self.run_and_verify_io(pod_list=new_dc_pods,
                                              fio_filename="io_file2",
                                              run_io_in_bg=True)

        helpers.label_worker_node(node_list=extra_nodes[:-1],
                                  label_key="nodetype",
                                  label_value="app-pod")

        # Induce network failure on the node
        node.node_network_failure(extra_nodes[-1])
        logger.info(f"Waiting for {self.short_nw_fail_time} seconds")
        sleep(self.short_nw_fail_time)

        # Wait for pods to be rescheduled
        for pod_obj in new_dc_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_TERMINATING,
                resource_name=pod_obj.name,
                timeout=600,
                sleep=30,
            )

        # Fetch info of new pods and verify Multi-Attach error
        new_dc_pods2 = self.get_new_pods(new_dc_pods)
        assert len(new_dc_pods2) == len(
            new_dc_pods), "Unexpected number of app pods"
        self.verify_multi_attach_error(new_dc_pods2)

        # Reboot the unresponsive node
        logger.info(f"Rebooting the unresponsive node: {extra_nodes[-1]}")
        nodes.restart_nodes_by_stop_and_start(
            node.get_node_objs([extra_nodes[-1]]))
        node.wait_for_nodes_status(node_names=[extra_nodes[-1]],
                                   status=constants.NODE_READY)

        # Wait for new app pods to reach Running state
        for pod_obj in new_dc_pods2:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=1200,
                sleep=30,
            ), (f"App pod with name {pod_obj.name} did not reach Running state"
                )

        if not external_mode:
            # Wait for mon and osd pods to reach Running state
            for selector, count in selectors_to_check.items():
                assert ceph_cluster.POD.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=selector,
                    resource_count=count,
                    timeout=1800,
                    sleep=60,
                ), f"{count} expected pods with selector {selector} are not in Running state"

            if ceph_cluster.mon_count == 3:
                # Check ceph health
                assert ceph_health_check(), "Ceph cluster health is not OK"
                logger.info("Ceph cluster health is OK")

        # Verify data integrity from new pods
        for num, pod_obj in enumerate(new_dc_pods2):
            pod.verify_data_integrity(pod_obj=pod_obj,
                                      file_name="io_file2",
                                      original_md5sum=md5sum_data2[num])

        for num, pod_obj in enumerate(new_dc_pods2):
            pod.verify_data_integrity(pod_obj=pod_obj,
                                      file_name="io_file1",
                                      original_md5sum=md5sum_data[num])

        # Run IO on new pods
        self.run_and_verify_io(pod_list=new_dc_pods2,
                               fio_filename="io_file3",
                               return_md5sum=False)
Пример #12
0
    def setup(
        self,
        request,
        scenario,
        num_of_nodes,
        num_of_fail_nodes,
        disrupt_provisioner,
        project_factory,
        multi_pvc_factory,
        dc_pod_factory,
    ):
        """
        Identify the nodes and start DeploymentConfig based app pods using
        PVC with ReadWriteOnce (RWO) access mode on selected nodes

        Args:
            scenario (str): Scenario of app pods running on OCS or dedicated nodes
                (eg., 'colocated', 'dedicated')
            num_of_nodes (int): number of nodes required for running test
            num_of_fail_nodes (int): number of nodes to make unresponsive during test
            disrupt_provisioner (bool): True to disrupt the leader provisioner
                pods if not running on selected nodes, else False
            project_factory: A fixture to create new project
            multi_pvc_factory: A fixture create a set of new PVCs
            dc_pod_factory: A fixture to create deploymentconfig pods

        Returns:
            tuple: containing the params used in test cases

        """
        ocs_nodes, non_ocs_nodes = self.identify_and_add_nodes(
            scenario, num_of_nodes)
        test_nodes = ocs_nodes if (scenario == "colocated") else non_ocs_nodes
        logger.info(f"Using nodes {test_nodes} for running test")

        def finalizer():
            helpers.remove_label_from_worker_node(node_list=test_nodes,
                                                  label_key="nodetype")

            # Check ceph health
            ceph_health_check(tries=40)

        request.addfinalizer(finalizer)

        project = project_factory()

        if helpers.storagecluster_independent_check():
            ceph_cluster = CephClusterExternal()
        else:
            ceph_cluster = CephCluster()
            # Wait for mon pods to reach expected count
            # Bug 1778273 - [RFE]: Configure 5 MONs for OCS cluster with 5 or more nodes
            # This wait is required for some of the previous OCS versions (< 4.5)
            current_mon_count = int(
                ceph_cluster.CEPHCLUSTER.get_resource(resource_name="",
                                                      column="MONCOUNT"))
            assert ceph_cluster.POD.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.MON_APP_LABEL,
                resource_count=current_mon_count,
                timeout=900,
            )
            ceph_cluster.mons = []
            ceph_cluster.scan_cluster()

        # Select nodes for running app pods and inducing network failure later
        app_pod_nodes = self.select_nodes_for_app_pods(scenario, ceph_cluster,
                                                       ocs_nodes,
                                                       non_ocs_nodes,
                                                       num_of_fail_nodes)

        # Create multiple RBD and CephFS backed PVCs with RWO accessmode
        num_of_pvcs = self.num_of_app_pods_per_node * num_of_fail_nodes
        rbd_pvcs = multi_pvc_factory(
            interface=constants.CEPHBLOCKPOOL,
            project=project,
            size=self.pvc_size,
            access_modes=[constants.ACCESS_MODE_RWO],
            num_of_pvc=num_of_pvcs,
        )
        cephfs_pvcs = multi_pvc_factory(
            interface=constants.CEPHFILESYSTEM,
            project=project,
            size=self.pvc_size,
            access_modes=[constants.ACCESS_MODE_RWO],
            num_of_pvc=num_of_pvcs,
        )

        # Create deploymentconfig based pods
        dc_pods = []
        # Start app-pods on selected node(s)
        for node_name in app_pod_nodes:
            logger.info(f"Starting app pods on the node {node_name}")
            helpers.label_worker_node(node_list=[node_name],
                                      label_key="nodetype",
                                      label_value="app-pod")

            for num in range(self.num_of_app_pods_per_node):
                dc_pods.append(
                    dc_pod_factory(
                        interface=constants.CEPHBLOCKPOOL,
                        pvc=rbd_pvcs.pop(0),
                        node_selector={"nodetype": "app-pod"},
                    ))
                assert pod.verify_node_name(
                    dc_pods[-1], node_name
                ), f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}"
                dc_pods.append(
                    dc_pod_factory(
                        interface=constants.CEPHFILESYSTEM,
                        pvc=cephfs_pvcs.pop(0),
                        node_selector={"nodetype": "app-pod"},
                    ))
                assert pod.verify_node_name(
                    dc_pods[-1], node_name
                ), f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}"
            helpers.remove_label_from_worker_node(node_list=[node_name],
                                                  label_key="nodetype")

        # Label other test nodes to be able to run app pods later
        helpers.label_worker_node(node_list=test_nodes,
                                  label_key="nodetype",
                                  label_value="app-pod")

        # Get ceph mon,osd pods running on selected node if colocated scenario
        # and extra OCS nodes are present
        # Recovery steps for MON and OSDS not required from OCS 4.4 onwards
        # Refer to BZ 1830015 and BZ 1835908
        ceph_pods = []
        if float(config.ENV_DATA["ocs_version"]) < 4.4 and (
                scenario == "colocated" and len(test_nodes) > 3):
            pods_to_check = ceph_cluster.osds
            # Skip mon pods if mon_count is 5 as there may not be enough nodes
            # for all mons to run after multiple node failures
            if ceph_cluster.mon_count == 3:
                pods_to_check.extend(ceph_cluster.mons)
            for pod_obj in pods_to_check:
                if pod.get_pod_node(pod_obj).name in app_pod_nodes[0]:
                    ceph_pods.append(pod_obj)
            logger.info(
                f"Colocated Mon, OSD pods: {[pod_obj.name for pod_obj in ceph_pods]}"
            )

        disruptor = []
        if disrupt_provisioner:
            disruptor = self.disrupt_plugin_provisioner_pods(app_pod_nodes)

        return ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor
 def finalizer():
     helpers.remove_label_from_worker_node(node_list=test_nodes,
                                           label_key="nodetype")