def finalizer():
     worker_nodes = get_worker_nodes()
     # Removing created label on all worker nodes
     remove_label_from_worker_node(worker_nodes, label_key="dc")
     for thread in self.threads:
         thread.join()
     ceph_health_check()
示例#2
0
    def cleanup(self):
        """
        Function to tear down
        """
        # Delete all pods, pvcs and namespaces
        for namespace in self.namespace_list:
            delete_objs_parallel(
                obj_list=pod.get_all_pods(namespace=namespace.namespace),
                namespace=namespace.namespace, kind=self.kind
            )
            delete_objs_parallel(
                obj_list=pvc.get_all_pvc_objs(namespace=namespace.namespace),
                namespace=namespace.namespace, kind=constants.PVC
            )
            ocp = OCP(kind=constants.NAMESPACE)
            ocp.delete(resource_name=namespace.namespace)

        # Remove scale label from worker nodes in cleanup
        scale_workers = machine.get_labeled_nodes(constants.SCALE_LABEL)
        helpers.remove_label_from_worker_node(
            node_list=scale_workers, label_key='scale-label'
        )

        # Delete machineset which will delete respective nodes too for aws-ipi platform
        if self.ms_name:
            for name in self.ms_name:
                machine.delete_custom_machineset(name)
示例#3
0
def delete_worker_node():
    # Remove scale label from worker nodes
    scale_workers = machine.get_labeled_nodes(constants.SCALE_LABEL)
    if scale_workers:
        helpers.remove_label_from_worker_node(node_list=scale_workers,
                                              label_key='scale-label')
    # Delete machineset
    if ms_name:
        for name in ms_name:
            machine.delete_custom_machineset(name)
示例#4
0
    def teardown():

        if with_ocs:
            return

        if m_set != '':
            log.info(f'Destroy {m_set}')
            machine.delete_custom_machineset(m_set)
        else:
            log.info('Clear label form worker (Application) nodes')
            # Getting all Application nodes
            app_nodes = machine.get_labeled_nodes(constants.APP_NODE_LABEL)
            log.debug(f'The application nodes are : {app_nodes}')
            helpers.remove_label_from_worker_node(app_nodes,
                                                  constants.VDBENCH_NODE_LABEL)
示例#5
0
    def finalizer():
        """
        Make sure that all cluster's nodes are in 'Ready' state and if not,
        change them back to 'Ready' state by marking them as schedulable
        """
        scheduling_disabled_nodes = [
            n.name for n in get_node_objs() if n.ocp.get_resource_status(
                n.name) == constants.NODE_READY_SCHEDULING_DISABLED
        ]
        if scheduling_disabled_nodes:
            schedule_nodes(scheduling_disabled_nodes)

        # Remove label created for DC app pods on all worker nodes
        node_objs = get_node_objs()
        for node_obj in node_objs:
            if 'dc' in node_obj.get().get('metadata').get('labels').keys():
                remove_label_from_worker_node([node_obj.name], label_key="dc")
 def finalizer():
     worker_nodes = get_worker_nodes()
     # Removing created label on all worker nodes
     remove_label_from_worker_node(worker_nodes, label_key="dc")
 def teardown():
     log.info('Clear label form worker (Application) nodes')
     # Getting all Application nodes
     app_nodes = machine.get_labeled_nodes(constants.APP_NODE_LABEL)
     helpers.remove_label_from_worker_node(app_nodes,
                                           constants.APP_NODE_LABEL)
    def test_rwo_pvc_fencing_node_prolonged_and_short_network_failure(
        self, nodes, setup, teardown
    ):
        """
        OCS-1431/OCS-1436:
        - Start DeploymentConfig based app pods on 1 node
        - Make the node (where app pods are running) unresponsive
            by bringing its main network interface down
        - Disrupt the leader provisioner pods if not running on above selected
            node
        - Check new app pods and/or mon, osd pods scheduled on another node
            are stuck due to Multi-Attach error.
        - Power off the unresponsive node
        - Force delete the app pods and/or mon,osd pods on the unresponsive node
        - Check new app pods and/or mon, osd pods scheduled on another node comes
            into Running state
        - Run IOs on new app pods
        - Again make the node (where app pods are running) unresponsive
            by bringing its main network interface down
        - Check new app pods scheduled on another node are stuck due to
            Multi-Attach error.
        - Reboot the unresponsive node
        - When unresponsive node recovers, run IOs on new app pods
        """
        ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup

        extra_nodes = list(set(test_nodes) - set(app_pod_nodes))
        helpers.remove_label_from_worker_node(
            node_list=extra_nodes[:-1], label_key="nodetype"
        )

        # Run IO on pods
        md5sum_data = self.run_and_verify_io(
            pod_list=dc_pods, fio_filename='io_file1', run_io_in_bg=True
        )

        # Disrupt leader plugin-provisioner pods, skip if running on node to be failed
        if disruptor:
            [disruption.delete_resource() for disruption in disruptor]

        # Induce network failure on the nodes
        node.node_network_failure(app_pod_nodes)
        logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds")
        sleep(self.prolong_nw_fail_time)

        # Wait for pods to be rescheduled
        for pod_obj in (dc_pods + ceph_pods):
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_TERMINATING,
                resource_name=pod_obj.name
            )

        # Fetch info of new pods and verify Multi-Attach error
        new_dc_pods = self.get_new_pods(dc_pods)
        assert len(new_dc_pods) == len(dc_pods), 'Unexpected number of app pods'
        self.verify_multi_attach_error(new_dc_pods)

        new_ceph_pods = []
        if ceph_pods:
            new_ceph_pods = self.get_new_pods(ceph_pods)
            assert len(new_ceph_pods) > 0, 'Unexpected number of osd pods'
            self.verify_multi_attach_error(new_ceph_pods)

        logger.info(f"Executing manual recovery steps")
        # Power off the unresponsive node
        logger.info(
            f"Powering off the unresponsive node: {app_pod_nodes}"
        )
        nodes.stop_nodes(node.get_node_objs(app_pod_nodes))

        # Force delete the app pods and/or mon,osd pods on the unresponsive node
        for pod_obj in (dc_pods + ceph_pods):
            pod_obj.delete(force=True)

        # Wait for new app pods to reach Running state
        for pod_obj in new_dc_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING, resource_name=pod_obj.name,
                timeout=1200, sleep=30
            ), (
                f"App pod with name {pod_obj.name} did not reach Running state"
            )

        # Wait for mon and osd pods to reach Running state
        selectors_to_check = [constants.MON_APP_LABEL, constants.OSD_APP_LABEL]
        for selector in selectors_to_check:
            assert ceph_cluster.POD.wait_for_resource(
                condition=constants.STATUS_RUNNING, selector=selector,
                resource_count=3, timeout=1800, sleep=60
            ), (
                f"3 expected pods with selector {selector} are not in Running state"
            )

        if ceph_cluster.mon_count == 3:
            # Check ceph health
            toolbox_status = ceph_cluster.POD.get_resource_status(
                ceph_cluster.toolbox.name
            )
            if toolbox_status == constants.STATUS_TERMINATING:
                ceph_cluster.toolbox.delete(force=True)

            assert ceph_health_check(), f"Ceph cluster health is not OK"
            logger.info("Ceph cluster health is OK")

        # Verify data integrity from new pods
        for num, pod_obj in enumerate(new_dc_pods):
            assert pod.verify_data_integrity(
                pod_obj=pod_obj, file_name='io_file1',
                original_md5sum=md5sum_data[num]
            ), 'Data integrity check failed'

        # Run IO on new pods
        md5sum_data2 = self.run_and_verify_io(
            pod_list=new_dc_pods, fio_filename='io_file2', run_io_in_bg=True
        )

        helpers.label_worker_node(
            node_list=extra_nodes[:-1], label_key="nodetype", label_value="app-pod"
        )

        # Induce network failure on the node
        node.node_network_failure(extra_nodes[-1])
        logger.info(f"Waiting for {self.short_nw_fail_time} seconds")
        sleep(self.short_nw_fail_time)

        # Wait for pods to be rescheduled
        for pod_obj in new_dc_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_TERMINATING,
                resource_name=pod_obj.name, timeout=600, sleep=30
            )

        # Fetch info of new pods and verify Multi-Attach error
        new_dc_pods2 = self.get_new_pods(new_dc_pods)
        assert len(new_dc_pods2) == len(new_dc_pods), 'Unexpected number of app pods'
        self.verify_multi_attach_error(new_dc_pods2)

        # Reboot the unresponsive node
        logger.info(f"Rebooting the unresponsive node: {extra_nodes[-1]}")
        nodes.restart_nodes(node.get_node_objs([extra_nodes[-1]]))
        node.wait_for_nodes_status(
            node_names=[extra_nodes[-1]], status=constants.NODE_READY
        )

        # Wait for new app pods to reach Running state
        for pod_obj in new_dc_pods2:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING, resource_name=pod_obj.name,
                timeout=1200, sleep=30
            ), (
                f"App pod with name {pod_obj.name} did not reach Running state"
            )

        # Wait for mon and osd pods to reach Running state
        for selector in selectors_to_check:
            assert ceph_cluster.POD.wait_for_resource(
                condition=constants.STATUS_RUNNING, selector=selector,
                resource_count=3, timeout=1800, sleep=60
            ), (
                f"3 expected pods with selector {selector} are not in Running state"
            )

        if ceph_cluster.mon_count == 3:
            # Check ceph health
            assert ceph_health_check(), f"Ceph cluster health is not OK"
            logger.info("Ceph cluster health is OK")

        # Verify data integrity from new pods
        for num, pod_obj in enumerate(new_dc_pods2):
            assert pod.verify_data_integrity(
                pod_obj=pod_obj, file_name='io_file2',
                original_md5sum=md5sum_data2[num]
            ), 'Data integrity check for files written before second node failures failed'

        for num, pod_obj in enumerate(new_dc_pods2):
            assert pod.verify_data_integrity(
                pod_obj=pod_obj, file_name='io_file1',
                original_md5sum=md5sum_data[num]
            ), 'Data integrity check for files written before first node failures failed'

        # Run IO on new pods
        self.run_and_verify_io(
            pod_list=new_dc_pods2, fio_filename='io_file3', return_md5sum=False
        )
 def finalizer():
     helpers.remove_label_from_worker_node(
         node_list=test_nodes, label_key="nodetype"
     )
    def setup(
        self, request, scenario, num_of_nodes, num_of_fail_nodes,
        disrupt_provisioner, project_factory, multi_pvc_factory, dc_pod_factory
    ):
        """
        Identify the nodes and start DeploymentConfig based app pods using
        PVC with ReadWriteOnce (RWO) access mode on selected nodes

        Args:
            scenario (str): Scenario of app pods running on OCS or dedicated nodes
                (eg., 'colocated', 'dedicated')
            num_of_nodes (int): number of nodes required for running test
            num_of_fail_nodes (int): number of nodes to make unresponsive during test
            disrupt_provisioner (bool): True to disrupt the leader provisioner
                pods if not running on selected nodes, else False
            project_factory: A fixture to create new project
            multi_pvc_factory: A fixture create a set of new PVCs
            dc_pod_factory: A fixture to create deploymentconfig pods

        Returns:
            tuple: containing the params used in test cases
        """
        ocs_nodes, non_ocs_nodes = self.identify_and_add_nodes(
            scenario, num_of_nodes
        )
        test_nodes = ocs_nodes if (scenario == "colocated") else non_ocs_nodes
        logger.info(f"Using nodes {test_nodes} for running test")

        def finalizer():
            helpers.remove_label_from_worker_node(
                node_list=test_nodes, label_key="nodetype"
            )

        request.addfinalizer(finalizer)

        if len(ocs_nodes) > 4 and float(config.ENV_DATA['ocs_version']) >= 4.3:
            pod_obj = ocp.OCP(
                kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']
            )
            assert pod_obj.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.MON_APP_LABEL, resource_count=5, timeout=900
            )

        ceph_cluster = CephCluster()
        project = project_factory()

        # Select nodes for running app pods and inducing network failure later
        app_pod_nodes = self.select_nodes_for_app_pods(
            scenario, ceph_cluster, ocs_nodes, non_ocs_nodes,
            num_of_fail_nodes
        )

        # Create multiple RBD and CephFS backed PVCs with RWO accessmode
        num_of_pvcs = self.num_of_app_pods_per_node * num_of_fail_nodes
        rbd_pvcs = multi_pvc_factory(
            interface=constants.CEPHBLOCKPOOL, project=project, size=self.pvc_size,
            access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs
        )
        cephfs_pvcs = multi_pvc_factory(
            interface=constants.CEPHFILESYSTEM, project=project, size=self.pvc_size,
            access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs
        )

        # Create deploymentconfig based pods
        dc_pods = []
        # Start app-pods on selected node(s)
        for node_name in app_pod_nodes:
            logger.info(f"Starting app pods on the node {node_name}")
            helpers.label_worker_node(
                node_list=[node_name], label_key="nodetype",
                label_value="app-pod"
            )

            for num in range(self.num_of_app_pods_per_node):
                dc_pods.append(
                    dc_pod_factory(
                        interface=constants.CEPHBLOCKPOOL, pvc=rbd_pvcs.pop(0),
                        node_selector={'nodetype': 'app-pod'}
                    )
                )
                assert pod.verify_node_name(dc_pods[-1], node_name), (
                    f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}"
                )
                dc_pods.append(
                    dc_pod_factory(
                        interface=constants.CEPHFILESYSTEM, pvc=cephfs_pvcs.pop(0),
                        node_selector={'nodetype': 'app-pod'}
                    )
                )
                assert pod.verify_node_name(dc_pods[-1], node_name), (
                    f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}"
                )
            helpers.remove_label_from_worker_node(
                node_list=[node_name], label_key="nodetype"
            )

        # Label other test nodes to be able to run app pods later
        helpers.label_worker_node(
            node_list=test_nodes, label_key="nodetype", label_value="app-pod"
        )

        # Get ceph mon,osd pods running on selected node if colocated scenario
        # and extra OCS nodes are present
        ceph_pods = []
        if scenario == "colocated" and len(test_nodes) > len(ceph_cluster.osds):
            pods_to_check = ceph_cluster.osds
            # Skip mon pods if mon_count is 5 as there may not be enough nodes
            # for all mons to run after multiple node failures
            if ceph_cluster.mon_count == 3:
                pods_to_check.extend(ceph_cluster.mons)
            for pod_obj in pods_to_check:
                if pod.get_pod_node(pod_obj).name in app_pod_nodes[0]:
                    ceph_pods.append(pod_obj)
            logger.info(
                f"Colocated Mon, OSD pods: {[pod_obj.name for pod_obj in ceph_pods]}"
            )

        disruptor = []
        if disrupt_provisioner:
            disruptor = self.disrupt_plugin_provisioner_pods(app_pod_nodes)

        return ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor
示例#11
0
        def finalizer():
            helpers.remove_label_from_worker_node(node_list=worker_nodes,
                                                  label_key="nodetype")

            # Check ceph health
            ceph_health_check(tries=80)