def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") for thread in self.threads: thread.join() ceph_health_check()
def cleanup(self): """ Function to tear down """ # Delete all pods, pvcs and namespaces for namespace in self.namespace_list: delete_objs_parallel( obj_list=pod.get_all_pods(namespace=namespace.namespace), namespace=namespace.namespace, kind=self.kind ) delete_objs_parallel( obj_list=pvc.get_all_pvc_objs(namespace=namespace.namespace), namespace=namespace.namespace, kind=constants.PVC ) ocp = OCP(kind=constants.NAMESPACE) ocp.delete(resource_name=namespace.namespace) # Remove scale label from worker nodes in cleanup scale_workers = machine.get_labeled_nodes(constants.SCALE_LABEL) helpers.remove_label_from_worker_node( node_list=scale_workers, label_key='scale-label' ) # Delete machineset which will delete respective nodes too for aws-ipi platform if self.ms_name: for name in self.ms_name: machine.delete_custom_machineset(name)
def delete_worker_node(): # Remove scale label from worker nodes scale_workers = machine.get_labeled_nodes(constants.SCALE_LABEL) if scale_workers: helpers.remove_label_from_worker_node(node_list=scale_workers, label_key='scale-label') # Delete machineset if ms_name: for name in ms_name: machine.delete_custom_machineset(name)
def teardown(): if with_ocs: return if m_set != '': log.info(f'Destroy {m_set}') machine.delete_custom_machineset(m_set) else: log.info('Clear label form worker (Application) nodes') # Getting all Application nodes app_nodes = machine.get_labeled_nodes(constants.APP_NODE_LABEL) log.debug(f'The application nodes are : {app_nodes}') helpers.remove_label_from_worker_node(app_nodes, constants.VDBENCH_NODE_LABEL)
def finalizer(): """ Make sure that all cluster's nodes are in 'Ready' state and if not, change them back to 'Ready' state by marking them as schedulable """ scheduling_disabled_nodes = [ n.name for n in get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_READY_SCHEDULING_DISABLED ] if scheduling_disabled_nodes: schedule_nodes(scheduling_disabled_nodes) # Remove label created for DC app pods on all worker nodes node_objs = get_node_objs() for node_obj in node_objs: if 'dc' in node_obj.get().get('metadata').get('labels').keys(): remove_label_from_worker_node([node_obj.name], label_key="dc")
def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc")
def teardown(): log.info('Clear label form worker (Application) nodes') # Getting all Application nodes app_nodes = machine.get_labeled_nodes(constants.APP_NODE_LABEL) helpers.remove_label_from_worker_node(app_nodes, constants.APP_NODE_LABEL)
def test_rwo_pvc_fencing_node_prolonged_and_short_network_failure( self, nodes, setup, teardown ): """ OCS-1431/OCS-1436: - Start DeploymentConfig based app pods on 1 node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Disrupt the leader provisioner pods if not running on above selected node - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive node - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods - Again make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods scheduled on another node are stuck due to Multi-Attach error. - Reboot the unresponsive node - When unresponsive node recovers, run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup extra_nodes = list(set(test_nodes) - set(app_pod_nodes)) helpers.remove_label_from_worker_node( node_list=extra_nodes[:-1], label_key="nodetype" ) # Run IO on pods md5sum_data = self.run_and_verify_io( pod_list=dc_pods, fio_filename='io_file1', run_io_in_bg=True ) # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds") sleep(self.prolong_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in (dc_pods + ceph_pods): pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name ) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len(dc_pods), 'Unexpected number of app pods' self.verify_multi_attach_error(new_dc_pods) new_ceph_pods = [] if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, 'Unexpected number of osd pods' self.verify_multi_attach_error(new_ceph_pods) logger.info(f"Executing manual recovery steps") # Power off the unresponsive node logger.info( f"Powering off the unresponsive node: {app_pod_nodes}" ) nodes.stop_nodes(node.get_node_objs(app_pod_nodes)) # Force delete the app pods and/or mon,osd pods on the unresponsive node for pod_obj in (dc_pods + ceph_pods): pod_obj.delete(force=True) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30 ), ( f"App pod with name {pod_obj.name} did not reach Running state" ) # Wait for mon and osd pods to reach Running state selectors_to_check = [constants.MON_APP_LABEL, constants.OSD_APP_LABEL] for selector in selectors_to_check: assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=3, timeout=1800, sleep=60 ), ( f"3 expected pods with selector {selector} are not in Running state" ) if ceph_cluster.mon_count == 3: # Check ceph health toolbox_status = ceph_cluster.POD.get_resource_status( ceph_cluster.toolbox.name ) if toolbox_status == constants.STATUS_TERMINATING: ceph_cluster.toolbox.delete(force=True) assert ceph_health_check(), f"Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): assert pod.verify_data_integrity( pod_obj=pod_obj, file_name='io_file1', original_md5sum=md5sum_data[num] ), 'Data integrity check failed' # Run IO on new pods md5sum_data2 = self.run_and_verify_io( pod_list=new_dc_pods, fio_filename='io_file2', run_io_in_bg=True ) helpers.label_worker_node( node_list=extra_nodes[:-1], label_key="nodetype", label_value="app-pod" ) # Induce network failure on the node node.node_network_failure(extra_nodes[-1]) logger.info(f"Waiting for {self.short_nw_fail_time} seconds") sleep(self.short_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name, timeout=600, sleep=30 ) # Fetch info of new pods and verify Multi-Attach error new_dc_pods2 = self.get_new_pods(new_dc_pods) assert len(new_dc_pods2) == len(new_dc_pods), 'Unexpected number of app pods' self.verify_multi_attach_error(new_dc_pods2) # Reboot the unresponsive node logger.info(f"Rebooting the unresponsive node: {extra_nodes[-1]}") nodes.restart_nodes(node.get_node_objs([extra_nodes[-1]])) node.wait_for_nodes_status( node_names=[extra_nodes[-1]], status=constants.NODE_READY ) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods2: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30 ), ( f"App pod with name {pod_obj.name} did not reach Running state" ) # Wait for mon and osd pods to reach Running state for selector in selectors_to_check: assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=3, timeout=1800, sleep=60 ), ( f"3 expected pods with selector {selector} are not in Running state" ) if ceph_cluster.mon_count == 3: # Check ceph health assert ceph_health_check(), f"Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods2): assert pod.verify_data_integrity( pod_obj=pod_obj, file_name='io_file2', original_md5sum=md5sum_data2[num] ), 'Data integrity check for files written before second node failures failed' for num, pod_obj in enumerate(new_dc_pods2): assert pod.verify_data_integrity( pod_obj=pod_obj, file_name='io_file1', original_md5sum=md5sum_data[num] ), 'Data integrity check for files written before first node failures failed' # Run IO on new pods self.run_and_verify_io( pod_list=new_dc_pods2, fio_filename='io_file3', return_md5sum=False )
def finalizer(): helpers.remove_label_from_worker_node( node_list=test_nodes, label_key="nodetype" )
def setup( self, request, scenario, num_of_nodes, num_of_fail_nodes, disrupt_provisioner, project_factory, multi_pvc_factory, dc_pod_factory ): """ Identify the nodes and start DeploymentConfig based app pods using PVC with ReadWriteOnce (RWO) access mode on selected nodes Args: scenario (str): Scenario of app pods running on OCS or dedicated nodes (eg., 'colocated', 'dedicated') num_of_nodes (int): number of nodes required for running test num_of_fail_nodes (int): number of nodes to make unresponsive during test disrupt_provisioner (bool): True to disrupt the leader provisioner pods if not running on selected nodes, else False project_factory: A fixture to create new project multi_pvc_factory: A fixture create a set of new PVCs dc_pod_factory: A fixture to create deploymentconfig pods Returns: tuple: containing the params used in test cases """ ocs_nodes, non_ocs_nodes = self.identify_and_add_nodes( scenario, num_of_nodes ) test_nodes = ocs_nodes if (scenario == "colocated") else non_ocs_nodes logger.info(f"Using nodes {test_nodes} for running test") def finalizer(): helpers.remove_label_from_worker_node( node_list=test_nodes, label_key="nodetype" ) request.addfinalizer(finalizer) if len(ocs_nodes) > 4 and float(config.ENV_DATA['ocs_version']) >= 4.3: pod_obj = ocp.OCP( kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace'] ) assert pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=5, timeout=900 ) ceph_cluster = CephCluster() project = project_factory() # Select nodes for running app pods and inducing network failure later app_pod_nodes = self.select_nodes_for_app_pods( scenario, ceph_cluster, ocs_nodes, non_ocs_nodes, num_of_fail_nodes ) # Create multiple RBD and CephFS backed PVCs with RWO accessmode num_of_pvcs = self.num_of_app_pods_per_node * num_of_fail_nodes rbd_pvcs = multi_pvc_factory( interface=constants.CEPHBLOCKPOOL, project=project, size=self.pvc_size, access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs ) cephfs_pvcs = multi_pvc_factory( interface=constants.CEPHFILESYSTEM, project=project, size=self.pvc_size, access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs ) # Create deploymentconfig based pods dc_pods = [] # Start app-pods on selected node(s) for node_name in app_pod_nodes: logger.info(f"Starting app pods on the node {node_name}") helpers.label_worker_node( node_list=[node_name], label_key="nodetype", label_value="app-pod" ) for num in range(self.num_of_app_pods_per_node): dc_pods.append( dc_pod_factory( interface=constants.CEPHBLOCKPOOL, pvc=rbd_pvcs.pop(0), node_selector={'nodetype': 'app-pod'} ) ) assert pod.verify_node_name(dc_pods[-1], node_name), ( f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}" ) dc_pods.append( dc_pod_factory( interface=constants.CEPHFILESYSTEM, pvc=cephfs_pvcs.pop(0), node_selector={'nodetype': 'app-pod'} ) ) assert pod.verify_node_name(dc_pods[-1], node_name), ( f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}" ) helpers.remove_label_from_worker_node( node_list=[node_name], label_key="nodetype" ) # Label other test nodes to be able to run app pods later helpers.label_worker_node( node_list=test_nodes, label_key="nodetype", label_value="app-pod" ) # Get ceph mon,osd pods running on selected node if colocated scenario # and extra OCS nodes are present ceph_pods = [] if scenario == "colocated" and len(test_nodes) > len(ceph_cluster.osds): pods_to_check = ceph_cluster.osds # Skip mon pods if mon_count is 5 as there may not be enough nodes # for all mons to run after multiple node failures if ceph_cluster.mon_count == 3: pods_to_check.extend(ceph_cluster.mons) for pod_obj in pods_to_check: if pod.get_pod_node(pod_obj).name in app_pod_nodes[0]: ceph_pods.append(pod_obj) logger.info( f"Colocated Mon, OSD pods: {[pod_obj.name for pod_obj in ceph_pods]}" ) disruptor = [] if disrupt_provisioner: disruptor = self.disrupt_plugin_provisioner_pods(app_pod_nodes) return ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor
def finalizer(): helpers.remove_label_from_worker_node(node_list=worker_nodes, label_key="nodetype") # Check ceph health ceph_health_check(tries=80)