def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") for thread in self.threads: thread.join() ceph_health_check()
def cleanup(self): """ Function to tear down """ # Delete all pods, pvcs and namespaces for namespace in self.namespace_list: delete_objs_parallel( obj_list=pod.get_all_pods(namespace=namespace.namespace), namespace=namespace.namespace, kind=self.kind, ) delete_objs_parallel( obj_list=pvc.get_all_pvc_objs(namespace=namespace.namespace), namespace=namespace.namespace, kind=constants.PVC, ) ocp = OCP(kind=constants.NAMESPACE) ocp.delete(resource_name=namespace.namespace) # Remove scale label from worker nodes in cleanup scale_workers = machine.get_labeled_nodes(constants.SCALE_LABEL) helpers.remove_label_from_worker_node(node_list=scale_workers, label_key="scale-label") # Delete machineset which will delete respective nodes too for aws-ipi platform if self.ms_name: for name in self.ms_name: machine.delete_custom_machineset(name)
def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") # Verify OSD encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification()
def finalizer(): helpers.remove_label_from_worker_node( node_list=test_nodes, label_key="nodetype" ) # Check ceph health ceph_health_check(tries=40)
def delete_worker_node(): # Remove scale label from worker nodes scale_workers = machine.get_labeled_nodes(constants.SCALE_LABEL) if scale_workers: helpers.remove_label_from_worker_node(node_list=scale_workers, label_key="scale-label") # Delete machineset if ms_name: for name in ms_name: machine.delete_custom_machineset(name)
def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") for thread in self.threads: thread.join() log.info("Get the machine set name from one of the worker node names") machine_name = machine.get_machine_from_node_name(worker_nodes[0]) machineset_name = machine.get_machineset_from_machine_name(machine_name) log.info( "Verify that the current replica count is equal to the ready replica count" ) machine.change_current_replica_count_to_ready_replica_count(machineset_name) ceph_health_check()
def cleanup(self): run(f"oc delete -f {self.crd}", shell=True, cwd=self.dir) run(f"oc delete -f {self.operator}", shell=True, cwd=self.dir) run("oc delete -f deploy", shell=True, cwd=self.dir) run_cmd(f"oc delete project {self.namespace}") run( "oc delete -f resources/kernel-cache-drop-clusterrole.yaml", shell=True, check=True, cwd=self.dir, ) self.ns_obj.wait_for_delete(resource_name=self.namespace, timeout=180) # Reset namespace to default switch_to_default_rook_cluster_project() helpers.remove_label_from_worker_node(self.worker_nodes, label_key="kernel-cache-dropper")
def finalizer(): """ Make sure that all cluster's nodes are in 'Ready' state and if not, change them back to 'Ready' state by marking them as schedulable """ scheduling_disabled_nodes = [ n.name for n in get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_READY_SCHEDULING_DISABLED ] if scheduling_disabled_nodes: schedule_nodes(scheduling_disabled_nodes) # Remove label created for DC app pods on all worker nodes node_objs = get_node_objs() for node_obj in node_objs: if "dc" in node_obj.get().get("metadata").get("labels").keys(): remove_label_from_worker_node([node_obj.name], label_key="dc")
def cleanup(self): """ Clean up the cluster from the benchmark operator project """ # Reset namespace to default switch_to_default_rook_cluster_project() log.info("Delete the benchmark-operator project") run("make undeploy", shell=True, check=True, cwd=self.dir) # Wait until the benchmark-operator project deleted self.ns_obj.wait_for_delete(resource_name=self.namespace, timeout=180) # remove from workers the label used for cache dropping log.info("Remove labels from worker nodes.") helpers.remove_label_from_worker_node(self.worker_nodes, label_key=BMO_LABEL) # wait another 10 sec. after cleanup done. time.sleep(10)
def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc")
def test_rwo_pvc_fencing_node_prolonged_and_short_network_failure( self, nodes, setup, node_restart_teardown): """ OCS-1431/OCS-1436: - Start DeploymentConfig based app pods on 1 node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Disrupt the leader provisioner pods if not running on above selected node - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive node - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods - Again make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods scheduled on another node are stuck due to Multi-Attach error. - Reboot the unresponsive node - When unresponsive node recovers, run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup external_mode = helpers.storagecluster_independent_check() extra_nodes = list(set(test_nodes) - set(app_pod_nodes)) helpers.remove_label_from_worker_node(node_list=extra_nodes[:-1], label_key="nodetype") # Run IO on pods md5sum_data = self.run_and_verify_io(pod_list=dc_pods, fio_filename="io_file1", run_io_in_bg=True) # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds") sleep(self.prolong_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in dc_pods + ceph_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len( dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods) new_ceph_pods = [] if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, "Unexpected number of osd pods" self.verify_multi_attach_error(new_ceph_pods) logger.info("Executing manual recovery steps") # Power off the unresponsive node logger.info(f"Powering off the unresponsive node: {app_pod_nodes}") nodes.stop_nodes(node.get_node_objs(app_pod_nodes)) # Force delete the app pods and/or mon,osd pods on the unresponsive node for pod_obj in dc_pods + ceph_pods: pod_obj.delete(force=True) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not external_mode: # Wait for mon and osd pods to reach Running state selectors_to_check = { constants.MON_APP_LABEL: self.expected_mon_count, constants.OSD_APP_LABEL: ceph_cluster.osd_count, } for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == self.expected_mon_count: # Check ceph health toolbox_status = ceph_cluster.POD.get_resource_status( ceph_cluster.toolbox.name) if toolbox_status == constants.STATUS_TERMINATING: ceph_cluster.toolbox.delete(force=True) assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods md5sum_data2 = self.run_and_verify_io(pod_list=new_dc_pods, fio_filename="io_file2", run_io_in_bg=True) helpers.label_worker_node(node_list=extra_nodes[:-1], label_key="nodetype", label_value="app-pod") # Induce network failure on the node node.node_network_failure(extra_nodes[-1]) logger.info(f"Waiting for {self.short_nw_fail_time} seconds") sleep(self.short_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name, timeout=600, sleep=30, ) # Fetch info of new pods and verify Multi-Attach error new_dc_pods2 = self.get_new_pods(new_dc_pods) assert len(new_dc_pods2) == len( new_dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods2) # Reboot the unresponsive node logger.info(f"Rebooting the unresponsive node: {extra_nodes[-1]}") nodes.restart_nodes_by_stop_and_start( node.get_node_objs([extra_nodes[-1]])) node.wait_for_nodes_status(node_names=[extra_nodes[-1]], status=constants.NODE_READY) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods2: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not external_mode: # Wait for mon and osd pods to reach Running state for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == 3: # Check ceph health assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods2): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file2", original_md5sum=md5sum_data2[num]) for num, pod_obj in enumerate(new_dc_pods2): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods self.run_and_verify_io(pod_list=new_dc_pods2, fio_filename="io_file3", return_md5sum=False)
def setup( self, request, scenario, num_of_nodes, num_of_fail_nodes, disrupt_provisioner, project_factory, multi_pvc_factory, dc_pod_factory, ): """ Identify the nodes and start DeploymentConfig based app pods using PVC with ReadWriteOnce (RWO) access mode on selected nodes Args: scenario (str): Scenario of app pods running on OCS or dedicated nodes (eg., 'colocated', 'dedicated') num_of_nodes (int): number of nodes required for running test num_of_fail_nodes (int): number of nodes to make unresponsive during test disrupt_provisioner (bool): True to disrupt the leader provisioner pods if not running on selected nodes, else False project_factory: A fixture to create new project multi_pvc_factory: A fixture create a set of new PVCs dc_pod_factory: A fixture to create deploymentconfig pods Returns: tuple: containing the params used in test cases """ ocs_nodes, non_ocs_nodes = self.identify_and_add_nodes( scenario, num_of_nodes) test_nodes = ocs_nodes if (scenario == "colocated") else non_ocs_nodes logger.info(f"Using nodes {test_nodes} for running test") def finalizer(): helpers.remove_label_from_worker_node(node_list=test_nodes, label_key="nodetype") # Check ceph health ceph_health_check(tries=40) request.addfinalizer(finalizer) project = project_factory() if helpers.storagecluster_independent_check(): ceph_cluster = CephClusterExternal() else: ceph_cluster = CephCluster() # Wait for mon pods to reach expected count # Bug 1778273 - [RFE]: Configure 5 MONs for OCS cluster with 5 or more nodes # This wait is required for some of the previous OCS versions (< 4.5) current_mon_count = int( ceph_cluster.CEPHCLUSTER.get_resource(resource_name="", column="MONCOUNT")) assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=current_mon_count, timeout=900, ) ceph_cluster.mons = [] ceph_cluster.scan_cluster() # Select nodes for running app pods and inducing network failure later app_pod_nodes = self.select_nodes_for_app_pods(scenario, ceph_cluster, ocs_nodes, non_ocs_nodes, num_of_fail_nodes) # Create multiple RBD and CephFS backed PVCs with RWO accessmode num_of_pvcs = self.num_of_app_pods_per_node * num_of_fail_nodes rbd_pvcs = multi_pvc_factory( interface=constants.CEPHBLOCKPOOL, project=project, size=self.pvc_size, access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs, ) cephfs_pvcs = multi_pvc_factory( interface=constants.CEPHFILESYSTEM, project=project, size=self.pvc_size, access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs, ) # Create deploymentconfig based pods dc_pods = [] # Start app-pods on selected node(s) for node_name in app_pod_nodes: logger.info(f"Starting app pods on the node {node_name}") helpers.label_worker_node(node_list=[node_name], label_key="nodetype", label_value="app-pod") for num in range(self.num_of_app_pods_per_node): dc_pods.append( dc_pod_factory( interface=constants.CEPHBLOCKPOOL, pvc=rbd_pvcs.pop(0), node_selector={"nodetype": "app-pod"}, )) assert pod.verify_node_name( dc_pods[-1], node_name ), f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}" dc_pods.append( dc_pod_factory( interface=constants.CEPHFILESYSTEM, pvc=cephfs_pvcs.pop(0), node_selector={"nodetype": "app-pod"}, )) assert pod.verify_node_name( dc_pods[-1], node_name ), f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}" helpers.remove_label_from_worker_node(node_list=[node_name], label_key="nodetype") # Label other test nodes to be able to run app pods later helpers.label_worker_node(node_list=test_nodes, label_key="nodetype", label_value="app-pod") # Get ceph mon,osd pods running on selected node if colocated scenario # and extra OCS nodes are present # Recovery steps for MON and OSDS not required from OCS 4.4 onwards # Refer to BZ 1830015 and BZ 1835908 ceph_pods = [] if float(config.ENV_DATA["ocs_version"]) < 4.4 and ( scenario == "colocated" and len(test_nodes) > 3): pods_to_check = ceph_cluster.osds # Skip mon pods if mon_count is 5 as there may not be enough nodes # for all mons to run after multiple node failures if ceph_cluster.mon_count == 3: pods_to_check.extend(ceph_cluster.mons) for pod_obj in pods_to_check: if pod.get_pod_node(pod_obj).name in app_pod_nodes[0]: ceph_pods.append(pod_obj) logger.info( f"Colocated Mon, OSD pods: {[pod_obj.name for pod_obj in ceph_pods]}" ) disruptor = [] if disrupt_provisioner: disruptor = self.disrupt_plugin_provisioner_pods(app_pod_nodes) return ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor
def finalizer(): helpers.remove_label_from_worker_node(node_list=test_nodes, label_key="nodetype")