class TestNodeReplacementWithIO(ManageTest): """ Knip-894 Node replacement proactive with IO """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_nodereplacement_proactive_with_io_running(self, pvc_factory, pod_factory, dc_pod_factory): """ Knip-894 Node Replacement proactive when IO running in the background """ # Get worker nodes worker_node_list = get_worker_nodes() log.info(f"Current available worker nodes are {worker_node_list}") osd_node_name = select_osd_node_name() log.info("Creating dc pod backed with rbd pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20) pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True) log.info("Creating dc pod backed with cephfs pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: cephfs_dc_pod = dc_pod_factory( interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20) pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True) delete_and_create_osd_node(osd_node_name) # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Deleting Resources self.sanity_helpers.delete_resources() # Verify everything running fine log.info( "Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=120)
def test_deployment(pvc_factory, pod_factory): deploy = config.RUN['cli_params'].get('deploy') teardown = config.RUN['cli_params'].get('teardown') if not teardown or deploy: log.info("Verifying OCP cluster is running") assert is_cluster_running(config.ENV_DATA['cluster_path']) if not config.ENV_DATA['skip_ocs_deployment']: ocs_registry_image = config.DEPLOYMENT.get('ocs_registry_image') ocs_install_verification(ocs_registry_image=ocs_registry_image) nb_eps = config.DEPLOYMENT.get('noobaa_endpoints') if nb_eps > 1: change_noobaa_endpoints_count(nb_eps) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources sanity_helpers = Sanity() sanity_helpers.health_check() sanity_helpers.create_resources(pvc_factory, pod_factory) sanity_helpers.delete_resources() if teardown: log.info( "Cluster will be destroyed during teardown part of this test.")
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_automated_recovery_from_failed_nodes_IPI_proactive( self, pvc_factory, pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get the osd associated node name osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"{osd_node_name} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info(f"{osd_node_name} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestDetachAttachWorkerVolume(ManageTest): """ Test class for detach and attach worker volume """ @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Restart nodes that are in status NotReady, for situations in which the test failed before restarting the node after detach volume, which leaves nodes in NotReady """ def finalizer(): not_ready_nodes = [ n for n in node.get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_NOT_READY ] logger.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status() request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.polarion_id("OCS-1085") def test_detach_attach_worker_volume(self, nodes, pvc_factory, pod_factory): """ Detach and attach worker volume - Detach the data volume from one of the worker nodes - Validate cluster functionality, without checking cluster and Ceph health (as one node volume is detached, the cluster will be unhealthy) by creating resources and running IO - Wait for the volumes to be re-attached back to the worker node - Restart the node so the volume will get re-mounted """ # Get a data volume data_volume = nodes.get_data_volumes()[0] # Get the worker node according to the volume attachment worker = nodes.get_node_by_attached_volume(data_volume) # Detach volume (logging is done inside the function) nodes.detach_volume(data_volume) # Validate cluster is still functional try: # In case the selected node that its volume disk was detached was the one # running the ceph tools pod, we'll need to wait for a new ct pod to start. # For that, a function that connects to the ct pod is being used to check if # it's alive _ = get_admin_key() except CommandFailed as ex: if "connection timed out" in str(ex): logger.info( "Ceph tools box was running on the node that its data " "volume has been detached. Hence, waiting for a new " "Ceph tools box pod to spin up") wait_for_resource_count_change( func_to_use=get_all_pods, previous_num=1, namespace=config.ENV_DATA['cluster_namespace'], timeout=120, selector='app=rook-ceph-tools') else: raise finally: self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Wait for worker volume to be re-attached automatically to the node assert nodes.wait_for_volume_attach(data_volume), ( f"Volume {data_volume.id} failed to be re-attached to a worker node" ) # Restart the instance so the volume will get re-mounted nodes.restart_nodes([worker]) # Cluster health check self.sanity_helpers.health_check() @pytest.mark.polarion_id("OCS-1086") def test_detach_attach_2_data_volumes(self, nodes, pvc_factory, pod_factory): """ Detach and attach disk from 2 worker nodes - Detach the data 2 of the data volumes from their worker nodes - Wait for the volumes to be re-attached back to the worker nodes - Restart the nodes so the volume will get re-mounted in each node - Check cluster health and functionality to make sure detach, attach and restart did not affect the cluster """ # Get 2 data volumes data_volumes = nodes.get_data_volumes()[:2] workers_and_volumes = [{ 'worker': nodes.get_node_by_attached_volume(vol), 'volume': vol } for vol in data_volumes] for worker_and_volume in workers_and_volumes: # Detach the volume (logging is done inside the function) nodes.detach_volume(worker_and_volume['volume']) for worker_and_volume in workers_and_volumes: # Wait for worker volume to be re-attached automatically to the node assert nodes.wait_for_volume_attach(worker_and_volume['volume']), ( f"Volume {worker_and_volume['volume']} " f"failed to be re-attached to a worker node") # Restart the instances so the volume will get re-mounted nodes.restart_nodes([ worker_and_volume['worker'] for worker_and_volume in workers_and_volumes ]) # Validate cluster is still functional self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory)
class TestNodesMaintenance(ManageTest): """ Test basic flows of maintenance (unschedule and drain) and activate operations, followed by cluster functionality and health checks """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @tier1 @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1269")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1272")) ] ) def test_node_maintenance(self, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) node.drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier2 @pytest.mark.skipif( condition=config.ENV_DATA['platform'] != 'AWS', reason="Tests are not running on AWS deployed cluster" ) @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1292")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1293")) ] ) def test_node_maintenance_restart_activate( self, ec2_instances, aws_obj, pvc_factory, pod_factory, node_type ): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node's ec2 instance - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node typed_node = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_node, f"Failed to find a {node_type} node for the test" typed_node_name = typed_node[0].name # Maintenance the node (unschedule and drain). The function contains logging node.drain_nodes([typed_node_name]) instance = aws.get_instances_ids_and_names(typed_node) assert instance, f"Failed to get ec2 instances for node {typed_node_name}" # Restarting ec2 instance aws_obj.restart_ec2_instances(instances=instance, wait=True) node.wait_for_nodes_status( node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED ) # Mark the node back to schedulable node.schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() @tier2 @pytest.mark.parametrize( argnames=["nodes_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1273")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1271")) ] ) def test_2_nodes_maintenance_same_type( self, pvc_factory, pod_factory, nodes_type ): """ OCS-1273/OCs-1271: - Maintenance (mark as unscheduable and drain) 2 worker/master nodes - Mark the nodes as scheduable - Check cluster and Ceph health - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 2 nodes typed_nodes = node.get_typed_nodes(node_type=nodes_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {nodes_type} node for the test" typed_node_names = [typed_node.name for typed_node in typed_nodes] # Maintenance the nodes (unschedule and drain) node.drain_nodes(typed_node_names) # Mark the nodes back to schedulable node.schedule_nodes(typed_node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() @tier2 @pytest.mark.polarion_id("OCS-1274") def test_2_nodes_different_types(self, pvc_factory, pod_factory): """ OCS-1274: - Maintenance (mark as unscheduable and drain) 1 worker node and 1 master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the nodes as scheduable - Check cluster and Ceph health """ # Get 1 node from each type nodes = [ node.get_typed_nodes( node_type=node_type, num_of_nodes=1 )[0] for node_type in ['worker', 'master'] ] assert nodes, f"Failed to find a nodes for the test" node_names = [typed_node.name for typed_node in nodes] # Maintenance the nodes (unschedule and drain) node.drain_nodes(node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the nodes back to schedulable node.schedule_nodes(node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes """ @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface"], argvalues=[ pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2100")), pytest.param(*['cephfs'], marks=pytest.mark.polarion_id("OCS-2101")), ]) def test_automated_recovery_from_failed_nodes_IPI_proactive( self, interface, pvc_factory, pod_factory, dc_pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) msg = "Common OSD and app running node(s) NOT found" assert (len(common_nodes) > 0), msg log.info(f"Common OSD and app pod running nodes are {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # DC app pods on the failed node will get automatically created on # other running node. Waiting for all dc app pod to reach running # state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") # Check all OCS pods status, they should be in running state all_pod_obj = pod.get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE) for pod_obj in all_pod_obj: if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") else: raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestNodesRestart(ManageTest): """ Test ungraceful cluster shutdown """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Make sure all nodes are up again """ def finalizer(): nodes.restart_nodes_by_stop_and_start_teardown() request.addfinalizer(finalizer) @pytest.mark.parametrize( argnames=["force"], argvalues=[ pytest.param(*[True], marks=pytest.mark.polarion_id("OCS-894")), pytest.param(*[False], marks=[ pytest.mark.polarion_id("OCS-895"), aws_platform_required ]) ]) def test_nodes_restart(self, nodes, pvc_factory, pod_factory, force): """ Test nodes restart (from the platform layer, i.e, EC2 instances, VMWare VMs) """ ocp_nodes = get_node_objs() nodes.restart_nodes_by_stop_and_start(nodes=ocp_nodes, force=force) self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) @bugzilla('1754287') @pytest.mark.polarion_id("OCS-2015") def test_rolling_nodes_restart(self, nodes, pvc_factory, pod_factory): """ Test restart nodes one after the other and check health status in between """ ocp_nodes = get_node_objs() for node in ocp_nodes: nodes.restart_nodes(nodes=[node], wait=False) self.sanity_helpers.health_check(cluster_check=False, tries=60) self.sanity_helpers.create_resources(pvc_factory, pod_factory) @pytest.mark.parametrize( argnames=["interface", "operation"], argvalues=[ pytest.param(*['rbd', 'create_resources'], marks=pytest.mark.polarion_id("OCS-1138")), pytest.param(*['rbd', 'delete_resources'], marks=pytest.mark.polarion_id("OCS-1241")), pytest.param(*['cephfs', 'create_resources'], marks=pytest.mark.polarion_id("OCS-1139")), pytest.param(*['cephfs', 'delete_resources'], marks=pytest.mark.polarion_id("OCS-1242")) ]) def test_pv_provisioning_under_degraded_state_stop_provisioner_pod_node( self, nodes, pvc_factory, pod_factory, interface, operation): """ Test PV provisioning under degraded state - stop the node that has the provisioner pod running on OCS-1138: - Stop 1 worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node - Check cluster and Ceph health OCS-1241: - Stop 1 worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node - Check cluster and Ceph health OCS-1139: - Stop 1 worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node - Check cluster and Ceph health OCS-1242: - Stop 1 worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node - Check cluster and Ceph health """ if operation == 'delete_resources': # Create resources that their deletion will be tested later self.sanity_helpers.create_resources(pvc_factory, pod_factory) provisioner_pods = None # Get the provisioner pod according to the interface if interface == 'rbd': provisioner_pods = pod.get_rbdfsplugin_provisioner_pods() elif interface == 'cephfs': provisioner_pods = pod.get_cephfsplugin_provisioner_pods() provisioner_pod = provisioner_pods[0] # Making sure that the node is not running the rook operator pod: provisioner_node = pod.get_pod_node(provisioner_pod) rook_operator_pod = pod.get_operator_pods()[0] operator_node = pod.get_pod_node(rook_operator_pod) if operator_node.get().get('metadata').get( 'name') == provisioner_node.get().get('metadata').get('name'): provisioner_pod = provisioner_pods[1] provisioner_pod_name = provisioner_pod.name logger.info( f"{interface} provisioner pod found: {provisioner_pod_name}") # Get the node name that has the provisioner pod running on provisioner_node = pod.get_pod_node(provisioner_pod) provisioner_node_name = provisioner_node.get().get('metadata').get( 'name') logger.info( f"{interface} provisioner pod is running on node {provisioner_node_name}" ) # Stopping the nodes nodes.stop_nodes(nodes=[provisioner_node]) # Wait for the provisioner pod to get to running status selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if ( interface == 'rbd') else constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL # Wait for the provisioner pod to reach Terminating status logger.info( f"Waiting for pod {provisioner_pod_name} to reach status Terminating" ) assert provisioner_pod.ocp.wait_for_resource( timeout=600, resource_name=provisioner_pod.name, condition=constants.STATUS_TERMINATING ), f"{interface} provisioner pod failed to reach status Terminating" logger.info( f"Pod {provisioner_pod_name} has reached status Terminating") # Wait for the provisioner pod to be started and reach running status logger.info( f"Waiting for {interface} provisioner pod to reach status Running") # After this change https://github.com/rook/rook/pull/3642/, there are # 2 provisioners for each interface assert provisioner_pod.ocp.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector=selector, resource_count=2 ), f"{interface} provisioner pod failed to reach status Running" logger.info(f"{interface} provisioner pod has reached status Running") if operation == 'create_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.create_resources(pvc_factory, pod_factory) elif operation == 'delete_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.delete_resources() # Starting the nodes nodes.start_nodes(nodes=[provisioner_node]) # Checking cluster and Ceph health self.sanity_helpers.health_check() @pytest.mark.parametrize( argnames=["operation"], argvalues=[ pytest.param(*['create_resources'], marks=[pytest.mark.polarion_id("OCS-2016")]), pytest.param(*['delete_resources'], marks=[pytest.mark.polarion_id("OCS-2017")]), ]) def test_pv_provisioning_under_degraded_state_stop_rook_operator_pod_node( self, nodes, pvc_factory, pod_factory, operation): """ Test PV provisioning under degraded state - stop the node that has the rook operator pod running on OCS-2016: - Stop 1 worker node that has the rook ceph operator pod running on - Wait for the rook ceph operator pod to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node - Check cluster and Ceph health OCS-2017: - Stop 1 worker node that has the rook ceph operator pod running on - Wait for the rook ceph operator pod to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources - Start the worker node - Check cluster and Ceph health """ if operation == 'delete_resources': # Create resources that their deletion will be tested later self.sanity_helpers.create_resources(pvc_factory, pod_factory) rook_operator_pods = pod.get_operator_pods() rook_operator_pod = rook_operator_pods[0] rook_operator_pod_name = rook_operator_pod.name logger.info(f"rook operator pod found: {rook_operator_pod_name}") # Get the node name that has the rook operator pod running on operator_node = pod.get_pod_node(rook_operator_pod) operator_node_name = operator_node.get().get('metadata').get('name') logger.info( f"{rook_operator_pod_name} pod is running on node {operator_node_name}" ) # Stopping the node nodes.stop_nodes(nodes=[operator_node]) # Wait for the rook operator pod to get to running status selector = constants.OPERATOR_LABEL # Wait for the rook operator pod to reach Terminating status logger.info( f"Waiting for pod {rook_operator_pod_name} to reach status Terminating" ) assert rook_operator_pod.ocp.wait_for_resource( timeout=600, resource_name=rook_operator_pod_name, condition=constants.STATUS_TERMINATING ), "rook operator pod failed to reach status Terminating" logger.info( f"Pod {rook_operator_pod_name} has reached status Terminating") # Wait for the rook operator pod to be started and reach running status logger.info( f"Waiting for pod {rook_operator_pod_name} to reach status Running" ) assert rook_operator_pod.ocp.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector=selector, resource_count=1 ), "rook operator pod failed to reach status Running" logger.info("rook operator pod has reached status Running") assert wait_for_ct_pod_recovery( ), "Ceph tools pod failed to come up on another node" if operation == 'create_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.create_resources(pvc_factory, pod_factory) elif operation == 'delete_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.delete_resources() # Starting the nodes nodes.start_nodes(nodes=[operator_node]) # Checking cluster and Ceph health self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes - Reactive """ threads = [] @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") for thread in self.threads: thread.join() ceph_health_check() request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface", "failure"], argvalues=[ pytest.param(*['rbd', 'shutdown'], marks=[ pytest.mark.polarion_id("OCS-2102"), pytest.mark.bugzilla("1830015") ]), pytest.param(*['rbd', 'terminate'], marks=pytest.mark.polarion_id("OCS-2103")), pytest.param(*['cephfs', 'shutdown'], marks=[ pytest.mark.polarion_id("OCS-2104"), pytest.mark.bugzilla("1830015") ]), pytest.param(*['cephfs', 'terminate'], marks=pytest.mark.polarion_id("OCS-2105")), ]) def test_automated_recovery_from_failed_nodes_IPI_reactive( self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory, interface): """ Knip-678 Automated recovery from failed nodes Reactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") if interface == 'rbd': interface = constants.CEPHBLOCKPOOL elif interface == 'cephfs': interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True)) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "shutdown": nodes.stop_nodes(failure_node_obj, wait=True) log.info(f"Successfully powered off node: " f"{failure_node_obj[0].name}") elif failure == "terminate": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") try: # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=720) log.info("All the dc pods reached running state") # Check all OCS pods status, they should be in running state all_pod_obj = pod.get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE) for pod_obj in all_pod_obj: if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") continue helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=240) except ResourceWrongStatusException: if failure == "shutdown": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes """ @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface"], argvalues=[ pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2100")), pytest.param(*['cephfs'], marks=pytest.mark.polarion_id("OCS-2101")), ]) def test_automated_recovery_from_failed_nodes_IPI_proactive( self, interface, pvc_factory, pod_factory, dc_pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) msg = "Common OSD and app running node(s) NOT found" assert (len(common_nodes) > 0), msg log.info(f"Common OSD and app pod running nodes are {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # DC app pods on the failed node will get automatically created on # other running node. Waiting for all dc app pod to reach running # state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestDetachAttachWorkerVolumeAWS(ManageTest): """ Test class for detach and attach worker volume """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.polarion_id("OCS-1085") def test_detach_attach_worker_volume(self, aws_obj, pvc_factory, pod_factory): """ Detach and attach worker volume - Detach the data volume from one of the worker nodes - Validate cluster functionality, without checking cluster and Ceph health (as one node volume is detached, the cluster will be unhealthy) by creating resources and running IO - Attach back the volume to the node - Restart the node so the volume will get re-mounted """ # Requesting 1 worker node for the test as this case includes detach and # attach of data volume of 1 worker node worker = node.get_typed_nodes(num_of_nodes=1) assert worker, "Failed to find a worker node for the test" worker = worker[0] # Get the worker node's ec2 instance ID and name instance = aws.get_instances_ids_and_names([worker]) assert instance, f"Failed to get ec2 instances for node {worker.name}" instance_id = [*instance][0] # Get the ec2 instance data volume Volume instance ec2_volume = aws.get_data_volumes(instance_id)[0] # Detach volume (logging is done inside the function) aws_obj.detach_volume(ec2_volume) # Validate cluster is still functional self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Attach volume (logging is done inside the function) aws_obj.attach_volume(ec2_volume, instance_id) # Restart the instance so the volume will get re-mounted aws_obj.restart_ec2_instances(instances=instance, wait=True) # Cluster health check self.sanity_helpers.health_check() @pytest.mark.polarion_id("OCS-1086") def test_detach_attach_2_workers_volumes(self, aws_obj, pvc_factory, pod_factory): """ Detach and attach disk from 2 worker nodes - Detach the data volume from 2 of the worker nodes - Attach back the volume to the worker nodes - Restart the nodes so the volume will get re-mounted in each node - Check cluster health and functionality to make sure detach, attach and restart did not affect the cluster """ # Requesting 2 worker nodes for the test as this case includes # detach and attach of data volume of 1 worker node workers = node.get_typed_nodes(num_of_nodes=2) assert workers, "Failed to find worker nodes for the test" # Get the worker nodes ec2 instance IDs and names instances = aws.get_instances_ids_and_names(workers) assert instances, ( f"Failed to get ec2 instances for node {[w.name for w in workers]}" ) for instance in instances.items(): instance_id = [*instance][0] # Get the ec2 instance data volume Volume instance ec2_volume = aws.get_data_volumes(instance_id)[0] # Detach volume (logging is done inside the function) aws_obj.detach_volume(ec2_volume) # Attach volume (logging is done inside the function) aws_obj.attach_volume(ec2_volume, instance_id) # Restart the instances so the volume will get re-mounted aws_obj.restart_ec2_instances(instances=instances, wait=True) # Validate cluster is still functional self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory)
class TestNodeReplacement(ManageTest): """ Knip-894 Node replacement - AWS-IPI-Reactive """ @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface", "failure"], argvalues=[ pytest.param( *['rbd', 'power off'], marks=pytest.mark.polarion_id("OCS-2118") ), pytest.param( *['rbd', 'network failure'], marks=pytest.mark.polarion_id("OCS-2120") ), pytest.param( *['cephfs', 'power off'], marks=pytest.mark.polarion_id("OCS-2119") ), pytest.param( *['cephfs', 'network failure'], marks=pytest.mark.polarion_id("OCS-2121") ), ] ) def test_node_replacement_reactive_aws_ipi( self, nodes, pvc_factory, pod_factory, dc_pod_factory, failure, interface ): """ Knip-894 Node replacement - AWS-IPI-Reactive """ # Get worker nodes initial_nodes = get_worker_nodes() # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") if interface == 'rbd': interface = constants.CEPHBLOCKPOOL elif interface == 'cephfs': interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory( interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name ) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name ) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}" ) # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "power off": # Power off AWS worker node instance nodes.stop_nodes(failure_node_obj, wait=True) log.info(f"Successfully powered off node: {failure_node_obj[0].name}") elif failure == "network failure": # Induce Network failure node_network_failure([failure_node_obj[0].name]) # Add annotation to the failed node annotation = "machine.openshift.io/exclude-node-draining=''" machine.add_annotation_to_machine( annotation=annotation, machine_name=machine_name ) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Wait for the new machine to spin log.info("Waiting for the new node to be in ready state") machine.wait_for_new_node_to_be_ready(machineset_name) # Get the node name of new spun node nodes_after_new_spun_node = get_worker_nodes() new_spun_node = list( set(nodes_after_new_spun_node) - set(initial_nodes) ) log.info(f"New spun node is {new_spun_node}") # Label it node_obj = ocp.OCP(kind='node') node_obj.add_label( resource_name=new_spun_node[0], label=constants.OPERATOR_NODE_LABEL ) log.info( f"Successfully labeled {new_spun_node} with OCS storage label" ) # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state( dc_pod_obj, timeout=1200 ) log.info("All the dc pods reached running state") # Check all OCS pods status, they should be in running state all_pod_obj = pod.get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE ) for pod_obj in all_pod_obj: if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=1800 ) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE ) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestNodesRestart(ManageTest): """ Test ungraceful cluster shutdown """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["force"], argvalues=[ pytest.param(*[True], marks=pytest.mark.polarion_id("OCS-894")), pytest.param(*[False], marks=pytest.mark.polarion_id("OCS-895")) ]) def test_nodes_restart_aws(self, ec2_instances, aws_obj, pvc_factory, pod_factory, force): """ Test ungraceful cluster shutdown - AWS """ aws_obj.restart_ec2_instances(instances=ec2_instances, wait=True, force=force) self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) @pytest.mark.parametrize( argnames=["interface", "operation"], argvalues=[ pytest.param(*['rbd', 'create_resources'], marks=pytest.mark.polarion_id("OCS-1138")), pytest.param(*['rbd', 'delete_resources'], marks=pytest.mark.polarion_id("OCS-1241")), pytest.param(*['cephfs', 'create_resources'], marks=pytest.mark.polarion_id("OCS-1139")), pytest.param(*['cephfs', 'delete_resources'], marks=pytest.mark.polarion_id("OCS-1242")) ]) def test_pv_provisioning_under_degraded_state(self, ec2_instances, aws_obj, pvc_factory, pod_factory, interface, operation): """ Test PV provisioning under degraded state OCS-1138: - Stop 1 ec2 instance worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health OCS-1241: - Stop 1 ec2 instance worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health OCS-1139: - Stop 1 ec2 instance worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health OCS-1242: - Stop 1 ec2 instance worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health """ if operation == 'delete_resources': # Create resources that their deletion will be tested later self.sanity_helpers.create_resources(pvc_factory, pod_factory) provisioner_pod = None # Get the provisioner pod according to the interface if interface == 'rbd': provisioner_pod = pod.get_rbdfsplugin_provisioner_pods()[0] elif interface == 'cephfs': provisioner_pod = pod.get_cephfsplugin_provisioner_pods()[0] provisioner_pod_name = provisioner_pod.name logger.info( f"{interface} provisioner pod found: {provisioner_pod_name}") # Get the node name that has the provisioner pod running on provisioner_node = pod.get_pod_node(provisioner_pod) provisioner_node_name = provisioner_node.get('metadata').get('name') logger.info( f"{interface} provisioner pod is running on node {provisioner_node_name}" ) # Get the ec2 instance of the node instances = aws.get_instances_ids_and_names([provisioner_node]) assert instances, ( f"Failed to get ec2 instances for node {provisioner_node_name}") # Stopping the nodes aws_obj.stop_ec2_instances(instances=instances, wait=True) # Wait for the provisioner pod to get to running status selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if ( interface == 'rbd') else constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL # Wait for the provisioner pod to reach Terminating status logger.info( f"Waiting for pod {provisioner_pod_name} to reach status Terminating" ) assert provisioner_pod.ocp.wait_for_resource( timeout=600, resource_name=provisioner_pod.name, condition=constants.STATUS_TERMINATING ), f"{interface} provisioner pod failed to reach status Terminating" logger.info( f"Pod {provisioner_pod_name} has reached status Terminating") # Wait for the provisioner pod to be started and reach running status logger.info( f"Waiting for pod {provisioner_pod_name} to reach status Running") logger.info(f"Pod {provisioner_pod_name} has reached status Running") # After this change https://github.com/rook/rook/pull/3642/, there are # 2 provisioners for each interface assert provisioner_pod.ocp.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector=selector, resource_count=2 ), f"{interface} provisioner pod failed to reach status Running" if operation == 'create_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.create_resources(pvc_factory, pod_factory) elif operation == 'delete_resources': # Cluster validation (resources creation and IO running) self.sanity_helpers.delete_resources() # Starting the nodes aws_obj.start_ec2_instances(instances=instances, wait=True) # Checking cluster and Ceph health self.sanity_helpers.health_check()
class TestNodesMaintenance(ManageTest): """ Test basic flows of maintenance (unschedule and drain) and activate operations, followed by cluster functionality and health checks """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @tier1 @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1269")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1272")) ]) def test_node_maintenance(self, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier4 @tier4b @aws_platform_required @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1292")), pytest.param(*['master'], marks=[ pytest.mark.polarion_id("OCS-1293"), bugzilla('1754287') ]) ]) def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=True) wait_for_nodes_status(node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() @tier3 @pytest.mark.parametrize( argnames=["nodes_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1273")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1271")) ]) def test_2_nodes_maintenance_same_type(self, nodes_type): """ OCS-1273/OCs-1271: - Try draining 2 nodes from the same type - should fail - Check cluster and Ceph health """ # Get 2 nodes typed_nodes = get_typed_nodes(node_type=nodes_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {nodes_type} node for the test" typed_node_names = [typed_node.name for typed_node in typed_nodes] # Try draining 2 nodes - should fail try: drain_nodes(typed_node_names) except TimeoutExpired: logger.info( f"Draining of nodes {typed_node_names} failed as expected") schedule_nodes(typed_node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier2 @pytest.mark.polarion_id("OCS-1274") def test_2_nodes_different_types(self, pvc_factory, pod_factory): """ OCS-1274: - Maintenance (mark as unscheduable and drain) 1 worker node and 1 master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the nodes as scheduable - Check cluster and Ceph health """ # Get 1 node from each type nodes = [ get_typed_nodes(node_type=node_type, num_of_nodes=1)[0] for node_type in ['worker', 'master'] ] assert nodes, f"Failed to find a nodes for the test" node_names = [typed_node.name for typed_node in nodes] # Maintenance the nodes (unschedule and drain) drain_nodes(node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the nodes back to schedulable schedule_nodes(node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_automated_recovery_from_failed_nodes_IPI_proactive( self, pvc_factory, pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get the osd associated node name osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"{osd_node_name} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info(f"{osd_node_name} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name: try: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) # 'rook-ceph-crashcollector' on the failed node stucks at pending # state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as WA and # deleting its deployment so that the pod disappears # Will revert this WA once the BZ is fixed except ResourceWrongStatusException: if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP() name = pod_obj.name[:-17] command = f"delete deployment {name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestDiskFailures(ManageTest): """ Test class for detach and attach worker volume """ @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Restart nodes that are in status NotReady, for situations in which the test failed before restarting the node after detach volume, which leaves nodes in NotReady """ def finalizer(): not_ready_nodes = [ n for n in node.get_node_objs() if n .ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] logger.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status() request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @aws_platform_required @pytest.mark.polarion_id("OCS-1085") def test_detach_attach_worker_volume(self, nodes, pvc_factory, pod_factory): """ Detach and attach worker volume - Detach the data volume from one of the worker nodes - Validate cluster functionality, without checking cluster and Ceph health (as one node volume is detached, the cluster will be unhealthy) by creating resources and running IO - Wait for the volumes to be re-attached back to the worker node - Restart the node so the volume will get re-mounted """ # Get a data volume data_volume = nodes.get_data_volumes()[0] # Get the worker node according to the volume attachment worker = nodes.get_node_by_attached_volume(data_volume) # Detach volume (logging is done inside the function) nodes.detach_volume(data_volume, worker) # Validate cluster is still functional # In case the selected node that its volume disk was detached was the one # running the ceph tools pod, we'll need to wait for a new ct pod to start. # For that, a function that connects to the ct pod is being used to check if # it's alive assert wait_for_ct_pod_recovery(), "Ceph tools pod failed to come up on another node" self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Wait for worker volume to be re-attached automatically to the node assert nodes.wait_for_volume_attach(data_volume), ( "Volume failed to be re-attached to a worker node" ) # Restart the instance so the volume will get re-mounted nodes.restart_nodes([worker]) # Cluster health check # W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster # becomes healthy eventually # TODO: Remove 'tries=100' self.sanity_helpers.health_check(tries=100) @aws_platform_required @pytest.mark.polarion_id("OCS-1086") def test_detach_attach_2_data_volumes(self, nodes, pvc_factory, pod_factory): """ Detach and attach disk from 2 worker nodes - Detach the data 2 of the data volumes from their worker nodes - Wait for the volumes to be re-attached back to the worker nodes - Restart the nodes so the volume will get re-mounted in each node - Check cluster health and functionality to make sure detach, attach and restart did not affect the cluster """ # Get 2 data volumes data_volumes = nodes.get_data_volumes()[:2] workers_and_volumes = [ {'worker': nodes.get_node_by_attached_volume(vol), 'volume': vol} for vol in data_volumes ] for worker_and_volume in workers_and_volumes: # Detach the volume (logging is done inside the function) nodes.detach_volume( worker_and_volume['volume'], nodes.detach_volume(worker_and_volume['worker']) ) for worker_and_volume in workers_and_volumes: # Wait for worker volume to be re-attached automatically to the node assert nodes.wait_for_volume_attach(worker_and_volume['volume']), ( f"Volume {worker_and_volume['volume']} " f"failed to be re-attached to a worker node" ) # Restart the instances so the volume will get re-mounted nodes.restart_nodes( [worker_and_volume['worker'] for worker_and_volume in workers_and_volumes] ) # Validate cluster is still functional self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) @bugzilla('1830702') @vsphere_platform_required @pytest.mark.polarion_id("OCS-2172") def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory): """ Test cluster recovery from disk deletion from the platform side. Based on documented procedure detailed in https://bugzilla.redhat.com/show_bug.cgi?id=1787236#c16 """ logger.info("Picking a PV which will be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get('spec').get('claimRef').get('name') # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ds for ds in osd_pvcs if ds.get().get('metadata').get('name') == claim_name][0] # Get the corresponding OSD pod logger.info(f"Getting the corresponding OSD pod of PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get() .get('metadata').get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] # Get the node that has the OSD pod running on logger.info(f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) volume_size = osd_pvc.size osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get('metadata') .get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = osd_prepare_pod.get().get('metadata').get('labels').get('job-name') osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the corresponding OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get() .get('metadata').get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] # Delete the volume from the platform side logger.info(f"Deleting volume {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Delete the OSD deployment osd_deployment_name = osd_deployment.name logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name, timeout=120) # Delete the OSD prepare job osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC osd_pvc_name = osd_pvc.name logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Recreate a volume from the platform side logger.info("Creating a replacing volume from the platform side") nodes.create_and_attach_volume(osd_node, volume_size) # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info(f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info("Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count ), ( f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info("Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count ), ( f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # Validate cluster is still functional self.sanity_helpers.health_check(tries=80) self.sanity_helpers.create_resources(pvc_factory, pod_factory)
class TestNodesMaintenance(ManageTest): """ Test basic flows of maintenance (unschedule and drain) and activate operations, followed by cluster functionality and health checks """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.fixture(autouse=True) def health_checker(self): """ Check Ceph health """ try: status = ceph_health_check_base() if status: log.info("Health check passed") except CephHealthException as e: # skip because ceph is not in good health pytest.skip(str(e)) @tier1 @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1269")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1272")) ]) def test_node_maintenance(self, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier4 @tier4b @aws_platform_required @pytest.mark.parametrize( argnames=["node_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1292")), pytest.param(*['master'], marks=[ pytest.mark.polarion_id("OCS-1293"), bugzilla('1754287') ]) ]) def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node of the type needed for the test iteration typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=True) wait_for_nodes_status(node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() @tier3 @pytest.mark.parametrize( argnames=["nodes_type"], argvalues=[ pytest.param(*['worker'], marks=pytest.mark.polarion_id("OCS-1273")), pytest.param(*['master'], marks=pytest.mark.polarion_id("OCS-1271")) ]) def test_2_nodes_maintenance_same_type(self, nodes_type): """ OCS-1273/OCs-1271: - Try draining 2 nodes from the same type - should fail - Check cluster and Ceph health """ # Get 2 nodes typed_nodes = get_typed_nodes(node_type=nodes_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {nodes_type} node for the test" typed_node_names = [typed_node.name for typed_node in typed_nodes] # Try draining 2 nodes - should fail try: drain_nodes(typed_node_names) except TimeoutExpired: log.info( f"Draining of nodes {typed_node_names} failed as expected") schedule_nodes(typed_node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier2 @pytest.mark.polarion_id("OCS-1274") def test_2_nodes_different_types(self, pvc_factory, pod_factory): """ OCS-1274: - Maintenance (mark as unscheduable and drain) 1 worker node and 1 master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the nodes as scheduable - Check cluster and Ceph health """ # Get 1 node from each type nodes = [ get_typed_nodes(node_type=node_type, num_of_nodes=1)[0] for node_type in ['worker', 'master'] ] assert nodes, f"Failed to find a nodes for the test" node_names = [typed_node.name for typed_node in nodes] # Maintenance the nodes (unschedule and drain) drain_nodes(node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the nodes back to schedulable schedule_nodes(node_names) # Perform cluster and Ceph health checks self.sanity_helpers.health_check() @tier4 @tier4b @aws_platform_required @pytest.mark.parametrize( argnames=["interface"], argvalues=[ pytest.param(*['rbd'], marks=pytest.mark.polarion_id("OCS-2128")), pytest.param(*['cephfs'], marks=pytest.mark.polarion_id("OCS-2129")), ]) def test_simultaneous_drain_of_two_ocs_nodes(self, pvc_factory, pod_factory, dc_pod_factory, interface): """ OCS-2128/OCS-2129: - Create PVCs and start IO on DC based app pods - Add one extra node in two of the AZs and label the nodes with OCS storage label - Maintenance (mark as unscheduable and drain) 2 worker nodes simultaneously - Confirm that OCS and DC pods are in running state - Remove unscheduled nodes - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Check cluster and Ceph health """ # Get OSD running nodes osd_running_worker_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_worker_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_worker_nodes, label_key='dc', label_value='fedora') log.info("Successfully labeled worker nodes with {dc:fedora}") # Create DC app pods log.info("Creating DC based app pods and starting IO in background") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get the machine name using the node name machine_names = [ machine.get_machine_from_node_name(osd_running_worker_node) for osd_running_worker_node in osd_running_worker_nodes[:2] ] log.info(f"{osd_running_worker_nodes} associated " f"machine are {machine_names}") # Get the machineset name using machine name machineset_names = [ machine.get_machineset_from_machine_name(machine_name) for machine_name in machine_names ] log.info(f"{osd_running_worker_nodes} associated machineset " f"is {machineset_names}") # Add a new node and label it add_new_node_and_label_it(machineset_names[0]) add_new_node_and_label_it(machineset_names[1]) # Drain 2 nodes drain_nodes(osd_running_worker_nodes[:2]) # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # DC app pods on the drained node will get automatically created on other # running node in same AZ. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") # Remove unscheduled nodes # In scenarios where the drain is attempted on >3 worker setup, # post completion of drain we are removing the unscheduled nodes so # that we maintain 3 worker nodes. log.info(f"Removing scheduled nodes {osd_running_worker_nodes[:2]}") remove_node_objs = get_node_objs(osd_running_worker_nodes[:2]) remove_nodes(remove_node_objs) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestAutomatedRecoveryFromFailedNodes(ManageTest): """ Knip-678 Automated recovery from failed nodes - Reactive """ threads = [] @pytest.fixture(autouse=True) def teardown(self, request): def finalizer(): worker_nodes = get_worker_nodes() # Removing created label on all worker nodes remove_label_from_worker_node(worker_nodes, label_key="dc") for thread in self.threads: thread.join() ceph_health_check() request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @pytest.mark.parametrize( argnames=["interface", "failure"], argvalues=[ pytest.param(*['rbd', 'shutdown'], marks=[ pytest.mark.polarion_id("OCS-2102"), pytest.mark.bugzilla("1830015") ]), pytest.param(*['rbd', 'terminate'], marks=pytest.mark.polarion_id("OCS-2103")), pytest.param(*['cephfs', 'shutdown'], marks=[ pytest.mark.polarion_id("OCS-2104"), pytest.mark.bugzilla("1830015") ]), pytest.param(*['cephfs', 'terminate'], marks=pytest.mark.polarion_id("OCS-2105")), ]) def test_automated_recovery_from_failed_nodes_IPI_reactive( self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory, interface): """ Knip-678 Automated recovery from failed nodes Reactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") if interface == 'rbd': interface = constants.CEPHBLOCKPOOL elif interface == 'cephfs': interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True)) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "shutdown": nodes.stop_nodes(failure_node_obj, wait=True) log.info(f"Successfully powered off node: " f"{failure_node_obj[0].name}") elif failure == "terminate": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") try: # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=720) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() except ResourceWrongStatusException: if failure == "shutdown": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
class TestNodeReplacement(ManageTest): """ Knip-894 Node replacement - AWS-IPI-Proactive """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_nodereplacement_proactive(self, pvc_factory, pod_factory, dc_pod_factory): """ Knip-894 Node Replacement proactive """ # Get worker nodes worker_node_list = get_worker_nodes() log.info(f"Current available worker nodes are {worker_node_list}") osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") log.info("Creating dc pod backed with rbd pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20) pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True) log.info("Creating dc pod backed with cephfs pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: cephfs_dc_pod = dc_pod_factory( interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20) pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True) # Unscheduling node node.unschedule_nodes([osd_node_name]) # Draining Node node.drain_nodes([osd_node_name]) log.info("Getting machine name from specified node name") machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"Node {osd_node_name} associated machine is {machine_name}") log.info( f"Deleting machine {machine_name} and waiting for new machine to come up" ) machine.delete_machine_and_check_state_of_new_spinned_machine( machine_name) new_machine_list = machine.get_machines() for machines in new_machine_list: # Trimming is done to get just machine name # eg:- machine_name:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b-nlgkr # After trimming:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b if re.match(machines.name[:-6], machine_name): new_machine_name = machines.name machineset_name = machine.get_machineset_from_machine_name( new_machine_name) log.info("Waiting for new worker node to be in ready state") machine.wait_for_new_node_to_be_ready(machineset_name) new_node_name = node.get_node_from_machine_name(new_machine_name) log.info("Adding ocs label to newly created worker node") node_obj = ocp.OCP(kind='node') node_obj.add_label(resource_name=new_node_name, label=constants.OPERATOR_NODE_LABEL) log.info( f"Successfully labeled {new_node_name} with OCS storage label") # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Deleting Resources self.sanity_helpers.delete_resources() # Verify everything running fine log.info( "Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check()
class TestNodeReplacement(ManageTest): """ Knip-894 Node replacement - AWS-IPI-Proactive """ @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() def test_nodereplacement_proactive(self, pvc_factory, pod_factory, dc_pod_factory): """ Knip-894 Node Replacement proactive """ # Get worker nodes worker_node_list = get_worker_nodes() log.info(f"Current available worker nodes are {worker_node_list}") osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") log.info("Creating dc pod backed with rbd pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20) pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True) log.info("Creating dc pod backed with cephfs pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: cephfs_dc_pod = dc_pod_factory(interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20) pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True) if config.ENV_DATA['platform'].lower() == constants.AWS_PLATFORM: if config.ENV_DATA['deployment_type'] == 'ipi': node.delete_and_create_osd_node_aws_ipi(osd_node_name) elif config.ENV_DATA['deployment_type'] == 'upi': node.delete_and_create_osd_node_aws_upi(osd_node_name) else: pytest.fail( f"ocs-ci config 'deployment_type' value '{config.ENV_DATA['deployment_type']}' is not valid, " f"results of this test run are all invalid.") elif config.ENV_DATA['platform'].lower() == constants.VSPHERE_PLATFORM: pytest.skip("Skipping add node in Vmware platform due to " "https://bugzilla.redhat.com/show_bug.cgi?id=1844521" ) # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Deleting Resources self.sanity_helpers.delete_resources() # Verify everything running fine log.info("Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=30)
class TestDiskFailures(ManageTest): """ Test class for detach and attach worker volume """ def detach_volume_and_wait_for_attach(self, nodes, data_volume, worker_node): """ Detach an EBS volume from an AWS instance and wait for the volume to be re-attached Args: node (OCS): The OCS object representing the node data_volume (Volume): The ec2 volume to delete worker_node (OCS): The OCS object of the EC2 instance """ try: # Detach volume (logging is done inside the function) nodes.detach_volume(data_volume, worker_node) except AWSTimeoutException as e: if "Volume state: in-use" in e: logger.info( f"Volume {data_volume} re-attached successfully to worker" f" node {worker_node}") else: raise else: """ Wait for worker volume to be re-attached automatically to the node """ assert nodes.wait_for_volume_attach(data_volume), ( f"Volume {data_volume} failed to be re-attached to worker " f"node {worker_node}") @pytest.fixture(autouse=True) def teardown(self, request, nodes): """ Restart nodes that are in status NotReady, for situations in which the test failed before restarting the node after detach volume, which leaves nodes in NotReady """ def finalizer(): not_ready_nodes = [ n for n in node.get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_NOT_READY ] logger.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status() # Restart node if the osd stays at CLBO state osd_pods_obj_list = get_osd_pods() for pod in osd_pods_obj_list: if pod.get().get('status').get('containerStatuses')[0].get( 'state') == constants.STATUS_CLBO: node_obj = get_pod_node(pod) nodes.restart_nodes([node_obj]) node.wait_for_nodes_status([node_obj.name]) request.addfinalizer(finalizer) @pytest.fixture(autouse=True) def init_sanity(self): """ Initialize Sanity instance """ self.sanity_helpers = Sanity() @aws_platform_required @pytest.mark.polarion_id("OCS-1085") @bugzilla('1825675') def test_detach_attach_worker_volume(self, nodes, pvc_factory, pod_factory): """ Detach and attach worker volume - Detach the data volume from one of the worker nodes - Wait for the volumes to be re-attached back to the worker node - Validate cluster functionality, without checking cluster and Ceph health (as one node volume is detached, the cluster will be unhealthy) by creating resources and running IO - Restart the node so the volume will get re-mounted """ # Get a data volume data_volume = nodes.get_data_volumes()[0] # Get the worker node according to the volume attachment worker = nodes.get_node_by_attached_volume(data_volume) # Detach volume and wait for the volume to attach self.detach_volume_and_wait_for_attach(nodes, data_volume, worker) # Validate cluster is still functional # In case the selected node that its volume disk was detached was the one # running the ceph tools pod, we'll need to wait for a new ct pod to start. # For that, a function that connects to the ct pod is being used to check if # it's alive assert wait_for_ct_pod_recovery( ), "Ceph tools pod failed to come up on another node" self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Restart the instance so the volume will get re-mounted nodes.restart_nodes([worker]) # Cluster health check # W/A: For the investigation of BZ 1825675, timeout is increased to see if cluster # becomes healthy eventually # TODO: Remove 'tries=100' self.sanity_helpers.health_check(tries=100) @aws_platform_required @pytest.mark.polarion_id("OCS-1086") def test_detach_attach_2_data_volumes(self, nodes, pvc_factory, pod_factory): """ Detach and attach disk from 2 worker nodes - Detach the data 2 of the data volumes from their worker nodes - Wait for the volumes to be re-attached back to the worker nodes - Restart the nodes so the volume will get re-mounted in each node - Check cluster health and functionality to make sure detach, attach and restart did not affect the cluster """ # Get 2 data volumes data_volumes = nodes.get_data_volumes()[:2] workers_and_volumes = [{ 'worker': nodes.get_node_by_attached_volume(vol), 'volume': vol } for vol in data_volumes] for worker_and_volume in workers_and_volumes: # Detach volume and wait for the volume to attach self.detach_volume_and_wait_for_attach(nodes, worker_and_volume['volume'], worker_and_volume['worker']) # Restart the instances so the volume will get re-mounted nodes.restart_nodes([ worker_and_volume['worker'] for worker_and_volume in workers_and_volumes ]) # Validate cluster is still functional self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) @bugzilla('1830702') @vsphere_platform_required @pytest.mark.polarion_id("OCS-2172") def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory): """ Test cluster recovery from disk deletion from the platform side. Based on documented procedure detailed in https://bugzilla.redhat.com/show_bug.cgi?id=1823183 """ logger.info("Picking a PV which to be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get('spec').get('claimRef').get('name') # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ ds for ds in osd_pvcs if ds.get().get('metadata').get('name') == claim_name ][0] # Get the corresponding OSD pod and ID logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get().get('metadata').get('labels').get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] logger.info(f"OSD_POD {osd_pod.name}") osd_id = osd_pod.get().get('metadata').get('labels').get('ceph-osd-id') # Get the node that has the OSD pod running on logger.info( f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get('metadata').get( 'labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = osd_prepare_pod.get().get('metadata').get( 'labels').get('job-name') osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get().get('metadata').get('labels').get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_deployment_name = osd_deployment.name # Delete the volume from the platform side logger.info(f"Deleting {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Scale down OSD deployment logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0") ocp.OCP().exec_oc_cmd( f"scale --replicas=0 deployment/{osd_deployment_name}") # Force delete OSD pod if necessary osd_pod_name = osd_pod.name logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted") try: osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) except TimeoutError: osd_pod.delete(force=True) osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) # Run ocs-osd-removal job logger.info(f"Executing OSD removal job on OSD-{osd_id}") osd_removal_job_yaml = ocp.OCP( namespace=config.ENV_DATA['cluster_namespace']).exec_oc_cmd( f"process ocs-osd-removal" f" -p FAILED_OSD_ID={osd_id} -o yaml") osd_removal_job = OCS(**osd_removal_job_yaml) osd_removal_job.create(do_reload=False) # Get ocs-osd-removal pod name logger.info("Getting the ocs-osd-removal pod name") osd_removal_pod_name = get_pod_name_by_pattern( f"ocs-osd-removal-{osd_id}")[0] osd_removal_pod_obj = get_pod_obj(osd_removal_pod_name, namespace='openshift-storage') osd_removal_pod_obj.ocp.wait_for_resource( condition=constants.STATUS_COMPLETED, resource_name=osd_removal_pod_name) # Verify OSD removal from the ocs-osd-removal pod logs logger.info( f"Verifying removal of OSD from {osd_removal_pod_name} pod logs") logs = get_pod_logs(osd_removal_pod_name) pattern = f"purged osd.{osd_id}" assert re.search(pattern, logs) # Delete the OSD prepare job logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}") osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC osd_pvc_name = osd_pvc.name logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Delete the OSD deployment logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name, timeout=120) # Delete PV logger.info(f"Verifying deletion of PV {osd_pv_name}") try: osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) except TimeoutError: osd_pv.delete() osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info( f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() # Delete the OSD removal job logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}") osd_removal_job = get_job_obj(f"ocs-osd-removal-{osd_id}") osd_removal_job.delete() osd_removal_job.ocp.wait_for_delete( resource_name=f"ocs-osd-removal-{osd_id}") timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info( "Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info( "Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # Validate cluster is still functional self.sanity_helpers.health_check(tries=80) self.sanity_helpers.create_resources(pvc_factory, pod_factory)