def test_2_nodes_maintenance_same_type(self, resources, schedule_nodes, nodes_type): """ OCS-1273/OCs-1271: - Maintenance (mark as unscheduable and drain) 2 worker/master nodes - Mark the nodes as scheduable - Check cluster and Ceph health - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 2 nodes typed_nodes = node.get_typed_nodes(node_type=nodes_type, num_of_nodes=2) typed_node_names = [typed_node.name for typed_node in typed_nodes] # Maintenance the nodes (unschedule and drain) node.maintenance_nodes(typed_node_names) # Mark the nodes back to schedulable node.schedule_nodes(typed_node_names) # Perform cluster and Ceph health checks sanity_helpers.health_check(typed_node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources sanity_helpers.create_resources(resources) sanity_helpers.delete_resources(resources)
def test_node_maintenance(self, resources, schedule_nodes, node_type): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get 1 node typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) typed_node_name = typed_nodes[0].name # Maintenance the node (unschedule and drain) node.maintenance_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources sanity_helpers.create_resources(resources) sanity_helpers.delete_resources(resources) # Mark the node back to schedulable node.schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks sanity_helpers.health_check([typed_node_name])
def test_2_nodes_different_types(self, resources, schedule_nodes): """ OCS-1274: - Maintenance (mark as unscheduable and drain) 1 worker node and 1 master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the nodes as scheduable - Check cluster and Ceph health """ # Get 1 node from each type nodes = [ node.get_typed_nodes( node_type=node_type, num_of_nodes=1 )[0] for node_type in ['worker', 'master'] ] node_names = [typed_node.name for typed_node in nodes] # Maintenance the nodes (unschedule and drain) node.maintenance_nodes(node_names) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources sanity_helpers.create_resources(resources) sanity_helpers.delete_resources(resources) # Mark the nodes back to schedulable node.schedule_nodes(node_names) # Perform cluster and Ceph health checks sanity_helpers.health_check(node_names)
def test_nodes_restart_aws(self, resources, instances, aws_obj, force): """ Test ungraceful cluster shutdown - AWS """ aws_obj.restart_ec2_instances(instances=instances, wait=True, force=force) sanity_helpers.health_check(nodes=list(instances.values())) sanity_helpers.create_resources(resources=resources)
def test_detach_attach_worker_volume(self, aws_obj, resources): """ Detach and attach worker volume - Detach the data volume from one of the worker nodes - Validate cluster functionality, without checking cluster and Ceph health (as one node volume is detached, the cluster will be unhealthy) by creating resources and running IO - Attach back the volume to the node - Restart the node so the volume will get re-mounted """ # Requesting 1 worker node for the test as this case includes detach and # attach of data volume of 1 worker node worker = node.get_typed_nodes(num_of_nodes=1)[0] # Get the worker node's ec2 instance ID and name instance = aws.get_instances_ids_and_names([worker]) instance_id = [*instance][0] # Get the ec2 instance data volume Volume instance ec2_volume = aws.get_data_volumes(instance_id)[0] # Detach volume (logging is done inside the function) aws_obj.detach_volume(ec2_volume) # Validate cluster is still functional sanity_helpers.create_resources(resources=resources) # Attach volume (logging is done inside the function) aws_obj.attach_volume(ec2_volume, instance_id) # Restart the instance so the volume will get re-mounted aws_obj.restart_ec2_instances(instances=instance, wait=True) # Cluster health check sanity_helpers.health_check(nodes=list(instance.values()))
def test_detach_attach_2_workers_volumes(self, aws_obj, resources): """ Detach and attach disk from 2 worker nodes - Detach the data volume from 2 of the worker nodes - Attach back the volume to the worker nodes - Restart the nodes so the volume will get re-mounted in each node - Check cluster health and functionality to make sure detach, attach and restart did not affect the cluster """ # Requesting 2 worker nodes for the test as this case includes # detach and attach of data volume of 1 worker node workers = node.get_typed_nodes(num_of_nodes=2) # Get the worker nodes ec2 instance IDs and names instances = aws.get_instances_ids_and_names(workers) for instance in instances.items(): instance_id = [*instance][0] # Get the ec2 instance data volume Volume instance ec2_volume = aws.get_data_volumes(instance_id)[0] # Detach volume (logging is done inside the function) aws_obj.detach_volume(ec2_volume) # Attach volume (logging is done inside the function) aws_obj.attach_volume(ec2_volume, instance_id) # Restart the instances so the volume will get re-mounted aws_obj.restart_ec2_instances(instances=instances, wait=True) # Validate cluster is still functional sanity_helpers.health_check(nodes=list(instances.values())) sanity_helpers.create_resources(resources=resources)
def test_pv_provisioning_under_degraded_state(self, resources, instances, aws_obj, interface, operation): """ Test PV provisioning under degraded state OCS-1138: - Stop 1 ec2 instance worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health OCS-1241: - Stop 1 ec2 instance worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health OCS-1139: - Stop 1 ec2 instance worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health OCS-1242: - Stop 1 ec2 instance worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health """ if operation == 'delete_resources': # Create resources that their deletion will be tested later sanity_helpers.create_resources(resources=resources) provisioner_pod = None # Get the provisioner pod according to the interface if interface == 'rbd': provisioner_pod = pod.get_rbdfsplugin_provisioner_pods()[0] elif interface == 'cephfs': provisioner_pod = pod.get_cephfsplugin_provisioner_pods()[0] provisioner_pod_name = provisioner_pod.name logger.info( f"{interface} provisioner pod found: {provisioner_pod_name}") # Get the node name that has the provisioner pod running on provisioner_node = pod.get_pod_node(provisioner_pod) provisioner_node_name = provisioner_node.get('metadata').get('name') logger.info( f"{interface} provisioner pod is running on node {provisioner_node_name}" ) # Get the ec2 instance of the node instances = aws.get_instances_ids_and_names([provisioner_node]) # Stopping the nodes aws_obj.stop_ec2_instances(instances=instances, wait=True) # Wait for the provisioner pod to get to running status selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if ( interface == 'rbd') else constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL # Wait for the provisioner pod to reach Terminating status logger.info( f"Waiting for pod {provisioner_pod_name} to reach status Terminating" ) assert provisioner_pod.ocp.wait_for_resource( timeout=300, resource_name=provisioner_pod.name, condition=constants.STATUS_TERMINATING ), f"{interface} provisioner pod failed to reach status Terminating" # Wait for the provisioner pod to be started and reach running status logger.info( f"Waiting for pod {provisioner_pod_name} to reach status Running") assert provisioner_pod.ocp.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector=selector, resource_count=1 ), f"{interface} provisioner pod failed to reach status Running" if operation == 'create_resources': # Cluster validation (resources creation and IO running) sanity_helpers.create_resources(resources=resources) elif operation == 'delete_resources': # Cluster validation (resources creation and IO running) sanity_helpers.delete_resources(resources=resources) # Starting the nodes aws_obj.start_ec2_instances(instances=instances, wait=True) # Checking cluster and Ceph health sanity_helpers.health_check(nodes=[provisioner_node_name])
def test_monitoring_when_one_of_the_prometheus_node_down( self, test_fixture): """ Test case to validate when the prometheus pod is down and interaction with prometheus """ namespace_list, pvc_objs, pod_objs, sc = test_fixture aws_obj = aws.AWS() # Get all the openshift-monitoring pods monitoring_pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE) # Get the worker node list workers = get_typed_nodes(node_type='worker') # Get all prometheus pods pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj['spec']['nodeName'] prometheus_node = [ node for node in workers if node.get().get('metadata').get('name') == prometheus_node ] # Make one of the node down where the prometheus pod is hosted instances = aws.get_instances_ids_and_names(prometheus_node) aws_obj.restart_ec2_instances(instances=instances, wait=True, force=True) # Validate all nodes are in READY state wait_for_nodes_status() # Check the node are Ready state and check cluster is health ok sanity_helpers.health_check(nodes=list(instances.values())) # Check all the monitoring pods are up for pod_obj in monitoring_pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) # Check for the created pvc metrics after nodes restarting for pvc_obj in pvc_objs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" ) # Create projects after restarting nodes namespaces = helpers.create_multilpe_projects(number_of_project=1) namespace_list.extend(namespaces) # Create pvcs after restarting nodes pvcs = [ helpers.create_pvc(sc_name=sc.name, namespace=each_namespace.namespace) for each_namespace in namespaces ] for pvc_obj in pvcs: helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND) pvc_obj.reload() pvc_objs.extend(pvcs) # Create app pods after restarting nodes pods = [ helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL, pvc_name=each_pvc.name, namespace=each_pvc.namespace) for each_pvc in pvcs ] for pod_obj in pods: helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) pod_obj.reload() pod_objs.extend(pods) # Check for the created pvc metrics on prometheus pod after restarting nodes for pvc_obj in pvcs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" )