def test_monitoring_after_rebooting_node_where_mgr_is_running(self): """ Test case to validate rebooting a node where mgr is running should not delete the data collected on prometheus pod """ aws_obj = aws.AWS() # Get the mgr pod obj mgr_pod_obj = pod.get_mgr_pods() # Get the node where the mgr pod is hosted mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) # Reboot the node where the mgr pod is hosted instances = aws.get_instances_ids_and_names([mgr_node_obj]) aws_obj.restart_ec2_instances(instances=instances, wait=True, force=True) # Validate all nodes are in READY state wait_for_nodes_status() # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for ceph health check metrics is updated with new mgr pod wait_to_update_mgrpod_info_prometheus_pod() # Check for the created pvc metrics after rebooting the node where mgr pod was running for pod_obj in self.pod_objs: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def instances(request, aws_obj): """ Get cluster instances Returns: dict: The ID keys and the name values of the instances """ nodes = ocp.get_node_objs() ec2_instances = aws.get_instances_ids_and_names(nodes) def finalizer(): """ Make sure all instances are running """ stopping_instances = { key: val for key, val in ec2_instances.items() if ( aws_obj.get_instances_status_by_id(key) == constants.INSTANCE_STOPPING ) } if stopping_instances: for stopping_instance in stopping_instances: instance = aws_obj.get_ec2_instance(stopping_instance.key()) instance.wait_until_stopped() stopped_instances = { key: val for key, val in ec2_instances.items() if ( aws_obj.get_instances_status_by_id(key) == constants.INSTANCE_STOPPED ) } if stopped_instances: aws_obj.start_ec2_instances(instances=stopped_instances, wait=True) request.addfinalizer(finalizer) return ec2_instances
def get_ec2_instances(self, nodes): """ Get the EC2 instances dicts Args: nodes (list): The OCS objects of the nodes Returns: dict: The EC2 instances dicts (IDs and names) """ return aws.get_instances_ids_and_names(nodes)
def ec2_instances(request, aws_obj): """ Get cluster instances Returns: dict: The ID keys and the name values of the instances """ # Get all cluster nodes objects nodes = node.get_node_objs() # Get the cluster nodes ec2 instances ec2_instances = aws.get_instances_ids_and_names(nodes) assert ec2_instances, f"Failed to get ec2 instances for node {[n.name for n in nodes]}" def finalizer(): """ Make sure all instances are running """ # Getting the instances that are in status 'stopping' (if there are any), to wait for them to # get to status 'stopped' so it will be possible to start them stopping_instances = { key: val for key, val in ec2_instances.items() if (aws_obj.get_instances_status_by_id(key) == constants.INSTANCE_STOPPING) } # Waiting fot the instances that are in status 'stopping' # (if there are any) to reach 'stopped' if stopping_instances: for stopping_instance in stopping_instances: instance = aws_obj.get_ec2_instance(stopping_instance.key()) instance.wait_until_stopped() stopped_instances = { key: val for key, val in ec2_instances.items() if (aws_obj.get_instances_status_by_id(key) == constants.INSTANCE_STOPPED) } # Start the instances if stopped_instances: aws_obj.start_ec2_instances(instances=stopped_instances, wait=True) request.addfinalizer(finalizer) return ec2_instances
def test_detach_attach_worker_volume(self, aws_obj, pvc_factory, pod_factory): """ Detach and attach worker volume - Detach the data volume from one of the worker nodes - Validate cluster functionality, without checking cluster and Ceph health (as one node volume is detached, the cluster will be unhealthy) by creating resources and running IO - Attach back the volume to the node - Restart the node so the volume will get re-mounted """ # Requesting 1 worker node for the test as this case includes detach and # attach of data volume of 1 worker node worker = node.get_typed_nodes(num_of_nodes=1) assert worker, "Failed to find a worker node for the test" worker = worker[0] # Get the worker node's ec2 instance ID and name instance = aws.get_instances_ids_and_names([worker]) assert instance, f"Failed to get ec2 instances for node {worker.name}" instance_id = [*instance][0] # Get the ec2 instance data volume Volume instance ec2_volume = aws.get_data_volumes(instance_id)[0] # Detach volume (logging is done inside the function) aws_obj.detach_volume(ec2_volume) # Validate cluster is still functional self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Attach volume (logging is done inside the function) aws_obj.attach_volume(ec2_volume, instance_id) # Restart the instance so the volume will get re-mounted aws_obj.restart_ec2_instances(instances=instance, wait=True) # Cluster health check self.sanity_helpers.health_check()
def test_detach_attach_2_workers_volumes(self, aws_obj, pvc_factory, pod_factory): """ Detach and attach disk from 2 worker nodes - Detach the data volume from 2 of the worker nodes - Attach back the volume to the worker nodes - Restart the nodes so the volume will get re-mounted in each node - Check cluster health and functionality to make sure detach, attach and restart did not affect the cluster """ # Requesting 2 worker nodes for the test as this case includes # detach and attach of data volume of 1 worker node workers = node.get_typed_nodes(num_of_nodes=2) assert workers, "Failed to find worker nodes for the test" # Get the worker nodes ec2 instance IDs and names instances = aws.get_instances_ids_and_names(workers) assert instances, ( f"Failed to get ec2 instances for node {[w.name for w in workers]}" ) for instance in instances.items(): instance_id = [*instance][0] # Get the ec2 instance data volume Volume instance ec2_volume = aws.get_data_volumes(instance_id)[0] # Detach volume (logging is done inside the function) aws_obj.detach_volume(ec2_volume) # Attach volume (logging is done inside the function) aws_obj.attach_volume(ec2_volume, instance_id) # Restart the instances so the volume will get re-mounted aws_obj.restart_ec2_instances(instances=instances, wait=True) # Validate cluster is still functional self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory)
def test_node_maintenance_restart_activate( self, ec2_instances, aws_obj, pvc_factory, pod_factory, node_type ): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node's ec2 instance - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get 1 node typed_node = node.get_typed_nodes(node_type=node_type, num_of_nodes=1) assert typed_node, f"Failed to find a {node_type} node for the test" typed_node_name = typed_node[0].name # Maintenance the node (unschedule and drain). The function contains logging node.drain_nodes([typed_node_name]) instance = aws.get_instances_ids_and_names(typed_node) assert instance, f"Failed to get ec2 instances for node {typed_node_name}" # Restarting ec2 instance aws_obj.restart_ec2_instances(instances=instance, wait=True) node.wait_for_nodes_status( node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED ) # Mark the node back to schedulable node.schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources()
def test_detach_attach_worker_volume(self, aws_obj, resources): """ Detach and attach worker volume - Detach the data volume from one of the worker nodes - Validate cluster functionality, without checking cluster and Ceph health (as one node volume is detached, the cluster will be unhealthy) by creating resources and running IO - Attach back the volume to the node - Restart the node so the volume will get re-mounted """ # Requesting 1 worker node for the test as this case includes detach and # attach of data volume of 1 worker node worker = node.get_typed_nodes(num_of_nodes=1)[0] # Get the worker node's ec2 instance ID and name instance = aws.get_instances_ids_and_names([worker]) instance_id = [*instance][0] # Get the ec2 instance data volume Volume instance ec2_volume = aws.get_data_volumes(instance_id)[0] # Detach volume (logging is done inside the function) aws_obj.detach_volume(ec2_volume) # Validate cluster is still functional self.validate_cluster(resources=resources, nodes=list(instance.values()), health_check=False) # Attach volume (logging is done inside the function) aws_obj.attach_volume(ec2_volume, instance_id) # Restart the instance so the volume will get re-mounted aws_obj.restart_ec2_instances(instances=instance, wait=True) # Cluster health check self.health_check()
def test_monitoring_after_rebooting_master_node(self, pod_factory): """ Test case to validate reboot master node and its interaction with prometheus pods """ aws_obj = aws.AWS() # Get the master node list master_nodes = get_typed_nodes(node_type='master') # Reboot one after one master nodes for node in master_nodes: instances = aws.get_instances_ids_and_names([node]) aws_obj.restart_ec2_instances(instances=instances, wait=True, force=True) # Validate all nodes are in READY state wait_for_master_node_to_be_running_state() # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for the created pvc metrics after rebooting the master nodes for pod_obj in self.pod_objs: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) pod_obj = pod_factory(interface=constants.CEPHBLOCKPOOL, status=constants.STATUS_RUNNING) self.pod_objs.extend([pod_obj]) # Check for the new created pvc metrics on prometheus pod assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def test_detach_attach_2_workers_volumes(self, aws_obj, resources): """ Detach and attach disk from 2 worker nodes - Detach the data volume from 2 of the worker nodes - Attach back the volume to the worker nodes - Restart the nodes so the volume will get re-mounted in each node - Check cluster health and functionality to make sure detach, attach and restart did not affect the cluster """ # Requesting 2 worker nodes for the test as this case includes # detach and attach of data volume of 1 worker node workers = node.get_typed_nodes(num_of_nodes=2) # Get the worker nodes ec2 instance IDs and names instances = aws.get_instances_ids_and_names(workers) for instance in instances.items(): instance_id = [*instance][0] # Get the ec2 instance data volume Volume instance ec2_volume = aws.get_data_volumes(instance_id)[0] # Detach volume (logging is done inside the function) aws_obj.detach_volume(ec2_volume) # Attach volume (logging is done inside the function) aws_obj.attach_volume(ec2_volume, instance_id) # Restart the instances so the volume will get re-mounted aws_obj.restart_ec2_instances(instances=instances, wait=True) # Validate cluster is still functional self.validate_cluster(resources=resources, nodes=list(instances.values()))
def test_monitoring_when_one_of_the_prometheus_node_down( self, test_fixture): """ Test case to validate when the prometheus pod is down and interaction with prometheus """ namespace_list, pvc_objs, pod_objs, sc = test_fixture aws_obj = aws.AWS() # Get all the openshift-monitoring pods monitoring_pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE) # Get the worker node list workers = get_typed_nodes(node_type='worker') # Get all prometheus pods pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj['spec']['nodeName'] prometheus_node = [ node for node in workers if node.get().get('metadata').get('name') == prometheus_node ] # Make one of the node down where the prometheus pod is hosted instances = aws.get_instances_ids_and_names(prometheus_node) aws_obj.restart_ec2_instances(instances=instances, wait=True, force=True) # Validate all nodes are in READY state wait_for_nodes_status() # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check all the monitoring pods are up for pod_obj in monitoring_pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) # Check for the created pvc metrics after nodes restarting for pvc_obj in pvc_objs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" ) # Create projects after restarting nodes namespaces = helpers.create_multilpe_projects(number_of_project=1) namespace_list.extend(namespaces) # Create pvcs after restarting nodes pvcs = [ helpers.create_pvc(sc_name=sc.name, namespace=each_namespace.namespace) for each_namespace in namespaces ] for pvc_obj in pvcs: helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND) pvc_obj.reload() pvc_objs.extend(pvcs) # Create app pods after restarting nodes pods = [ helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL, pvc_name=each_pvc.name, namespace=each_pvc.namespace) for each_pvc in pvcs ] for pod_obj in pods: helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) pod_obj.reload() pod_objs.extend(pods) # Check for the created pvc metrics on prometheus pod after restarting nodes for pvc_obj in pvcs: assert check_pvcdata_collected_on_prometheus(pvc_obj.name), ( f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected" )
def test_pv_provisioning_under_degraded_state(self, resources, instances, aws_obj, interface, operation): """ Test PV provisioning under degraded state OCS-1138: - Stop 1 ec2 instance worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health OCS-1241: - Stop 1 ec2 instance worker node that has the RBD provisioner pod running on - Wait for the RBD pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health OCS-1139: - Stop 1 ec2 instance worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by creating resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health OCS-1242: - Stop 1 ec2 instance worker node that has the CephFS provisioner pod running on - Wait for the CephFS pod provisioner to come up again to running status - Validate cluster functionality, without checking cluster and Ceph health by deleting resources and running IO - Start the worker node ec2 instance - Check cluster and Ceph health """ if operation == 'delete_resources': # Create resources that their deletion will be tested later sanity_helpers.create_resources(resources=resources) provisioner_pod = None # Get the provisioner pod according to the interface if interface == 'rbd': provisioner_pod = pod.get_rbdfsplugin_provisioner_pods()[0] elif interface == 'cephfs': provisioner_pod = pod.get_cephfsplugin_provisioner_pods()[0] provisioner_pod_name = provisioner_pod.name logger.info( f"{interface} provisioner pod found: {provisioner_pod_name}") # Get the node name that has the provisioner pod running on provisioner_node = pod.get_pod_node(provisioner_pod) provisioner_node_name = provisioner_node.get('metadata').get('name') logger.info( f"{interface} provisioner pod is running on node {provisioner_node_name}" ) # Get the ec2 instance of the node instances = aws.get_instances_ids_and_names([provisioner_node]) # Stopping the nodes aws_obj.stop_ec2_instances(instances=instances, wait=True) # Wait for the provisioner pod to get to running status selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if ( interface == 'rbd') else constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL # Wait for the provisioner pod to reach Terminating status logger.info( f"Waiting for pod {provisioner_pod_name} to reach status Terminating" ) assert provisioner_pod.ocp.wait_for_resource( timeout=300, resource_name=provisioner_pod.name, condition=constants.STATUS_TERMINATING ), f"{interface} provisioner pod failed to reach status Terminating" # Wait for the provisioner pod to be started and reach running status logger.info( f"Waiting for pod {provisioner_pod_name} to reach status Running") assert provisioner_pod.ocp.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector=selector, resource_count=1 ), f"{interface} provisioner pod failed to reach status Running" if operation == 'create_resources': # Cluster validation (resources creation and IO running) sanity_helpers.create_resources(resources=resources) elif operation == 'delete_resources': # Cluster validation (resources creation and IO running) sanity_helpers.delete_resources(resources=resources) # Starting the nodes aws_obj.start_ec2_instances(instances=instances, wait=True) # Checking cluster and Ceph health sanity_helpers.health_check(nodes=[provisioner_node_name])