def test_monitoring_after_rebooting_node_where_mgr_is_running(self):
        """
        Test case to validate rebooting a node where mgr is running
        should not delete the data collected on prometheus pod
        """

        aws_obj = aws.AWS()

        # Get the mgr pod obj
        mgr_pod_obj = pod.get_mgr_pods()

        # Get the node where the mgr pod is hosted
        mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0])

        # Reboot the node where the mgr pod is hosted
        instances = aws.get_instances_ids_and_names([mgr_node_obj])
        aws_obj.restart_ec2_instances(instances=instances,
                                      wait=True,
                                      force=True)

        # Validate all nodes are in READY state
        wait_for_nodes_status()

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for ceph health check metrics is updated with new mgr pod
        wait_to_update_mgrpod_info_prometheus_pod()

        # Check for the created pvc metrics after rebooting the node where mgr pod was running
        for pod_obj in self.pod_objs:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
示例#2
0
def instances(request, aws_obj):
    """
    Get cluster instances

    Returns:
        dict: The ID keys and the name values of the instances

    """
    nodes = ocp.get_node_objs()
    ec2_instances = aws.get_instances_ids_and_names(nodes)

    def finalizer():
        """
        Make sure all instances are running
        """
        stopping_instances = {
            key: val for key, val in ec2_instances.items() if (
                aws_obj.get_instances_status_by_id(key) == constants.INSTANCE_STOPPING
            )
        }
        if stopping_instances:
            for stopping_instance in stopping_instances:
                instance = aws_obj.get_ec2_instance(stopping_instance.key())
                instance.wait_until_stopped()
        stopped_instances = {
            key: val for key, val in ec2_instances.items() if (
                aws_obj.get_instances_status_by_id(key) == constants.INSTANCE_STOPPED
            )
        }
        if stopped_instances:
            aws_obj.start_ec2_instances(instances=stopped_instances, wait=True)

    request.addfinalizer(finalizer)

    return ec2_instances
示例#3
0
    def get_ec2_instances(self, nodes):
        """
        Get the EC2 instances dicts

        Args:
            nodes (list): The OCS objects of the nodes

        Returns:
            dict: The EC2 instances dicts (IDs and names)

        """
        return aws.get_instances_ids_and_names(nodes)
示例#4
0
def ec2_instances(request, aws_obj):
    """
    Get cluster instances

    Returns:
        dict: The ID keys and the name values of the instances

    """
    # Get all cluster nodes objects
    nodes = node.get_node_objs()

    # Get the cluster nodes ec2 instances
    ec2_instances = aws.get_instances_ids_and_names(nodes)
    assert ec2_instances, f"Failed to get ec2 instances for node {[n.name for n in nodes]}"

    def finalizer():
        """
        Make sure all instances are running
        """
        # Getting the instances that are in status 'stopping' (if there are any), to wait for them to
        # get to status 'stopped' so it will be possible to start them
        stopping_instances = {
            key: val
            for key, val in ec2_instances.items()
            if (aws_obj.get_instances_status_by_id(key) ==
                constants.INSTANCE_STOPPING)
        }

        # Waiting fot the instances that are in status 'stopping'
        # (if there are any) to reach 'stopped'
        if stopping_instances:
            for stopping_instance in stopping_instances:
                instance = aws_obj.get_ec2_instance(stopping_instance.key())
                instance.wait_until_stopped()
        stopped_instances = {
            key: val
            for key, val in ec2_instances.items()
            if (aws_obj.get_instances_status_by_id(key) ==
                constants.INSTANCE_STOPPED)
        }

        # Start the instances
        if stopped_instances:
            aws_obj.start_ec2_instances(instances=stopped_instances, wait=True)

    request.addfinalizer(finalizer)

    return ec2_instances
示例#5
0
    def test_detach_attach_worker_volume(self, aws_obj, pvc_factory, pod_factory):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be unhealthy)
          by creating resources and running IO
        - Attach back the volume to the node
        - Restart the node so the volume will get re-mounted

        """
        # Requesting 1 worker node for the test as this case includes detach and
        # attach of data volume of 1 worker node
        worker = node.get_typed_nodes(num_of_nodes=1)
        assert worker, "Failed to find a worker node for the test"
        worker = worker[0]

        # Get the worker node's ec2 instance ID and name
        instance = aws.get_instances_ids_and_names([worker])
        assert instance, f"Failed to get ec2 instances for node {worker.name}"

        instance_id = [*instance][0]

        # Get the ec2 instance data volume Volume instance
        ec2_volume = aws.get_data_volumes(instance_id)[0]

        # Detach volume (logging is done inside the function)
        aws_obj.detach_volume(ec2_volume)

        # Validate cluster is still functional
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)

        # Attach volume (logging is done inside the function)
        aws_obj.attach_volume(ec2_volume, instance_id)

        # Restart the instance so the volume will get re-mounted
        aws_obj.restart_ec2_instances(instances=instance, wait=True)

        # Cluster health check
        self.sanity_helpers.health_check()
示例#6
0
    def test_detach_attach_2_workers_volumes(self, aws_obj, pvc_factory, pod_factory):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data volume from 2 of the worker nodes
        - Attach back the volume to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Requesting 2 worker nodes for the test as this case includes
        # detach and attach of data volume of 1 worker node
        workers = node.get_typed_nodes(num_of_nodes=2)
        assert workers, "Failed to find worker nodes for the test"

        # Get the worker nodes ec2 instance IDs and names
        instances = aws.get_instances_ids_and_names(workers)
        assert instances, (
            f"Failed to get ec2 instances for node {[w.name for w in workers]}"
        )

        for instance in instances.items():
            instance_id = [*instance][0]

            # Get the ec2 instance data volume Volume instance
            ec2_volume = aws.get_data_volumes(instance_id)[0]

            # Detach volume (logging is done inside the function)
            aws_obj.detach_volume(ec2_volume)

            # Attach volume (logging is done inside the function)
            aws_obj.attach_volume(ec2_volume, instance_id)

        # Restart the instances so the volume will get re-mounted
        aws_obj.restart_ec2_instances(instances=instances, wait=True)

        # Validate cluster is still functional
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
示例#7
0
    def test_node_maintenance_restart_activate(
        self, ec2_instances, aws_obj, pvc_factory, pod_factory, node_type
    ):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node's ec2 instance
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node
        typed_node = node.get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_node, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_node[0].name

        # Maintenance the node (unschedule and drain). The function contains logging
        node.drain_nodes([typed_node_name])

        instance = aws.get_instances_ids_and_names(typed_node)
        assert instance, f"Failed to get ec2 instances for node {typed_node_name}"

        # Restarting ec2 instance
        aws_obj.restart_ec2_instances(instances=instance, wait=True)

        node.wait_for_nodes_status(
            node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED
        )
        # Mark the node back to schedulable
        node.schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
示例#8
0
    def test_detach_attach_worker_volume(self, aws_obj, resources):
        """
        Detach and attach worker volume

        - Detach the data volume from one of the worker nodes
        - Validate cluster functionality, without checking cluster and Ceph
          health (as one node volume is detached, the cluster will be unhealthy)
          by creating resources and running IO
        - Attach back the volume to the node
        - Restart the node so the volume will get re-mounted

        """
        # Requesting 1 worker node for the test as this case includes detach and
        # attach of data volume of 1 worker node
        worker = node.get_typed_nodes(num_of_nodes=1)[0]

        # Get the worker node's ec2 instance ID and name
        instance = aws.get_instances_ids_and_names([worker])
        instance_id = [*instance][0]

        # Get the ec2 instance data volume Volume instance
        ec2_volume = aws.get_data_volumes(instance_id)[0]

        # Detach volume (logging is done inside the function)
        aws_obj.detach_volume(ec2_volume)

        # Validate cluster is still functional
        self.validate_cluster(resources=resources,
                              nodes=list(instance.values()),
                              health_check=False)

        # Attach volume (logging is done inside the function)
        aws_obj.attach_volume(ec2_volume, instance_id)

        # Restart the instance so the volume will get re-mounted
        aws_obj.restart_ec2_instances(instances=instance, wait=True)

        # Cluster health check
        self.health_check()
示例#9
0
    def test_monitoring_after_rebooting_master_node(self, pod_factory):
        """
        Test case to validate reboot master node and its
        interaction with prometheus pods
        """
        aws_obj = aws.AWS()

        # Get the master node list
        master_nodes = get_typed_nodes(node_type='master')

        # Reboot one after one master nodes
        for node in master_nodes:
            instances = aws.get_instances_ids_and_names([node])
            aws_obj.restart_ec2_instances(instances=instances,
                                          wait=True,
                                          force=True)

            # Validate all nodes are in READY state
            wait_for_master_node_to_be_running_state()

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for the created pvc metrics after rebooting the master nodes
        for pod_obj in self.pod_objs:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )

        pod_obj = pod_factory(interface=constants.CEPHBLOCKPOOL,
                              status=constants.STATUS_RUNNING)
        self.pod_objs.extend([pod_obj])

        # Check for the new created pvc metrics on prometheus pod
        assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
            f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
        )
示例#10
0
    def test_detach_attach_2_workers_volumes(self, aws_obj, resources):
        """
        Detach and attach disk from 2 worker nodes

        - Detach the data volume from 2 of the worker nodes
        - Attach back the volume to the worker nodes
        - Restart the nodes so the volume will get re-mounted in each node
        - Check cluster health and functionality to make sure detach,
          attach and restart did not affect the cluster

        """
        # Requesting 2 worker nodes for the test as this case includes
        # detach and attach of data volume of 1 worker node
        workers = node.get_typed_nodes(num_of_nodes=2)

        # Get the worker nodes ec2 instance IDs and names
        instances = aws.get_instances_ids_and_names(workers)

        for instance in instances.items():
            instance_id = [*instance][0]

            # Get the ec2 instance data volume Volume instance
            ec2_volume = aws.get_data_volumes(instance_id)[0]

            # Detach volume (logging is done inside the function)
            aws_obj.detach_volume(ec2_volume)

            # Attach volume (logging is done inside the function)
            aws_obj.attach_volume(ec2_volume, instance_id)

        # Restart the instances so the volume will get re-mounted
        aws_obj.restart_ec2_instances(instances=instances, wait=True)

        # Validate cluster is still functional
        self.validate_cluster(resources=resources,
                              nodes=list(instances.values()))
示例#11
0
    def test_monitoring_when_one_of_the_prometheus_node_down(
            self, test_fixture):
        """
        Test case to validate when the prometheus pod is down and
        interaction with prometheus
        """
        namespace_list, pvc_objs, pod_objs, sc = test_fixture

        aws_obj = aws.AWS()

        # Get all the openshift-monitoring pods
        monitoring_pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE)

        # Get the worker node list
        workers = get_typed_nodes(node_type='worker')

        # Get all prometheus pods
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj['spec']['nodeName']

            prometheus_node = [
                node for node in workers
                if node.get().get('metadata').get('name') == prometheus_node
            ]

            # Make one of the node down where the prometheus pod is hosted
            instances = aws.get_instances_ids_and_names(prometheus_node)
            aws_obj.restart_ec2_instances(instances=instances,
                                          wait=True,
                                          force=True)

            # Validate all nodes are in READY state
            wait_for_nodes_status()

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check all the monitoring pods are up
        for pod_obj in monitoring_pod_obj_list:
            wait_for_resource_state(resource=pod_obj,
                                    state=constants.STATUS_RUNNING)

        # Check for the created pvc metrics after nodes restarting
        for pvc_obj in pvc_objs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )

        # Create projects after restarting nodes
        namespaces = helpers.create_multilpe_projects(number_of_project=1)
        namespace_list.extend(namespaces)

        # Create pvcs after restarting nodes
        pvcs = [
            helpers.create_pvc(sc_name=sc.name,
                               namespace=each_namespace.namespace)
            for each_namespace in namespaces
        ]
        for pvc_obj in pvcs:
            helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND)
            pvc_obj.reload()
        pvc_objs.extend(pvcs)

        # Create app pods after restarting nodes
        pods = [
            helpers.create_pod(interface_type=constants.CEPHBLOCKPOOL,
                               pvc_name=each_pvc.name,
                               namespace=each_pvc.namespace)
            for each_pvc in pvcs
        ]
        for pod_obj in pods:
            helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING)
            pod_obj.reload()
        pod_objs.extend(pods)

        # Check for the created pvc metrics on prometheus pod after restarting nodes
        for pvc_obj in pvcs:
            assert check_pvcdata_collected_on_prometheus(pvc_obj.name), (
                f"On prometheus pod for created pvc {pvc_obj.name} related data is not collected"
            )
示例#12
0
    def test_pv_provisioning_under_degraded_state(self, resources, instances,
                                                  aws_obj, interface,
                                                  operation):
        """
        Test PV provisioning under degraded state

        OCS-1138:
        - Stop 1 ec2 instance worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node ec2 instance
        - Check cluster and Ceph health

        OCS-1241:
        - Stop 1 ec2 instance worker node that has the RBD provisioner
          pod running on
        - Wait for the RBD pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node ec2 instance
        - Check cluster and Ceph health

        OCS-1139:
        - Stop 1 ec2 instance worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by creating resources and running IO
        - Start the worker node ec2 instance
        - Check cluster and Ceph health

        OCS-1242:
        - Stop 1 ec2 instance worker node that has the CephFS provisioner
          pod running on
        - Wait for the CephFS pod provisioner to come up again to running status
        - Validate cluster functionality, without checking cluster and Ceph
          health by deleting resources and running IO
        - Start the worker node ec2 instance
        - Check cluster and Ceph health

        """
        if operation == 'delete_resources':
            # Create resources that their deletion will be tested later
            sanity_helpers.create_resources(resources=resources)

        provisioner_pod = None

        # Get the provisioner pod according to the interface
        if interface == 'rbd':
            provisioner_pod = pod.get_rbdfsplugin_provisioner_pods()[0]
        elif interface == 'cephfs':
            provisioner_pod = pod.get_cephfsplugin_provisioner_pods()[0]
        provisioner_pod_name = provisioner_pod.name
        logger.info(
            f"{interface} provisioner pod found: {provisioner_pod_name}")

        # Get the node name that has the provisioner pod running on
        provisioner_node = pod.get_pod_node(provisioner_pod)
        provisioner_node_name = provisioner_node.get('metadata').get('name')
        logger.info(
            f"{interface} provisioner pod is running on node {provisioner_node_name}"
        )

        # Get the ec2 instance of the node
        instances = aws.get_instances_ids_and_names([provisioner_node])

        # Stopping the nodes
        aws_obj.stop_ec2_instances(instances=instances, wait=True)

        # Wait for the provisioner pod to get to running status
        selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL if (
            interface
            == 'rbd') else constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL

        # Wait for the provisioner pod to reach Terminating status
        logger.info(
            f"Waiting for pod {provisioner_pod_name} to reach status Terminating"
        )
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=300,
            resource_name=provisioner_pod.name,
            condition=constants.STATUS_TERMINATING
        ), f"{interface} provisioner pod failed to reach status Terminating"

        # Wait for the provisioner pod to be started and reach running status
        logger.info(
            f"Waiting for pod {provisioner_pod_name} to reach status Running")
        assert provisioner_pod.ocp.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector=selector,
            resource_count=1
        ), f"{interface} provisioner pod failed to reach status Running"

        if operation == 'create_resources':
            # Cluster validation (resources creation and IO running)
            sanity_helpers.create_resources(resources=resources)
        elif operation == 'delete_resources':
            # Cluster validation (resources creation and IO running)
            sanity_helpers.delete_resources(resources=resources)

        # Starting the nodes
        aws_obj.start_ec2_instances(instances=instances, wait=True)

        # Checking cluster and Ceph health
        sanity_helpers.health_check(nodes=[provisioner_node_name])