示例#1
0
    def test_node_maintenance_restart_activate(self, nodes, pvc_factory,
                                               pod_factory, node_type):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=True)

        wait_for_nodes_status(node_names=[typed_node_name],
                              status=constants.NODE_READY_SCHEDULING_DISABLED)
        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
    def test_node_maintenance(self, reduce_and_resume_cluster_load, node_type,
                              pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain)
        drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=90)
    def test_node_maintenance_restart_activate(self, nodes, pvc_factory,
                                               pod_factory, node_type):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 1 node of the type needed for the test iteration
        typed_nodes = get_nodes(node_type=node_type, num_of_nodes=1)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        reboot_events_cmd = (
            f"get events -A --field-selector involvedObject.name="
            f"{typed_node_name},reason=Rebooted -o yaml")

        # Find the number of reboot events in 'typed_node_name'
        num_events = len(
            typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"])

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=False)

        try:
            wait_for_nodes_status(
                node_names=[typed_node_name],
                status=constants.NODE_NOT_READY_SCHEDULING_DISABLED,
            )
        except ResourceWrongStatusException:
            # Sometimes, the node will be back to running state quickly so
            # that the status change won't be detected. Verify the node was
            # actually restarted by checking the reboot events count
            new_num_events = len(
                typed_nodes[0].ocp.exec_oc_cmd(reboot_events_cmd)["items"])
            assert new_num_events > num_events, (
                f"Reboot event not found."
                f"Node {typed_node_name} did not restart.")

        wait_for_nodes_status(
            node_names=[typed_node_name],
            status=constants.NODE_READY_SCHEDULING_DISABLED,
        )

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
    def test_run_couchbase_node_drain(self, cb_setup, node_type='master'):
        """
        Test couchbase workload with node drain
        """
        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(
            node_type='worker', print_table=True
        )

        # Node drain with specific node type
        typed_nodes = node.get_typed_nodes(
            node_type=node_type, num_of_nodes=1
        )
        typed_node_name = typed_nodes[0].name

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        for sample in TimeoutSampler(300, 5, self.cb.result.done):
            if sample:
                break
            else:
                logging.info(
                    "#### ....Waiting for couchbase threads to complete..."
                )
        utils.ceph_health_check()
示例#5
0
    def test_2_nodes_maintenance_same_type(
        self, pvc_factory, pod_factory, nodes_type
    ):
        """
        OCS-1273/OCs-1271:
        - Maintenance (mark as unscheduable and drain) 2 worker/master nodes
        - Mark the nodes as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get 2 nodes
        typed_nodes = node.get_typed_nodes(node_type=nodes_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {nodes_type} node for the test"

        typed_node_names = [typed_node.name for typed_node in typed_nodes]

        # Maintenance the nodes (unschedule and drain)
        node.drain_nodes(typed_node_names)

        # Mark the nodes back to schedulable
        node.schedule_nodes(typed_node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
    def test_node_maintenance(self, node_type, pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get a list of 2 nodes. Pick one of them after checking
        # which one does't have the rook operator running on
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2)
        typed_node_name = typed_nodes[0].name
        # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get('metadata').get('name') == typed_node_name:
            typed_node_name = typed_nodes[1].name
        # End of workaround for BZ 1778488

        # Maintenance the node (unschedule and drain)
        drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
示例#7
0
    def test_2_nodes_different_types(self, pvc_factory, pod_factory):
        """
        OCS-1274:
        - Maintenance (mark as unscheduable and drain) 1 worker node and 1
          master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the nodes as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node from each type
        nodes = [
            node.get_typed_nodes(
                node_type=node_type, num_of_nodes=1
            )[0] for node_type in ['worker', 'master']
        ]
        assert nodes, f"Failed to find a nodes for the test"

        node_names = [typed_node.name for typed_node in nodes]

        # Maintenance the nodes (unschedule and drain)
        node.drain_nodes(node_names)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the nodes back to schedulable
        node.schedule_nodes(node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
示例#8
0
    def test_node_maintenance(self, node_type, pvc_factory, pod_factory):
        """
        OCS-1269/OCS-1272:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Mark the node as scheduable
        - Check cluster and Ceph health

        """
        # Get 1 node
        typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1)
        typed_node_name = typed_nodes[0].name

        # Maintenance the node (unschedule and drain)
        node.drain_nodes([typed_node_name])

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()

        # Mark the node back to schedulable
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
    def test_drain_mcg_pod_node(self, node_drain_teardown,
                                reduce_and_resume_cluster_load, pod_to_drain):
        """
        Test drianage of nodes which contain NB resources

        """
        # Retrieve the relevant pod object
        pod_obj = pod.Pod(**pod.get_pods_having_label(
            label=self.labels_map[pod_to_drain],
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        )[0])
        # Retrieve the node name on which the pod resides
        node_name = pod_obj.get()["spec"]["nodeName"]
        # Drain the node
        drain_nodes([node_name])
        # Verify the node was drained properly
        wait_for_nodes_status([node_name],
                              status=constants.NODE_READY_SCHEDULING_DISABLED)
        # Retrieve the new pod that should've been created post-drainage
        pod_obj = pod.Pod(**pod.get_pods_having_label(
            label=self.labels_map[pod_to_drain],
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        )[0])
        # Verify that the new pod has reached a 'RUNNNING' status again and recovered successfully
        wait_for_resource_state(pod_obj, constants.STATUS_RUNNING, timeout=120)
        # Check the NB status to verify the system is healthy
        self.cl_obj.wait_for_noobaa_health_ok()
示例#10
0
    def test_run_pgsql_node_drain(self,
                                  pgsql,
                                  transactions=5600,
                                  node_type="worker"):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)

        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)

        # Select a node where pgbench is not running for drain
        typed_nodes = [
            node1.name for node1 in node.get_nodes(node_type=node_type)
        ]
        filter_list = pgsql.filter_pgbench_nodes_from_nodeslist(typed_nodes)
        typed_node_name = filter_list[random.randint(0, len(filter_list) - 1)]
        log.info(f"Selected node {typed_node_name} for node drain operation")

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=40)

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)
示例#11
0
    def test_run_pgsql_node_drain(self,
                                  pgsql,
                                  transactions=900,
                                  node_type="master"):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(replicas=3,
                                       transactions=transactions,
                                       clients=3)

        # Start measuring time
        start_time = datetime.now()

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker",
                                                   print_table=True)

        # Node drain with specific node type
        typed_nodes = node.get_nodes(node_type=node_type, num_of_nodes=1)
        typed_node_name = typed_nodes[0].name

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Calculate the time from running state to completed state
        end_time = datetime.now()
        diff_time = end_time - start_time
        log.info(
            f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n"
        )

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)
示例#12
0
    def remove_rhcos_workers(self):
        """
        After RHEL workers are added remove rhcos workers from the cluster

        Raise:
            FailedToRemoveNodeException: if rhcos removal is failed
        """
        rhcos_workers = self.get_rhcos_workers()
        for node in rhcos_workers:
            cordon = f"oc adm cordon {node}"
            run_cmd(cordon)
            drain_nodes([node])
            delete = f"oc delete nodes {node}"
            run_cmd(delete)
        if len(self.get_rhcos_workers()):
            raise exceptions.FailedToRemoveNodeException()
示例#13
0
    def test_run_jenkins_drain_node(
        self, jenkins, node_type, num_projects, num_of_builds
    ):
        """

        Test Node Drain jenkins
        """
        # Init number of projects
        jenkins.number_projects = num_projects

        # Create app jenkins
        jenkins.create_app_jenkins()

        # Create jenkins pvc
        jenkins.create_jenkins_pvc()

        # Create jenkins build config
        jenkins.create_jenkins_build_config()

        # Wait jenkins deploy pod reach to completed state
        jenkins.wait_for_jenkins_deploy_status(status=STATUS_COMPLETED)

        # Get relevant node
        nodes_drain = jenkins.get_node_name_where_jenkins_pod_not_hosted(
            node_type=node_type, num_of_nodes=1
        )

        # Init number of builds per project
        jenkins.number_builds_per_project = num_of_builds

        # Start Builds
        jenkins.start_build()

        if len(nodes_drain) > 0:
            # Node maintenance - to gracefully terminate all pods on the node
            drain_nodes(nodes_drain)
            # Make the node  schedulable again
            schedule_nodes(nodes_drain)

        # Wait build reach 'Complete' state
        jenkins.wait_for_build_to_complete()

        # Print table of builds
        jenkins.print_completed_builds_results()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=40)
示例#14
0
    def test_node_maintenance_restart_activate(self, nodes, pvc_factory,
                                               pod_factory, node_type):
        """
        OCS-1292/OCS-1293:
        - Maintenance (mark as unscheduable and drain) 1 worker/master node
        - Restart the node
        - Mark the node as scheduable
        - Check cluster and Ceph health
        - Check cluster functionality by creating and deleting resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)

        """
        # Get a list of 2 nodes. Pick one of them after checking
        # which one does't have the rook operator running on
        typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {node_type} node for the test"
        typed_node_name = typed_nodes[0].name

        # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222
        rook_operator_pod = pod.get_operator_pods()[0]
        operator_node = pod.get_pod_node(rook_operator_pod)
        if operator_node.get().get('metadata').get('name') == typed_node_name:
            typed_node_name = typed_nodes[1].name
        # End of workaround for BZ 1778488

        # Maintenance the node (unschedule and drain). The function contains logging
        drain_nodes([typed_node_name])

        # Restarting the node
        nodes.restart_nodes(nodes=typed_nodes, wait=True)

        wait_for_nodes_status(node_names=[typed_node_name],
                              status=constants.NODE_READY_SCHEDULING_DISABLED)
        # Mark the node back to schedulable
        schedule_nodes([typed_node_name])

        # Check cluster and Ceph health and checking basic cluster
        # functionality by creating resources (pools, storageclasses,
        # PVCs, pods - both CephFS and RBD), run IO and delete the resources
        self.sanity_helpers.health_check()
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        self.sanity_helpers.delete_resources()
    def test_run_couchbase_node_drain(self, cb_setup, node_type="master"):
        """
        Test couchbase workload with node drain
        """
        # Check worker node utilization (adm_top)
        get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True)

        # Node drain with specific node type
        typed_nodes = node.get_nodes(node_type=node_type, num_of_nodes=1)
        typed_node_name = typed_nodes[0].name

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)
        self.sanity_helpers.health_check()
    def test_2_nodes_maintenance_same_type(self, nodes_type):
        """
        OCS-1273/OCs-1271:
        - Try draining 2 nodes from the same type - should fail
        - Check cluster and Ceph health

        """
        # Get 2 nodes
        typed_nodes = get_nodes(node_type=nodes_type, num_of_nodes=2)
        assert typed_nodes, f"Failed to find a {nodes_type} node for the test"

        typed_node_names = [typed_node.name for typed_node in typed_nodes]

        # Try draining 2 nodes - should fail
        try:
            drain_nodes(typed_node_names)
        except TimeoutExpired:
            log.info(f"Draining of nodes {typed_node_names} failed as expected")

        schedule_nodes(typed_node_names)

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
示例#17
0
    def test_run_pgsql_node_drain(
        self, pgsql, transactions=900, node_type='master'
    ):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        pgsql.create_pgbench_benchmark(
            replicas=3, transactions=transactions, clients=3
        )

        # Wait for pgbench pod to reach running state
        pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING)

        # Node drain with specific node type
        typed_nodes = node.get_typed_nodes(
            node_type=node_type, num_of_nodes=1
        )
        typed_node_name = typed_nodes[0].name

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        # Wait for pg_bench pod to complete
        pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED)

        # Get pgbench pods
        pgbench_pods = pgsql.get_pgbench_pods()

        # Validate pgbench run and parse logs
        pgsql.validate_pgbench_run(pgbench_pods)
示例#18
0
    def test_pvc_snapshot_and_clone(
        self, nodes, multiple_snapshot_and_clone_of_postgres_pvc_factory
    ):
        """
        1. Deploy PGSQL workload
        2. Take a snapshot of the pgsql PVC.
        3. Create a new PVC out of that snapshot or restore snapshot
        4. Create a clone of restored snapshot
        5. Attach a new pgsql pod to it.
         5. Resize cloned pvc
        7. Create snapshots of cloned pvc and restore those snapshots
        8. Attach a new pgsql pod to it and Resize the new restored pvc
        9. Repeat the above steps in bg when performing base operation:
            restart pods, worker node reboot, node drain, device replacement

        """

        log.info("Starting multiple creation & clone of postgres PVC in Background")
        bg_handler = flowtest.BackgroundOps()
        executor_run_bg_ops = ThreadPoolExecutor(max_workers=1)
        pgsql_snapshot_and_clone = executor_run_bg_ops.submit(
            bg_handler.handler,
            multiple_snapshot_and_clone_of_postgres_pvc_factory,
            pvc_size_new=25,
            pgsql=self.pgsql,
            iterations=1,
        )
        log.info("Started creation of snapshots & clones in background")

        flow_ops = flowtest.FlowOperations()
        log.info("Starting operation 1: Pod Restarts")
        disruption = Disruptions()
        pod_obj_list = [
            "osd",
            "mon",
            "mgr",
            "operator",
            "rbdplugin",
            "rbdplugin_provisioner",
        ]
        for pod in pod_obj_list:
            disruption.set_resource(resource=f"{pod}")
            disruption.delete_resource()
        log.info("Verifying exit criteria for operation 1: Pod Restarts")
        flow_ops.validate_cluster(
            node_status=True, pod_status=True, operation_name="Pod Restarts"
        )

        log.info("Starting operation 2: Node Reboot")
        node_names = flow_ops.node_operations_entry_criteria(
            node_type="worker", number_of_nodes=3, operation_name="Node Reboot"
        )
        # Reboot node
        nodes.restart_nodes(node_names)
        log.info("Verifying exit criteria for operation 2: Node Reboot")
        flow_ops.validate_cluster(
            node_status=True, pod_status=True, operation_name="Node Reboot"
        )

        log.info("Starting operation 3: Node Drain")
        node_name = flow_ops.node_operations_entry_criteria(
            node_type="worker", number_of_nodes=1, operation_name="Node Drain"
        )
        # Node maintenance - to gracefully terminate all pods on the node
        drain_nodes([node_name[0].name])
        # Make the node schedulable again
        schedule_nodes([node_name[0].name])
        log.info("Verifying exit criteria for operation 3: Node Drain")
        flow_ops.validate_cluster(
            node_status=True, pod_status=True, operation_name="Node Drain"
        )

        log.info("Waiting for background operations to be completed")
        bg_handler.wait_for_bg_operations([pgsql_snapshot_and_clone], timeout=600)
示例#19
0
    def test_rook_operator_restart_during_mon_failover(self,
                                                       node_drain_teardown):
        """
        Verify the number of monitoring pod is three when drain node

        """
        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=1,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "the expected pdb state is not equal to actual pdb state"

        log.info("Get worker node name where monitoring pod run")
        mon_pod_objs = get_mon_pods()
        node_name = mon_pod_objs[0].data["spec"]["nodeName"]

        drain_nodes([node_name])

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=0,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "the expected pdb state is not equal to actual pdb state"

        timeout = 1400
        log.info(f"Verify the number of mon pods is 3 for {timeout} seconds")
        sample = TimeoutSampler(timeout=timeout,
                                sleep=10,
                                func=check_number_of_mon_pods)
        if sample.wait_for_func_status(result=False):
            assert "There are more than 3 mon pods."

        log.info("Respin pod rook-ceph operator pod")
        rook_ceph_operator_pod_obj = get_operator_pods()
        rook_ceph_operator_pod_obj[0].delete()

        schedule_nodes([node_name])

        log.info("Wait for all the pods in openshift-storage to be running.")
        assert wait_for_pods_to_be_running(timeout=300)

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=1,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "the expected pdb state is not equal to actual pdb state"

        ceph_health_check()

        assert check_number_of_mon_pods(
        ), "The number of mon pods not equal to 3"
    def test_monitoring_after_draining_node_where_prometheus_hosted(
            self, pods):
        """
        Test case to validate when node is drained where prometheus
        is hosted, prometheus pod should re-spin on new healthy node
        and shouldn't be any data/metrics loss

        """

        # Get the prometheus pod
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE,
            selector=['prometheus'])

        for pod_obj in pod_obj_list:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_obj.get()
            pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj['spec']['nodeName']

            # Drain node where the prometheus pod hosted
            drain_nodes([prometheus_node])

            # Validate node is in SchedulingDisabled state
            wait_for_nodes_status(
                [prometheus_node],
                status=constants.NODE_READY_SCHEDULING_DISABLED)

            # Validate all prometheus pod is running
            POD = ocp.OCP(kind=constants.POD,
                          namespace=defaults.OCS_MONITORING_NAMESPACE)
            assert POD.wait_for_resource(
                condition='Running', selector='app=prometheus', timeout=180), (
                    "One or more prometheus pods are not in running state")

            # Validate prometheus pod is re-spinned on new healthy node
            pod_info = pod_obj.get()
            new_node = pod_info['spec']['nodeName']
            assert new_node not in prometheus_node, (
                'Promethues pod not re-spinned on new node')
            log.info(f"Prometheus pod re-spinned on new node {new_node}")

            # Validate same pvc is mounted on prometheus pod
            assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName'] in pvc_name, (
                    f"Old pvc not found after restarting the prometheus pod {pod_obj.name}"
                )

            # Validate the prometheus health is ok
            assert prometheus_health_check(), (
                "Prometheus cluster health is not OK")

            # Mark the nodes back to schedulable
            schedule_nodes([prometheus_node])

            # Wait some time after node scheduling back
            waiting_time = 30
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate node is in Ready State
            wait_for_nodes_status([prometheus_node],
                                  status=constants.NODE_READY)

            # Validate ceph health OK
            ceph_health_check(tries=40, delay=30)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check()

        # Check for the created pvc metrics after rebooting the master nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), (
                f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
            )
示例#21
0
    def test_pdb_check_simultaneous_node_drains(
        self,
        pvc_factory,
        pod_factory,
        bucket_factory,
        rgw_bucket_factory,
        node_drain_teardown,
    ):
        """
        - Check for OSD PDBs before drain
        - Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs
        - Drain will be completed on worker node A
        - Drain will be pending on worker node B due to blocking PDBs
        - Check mon failover in first 10 mins, then 15 and 20 mins
        - Check the OSD PDBs
        - Mark the node A as schedulable
        - Let drain finish on Node B
        - Again check mon failover in first 10 mins and then in intervals
        - Mark the node B as schedulable
        - Check cluster and Ceph health

        """

        # Validate OSD PDBs before drain operation
        assert (not validate_existence_of_blocking_pdb()
                ), "Blocking PDBs exist, Can't perform drain"

        # Get 2 worker nodes to drain
        typed_nodes = get_nodes(num_of_nodes=2)
        assert len(
            typed_nodes) == 2, "Failed to find worker nodes for the test"
        node_A = typed_nodes[0].name
        node_B = typed_nodes[1].name

        # Drain Node A and validate blocking PDBs
        drain_nodes([node_A])
        assert (validate_existence_of_blocking_pdb()
                ), "Blocking PDBs not created post drain"

        # Inducing delay between 2 drains
        # Node-B drain expected to be in pending due to blocking PDBs
        time.sleep(30)
        try:
            drain_nodes([node_B])
            # After the drain check Mon failover in 10th, 15th and 20th min
            timeout = [600, 300, 300]
            for failover in timeout:
                sample = TimeoutSampler(
                    timeout=failover,
                    sleep=10,
                    func=helpers.check_number_of_mon_pods,
                )
                if not sample.wait_for_func_status(result=True):
                    assert "Number of mon pods not equal to expected_mon_count=3"
        except TimeoutExpired:
            # Mark the node-A back to schedulable and let drain finish in Node-B
            schedule_nodes([node_A])

        time.sleep(40)

        # Validate OSD PDBs
        assert (validate_existence_of_blocking_pdb()
                ), "Blocking PDBs not created post second drain"

        # Mark the node-B back to schedulable and recover the cluster
        schedule_nodes([node_B])

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=validate_existence_of_blocking_pdb,
        )
        if not sample.wait_for_func_status(result=False):
            log.error("Blocking PDBs still exist")

        # After the drain check mon failover in 10th, 15th and 20th Min
        timeout = [600, 300, 300]
        for failover in timeout:
            sample = TimeoutSampler(
                timeout=failover,
                sleep=10,
                func=helpers.check_number_of_mon_pods,
            )
            if not sample.wait_for_func_status(result=True):
                assert "Number of Mon pods not equal to expected_mon_count=3"

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=1,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "The expected mon-pdb is not equal to actual mon pdb"

        # wait for storage pods
        pod.wait_for_storage_pods()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=50)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()
示例#22
0
    def test_mcg_namespace_disruptions_crd(
        self,
        mcg_obj,
        cld_mgr,
        awscli_pod,
        bucketclass_dict,
        bucket_factory,
        node_drain_teardown,
    ):
        """
        Test MCG namespace disruption flow

        1. Create NS resources with CRDs
        2. Create NS bucket with CRDs
        3. Upload to NS bucket
        4. Delete noobaa related pods and verify integrity of objects
        5. Create public access policy on NS bucket and verify Get op
        6. Drain nodes containing noobaa pods and verify integrity of objects
        7. Perform put operation to validate public access denial
        7. Edit/verify and remove objects on NS bucket

        """
        data = "Sample string content to write to a S3 object"
        object_key = "ObjKey-" + str(uuid.uuid4().hex)
        awscli_node_name = awscli_pod.get()["spec"]["nodeName"]

        aws_s3_creds = {
            "access_key_id": cld_mgr.aws_client.access_key,
            "access_key": cld_mgr.aws_client.secret_key,
            "endpoint": constants.MCG_NS_AWS_ENDPOINT,
            "region": config.ENV_DATA["region"],
        }

        # S3 account details
        user_name = "nb-user" + str(uuid.uuid4().hex)
        email = user_name + "@mail.com"

        logger.info("Setting up test files for upload, to the bucket/resources")
        setup_base_objects(awscli_pod, MCG_NS_ORIGINAL_DIR, MCG_NS_RESULT_DIR, amount=3)

        # Create the namespace resource and verify health
        ns_buc = bucket_factory(
            amount=1,
            interface=bucketclass_dict["interface"],
            bucketclass=bucketclass_dict,
        )[0]
        ns_bucket = ns_buc.name

        aws_target_bucket = ns_buc.bucketclass.namespacestores[0].uls_name

        logger.info(f"Namespace bucket: {ns_bucket} created")

        logger.info(f"Uploading objects to ns bucket: {ns_bucket}")
        sync_object_directory(
            awscli_pod,
            src=MCG_NS_ORIGINAL_DIR,
            target=f"s3://{ns_bucket}",
            s3_obj=mcg_obj,
        )

        for pod_to_respin in self.labels_map:
            logger.info(f"Re-spinning mcg resource: {self.labels_map[pod_to_respin]}")
            pod_obj = pod.Pod(
                **pod.get_pods_having_label(
                    label=self.labels_map[pod_to_respin],
                    namespace=defaults.ROOK_CLUSTER_NAMESPACE,
                )[0]
            )

            pod_obj.delete(force=True)

            assert pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=self.labels_map[pod_to_respin],
                resource_count=1,
                timeout=300,
            )

            logger.info(
                f"Downloading objects from ns bucket: {ns_bucket} "
                f"after re-spinning: {self.labels_map[pod_to_respin]}"
            )
            sync_object_directory(
                awscli_pod,
                src=f"s3://{ns_bucket}",
                target=MCG_NS_RESULT_DIR,
                s3_obj=mcg_obj,
            )

            logger.info(
                f"Verifying integrity of objects "
                f"after re-spinning: {self.labels_map[pod_to_respin]}"
            )
            compare_directory(
                awscli_pod, MCG_NS_ORIGINAL_DIR, MCG_NS_RESULT_DIR, amount=3
            )

        # S3 account
        user = NoobaaAccount(mcg_obj, name=user_name, email=email, buckets=[ns_bucket])
        logger.info(f"Noobaa account: {user.email_id} with S3 access created")

        # Admin sets Public access policy(*)
        bucket_policy_generated = gen_bucket_policy(
            user_list=["*"],
            actions_list=["GetObject"],
            resources_list=[f'{ns_bucket}/{"*"}'],
        )
        bucket_policy = json.dumps(bucket_policy_generated)

        logger.info(
            f"Creating bucket policy on bucket: {ns_bucket} with wildcard (*) Principal"
        )
        put_policy = put_bucket_policy(mcg_obj, ns_bucket, bucket_policy)
        logger.info(f"Put bucket policy response from Admin: {put_policy}")

        logger.info(f"Getting bucket policy on bucket: {ns_bucket}")
        get_policy = get_bucket_policy(mcg_obj, ns_bucket)
        logger.info(f"Got bucket policy: {get_policy['Policy']}")

        # MCG admin writes an object to bucket
        logger.info(f"Writing object on bucket: {ns_bucket} by admin")
        assert s3_put_object(mcg_obj, ns_bucket, object_key, data), "Failed: PutObject"

        # Verifying whether Get operation is allowed to any S3 user
        logger.info(
            f"Get object action on namespace bucket: {ns_bucket} "
            f"with user: {user.email_id}"
        )
        assert s3_get_object(user, ns_bucket, object_key), "Failed: GetObject"

        # Upload files to NS target
        logger.info(
            f"Uploading objects directly to ns resource target: {aws_target_bucket}"
        )
        sync_object_directory(
            awscli_pod,
            src=MCG_NS_ORIGINAL_DIR,
            target=f"s3://{aws_target_bucket}",
            signed_request_creds=aws_s3_creds,
        )

        for pod_to_drain in self.labels_map:
            pod_obj = pod.Pod(
                **pod.get_pods_having_label(
                    label=self.labels_map[pod_to_drain],
                    namespace=defaults.ROOK_CLUSTER_NAMESPACE,
                )[0]
            )

            # Retrieve the node name on which the pod resides
            node_name = pod_obj.get()["spec"]["nodeName"]

            if awscli_node_name == node_name:
                logger.info(
                    f"Skipping node drain since aws cli pod node: "
                    f"{awscli_node_name} is same as {pod_to_drain} "
                    f"pod node: {node_name}"
                )
                continue

            # Drain the node
            drain_nodes([node_name])
            wait_for_nodes_status(
                [node_name], status=constants.NODE_READY_SCHEDULING_DISABLED
            )
            schedule_nodes([node_name])
            wait_for_nodes_status(timeout=300)

            # Retrieve the new pod
            pod_obj = pod.Pod(
                **pod.get_pods_having_label(
                    label=self.labels_map[pod_to_drain],
                    namespace=defaults.ROOK_CLUSTER_NAMESPACE,
                )[0]
            )
            wait_for_resource_state(pod_obj, constants.STATUS_RUNNING, timeout=120)

            # Verify all storage pods are running
            wait_for_storage_pods()

            logger.info(
                f"Downloading objects from ns bucket: {ns_bucket} "
                f"after draining node: {node_name} with pod {pod_to_drain}"
            )
            sync_object_directory(
                awscli_pod,
                src=f"s3://{ns_bucket}",
                target=MCG_NS_RESULT_DIR,
                s3_obj=mcg_obj,
            )

            logger.info(
                f"Verifying integrity of objects "
                f"after draining node with pod: {pod_to_drain}"
            )
            compare_directory(
                awscli_pod, MCG_NS_ORIGINAL_DIR, MCG_NS_RESULT_DIR, amount=3
            )

        logger.info(f"Editing the namespace resource bucket: {ns_bucket}")
        namespace_bucket_update(
            mcg_obj,
            bucket_name=ns_bucket,
            read_resource=[aws_target_bucket],
            write_resource=aws_target_bucket,
        )

        logger.info(f"Verifying object download after edit on ns bucket: {ns_bucket}")
        sync_object_directory(
            awscli_pod,
            src=f"s3://{ns_bucket}",
            target=MCG_NS_RESULT_DIR,
            s3_obj=mcg_obj,
        )

        # Verifying whether Put object action is denied
        logger.info(
            f"Verifying whether user: {user.email_id} has only public read access"
        )

        logger.info(f"Removing objects from ns bucket: {ns_bucket}")
        rm_object_recursive(awscli_pod, target=ns_bucket, mcg_obj=mcg_obj)
    def test_crashcollector_pod_existence_on_ceph_pods_running_nodes(
            self, add_nodes, node_drain_teardown):
        """
        Add node with OCS label and verify crashcollector created on new node

        """
        failure_domain = get_failure_domin()
        logger.info(f"The failure domain is {failure_domain}")

        if failure_domain in ("zone", "rack"):
            old_node_rack_zone_dict = get_node_rack_or_zone_dict(
                failure_domain)
            logger.info(
                f"The old node rack/zone dict is {old_node_rack_zone_dict}")

        old_nodes = get_node_names()

        logger.info("Add one worker node with OCS label")
        add_nodes(ocs_nodes=True, node_count=1)

        new_node_name = list(set(get_node_names()) - set(old_nodes))[0]
        new_node = get_node_objs([new_node_name])[0]
        logger.info(f"New worker node is {new_node_name}")

        logger.info(
            f"Checking if the rack/zone of the node {new_node_name} is exist")
        timeout = 120
        sample = TimeoutSampler(
            timeout=timeout,
            sleep=10,
            func=self.is_node_rack_or_zone_exist,
            node_obj=get_node_objs([new_node_name])[0],
            failure_domain=failure_domain,
        )
        assert sample.wait_for_func_status(
            result=True
        ), f"Didn't find the node rack/zone after {timeout} seconds"

        if failure_domain in ("zone", "rack"):
            new_node_rack_zone_dict = get_node_rack_or_zone_dict(
                failure_domain)
            logger.info(
                f"The new node rack/zone dict is {new_node_rack_zone_dict}")

            new_rack_zone = get_node_rack_or_zone(failure_domain, new_node)
            logger.info(
                f"New worker node {new_node_name} in zone/rack {new_rack_zone}"
            )

            for node, rack_zone in old_node_rack_zone_dict.items():
                if rack_zone == new_rack_zone:
                    drain_node = node
        else:
            drain_node = old_nodes[0]

        drain_nodes([drain_node])

        logging.info("Wait for 3 mon pods to be on running state")
        pod = OCP(kind=constants.POD,
                  namespace=config.ENV_DATA["cluster_namespace"])
        assert pod.wait_for_resource(
            condition="Running",
            selector=constants.MON_APP_LABEL,
            resource_count=3,
            timeout=1400,
        )
        assert verify_rook_ceph_crashcollector_pods_where_rook_ceph_pods_are_running(
        )

        schedule_nodes([drain_node])

        logging.info("Wait for 3 osd pods to be on running state")
        assert pod.wait_for_resource(
            condition="Running",
            selector=constants.OSD_APP_LABEL,
            resource_count=3,
            timeout=600,
        )

        assert verify_rook_ceph_crashcollector_pods_where_rook_ceph_pods_are_running(
        )
    def run_in_bg(self,
                  nodes,
                  multiple_snapshot_and_clone_of_postgres_pvc_factory,
                  sc_name=None):
        log.info(
            "Starting multiple creation & clone of postgres PVC in Background")
        bg_handler = flowtest.BackgroundOps()
        executor_run_bg_ops = ThreadPoolExecutor(max_workers=1)
        pgsql_snapshot_and_clone = executor_run_bg_ops.submit(
            bg_handler.handler,
            multiple_snapshot_and_clone_of_postgres_pvc_factory,
            pvc_size_new=25,
            pgsql=self.pgsql,
            sc_name=sc_name,
            iterations=1,
        )
        log.info("Started creation of snapshots & clones in background")

        flow_ops = flowtest.FlowOperations()
        log.info("Starting operation 1: Pod Restarts")
        disruption = Disruptions()
        pod_obj_list = [
            "osd",
            "mon",
            "mgr",
            "operator",
            "rbdplugin",
            "rbdplugin_provisioner",
        ]
        for pod in pod_obj_list:
            disruption.set_resource(resource=f"{pod}")
            disruption.delete_resource()
        log.info("Verifying exit criteria for operation 1: Pod Restarts")
        flow_ops.validate_cluster(node_status=True,
                                  pod_status=True,
                                  operation_name="Pod Restarts")

        log.info("Starting operation 2: Node Reboot")
        node_names = flow_ops.node_operations_entry_criteria(
            node_type="worker",
            number_of_nodes=3,
            operation_name="Node Reboot")
        # Reboot node
        nodes.restart_nodes(node_names)
        log.info("Verifying exit criteria for operation 2: Node Reboot")
        flow_ops.validate_cluster(node_status=True,
                                  pod_status=True,
                                  operation_name="Node Reboot")

        log.info("Starting operation 3: Node Drain")
        node_name = flow_ops.node_operations_entry_criteria(
            node_type="worker", number_of_nodes=1, operation_name="Node Drain")
        # Node maintenance - to gracefully terminate all pods on the node
        drain_nodes([node_name[0].name])
        # Make the node schedulable again
        schedule_nodes([node_name[0].name])
        log.info("Verifying exit criteria for operation 3: Node Drain")
        flow_ops.validate_cluster(node_status=True,
                                  pod_status=True,
                                  operation_name="Node Drain")

        log.info("Waiting for background operations to be completed")
        bg_handler.wait_for_bg_operations([pgsql_snapshot_and_clone],
                                          timeout=600)
    def test_crashcollector_pod_existence_on_ceph_pods_running_nodes(
            self, add_nodes, node_drain_teardown):
        """
        Add node with OCS label and verify crashcollector created on new node

        """
        failure_domain = get_failure_domin()
        logger.info(f"The failure domain is {failure_domain}")

        if failure_domain in ("zone", "rack"):
            old_node_rack_zone = (get_node_zone() if failure_domain.lower()
                                  == "zone" else get_node_rack())
            logger.info(f"The old node rack/zone is {old_node_rack_zone}")

        old_nodes = get_node_names()

        logger.info("Add one worker node with OCS label")
        add_nodes(ocs_nodes=True, node_count=1)

        new_node = list(set(get_node_names()) - set(old_nodes))
        logger.info(f"New worker node is {new_node[0]}")

        if failure_domain in ("zone", "rack"):
            new_node_rack_zone = (get_node_zone() if failure_domain.lower()
                                  == "zone" else get_node_rack())
            logger.info(f"The new node rack/zone is {new_node_rack_zone}")

            new_rack_zone = new_node_rack_zone[new_node[0]]
            logger.info(
                f"New worker node {new_node[0]} in zone/rack {new_rack_zone}")

            for node, rack_zone in old_node_rack_zone.items():
                if rack_zone == new_rack_zone:
                    drain_node = node
        else:
            drain_node = old_nodes[0]

        drain_nodes([drain_node])

        logging.info("Wait for 3 mon pods to be on running state")
        pod = OCP(kind=constants.POD,
                  namespace=config.ENV_DATA["cluster_namespace"])
        assert pod.wait_for_resource(
            condition="Running",
            selector=constants.MON_APP_LABEL,
            resource_count=3,
            timeout=1400,
        )
        logger.info(
            "Verify rook-ceph-crashcollector pod running on worker node"
            " where rook-ceph pods are running.")
        assert sorted(get_crashcollector_nodes()) == sorted(
            get_nodes_where_ocs_pods_running()
        ), (f"The crashcollector pod exists on "
            f"{get_crashcollector_nodes() - get_nodes_where_ocs_pods_running()} "
            f"even though rook-ceph pods are not running on this node")

        schedule_nodes([drain_node])

        logging.info("Wait for 3 osd pods to be on running state")
        assert pod.wait_for_resource(
            condition="Running",
            selector=constants.OSD_APP_LABEL,
            resource_count=3,
            timeout=600,
        )

        logger.info(
            "Verify rook-ceph-crashcollector pod running on worker node where rook-ceph pods are running."
        )
        assert sorted(get_crashcollector_nodes()) == sorted(
            get_nodes_where_ocs_pods_running()
        ), (f"The crashcollector pod exists on "
            f"{get_crashcollector_nodes() - get_nodes_where_ocs_pods_running()} "
            f"even though rook-ceph pods are not running on this node")
示例#26
0
    def test_nodereplacement_proactive(self, pvc_factory, pod_factory,
                                       dc_pod_factory):
        """
        Knip-894 Node Replacement proactive

        """

        # Get worker nodes
        worker_node_list = get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_pods_obj = pod.get_osd_pods()
        osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name
        log.info(f"Selected OSD is {osd_node_name}")

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL,
                                            node_name=worker_node,
                                            size=20)
                pod.run_io_in_bg(rbd_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(
                    interface=constants.CEPHFILESYSTEM,
                    node_name=worker_node,
                    size=20)
                pod.run_io_in_bg(cephfs_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        # Unscheduling node
        node.unschedule_nodes([osd_node_name])
        # Draining Node
        node.drain_nodes([osd_node_name])
        log.info("Getting machine name from specified node name")
        machine_name = machine.get_machine_from_node_name(osd_node_name)
        log.info(f"Node {osd_node_name} associated machine is {machine_name}")
        log.info(
            f"Deleting machine {machine_name} and waiting for new machine to come up"
        )
        machine.delete_machine_and_check_state_of_new_spinned_machine(
            machine_name)
        new_machine_list = machine.get_machines()
        for machines in new_machine_list:
            # Trimming is done to get just machine name
            # eg:- machine_name:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b-nlgkr
            # After trimming:- prsurve-40-ocs-43-kbrvf-worker-us-east-2b
            if re.match(machines.name[:-6], machine_name):
                new_machine_name = machines.name
        machineset_name = machine.get_machineset_from_machine_name(
            new_machine_name)
        log.info("Waiting for new worker node to be in ready state")
        machine.wait_for_new_node_to_be_ready(machineset_name)
        new_node_name = node.get_node_from_machine_name(new_machine_name)
        log.info("Adding ocs label to newly created worker node")
        node_obj = ocp.OCP(kind='node')
        node_obj.add_label(resource_name=new_node_name,
                           label=constants.OPERATOR_NODE_LABEL)
        log.info(
            f"Successfully labeled {new_node_name} with OCS storage label")
        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()
        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check()
示例#27
0
    def test_simultaneous_drain_of_two_ocs_nodes(
        self,
        pvc_factory,
        pod_factory,
        dc_pod_factory,
        interface,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        OCS-2128/OCS-2129:
        - Create PVCs and start IO on DC based app pods
        - Add one extra node in two of the AZs and label the nodes
          with OCS storage label
        - Maintenance (mark as unscheduable and drain) 2 worker nodes
          simultaneously
        - Confirm that OCS and DC pods are in running state
        - Remove unscheduled nodes
        - Check cluster functionality by creating resources
          (pools, storageclasses, PVCs, pods - both CephFS and RBD)
        - Check cluster and Ceph health

        """
        # Get OSD running nodes
        osd_running_worker_nodes = get_osd_running_nodes()
        log.info(f"OSDs are running on nodes {osd_running_worker_nodes}")

        # Label osd nodes with fedora app
        label_worker_node(osd_running_worker_nodes,
                          label_key="dc",
                          label_value="fedora")
        log.info("Successfully labeled worker nodes with {dc:fedora}")

        # Create DC app pods
        log.info("Creating DC based app pods and starting IO in background")
        interface = (constants.CEPHBLOCKPOOL
                     if interface == "rbd" else constants.CEPHFILESYSTEM)
        dc_pod_obj = []
        for i in range(2):
            dc_pod = dc_pod_factory(interface=interface,
                                    node_selector={"dc": "fedora"})
            pod.run_io_in_bg(dc_pod, fedora_dc=True)
            dc_pod_obj.append(dc_pod)

        # Get the machine name using the node name
        machine_names = [
            machine.get_machine_from_node_name(osd_running_worker_node)
            for osd_running_worker_node in osd_running_worker_nodes[:2]
        ]
        log.info(f"{osd_running_worker_nodes} associated "
                 f"machine are {machine_names}")

        # Get the machineset name using machine name
        machineset_names = [
            machine.get_machineset_from_machine_name(machine_name)
            for machine_name in machine_names
        ]
        log.info(f"{osd_running_worker_nodes} associated machineset "
                 f"is {machineset_names}")

        # Add a new node and label it
        add_new_node_and_label_it(machineset_names[0])
        add_new_node_and_label_it(machineset_names[1])

        # Drain 2 nodes
        drain_nodes(osd_running_worker_nodes[:2])

        # Check the pods should be in running state
        all_pod_obj = pod.get_all_pods(wait=True)
        for pod_obj in all_pod_obj:
            if ("-1-deploy" or "ocs-deviceset") not in pod_obj.name:
                try:
                    helpers.wait_for_resource_state(
                        resource=pod_obj,
                        state=constants.STATUS_RUNNING,
                        timeout=200)
                except ResourceWrongStatusException:
                    # 'rook-ceph-crashcollector' on the failed node stucks at
                    # pending state. BZ 1810014 tracks it.
                    # Ignoring 'rook-ceph-crashcollector' pod health check as
                    # WA and deleting its deployment so that the pod
                    # disappears. Will revert this WA once the BZ is fixed
                    if "rook-ceph-crashcollector" in pod_obj.name:
                        ocp_obj = ocp.OCP(
                            namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                        pod_name = pod_obj.name
                        deployment_name = "-".join(pod_name.split("-")[:-2])
                        command = f"delete deployment {deployment_name}"
                        ocp_obj.exec_oc_cmd(command=command)
                        log.info(f"Deleted deployment for pod {pod_obj.name}")

        # DC app pods on the drained node will get automatically created on other
        # running node in same AZ. Waiting for all dc app pod to reach running state
        pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj,
                                                        timeout=1200)
        log.info("All the dc pods reached running state")

        # Remove unscheduled nodes
        # In scenarios where the drain is attempted on >3 worker setup,
        # post completion of drain we are removing the unscheduled nodes so
        # that we maintain 3 worker nodes.
        log.info(f"Removing scheduled nodes {osd_running_worker_nodes[:2]}")
        remove_node_objs = get_node_objs(osd_running_worker_nodes[:2])
        remove_nodes(remove_node_objs)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()
示例#28
0
    def test_pdb_check_simultaneous_node_drains(
        self,
        pvc_factory,
        pod_factory,
        bucket_factory,
        rgw_bucket_factory,
        node_drain_teardown,
    ):
        """
        - Check for OSD PDBs before drain
        - Maintenance (mark as unschedulable and drain) 2 worker node with delay of 30 secs
        - Drain will be completed on worker node A
        - Drain will be pending on worker node B due to blocking PDBs
        - Check the OSD PDBs
        - Mark the node A as schedulable
        - Let drain finish on Node B
        - Mark the node B as schedulable
        - Check cluster and Ceph health

        """

        # Validate OSD PDBs before drain operation
        assert (not validate_existence_of_blocking_pdb()
                ), "Blocking PDBs exist, Can't perform drain"

        # Get 2 worker nodes to drain
        typed_nodes = get_nodes(num_of_nodes=2)
        assert len(
            typed_nodes) == 2, "Failed to find worker nodes for the test"
        node_A = typed_nodes[0].name
        node_B = typed_nodes[1].name

        # Drain Node A and validate blocking PDBs
        drain_nodes([node_A])
        assert (validate_existence_of_blocking_pdb()
                ), "Blocking PDBs not created post drain"

        # Inducing delay between 2 drains
        # Node-B drain expected to be in pending due to blocking PDBs
        time.sleep(30)
        try:
            drain_nodes([node_B])
        except TimeoutExpired:
            # Mark the node-A back to schedulable and let drain finish in Node-B
            schedule_nodes([node_A])

        time.sleep(40)

        # Validate OSD PDBs
        assert (validate_existence_of_blocking_pdb()
                ), "Blocking PDBs not created post second drain"

        # Mark the node-B back to schedulable and recover the cluster
        schedule_nodes([node_B])

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=validate_existence_of_blocking_pdb,
        )
        if not sample.wait_for_func_status(result=False):
            log.error("Blocking PDBs still exist")

        # wait for storage pods
        pod.wait_for_storage_pods()

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check(tries=50)

        # Check basic cluster functionality by creating resources
        # (pools, storageclasses, PVCs, pods - both CephFS and RBD),
        # run IO and delete the resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        self.sanity_helpers.delete_resources()
    def test_base_operation_node_drain(
        self,
        node_drain_teardown,
        node_restart_teardown,
        nodes,
        pgsql_factory_fixture,
        project_factory,
        multi_pvc_factory,
        mcg_obj,
        bucket_factory,
    ):
        """
        Test covers following flow operations while running workloads in the background:
        1. Node drain
        2. Add capacity
        3. Node reboot
        4. Node n/w failure

        """
        logger.info("Starting IO operations in Background")
        project = project_factory()
        bg_handler = flowtest.BackgroundOps()
        executor_run_bg_ios_ops = ThreadPoolExecutor(max_workers=3)

        pgsql_workload = executor_run_bg_ios_ops.submit(
            bg_handler.handler,
            pgsql_factory_fixture,
            replicas=1,
            clients=1,
            transactions=100,
            timeout=100,
            iterations=1,
        )
        logging.info("Started pgsql workload in background")

        flow_ops = flowtest.FlowOperations()

        obc_ios = executor_run_bg_ios_ops.submit(
            bg_handler.handler,
            flow_ops.sanity_helpers.obc_put_obj_create_delete,
            mcg_obj,
            bucket_factory,
            iterations=30,
        )
        logging.info("Started object IOs in background")

        pvc_create_delete = executor_run_bg_ios_ops.submit(
            bg_handler.handler,
            flow_ops.sanity_helpers.create_pvc_delete,
            multi_pvc_factory,
            project,
            iterations=70,
        )
        logging.info("Started pvc create and delete in background")

        logger.info("Starting operation 1: Node Drain")
        node_name = flow_ops.node_operations_entry_criteria(
            node_type="worker", number_of_nodes=1, operation_name="Node Drain")
        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([node_name[0].name])
        # Make the node schedulable again
        node.schedule_nodes([node_name[0].name])
        logger.info("Verifying exit criteria for operation 1: Node Drain")
        flow_ops.validate_cluster(node_status=True,
                                  pod_status=True,
                                  operation_name="Node Drain")

        logger.info("Starting operation 2: Add Capacity")
        osd_pods_before, restart_count_before = flow_ops.add_capacity_entry_criteria(
        )
        # Add capacity
        osd_size = storage_cluster.get_osd_size()
        result = storage_cluster.add_capacity(osd_size)
        pod = OCP(kind=constants.POD,
                  namespace=config.ENV_DATA["cluster_namespace"])
        if is_flexible_scaling_enabled:
            replica_count = 1
        else:
            replica_count = 3
        pod.wait_for_resource(
            timeout=300,
            condition=constants.STATUS_RUNNING,
            selector="app=rook-ceph-osd",
            resource_count=result * replica_count,
        )
        logger.info("Verifying exit criteria for operation 2: Add Capacity")
        flow_ops.add_capacity_exit_criteria(restart_count_before,
                                            osd_pods_before)

        logger.info("Starting operation 3: Node Restart")
        node_name = flow_ops.node_operations_entry_criteria(
            node_type="worker",
            number_of_nodes=1,
            operation_name="Node Restart")
        # Node failure (reboot)
        nodes.restart_nodes(nodes=node_name)
        logger.info("Verifying exit criteria for operation 3: Node Restart")
        flow_ops.validate_cluster(node_status=True,
                                  pod_status=True,
                                  operation_name="Node Restart")

        logger.info("Starting operation 4: Node network fail")
        node_name, nw_fail_time = flow_ops.node_operations_entry_criteria(
            node_type="worker",
            number_of_nodes=1,
            network_fail_time=300,
            operation_name="Node N/W failure",
        )
        # Node n/w interface failure
        node.node_network_failure(node_name[0].name)
        logger.info(f"Waiting for {nw_fail_time} seconds")
        sleep(nw_fail_time)
        # Reboot the unresponsive node(s)
        logger.info(
            f"Stop and start the unresponsive node(s): {node_name[0].name}")
        nodes.restart_nodes_by_stop_and_start(nodes=node_name)
        logger.info(
            "Verifying exit criteria for operation 4: Node network fail")
        flow_ops.validate_cluster(node_status=True,
                                  pod_status=True,
                                  operation_name="Node N/W failure")

        logger.info(
            "Waiting for final iteration of background operations to be completed"
        )
        bg_ops = [pvc_create_delete, obc_ios, pgsql_workload]
        bg_handler.wait_for_bg_operations(bg_ops, timeout=600)
示例#30
0
    def test_run_pgsql_node_drain(self, transactions=900, node_type='master'):
        """
        Test pgsql workload
        """
        # Create pgbench benchmark
        log.info("Create resource file for pgbench workload")
        pg_trans = transactions
        timeout = pg_trans * 3
        pg_data = templating.load_yaml(constants.PGSQL_BENCHMARK_YAML)
        pg_data['spec']['workload']['args']['transactions'] = pg_trans
        pg_obj = OCS(**pg_data)
        pg_obj.create()

        # Getting pgbench podname
        for pgbench_pod in TimeoutSampler(
            pg_trans, 3, get_pod_name_by_pattern,
            'pgbench', 'my-ripsaw'
        ):
            try:
                if pgbench_pod[0] is not None:
                    pgbench_client_pod = pgbench_pod[0]
                    break
            except IndexError:
                log.info("Bench pod is not found")

        # Wait for pg_bench pod to be in running state
        log.info("Waiting for pgbench_pod to be in running state")
        pod_obj = OCP(kind='pod')
        pod_obj.wait_for_resource(
            condition='Running',
            resource_name=pgbench_client_pod,
            timeout=timeout,
            sleep=5,
        )

        # Node drain with specific node type
        typed_nodes = node.get_typed_nodes(node_type=node_type, num_of_nodes=1)
        typed_node_name = typed_nodes[0].name

        # Node maintenance - to gracefully terminate all pods on the node
        node.drain_nodes([typed_node_name])

        # Make the node schedulable again
        node.schedule_nodes([typed_node_name])

        # Perform cluster and Ceph health checks
        self.sanity_helpers.health_check()

        # Wait for pg_bench pod to complete workload
        log.info("Waiting for pgbench_client to complete")
        pod_obj.wait_for_resource(
            condition='Completed',
            resource_name=pgbench_client_pod,
            timeout=timeout,
            sleep=10,
        )

        # Parsing the results
        output = run_cmd(f'oc logs {pgbench_client_pod}')
        pg_output = utils.parse_pgsql_logs(output)
        log.info(
            "*******PGBench output log*********\n"
            f"{pg_output}"
        )
        for data in pg_output:
            latency_avg = data['latency_avg']
            if not latency_avg:
                raise UnexpectedBehaviour(
                    "PGBench failed to run, no data found on latency_avg"
                )
        log.info("PGBench has completed successfully")

        # Collect data and export to Google doc spreadsheet
        g_sheet = GoogleSpreadSheetAPI(sheet_name="OCS PGSQL", sheet_index=3)
        for lat in pg_output:
            lat_avg = lat['latency_avg']
            lat_stddev = lat['lat_stddev']
            tps_incl = lat['tps_incl']
            tps_excl = lat['tps_excl']
            g_sheet.insert_row(
                [int(lat_avg),
                 int(lat_stddev),
                 int(tps_incl),
                 int(tps_excl)], 2
            )
        # Clean up pgbench benchmark
        log.info("Deleting PG bench benchmark")
        pg_obj.delete()