예제 #1
0
    def kill_resource_repeatedly(self,
                                 resource_name,
                                 resource_id,
                                 max_iterations=30):
        """
        The function get the resource name, and id and kill the resource repeatedly
        until the new osd pods reached status running.

        Args:
            resource_name (str): the name of the resource to kill
            resource_id (int): the id of the resource to kill
            max_iterations (int): Maximum times of iterations to delete the given resource

        """
        d = Disruptions()

        for i in range(max_iterations):
            logging.info(
                f"iteration {i}: Delete resource {resource_name} with id {resource_id}"
            )
            d.set_resource(resource_name)
            d.delete_resource(resource_id)
            if self.new_pods_in_status_running:
                logging.info("New osd pods reached status running")
                break

        if not self.new_pods_in_status_running:
            logging.warning(
                f"New osd pods didn't reach status running after {max_iterations} iterations"
            )
예제 #2
0
    def test_run_amq_respin_pod(self, pod_name):
        """
        Test amq workload when spinning ceph pods
        and restarting amq pods

        """
        # Respin relevant pod
        if pod_name == "amq":
            pod_pattern_list = [
                "cluster-operator",
                "my-cluster-kafka",
                "my-cluster-zookeeper",
                "my-connect-cluster-connect",
                "my-bridge-bridge",
            ]
            for pod_pattern in pod_pattern_list:
                respin_amq_app_pod(kafka_namespace=constants.AMQ_NAMESPACE,
                                   pod_pattern=pod_pattern)
        else:
            log.info(f"Respin Ceph pod {pod_name}")
            disruption = Disruptions()
            disruption.set_resource(resource=f"{pod_name}")
            disruption.delete_resource()

            # Validate the results
            log.info("Validate message run completely")
            for thread in self.threads:
                thread.result(timeout=1800)
예제 #3
0
    def test_add_capacity_with_resource_delete(
        self,
        add_capacity_setup,
        workload_storageutilization_rbd,
        resource_name,
        resource_id,
        is_kill_resource_repeatedly,
    ):
        """
        The function get the resource name, and id.
        The function adds capacity to the cluster, and then delete the resource while
        storage capacity is getting increased.

        Args:
            resource_name (str): the name of the resource to delete
            resource_id (int): the id of the resource to delete
            is_kill_resource_repeatedly (bool): If True then kill the resource repeatedly. Else, if False
                delete the resource only once.

        """
        used_percentage = get_percent_used_capacity()
        logging.info(
            f"storageutilization is completed. used capacity = {used_percentage}"
        )

        osd_pods_before = pod_helpers.get_osd_pods()
        number_of_osd_pods_before = len(osd_pods_before)

        d = Disruptions()
        d.set_resource(resource_name)

        self.new_pods_in_status_running = False

        osd_size = storage_cluster.get_osd_size()
        logging.info(f"Adding one new set of OSDs. osd size = {osd_size}")
        storagedeviceset_count = storage_cluster.add_capacity(osd_size)
        logging.info("Adding one new set of OSDs was issued without problems")

        # Wait for new osd's to come up. After the first new osd in status Init - delete the resource.
        # After deleting the resource we expect that all the new osd's will be in status running,
        # and the delete resource will be also in status running.
        pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before)
        logging.info(
            f"Delete a {resource_name} pod while storage capacity is getting increased"
        )
        if is_kill_resource_repeatedly:
            with ThreadPoolExecutor() as executor:
                executor.submit(self.kill_resource_repeatedly, resource_name,
                                resource_id)
                self.wait_for_osd_pods_to_be_running(storagedeviceset_count)
        else:
            d.delete_resource(resource_id)
            self.wait_for_osd_pods_to_be_running(storagedeviceset_count)

        self.new_pods_in_status_running = True
        logging.info(
            "Finished verifying add capacity when one of the pods gets deleted"
        )
        logging.info("Waiting for ceph health check to finished...")
        check_ceph_health_after_add_capacity()
    def test_run_couchbase_respin_pod(self, cb_setup, pod_name):
        log.info(f"Respin Ceph pod {pod_name}")

        if pod_name == "couchbase":
            self.cb.respin_couchbase_app_pod()
        else:
            disruption = Disruptions()
            disruption.set_resource(resource=f"{pod_name}")
            disruption.delete_resource()

        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)
        self.sanity_helpers.health_check(tries=40)
    def test_monitoring_after_respinning_ceph_pods(self, pods):
        """
        Test case to validate respinning the ceph pods and
        its interaction with prometheus pod

        """

        # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one
        resource_to_delete = ["mgr", "mon", "osd"]
        disruption = Disruptions()
        for res_to_del in resource_to_delete:
            disruption.set_resource(resource=res_to_del)
            disruption.delete_resource()

        # Check for the created pvc metrics on prometheus pod
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
예제 #6
0
    def test_coredump_check_for_ceph_daemon_crash(self):
        """
        Verify coredumpctl list updated after killing daemon

        """
        log.info("Get Node name where mon pod running")
        mon_pod_nodes = [get_pod_node(pod) for pod in get_mon_pods()]
        mon_pod_node_names = [node.name for node in mon_pod_nodes]

        log.info("Get Node name where mgr pod running")
        mgr_pod_nodes = [get_pod_node(pod) for pod in get_mgr_pods()]
        mgr_pod_node_names = [node.name for node in mgr_pod_nodes]

        log.info("Get Node name where osd pod running")
        osd_pod_nodes = [get_pod_node(pod) for pod in get_osd_pods()]
        osd_pod_node_names = [node.name for node in osd_pod_nodes]

        node_mgr_mon_osd_names = set(mgr_pod_node_names).intersection(
            osd_pod_node_names, mon_pod_node_names)
        node_mgr_osd_names = set(mgr_pod_node_names).intersection(
            osd_pod_node_names)
        node_mgr_mon_names = set(mgr_pod_node_names).intersection(
            mon_pod_node_names)

        if len(node_mgr_mon_osd_names) > 0:
            daemon_types = ["mgr", "osd", "mon"]
            node_name = list(node_mgr_mon_osd_names)[0]
        elif len(node_mgr_osd_names) > 0:
            daemon_types = ["mgr", "osd"]
            node_name = list(node_mgr_osd_names)[0]
        elif len(node_mgr_mon_names) > 0:
            daemon_types = ["mgr", "mon"]
            node_name = list(node_mgr_mon_names)[0]
        else:
            daemon_types = ["mgr"]
            node_name = mgr_pod_node_names[0]
        log.info(f"Test the daemon_types {daemon_types} on node {node_name}")

        log.info("Delete the contents of 'posted' directory "
                 "`/var/lib/rook/openshift-storage/crash/posted/`")
        cmd_bash = f"oc debug nodes/{node_name} -- chroot /host /bin/bash -c "
        cmd_delete_files = '"rm -rf /var/lib/rook/openshift-storage/crash/posted/*"'
        cmd = cmd_bash + cmd_delete_files
        run_cmd(cmd=cmd)

        for daemon_type in daemon_types:
            log.info(f"find ceph-{daemon_type} process-id")
            cmd_pid = f"pidof ceph-{daemon_type}"
            cmd_gen = "oc debug node/" + node_name + " -- chroot /host "
            cmd = cmd_gen + cmd_pid
            out = run_cmd(cmd=cmd)
            pids = out.strip().split()
            pid = pids[0]
            if not pid.isnumeric():
                raise Exception(
                    f"The ceph-{daemon_type} process-id was not found.")

            log.info(f"Kill ceph-{daemon_type} process-id {pid}")
            disruptions_obj = Disruptions()
            disruptions_obj.daemon_pid = pid
            disruptions_obj.kill_daemon(node_name=node_name,
                                        check_new_pid=False,
                                        kill_signal="11")

        log.info(
            f"Verify that we have a crash event for ceph-{daemon_types} crash (tool pod)"
        )
        sample = TimeoutSampler(
            timeout=600,
            sleep=10,
            func=run_cmd_verify_cli_output,
            cmd="ceph crash ls",
            expected_output_lst=daemon_types,
            cephtool_cmd=True,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                f"ceph-{daemon_types} process does not exist on crash list (tool pod)"
            )

        log.info(
            f"Verify coredumpctl list updated after killing {daemon_types} daemons on {node_name}"
        )
        sample = TimeoutSampler(
            timeout=600,
            sleep=10,
            func=run_cmd_verify_cli_output,
            cmd="coredumpctl list",
            expected_output_lst=daemon_types,
            debug_node=node_name,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                f"coredump not getting generated for ceph-{daemon_types} daemon crash"
            )

        log.info(
            f"Verify the directory postedcoredumpctl is not empty on {node_name}"
        )
        sample = TimeoutSampler(
            timeout=600,
            sleep=10,
            func=run_cmd_verify_cli_output,
            cmd="ls -ltr /var/lib/rook/openshift-storage/crash/posted/",
            expected_output_lst=[":"],
            debug_node=node_name,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                f"coredump not getting generated for {daemon_types} daemons crash"
            )

        log.info(
            "Verify ceph status moved to HEALTH_WARN state with the relevant "
            "information (daemons have recently crashed)")
        sample = TimeoutSampler(
            timeout=20,
            sleep=5,
            func=run_cmd_verify_cli_output,
            cmd="ceph health detail",
            expected_output_lst=daemon_types +
            ["HEALTH_WARN", "daemons have recently crashed"],
            cephtool_cmd=True,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                "The output of command ceph health detail did not show "
                "warning 'daemons have recently crashed'")
예제 #7
0
    def test_coredump_check_for_ceph_daemon_crash(self, daemon_type):
        """
        Verify coredumpctl list updated after killing daemon

        """
        log.info(f"Get Node name where {daemon_type} pod running")
        if daemon_type == "mon":
            mon_pod_nodes = [get_pod_node(pod) for pod in get_mon_pods()]
            node_obj = mon_pod_nodes[0]
        elif daemon_type == "mgr":
            mgr_pod_nodes = [get_pod_node(pod) for pod in get_mgr_pods()]
            node_obj = mgr_pod_nodes[0]
        elif daemon_type == "osd":
            osd_pod_nodes = [get_pod_node(pod) for pod in get_osd_pods()]
            node_obj = osd_pod_nodes[0]
        node_name = node_obj.name

        log.info("Delete the contents of 'posted' directory "
                 "`/var/lib/rook/openshift-storage/crash/posted/`")
        cmd_bash = f"oc debug nodes/{node_name} -- chroot /host /bin/bash -c "
        cmd_delete_files = '"rm -rf /var/lib/rook/openshift-storage/crash/posted/*"'
        cmd = cmd_bash + cmd_delete_files
        run_cmd(cmd=cmd)

        log.info(f"find ceph-{daemon_type} process-id")
        cmd_pid = f"pidof ceph-{daemon_type}"
        cmd_gen = "oc debug node/" + node_name + " -- chroot /host "
        cmd = cmd_gen + cmd_pid
        out = run_cmd(cmd=cmd)
        pid = out.strip()
        if not pid.isnumeric():
            raise Exception(
                f"The ceph-{daemon_type} process-id was not found.")

        log.info(f"Kill ceph-{daemon_type} process-id {pid}")
        disruptions_obj = Disruptions()
        disruptions_obj.daemon_pid = pid
        disruptions_obj.kill_daemon(node_name=node_name,
                                    check_new_pid=False,
                                    kill_signal="11")

        log.info(
            f"Verify that we have a crash event for ceph-{daemon_type} crash")
        sample = TimeoutSampler(
            timeout=600,
            sleep=10,
            func=run_cmd_verify_cli_output,
            cmd="ceph crash ls",
            expected_output_lst=[daemon_type],
            cephtool_cmd=True,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                f"ceph-{daemon_type} process does not exist on crash list (tool pod)"
            )

        log.info(
            f"Verify coredumpctl list updated after killing ceph-{daemon_type} daemon"
        )
        sample = TimeoutSampler(
            timeout=600,
            sleep=10,
            func=run_cmd_verify_cli_output,
            cmd="coredumpctl list",
            expected_output_lst=[daemon_type],
            debug_node=node_name,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                f"coredump not getting generated for ceph-{daemon_type} daemon crash"
            )

        sample = TimeoutSampler(
            timeout=600,
            sleep=10,
            func=run_cmd_verify_cli_output,
            cmd="ls -ltr /var/lib/rook/openshift-storage/crash/posted/",
            expected_output_lst=[":"],
            debug_node=node_name,
        )
        if not sample.wait_for_func_status(True):
            raise Exception(
                f"coredump not getting generated for ceph-{daemon_type} daemon crash"
            )
예제 #8
0
    def test_pvc_snapshot_and_clone(
        self, nodes, multiple_snapshot_and_clone_of_postgres_pvc_factory
    ):
        """
        1. Deploy PGSQL workload
        2. Take a snapshot of the pgsql PVC.
        3. Create a new PVC out of that snapshot or restore snapshot
        4. Create a clone of restored snapshot
        5. Attach a new pgsql pod to it.
         5. Resize cloned pvc
        7. Create snapshots of cloned pvc and restore those snapshots
        8. Attach a new pgsql pod to it and Resize the new restored pvc
        9. Repeat the above steps in bg when performing base operation:
            restart pods, worker node reboot, node drain, device replacement

        """

        log.info("Starting multiple creation & clone of postgres PVC in Background")
        bg_handler = flowtest.BackgroundOps()
        executor_run_bg_ops = ThreadPoolExecutor(max_workers=1)
        pgsql_snapshot_and_clone = executor_run_bg_ops.submit(
            bg_handler.handler,
            multiple_snapshot_and_clone_of_postgres_pvc_factory,
            pvc_size_new=25,
            pgsql=self.pgsql,
            iterations=1,
        )
        log.info("Started creation of snapshots & clones in background")

        flow_ops = flowtest.FlowOperations()
        log.info("Starting operation 1: Pod Restarts")
        disruption = Disruptions()
        pod_obj_list = [
            "osd",
            "mon",
            "mgr",
            "operator",
            "rbdplugin",
            "rbdplugin_provisioner",
        ]
        for pod in pod_obj_list:
            disruption.set_resource(resource=f"{pod}")
            disruption.delete_resource()
        log.info("Verifying exit criteria for operation 1: Pod Restarts")
        flow_ops.validate_cluster(
            node_status=True, pod_status=True, operation_name="Pod Restarts"
        )

        log.info("Starting operation 2: Node Reboot")
        node_names = flow_ops.node_operations_entry_criteria(
            node_type="worker", number_of_nodes=3, operation_name="Node Reboot"
        )
        # Reboot node
        nodes.restart_nodes(node_names)
        log.info("Verifying exit criteria for operation 2: Node Reboot")
        flow_ops.validate_cluster(
            node_status=True, pod_status=True, operation_name="Node Reboot"
        )

        log.info("Starting operation 3: Node Drain")
        node_name = flow_ops.node_operations_entry_criteria(
            node_type="worker", number_of_nodes=1, operation_name="Node Drain"
        )
        # Node maintenance - to gracefully terminate all pods on the node
        drain_nodes([node_name[0].name])
        # Make the node schedulable again
        schedule_nodes([node_name[0].name])
        log.info("Verifying exit criteria for operation 3: Node Drain")
        flow_ops.validate_cluster(
            node_status=True, pod_status=True, operation_name="Node Drain"
        )

        log.info("Waiting for background operations to be completed")
        bg_handler.wait_for_bg_operations([pgsql_snapshot_and_clone], timeout=600)
    def run_in_bg(self,
                  nodes,
                  multiple_snapshot_and_clone_of_postgres_pvc_factory,
                  sc_name=None):
        log.info(
            "Starting multiple creation & clone of postgres PVC in Background")
        bg_handler = flowtest.BackgroundOps()
        executor_run_bg_ops = ThreadPoolExecutor(max_workers=1)
        pgsql_snapshot_and_clone = executor_run_bg_ops.submit(
            bg_handler.handler,
            multiple_snapshot_and_clone_of_postgres_pvc_factory,
            pvc_size_new=25,
            pgsql=self.pgsql,
            sc_name=sc_name,
            iterations=1,
        )
        log.info("Started creation of snapshots & clones in background")

        flow_ops = flowtest.FlowOperations()
        log.info("Starting operation 1: Pod Restarts")
        disruption = Disruptions()
        pod_obj_list = [
            "osd",
            "mon",
            "mgr",
            "operator",
            "rbdplugin",
            "rbdplugin_provisioner",
        ]
        for pod in pod_obj_list:
            disruption.set_resource(resource=f"{pod}")
            disruption.delete_resource()
        log.info("Verifying exit criteria for operation 1: Pod Restarts")
        flow_ops.validate_cluster(node_status=True,
                                  pod_status=True,
                                  operation_name="Pod Restarts")

        log.info("Starting operation 2: Node Reboot")
        node_names = flow_ops.node_operations_entry_criteria(
            node_type="worker",
            number_of_nodes=3,
            operation_name="Node Reboot")
        # Reboot node
        nodes.restart_nodes(node_names)
        log.info("Verifying exit criteria for operation 2: Node Reboot")
        flow_ops.validate_cluster(node_status=True,
                                  pod_status=True,
                                  operation_name="Node Reboot")

        log.info("Starting operation 3: Node Drain")
        node_name = flow_ops.node_operations_entry_criteria(
            node_type="worker", number_of_nodes=1, operation_name="Node Drain")
        # Node maintenance - to gracefully terminate all pods on the node
        drain_nodes([node_name[0].name])
        # Make the node schedulable again
        schedule_nodes([node_name[0].name])
        log.info("Verifying exit criteria for operation 3: Node Drain")
        flow_ops.validate_cluster(node_status=True,
                                  pod_status=True,
                                  operation_name="Node Drain")

        log.info("Waiting for background operations to be completed")
        bg_handler.wait_for_bg_operations([pgsql_snapshot_and_clone],
                                          timeout=600)