예제 #1
0
    def kill_resource_repeatedly(self,
                                 resource_name,
                                 resource_id,
                                 max_iterations=30):
        """
        The function get the resource name, and id and kill the resource repeatedly
        until the new osd pods reached status running.

        Args:
            resource_name (str): the name of the resource to kill
            resource_id (int): the id of the resource to kill
            max_iterations (int): Maximum times of iterations to delete the given resource

        """
        d = Disruptions()

        for i in range(max_iterations):
            logging.info(
                f"iteration {i}: Delete resource {resource_name} with id {resource_id}"
            )
            d.set_resource(resource_name)
            d.delete_resource(resource_id)
            if self.new_pods_in_status_running:
                logging.info("New osd pods reached status running")
                break

        if not self.new_pods_in_status_running:
            logging.warning(
                f"New osd pods didn't reach status running after {max_iterations} iterations"
            )
예제 #2
0
    def test_add_capacity_with_resource_delete(
        self,
        add_capacity_setup,
        workload_storageutilization_rbd,
        resource_name,
        resource_id,
        is_kill_resource_repeatedly,
    ):
        """
        The function get the resource name, and id.
        The function adds capacity to the cluster, and then delete the resource while
        storage capacity is getting increased.

        Args:
            resource_name (str): the name of the resource to delete
            resource_id (int): the id of the resource to delete
            is_kill_resource_repeatedly (bool): If True then kill the resource repeatedly. Else, if False
                delete the resource only once.

        """
        used_percentage = get_percent_used_capacity()
        logging.info(
            f"storageutilization is completed. used capacity = {used_percentage}"
        )

        osd_pods_before = pod_helpers.get_osd_pods()
        number_of_osd_pods_before = len(osd_pods_before)

        d = Disruptions()
        d.set_resource(resource_name)

        self.new_pods_in_status_running = False

        osd_size = storage_cluster.get_osd_size()
        logging.info(f"Adding one new set of OSDs. osd size = {osd_size}")
        storagedeviceset_count = storage_cluster.add_capacity(osd_size)
        logging.info("Adding one new set of OSDs was issued without problems")

        # Wait for new osd's to come up. After the first new osd in status Init - delete the resource.
        # After deleting the resource we expect that all the new osd's will be in status running,
        # and the delete resource will be also in status running.
        pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before)
        logging.info(
            f"Delete a {resource_name} pod while storage capacity is getting increased"
        )
        if is_kill_resource_repeatedly:
            with ThreadPoolExecutor() as executor:
                executor.submit(self.kill_resource_repeatedly, resource_name,
                                resource_id)
                self.wait_for_osd_pods_to_be_running(storagedeviceset_count)
        else:
            d.delete_resource(resource_id)
            self.wait_for_osd_pods_to_be_running(storagedeviceset_count)

        self.new_pods_in_status_running = True
        logging.info(
            "Finished verifying add capacity when one of the pods gets deleted"
        )
        logging.info("Waiting for ceph health check to finished...")
        check_ceph_health_after_add_capacity()
예제 #3
0
    def test_run_amq_respin_pod(self, pod_name):
        """
        Test amq workload when spinning ceph pods
        and restarting amq pods

        """
        # Respin relevant pod
        if pod_name == "amq":
            pod_pattern_list = [
                "cluster-operator",
                "my-cluster-kafka",
                "my-cluster-zookeeper",
                "my-connect-cluster-connect",
                "my-bridge-bridge",
            ]
            for pod_pattern in pod_pattern_list:
                respin_amq_app_pod(kafka_namespace=constants.AMQ_NAMESPACE,
                                   pod_pattern=pod_pattern)
        else:
            log.info(f"Respin Ceph pod {pod_name}")
            disruption = Disruptions()
            disruption.set_resource(resource=f"{pod_name}")
            disruption.delete_resource()

            # Validate the results
            log.info("Validate message run completely")
            for thread in self.threads:
                thread.result(timeout=1800)
    def test_run_couchbase_respin_pod(self, cb_setup, pod_name):
        log.info(f"Respin Ceph pod {pod_name}")

        if pod_name == "couchbase":
            self.cb.respin_couchbase_app_pod()
        else:
            disruption = Disruptions()
            disruption.set_resource(resource=f"{pod_name}")
            disruption.delete_resource()

        bg_handler = flowtest.BackgroundOps()
        bg_ops = [self.cb.result]
        bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)
        self.sanity_helpers.health_check(tries=40)
    def test_monitoring_after_respinning_ceph_pods(self, pods):
        """
        Test case to validate respinning the ceph pods and
        its interaction with prometheus pod

        """

        # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one
        resource_to_delete = ["mgr", "mon", "osd"]
        disruption = Disruptions()
        for res_to_del in resource_to_delete:
            disruption.set_resource(resource=res_to_del)
            disruption.delete_resource()

        # Check for the created pvc metrics on prometheus pod
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
예제 #6
0
    def test_pvc_snapshot_and_clone(
        self, nodes, multiple_snapshot_and_clone_of_postgres_pvc_factory
    ):
        """
        1. Deploy PGSQL workload
        2. Take a snapshot of the pgsql PVC.
        3. Create a new PVC out of that snapshot or restore snapshot
        4. Create a clone of restored snapshot
        5. Attach a new pgsql pod to it.
         5. Resize cloned pvc
        7. Create snapshots of cloned pvc and restore those snapshots
        8. Attach a new pgsql pod to it and Resize the new restored pvc
        9. Repeat the above steps in bg when performing base operation:
            restart pods, worker node reboot, node drain, device replacement

        """

        log.info("Starting multiple creation & clone of postgres PVC in Background")
        bg_handler = flowtest.BackgroundOps()
        executor_run_bg_ops = ThreadPoolExecutor(max_workers=1)
        pgsql_snapshot_and_clone = executor_run_bg_ops.submit(
            bg_handler.handler,
            multiple_snapshot_and_clone_of_postgres_pvc_factory,
            pvc_size_new=25,
            pgsql=self.pgsql,
            iterations=1,
        )
        log.info("Started creation of snapshots & clones in background")

        flow_ops = flowtest.FlowOperations()
        log.info("Starting operation 1: Pod Restarts")
        disruption = Disruptions()
        pod_obj_list = [
            "osd",
            "mon",
            "mgr",
            "operator",
            "rbdplugin",
            "rbdplugin_provisioner",
        ]
        for pod in pod_obj_list:
            disruption.set_resource(resource=f"{pod}")
            disruption.delete_resource()
        log.info("Verifying exit criteria for operation 1: Pod Restarts")
        flow_ops.validate_cluster(
            node_status=True, pod_status=True, operation_name="Pod Restarts"
        )

        log.info("Starting operation 2: Node Reboot")
        node_names = flow_ops.node_operations_entry_criteria(
            node_type="worker", number_of_nodes=3, operation_name="Node Reboot"
        )
        # Reboot node
        nodes.restart_nodes(node_names)
        log.info("Verifying exit criteria for operation 2: Node Reboot")
        flow_ops.validate_cluster(
            node_status=True, pod_status=True, operation_name="Node Reboot"
        )

        log.info("Starting operation 3: Node Drain")
        node_name = flow_ops.node_operations_entry_criteria(
            node_type="worker", number_of_nodes=1, operation_name="Node Drain"
        )
        # Node maintenance - to gracefully terminate all pods on the node
        drain_nodes([node_name[0].name])
        # Make the node schedulable again
        schedule_nodes([node_name[0].name])
        log.info("Verifying exit criteria for operation 3: Node Drain")
        flow_ops.validate_cluster(
            node_status=True, pod_status=True, operation_name="Node Drain"
        )

        log.info("Waiting for background operations to be completed")
        bg_handler.wait_for_bg_operations([pgsql_snapshot_and_clone], timeout=600)
    def run_in_bg(self,
                  nodes,
                  multiple_snapshot_and_clone_of_postgres_pvc_factory,
                  sc_name=None):
        log.info(
            "Starting multiple creation & clone of postgres PVC in Background")
        bg_handler = flowtest.BackgroundOps()
        executor_run_bg_ops = ThreadPoolExecutor(max_workers=1)
        pgsql_snapshot_and_clone = executor_run_bg_ops.submit(
            bg_handler.handler,
            multiple_snapshot_and_clone_of_postgres_pvc_factory,
            pvc_size_new=25,
            pgsql=self.pgsql,
            sc_name=sc_name,
            iterations=1,
        )
        log.info("Started creation of snapshots & clones in background")

        flow_ops = flowtest.FlowOperations()
        log.info("Starting operation 1: Pod Restarts")
        disruption = Disruptions()
        pod_obj_list = [
            "osd",
            "mon",
            "mgr",
            "operator",
            "rbdplugin",
            "rbdplugin_provisioner",
        ]
        for pod in pod_obj_list:
            disruption.set_resource(resource=f"{pod}")
            disruption.delete_resource()
        log.info("Verifying exit criteria for operation 1: Pod Restarts")
        flow_ops.validate_cluster(node_status=True,
                                  pod_status=True,
                                  operation_name="Pod Restarts")

        log.info("Starting operation 2: Node Reboot")
        node_names = flow_ops.node_operations_entry_criteria(
            node_type="worker",
            number_of_nodes=3,
            operation_name="Node Reboot")
        # Reboot node
        nodes.restart_nodes(node_names)
        log.info("Verifying exit criteria for operation 2: Node Reboot")
        flow_ops.validate_cluster(node_status=True,
                                  pod_status=True,
                                  operation_name="Node Reboot")

        log.info("Starting operation 3: Node Drain")
        node_name = flow_ops.node_operations_entry_criteria(
            node_type="worker", number_of_nodes=1, operation_name="Node Drain")
        # Node maintenance - to gracefully terminate all pods on the node
        drain_nodes([node_name[0].name])
        # Make the node schedulable again
        schedule_nodes([node_name[0].name])
        log.info("Verifying exit criteria for operation 3: Node Drain")
        flow_ops.validate_cluster(node_status=True,
                                  pod_status=True,
                                  operation_name="Node Drain")

        log.info("Waiting for background operations to be completed")
        bg_handler.wait_for_bg_operations([pgsql_snapshot_and_clone],
                                          timeout=600)