示例#1
0
 def finalizer():
     worker_nodes = get_worker_nodes()
     # Removing created label on all worker nodes
     remove_label_from_worker_node(worker_nodes, label_key="dc")
     for thread in self.threads:
         thread.join()
     ceph_health_check()
 def finalizer():
     worker_nodes = get_worker_nodes()
     # Removing created label on all worker nodes
     remove_label_from_worker_node(worker_nodes, label_key="dc")
     # Verify OSD encrypted
     if config.ENV_DATA.get("encryption_at_rest"):
         osd_encryption_verification()
示例#3
0
    def get_node_info(self, node_type="master"):
        """
        Getting node type hardware information and update the main environment
        dictionary.

        Args:
            node_type (str): the node type to collect data about,
              can be : master / worker - the default is master

        """
        if node_type == "master":
            nodes = node.get_master_nodes()
        elif node_type == "worker":
            nodes = node.get_worker_nodes()
        else:
            log.warning(f"Node type ({node_type}) is invalid")
            return

        oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        self.environment[f"{node_type}_nodes_num"] = len(nodes)
        self.environment[f"{node_type}_nodes_cpu_num"] = oc_cmd.exec_oc_debug_cmd(
            node=nodes[0],
            cmd_list=["lscpu | grep '^CPU(s):' | awk '{print $NF}'"],
        ).rstrip()
        self.environment[f"{node_type}_nodes_memory"] = oc_cmd.exec_oc_debug_cmd(
            node=nodes[0], cmd_list=["free | grep Mem | awk '{print $2}'"]
        ).rstrip()
示例#4
0
def create_dummy_zone_labels():
    """
    Create dummy zone labels on cluster nodes: try to label all master and
    worker nodes based on values of ``worker_availability_zones`` and
    ``master_availability_zones`` options, but only if there are no zone
    labels already defined.

    Raises:
        UnexpectedDeploymentConfiguration: when either cluster or ocs-ci config
            file are in conflict with dummy zone labels.
    """
    logger.info("trying to setup dummy_zone_node_labels")
    if are_zone_labels_missing():
        to_label = [
            ("master_availability_zones", get_master_nodes()),
            ("worker_availability_zones", get_worker_nodes()),
        ]
        for zone_opt, nodes in to_label:
            zones = config.ENV_DATA.get(zone_opt)
            if zones is None:
                msg = f"{zone_opt} is not defined in ENV_DATA conf"
                logger.error(msg)
                raise exceptions.UnexpectedDeploymentConfiguration(msg)
            assign_dummy_zones(zones, nodes)
    else:
        # don't use dummy zone labeling on a cluster with actuall zones
        msg = ("Cluster in unexpected state before dummy zone labeling: "
               "at least one node already have a zone label.")
        logger.error(msg)
        raise exceptions.UnexpectedDeploymentConfiguration(msg)
示例#5
0
    def start_baremetal_machines_with_ipmi_ctx(self, ipmi_ctxs, wait=True):
        """
        Start Baremetal Machines using Ipmi ctx

        Args:
            ipmi_ctxs (list): List of BM ipmi_ctx
            wait (bool): Wait for BMs to start

        """
        for ipmi_ctx in ipmi_ctxs:
            ipmi_ctx.chassis_control_power_up()

        if wait:
            for ipmi_ctx in ipmi_ctxs:
                for status in TimeoutSampler(600, 5, self.get_power_status, ipmi_ctx):
                    logger.info(
                        f"Waiting for Baremetal Machine to power on. "
                        f"Current Baremetal status: {status}"
                    )
                    if status == VM_POWERED_ON:
                        logger.info("Baremetal Machine reached poweredOn status")
                        break

        wait_for_cluster_connectivity(tries=400)
        wait_for_nodes_status(
            node_names=get_master_nodes(), status=constants.NODE_READY, timeout=800
        )
        wait_for_nodes_status(
            node_names=get_worker_nodes(), status=constants.NODE_READY, timeout=800
        )
示例#6
0
    def setup(self, project_factory, pvc_factory, pod_factory):
        """
        Create PVC and pods

        """
        self.pvc_size = 10

        self.pvc_obj = pvc_factory(
            interface=constants.CEPHBLOCKPOOL,
            size=self.pvc_size,
            access_mode=constants.ACCESS_MODE_RWX,
            status=constants.STATUS_BOUND,
            volume_mode=constants.VOLUME_MODE_BLOCK,
            size_unit="Mi",
        )

        worker_nodes_list = node.get_worker_nodes()

        self.pod_objs = []
        for node_name in worker_nodes_list:
            pod_obj = pod_factory(
                interface=constants.CEPHBLOCKPOOL,
                pvc=self.pvc_obj,
                status=constants.STATUS_RUNNING,
                node_name=node_name,
                pod_dict_path=constants.CSI_RBD_RAW_BLOCK_POD_YAML,
                raw_block_pv=True,
            )
            self.pod_objs.append(pod_obj)
示例#7
0
    def setup(self, add_nodes):
        """
        Check that we have the right configurations before we start the test
        """
        osd_pods_before = pod_helpers.get_osd_pods()
        number_of_osd_pods_before = len(osd_pods_before)
        if number_of_osd_pods_before >= constants.MAX_OSDS:
            pytest.skip("We have maximum of OSDs in the cluster")

        # If we use vSphere we may need to add more worker nodes
        # to the cluster before starting the test
        if (config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM
                and number_of_osd_pods_before >= 9):
            num_of_expected_wnodes = 6
            wnodes = node.get_worker_nodes()
            num_of_wnodes = len(wnodes)
            logging.info(
                f"We have {number_of_osd_pods_before} OSDs in the cluster, "
                f"and {num_of_wnodes} worker nodes in the cluster")
            if num_of_wnodes < num_of_expected_wnodes:
                num_of_wnodes_to_add = num_of_expected_wnodes - num_of_wnodes
                logging.info(
                    f"Adding more {num_of_wnodes_to_add} worker nodes to the cluster"
                )
                add_nodes(ocs_nodes=False, node_count=num_of_wnodes_to_add)

            wnodes_not_in_ocs = node.get_worker_nodes_not_in_ocs()
            if wnodes_not_in_ocs:
                logging.info("Label the worker nodes that are not in OCS")
                node.label_nodes(wnodes_not_in_ocs)
示例#8
0
    def start_powernodes_machines(self,
                                  powernode_machines,
                                  timeout=900,
                                  wait=True,
                                  force=True):
        """
        Start PowerNode Machines

        Args:
            powernode_machines (list): List of PowerNode machines
            timeout (int): time in seconds to wait for node to reach 'not ready' state,
                and 'ready' state.
            wait (bool): Wait for PowerNodes to start - for future use
            force (bool): True for PowerNode ungraceful power off, False for
                graceful PowerNode shutdown - for future use
        """
        ocpversion = get_ocp_version("-")
        for node in powernode_machines:
            result = exec_cmd(
                f"sudo virsh start test-ocp{ocpversion}-{node.name}")
            logger.info(f"Result of shutdown {result}")

        wait_for_cluster_connectivity(tries=900)
        wait_for_nodes_status(node_names=get_master_nodes(),
                              status=constants.NODE_READY,
                              timeout=timeout)
        wait_for_nodes_status(node_names=get_worker_nodes(),
                              status=constants.NODE_READY,
                              timeout=timeout)
示例#9
0
    def verify_disks_lso_attached(self, timeout=600, sleep=20):
        """
        Verify Disks Attached

        Args:
            timeout (int): Time in seconds to wait
            sleep (int): Sampling time in seconds

        """
        osd_size = config.ENV_DATA.get("device_size", defaults.DEVICE_SIZE)
        number_worker_nodes = get_worker_nodes()
        capacity = int(osd_size) * len(number_worker_nodes)
        if capacity >= 1024:
            capacity_str = str(
                capacity / 1024).rstrip("0").rstrip(".") + " TiB"
        else:
            capacity_str = str(capacity) + " GiB"
        sample = TimeoutSampler(
            timeout=timeout,
            sleep=sleep,
            func=self.check_element_text,
            expected_text=capacity_str,
        )
        if not sample.wait_for_func_status(result=True):
            logger.error(f" after {timeout} seconds")
            raise TimeoutExpiredError
示例#10
0
    def get_node_name_where_jenkins_pod_not_hosted(self,
                                                   node_type=constants.
                                                   WORKER_MACHINE,
                                                   num_of_nodes=1):
        """
        get nodes

        Args:
            node_type (str): The node type  (e.g. worker, master)
            num_of_nodes (int): The number of nodes to be returned

        Returns:
            list: List of compute node names
        """
        if node_type == constants.MASTER_MACHINE:
            nodes_drain = [
                node.name for node in get_nodes(node_type=node_type,
                                                num_of_nodes=num_of_nodes)
            ]
        elif node_type == constants.WORKER_MACHINE:
            pod_objs = []
            for project in self.projects:
                pod_names = get_pod_name_by_pattern(pattern="jenkins",
                                                    namespace=project)
                pod_obj = [
                    get_pod_obj(name=pod_name, namespace=project)
                    for pod_name in pod_names
                ]
                pod_objs += pod_obj
            nodes_app_name = set(get_app_pod_running_nodes(pod_objs))
            nodes_worker_name = set(get_worker_nodes())
            nodes_drain = nodes_worker_name - nodes_app_name
        else:
            raise ValueError("The node type is worker or master")
        return list(nodes_drain)[:num_of_nodes]
    def test_automated_recovery_from_stopped_node_and_start(
        self, nodes, additional_node
    ):
        """
        Knip-678 Automated recovery from failed nodes
        Reactive case - IPI

        0) A - add new node, B - don't add new node
        1) Stop node
        2) Validate result:
             A - pods should respin on the new node
             B - pods should remain in Pending state on the stopped node
        3) Start node
        4) Validate result:
             A - pods should start on the new node
             B - pods should start on the stopped node after starting it
        """
        wnode_name = get_worker_nodes()[0]
        machine_name = machine.get_machine_from_node_name(wnode_name)
        self.machineset_name = machine.get_machineset_from_machine_name(machine_name)
        self.start_ready_replica_count = machine.get_ready_replica_count(
            self.machineset_name
        )

        temp_osd = get_osd_pods()[0]
        osd_real_name = "-".join(temp_osd.name.split("-")[:-1])
        self.osd_worker_node = [get_pod_node(temp_osd)]
        if additional_node:
            self.add_new_storage_node(self.osd_worker_node[0].name)
            self.extra_node = True
        nodes.stop_nodes(self.osd_worker_node, wait=True)
        log.info(f"Successfully powered off node: {self.osd_worker_node[0].name}")

        timeout = 420
        assert wait_for_rook_ceph_pod_status(
            temp_osd, constants.STATUS_TERMINATING, timeout
        ), (
            f"The pod {osd_real_name} didn't reach the status {constants.STATUS_TERMINATING} "
            f"after {timeout} seconds"
        )

        # Validate that the OSD in terminate state has a new OSD in Pending
        all_pod_obj = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        new_osd = None
        for pod_obj in all_pod_obj:
            if osd_real_name == "-".join(pod_obj.name.split("-")[:-1]) and (
                temp_osd.name != pod_obj.name
            ):
                new_osd = pod_obj
                break

        nodes.start_nodes(nodes=self.osd_worker_node, wait=True)
        log.info(f"Successfully powered on node: {self.osd_worker_node[0].name}")
        wait_for_resource_state(new_osd, constants.STATUS_RUNNING, timeout=180)
        if additional_node:
            new_osd_node = get_pod_node(new_osd)
            assert (
                new_osd_node.name != self.osd_worker_node[0].name
            ), "New OSD is expected to run on the new additional node"
示例#12
0
    def test_nodereplacement_proactive_with_io_running(
        self,
        pvc_factory,
        pod_factory,
        dc_pod_factory,
        bucket_factory,
        rgw_bucket_factory,
    ):
        """
        Knip-894 Node Replacement proactive when IO running in the background

        """

        # Get worker nodes
        worker_node_list = node.get_worker_nodes()
        log.info(f"Current available worker nodes are {worker_node_list}")

        osd_node_name = select_osd_node_name()

        log.info("Creating dc pod backed with rbd pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL,
                                            node_name=worker_node,
                                            size=20)
                pod.run_io_in_bg(rbd_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        log.info("Creating dc pod backed with cephfs pvc and running io in bg")
        for worker_node in worker_node_list:
            if worker_node != osd_node_name:
                cephfs_dc_pod = dc_pod_factory(
                    interface=constants.CEPHFILESYSTEM,
                    node_name=worker_node,
                    size=20)
                pod.run_io_in_bg(cephfs_dc_pod,
                                 expect_to_fail=False,
                                 fedora_dc=True)

        delete_and_create_osd_node(osd_node_name)

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
        # Deleting Resources
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)

        # Verify OSD is encrypted
        if config.ENV_DATA.get("encryption_at_rest"):
            osd_encryption_verification()
示例#13
0
    def test_node_kernel_crash_ceph_fsync(self, pvc_factory, teardown_factory,
                                          dc_pod_factory, interface_type):
        """
        1. Create 1GiB PVC
        2. Attach PVC to an application pod
        3. Copy file fsync.py to pod
        4. Execute create delete file operation parallely with fsync.py
        5. Check Node gets Panic or not
        """

        worker_nodes_list = get_worker_nodes()

        # Create a Cephfs, rbd PVC
        pvc_obj = pvc_factory(interface=interface_type, )

        # Create a pod on a particular node
        selected_node = random.choice(worker_nodes_list)
        log.info(
            f"Creating a pod on node: {selected_node} with pvc {pvc_obj.name}")

        pod_obj = dc_pod_factory(
            interface=interface_type,
            pvc=pvc_obj,
        )

        file = constants.FSYNC
        cmd = f"oc cp {file} {pvc_obj.namespace}/{pod_obj.name}:/"
        helpers.run_cmd(cmd=cmd)
        log.info("Files copied successfully ")

        command = f"mkdir {self.result_dir}"
        pod_obj.exec_cmd_on_pod(command=command)
        log.info("Starting creation and deletion of files on volume")

        # Create and delete files on mount point
        create_executor = ThreadPoolExecutor(max_workers=1)
        self.create_thread = create_executor.submit(self.creates_files,
                                                    pod_obj)
        sleep(3)

        log.info("Started deletion of files on volume")
        delete_executor = ThreadPoolExecutor(max_workers=1)
        self.delete_thread = delete_executor.submit(self.remove_files, pod_obj)

        fsync_executor = ThreadPoolExecutor(max_workers=1)
        self.fsync_thread = fsync_executor.submit(pod_obj.exec_sh_cmd_on_pod,
                                                  command="python3 fsync.py")

        # Check Node gets Panic or not
        try:
            node.wait_for_nodes_status(selected_node,
                                       status=constants.NODE_NOT_READY,
                                       timeout=60)

        except ResourceWrongStatusException:
            log.info(f"(No kernel panic observed on {selected_node})")
        else:
            assert f"({selected_node} is in Not Ready state)"
示例#14
0
    def __init__(self, **kwargs):
        """
        Initializer function.

        Initialize object variables, clone the benchmark operator repo.
        and label the worker nodes.

        Args:
            kwargs (dict):
                Following kwargs are valid
                repo: benchmark-operator repo to used - a github link
                branch: branch to use from the repo

        Example Usage:
            r1 = BenchmarkOperator()
            r1.deploy()
            # use oc apply to apply custom modified bench
            my_custom_bench = my_custom_bench.yaml
            run_cmd('oc apply -f my_custom_bench')

        """
        log.info("Initialize the benchmark-operator object")
        self.args = kwargs
        self.repo = self.args.get("repo", BMO_REPO)
        self.branch = self.args.get("branch", "master")
        # the namespace is a constant for the benchmark-operator
        self.namespace = BMO_NAME
        self.pgsql_is_setup = False
        self.ocp = OCP()
        self.ns_obj = OCP(kind="namespace")
        self.pod_obj = OCP(namespace=BMO_NAME, kind="pod")
        # list of worker nodes to label
        self.worker_nodes = get_worker_nodes()
        self._clone_operator()
        self.dir += f"/{BMO_NAME}"

        # to use the cache dropping pod, worker nodes need to be labeled.
        log.info("Labeling the worker nodes for cache-dropping enable.")
        try:
            helpers.label_worker_node(self.worker_nodes,
                                      label_key=BMO_LABEL,
                                      label_value="yes")
        except CommandFailed:
            # this is probably because of the nodes are already labeled, so,
            # checking if nodes are labeled and continue anyway.
            result = self.pod_obj.exec_oc_cmd(f"get node -l {BMO_LABEL}")
            found = [
                node for node in self.worker_nodes if re.search(node, result)
            ]
            if len(found) == len(self.worker_nodes):
                log.info("All worker nodes are labeled")
            else:
                log.warning(
                    "Labeling nodes failed, Not all workers node are labeled !"
                )
示例#15
0
def check_automated_recovery_from_stopped_node(nodes):
    """
    1) Stop node.
    2) The rook ceph pods associated with the node should change to a Terminating state.
    3) The node should power on automatically, or if removed from the cluster,
       a new node should create automatically.
    4) The new osd pods with the same ids should start on the stopped node after it powered on,
       or to start on the new osd node.

    """
    old_wnodes = get_worker_nodes()
    log.info(f"Current worker nodes: {old_wnodes}")

    osd_node_name = random.choice(get_osd_running_nodes())
    osd_node = get_node_objs([osd_node_name])[0]

    machine_name = machine.get_machine_from_node_name(osd_node_name)
    machineset = machine.get_machineset_from_machine_name(machine_name)
    log.info(f"machineset name: {machineset}")

    old_osd_pod_ids = get_node_osd_ids(osd_node_name)
    log.info(f"osd pod ids: {old_osd_pod_ids}")

    pod_names_expected_to_terminate = get_node_pod_names_expected_to_terminate(
        osd_node_name)

    nodes.stop_nodes([osd_node], wait=True)
    log.info(f"Successfully powered off node: {osd_node_name}")

    log.info("Verify the node rook ceph pods go into a Terminating state")
    res = wait_for_pods_to_be_in_statuses([constants.STATUS_TERMINATING],
                                          pod_names_expected_to_terminate)
    assert res, "Not all the node rook ceph pods are in a Terminating state"

    try:
        log.info(f"Wait for the node: {osd_node_name} to power on")
        wait_for_nodes_status([osd_node_name])
        log.info(f"Successfully powered on node {osd_node_name}")
    except ResourceWrongStatusException as e:
        log.info(
            f"The worker node {osd_node_name} didn't start due to the exception {str(e)} "
            f"Probably it has been removed from the cluster. Waiting for a new node to come up..."
        )
        new_wnode = wait_for_new_worker_node_ipi(machineset, old_wnodes)
        osd_node_name = new_wnode.name

    assert wait_for_osd_ids_come_up_on_node(osd_node_name,
                                            old_osd_pod_ids,
                                            timeout=300)
    log.info(
        f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}"
    )
示例#16
0
def check_automated_recovery_from_drain_node(nodes):
    """
    1) Drain one worker node.
    2) Delete the OSD pods associated with the node.
    3) The new OSD pods with the same ids that come up, should be in a Pending state.
    4) Schedule the worker node.
    5) The OSD pods associated with the node, should back into a Running state, and come up
        on the same node.

    """
    osd_node_name = random.choice(get_osd_running_nodes())
    old_osd_pod_ids = get_node_osd_ids(osd_node_name)
    log.info(f"osd pod ids: {old_osd_pod_ids}")
    node_osd_pods = get_osd_pods_having_ids(old_osd_pod_ids)

    unschedule_nodes([osd_node_name])
    log.info(f"Successfully unschedule the node: {osd_node_name}")

    log.info("Delete the node osd pods")
    delete_pods(node_osd_pods)

    new_osd_pods = wait_for_osd_pods_having_ids(osd_ids=old_osd_pod_ids)
    new_osd_pod_names = [p.name for p in new_osd_pods]

    wnodes = get_worker_nodes()
    if len(wnodes) <= 3:
        expected_pods_status = constants.STATUS_PENDING
    else:
        expected_pods_status = constants.STATUS_RUNNING

    log.info(
        f"Verify the new osd pods {new_osd_pod_names} go into a {expected_pods_status} state"
    )
    res = wait_for_pods_to_be_in_statuses(
        [expected_pods_status],
        new_osd_pod_names,
        raise_pod_not_found_error=True,
    )
    assert res, f"Not all the node osd pods are in a {expected_pods_status} state"

    log.info(f"Wait for the node: {osd_node_name} to be scheduled")
    schedule_nodes([osd_node_name])
    log.info(f"Successfully scheduled the node {osd_node_name}")

    if len(wnodes) <= 3:
        assert wait_for_osd_ids_come_up_on_node(osd_node_name, old_osd_pod_ids)
        log.info(
            f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}"
        )
        def finalizer():
            worker_nodes = get_worker_nodes()
            # Removing created label on all worker nodes
            remove_label_from_worker_node(worker_nodes, label_key="dc")
            for thread in self.threads:
                thread.join()

            log.info("Get the machine set name from one of the worker node names")
            machine_name = machine.get_machine_from_node_name(worker_nodes[0])
            machineset_name = machine.get_machineset_from_machine_name(machine_name)
            log.info(
                "Verify that the current replica count is equal to the ready replica count"
            )
            machine.change_current_replica_count_to_ready_replica_count(machineset_name)

            ceph_health_check()
示例#18
0
    def start_baremetal_machines(self, baremetal_machine, wait=True):
        """
        Start Baremetal Machines

        Args:
            baremetal_machine (list): BM objects
            wait (bool): Wait for BMs to start

        """
        for node in baremetal_machine:
            if self.mgmt_details[node.name]:
                ipmi_ctx = self.get_ipmi_ctx(
                    host=self.mgmt_details[node.name]["mgmt_console"],
                    user=self.mgmt_details[node.name]["mgmt_username"],
                    password=self.mgmt_details[node.name]["mgmt_password"],
                )
                logger.info(f"Powering On {node.name}")
                ipmi_ctx.chassis_control_power_up()
            if wait:
                if self.mgmt_details[node.name]:
                    ipmi_ctx = self.get_ipmi_ctx(
                        host=self.mgmt_details[node.name]["mgmt_console"],
                        user=self.mgmt_details[node.name]["mgmt_username"],
                        password=self.mgmt_details[node.name]["mgmt_password"],
                    )
                    for status in TimeoutSampler(
                        600, 5, self.get_power_status, ipmi_ctx
                    ):
                        logger.info(
                            f"Waiting for Baremetal Machine {node.name} to power on. "
                            f"Current Baremetal status: {status}"
                        )
                        if status == VM_POWERED_ON:
                            logger.info(
                                f"Baremetal Machine {node.name} reached poweredOn status"
                            )
                            ipmi_ctx.session.close()
                            break

        wait_for_cluster_connectivity(tries=400)
        wait_for_nodes_status(
            node_names=get_master_nodes(), status=constants.NODE_READY, timeout=800
        )
        wait_for_nodes_status(
            node_names=get_worker_nodes(), status=constants.NODE_READY, timeout=800
        )
示例#19
0
    def test_rwo_pvc_assign_pod_node(self, interface, pvc_factory,
                                     teardown_factory):
        """
        Test assign nodeName to a pod using RWO pvc
        """
        worker_nodes_list = get_worker_nodes()

        # Create a RWO PVC
        pvc_obj = pvc_factory(
            interface=interface,
            access_mode=constants.ACCESS_MODE_RWO,
            status=constants.STATUS_BOUND,
        )

        # Create a pod on a particular node
        selected_node = random.choice(worker_nodes_list)
        logger.info(
            f"Creating a pod on node: {selected_node} with pvc {pvc_obj.name}")

        pod_obj = helpers.create_pod(
            interface_type=interface,
            pvc_name=pvc_obj.name,
            namespace=pvc_obj.namespace,
            node_name=selected_node,
            pod_dict_path=constants.NGINX_POD_YAML,
        )
        teardown_factory(pod_obj)

        # Confirm that the pod is running on the selected_node
        helpers.wait_for_resource_state(resource=pod_obj,
                                        state=constants.STATUS_RUNNING,
                                        timeout=120)
        pod_obj.reload()
        assert pod.verify_node_name(
            pod_obj, selected_node
        ), "Pod is running on a different node than the selected node"

        # Run IO
        logger.info(f"Running IO on pod {pod_obj.name}")
        pod_obj.run_io(storage_type="fs",
                       size="512M",
                       runtime=30,
                       invalidate=0)
        pod.get_fio_rw_iops(pod_obj)
    def test_scale_node_and_capacity(self):
        """
        Test for scaling 12 OCS worker nodes to the cluster
        Scale 12*3 = 36 OSDs
        """

        expected_worker_count = 12
        osds_per_node = 3

        try:
            # Gather existing deviceset, OSD and node count in setup
            existing_ocs_worker_list = get_worker_nodes()
            existing_deviceset_count = storage_cluster.get_deviceset_count()
            osd_replication_count = storage_cluster.get_osd_replica_count()
            expected_deviceset_count = (expected_worker_count /
                                        osds_per_node) * osd_replication_count

            # Check existing OCS worker node count and add nodes if required
            if len(existing_ocs_worker_list) < expected_worker_count:
                scale_worker_count = expected_worker_count - len(
                    existing_ocs_worker_list)
                if not scale_lib.scale_ocs_node(node_count=scale_worker_count):
                    raise OCSWorkerScaleFailed(
                        "OCS worker nodes scaling Failed")

            # Check existing OSD count and add OSDs if required
            if existing_deviceset_count < expected_deviceset_count:
                additional_deviceset = int(expected_deviceset_count -
                                           existing_deviceset_count)
                if not scale_lib.scale_capacity_with_deviceset(
                        add_deviceset_count=additional_deviceset, timeout=600):
                    raise OSDScaleFailed("Scaling OSDs Failed")

            # Check ceph health statuss
            utils.ceph_health_check(tries=30)

        except (OCSWorkerScaleFailed, OSDScaleFailed, Exception) as ex:
            TestAddNode.skip_all = True
            logging.warning(
                f"Due to Exception set TestAddNode.skip_all to {TestAddNode.skip_all}"
            )
            logging.error(f"Cluster not in expected state. {ex}")
示例#21
0
    def test_scale_node_and_capacity(self):
        """
        Test for scaling 12 OCS worker nodes to the cluster
        Scale 12*3 = 36 OSDs
        """

        expected_worker_count = 12
        osds_per_node = 3

        try:
            # Gather existing deviceset, OSD and node count in setup
            existing_ocs_worker_list = get_worker_nodes()
            existing_deviceset_count = storage_cluster.get_deviceset_count()
            osd_replication_count = storage_cluster.get_osd_replica_count()
            expected_deviceset_count = (
                expected_worker_count / osds_per_node
            ) * osd_replication_count

            # Check existing OCS worker node count and add nodes if required
            if len(existing_ocs_worker_list) < expected_worker_count:
                scale_worker_count = expected_worker_count - len(
                    existing_ocs_worker_list
                )
                assert scale_lib.scale_ocs_node(node_count=scale_worker_count)

            # Check existing OSD count and add OSDs if required
            if existing_deviceset_count < expected_deviceset_count:
                add_deviceset_count = (
                    expected_deviceset_count - existing_deviceset_count
                )
                assert scale_lib.scale_capacity_with_deviceset(
                    add_deviceset_count=add_deviceset_count
                )

            # Check ceph health statuss
            utils.ceph_health_check(tries=30)

        except UnexpectedBehaviour:
            TestAddNode.skip_all = True
            logging.info("Cluster is not in expected state, unexpected behaviour")
            raise
示例#22
0
def check_automated_recovery_from_terminated_node(nodes):
    """
    1) Terminate node.
    2) The rook ceph pods associated with the node should change to a Terminating state.
    3) A new node should be created automatically
    4) The new osd pods with the same ids of the terminated node should start on the new osd node.

    """
    old_wnodes = get_worker_nodes()
    log.info(f"Current worker nodes: {old_wnodes}")

    osd_node_name = random.choice(get_osd_running_nodes())
    osd_node = get_node_objs([osd_node_name])[0]

    machine_name = machine.get_machine_from_node_name(osd_node_name)
    machineset = machine.get_machineset_from_machine_name(machine_name)
    log.info(f"machineset name: {machineset}")

    old_osd_pod_ids = get_node_osd_ids(osd_node_name)
    log.info(f"osd pod ids: {old_osd_pod_ids}")

    pod_names_expected_to_terminate = get_node_pod_names_expected_to_terminate(
        osd_node.name)

    nodes.terminate_nodes([osd_node], wait=True)
    log.info(f"Successfully terminated the node: {osd_node_name}")

    log.info("Verify the node rook ceph pods go into a Terminating state")
    res = wait_for_pods_to_be_in_statuses([constants.STATUS_TERMINATING],
                                          pod_names_expected_to_terminate)
    assert res, "Not all the node rook ceph pods are in a Terminating state"

    new_wnode = wait_for_new_worker_node_ipi(machineset, old_wnodes)

    wait_for_osd_ids_come_up_on_node(new_wnode.name,
                                     old_osd_pod_ids,
                                     timeout=300)
    log.info(
        f"the osd ids {old_osd_pod_ids} Successfully come up on the node {new_wnode.name}"
    )
    def setup(self, interface_type, reclaim_policy, storageclass_factory):
        """
        Creates storage class with specified interface and reclaim policy.
        Fetches all worker nodes

        Args:
            interface_type (str): The type of the interface
                (e.g. CephBlockPool, CephFileSystem)
            reclaim_policy (str): The type of reclaim policy
                (eg., 'Delete', 'Retain')
            storageclass_factory: A fixture to create new storage class

        Returns:
            tuple: containing the storage class instance and list of worker nodes

        """
        # Create storage class
        sc_obj = storageclass_factory(interface=interface_type,
                                      reclaim_policy=reclaim_policy)
        worker_nodes_list = node.get_worker_nodes()

        return sc_obj, worker_nodes_list
示例#24
0
    def workloads_dir_setup(self, request):
        """
        Setting up the environment for the test

        """
        if config.DEPLOYMENT.get("local_storage"):
            self.worker_node = node.get_worker_nodes()[0]
            self.oc_cmd = OCP(namespace=defaults.ROOK_CLUSTER_NAMESPACE)
            mon_pod_name = self.oc_cmd.exec_oc_debug_cmd(
                node=self.worker_node,
                cmd_list=["ls /var/lib/rook/ | grep mon"],
            )
            mon_pod_id = mon_pod_name.split("-")[1].replace("\n", "")

            mon_pods_info = pod.get_pods_having_label(
                label=f"ceph_daemon_id={mon_pod_id}",
                namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            )
            self.mon_pod = pod.get_pod_obj(
                name=mon_pods_info[0]["metadata"]["name"],
                namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            )
        else:
            self.mon_pod = random.choice(pod.get_mon_pods())
        self.mon_suffix = self.mon_pod.get().get("metadata").get("labels").get(
            "mon")

        self.workloads_dir = f"/var/lib/ceph/mon/ceph-{self.mon_suffix}/workloads"
        log.info(f"Selected mon '{self.mon_pod.name}'")
        self.mon_pod.exec_cmd_on_pod(f"mkdir {self.workloads_dir}")
        self.mon_pod.exec_cmd_on_pod(f"touch {self.workloads_dir}/{TEMP_FILE}")

        def finalizer():
            self.mon_pod.exec_cmd_on_pod(f"rm -rf {self.workloads_dir}")
            time.sleep(SLEEP_TIMEOUT)
            utils.ceph_health_check()

        request.addfinalizer(finalizer)
示例#25
0
def check_and_add_enough_worker(worker_count):
    """
    Function to check if there is enough workers available to scale pods.
    IF there is no enough worker then worker will be added based on supported platforms
    Function also adds scale label to the respective worker nodes.

    Args:
        worker_count (int): Expected worker count to be present in the setup

    Returns:
        book: True is there is enough worker count else raise exception.

    """
    # Check either to use OCS workers for scaling app pods
    # Further continue to label the worker with scale label else not
    worker_list = node.get_worker_nodes()
    ocs_worker_list = machine.get_labeled_nodes(constants.OPERATOR_NODE_LABEL)
    scale_worker = machine.get_labeled_nodes(constants.SCALE_LABEL)
    if config.RUN.get("use_ocs_worker_for_scale"):
        if not scale_worker:
            helpers.label_worker_node(node_list=worker_list,
                                      label_key="scale-label",
                                      label_value="app-scale")
    else:
        if not scale_worker:
            for node_item in ocs_worker_list:
                worker_list.remove(node_item)
            if worker_list:
                helpers.label_worker_node(
                    node_list=worker_list,
                    label_key="scale-label",
                    label_value="app-scale",
                )
    scale_worker_list = machine.get_labeled_nodes(constants.SCALE_LABEL)
    logging.info(f"Print existing scale worker {scale_worker_list}")

    # Check if there is enough nodes to continue scaling of app pods
    if len(scale_worker_list) >= worker_count:
        logging.info(f"Setup has expected worker count {worker_count} "
                     "to continue scale of pods")
        return True
    else:
        logging.info(
            "There is no enough worker in the setup, will add enough worker "
            "for the automation supported platforms")
        # Add enough worker for AWS
        if (config.ENV_DATA["deployment_type"] == "ipi"
                and config.ENV_DATA["platform"].lower() == "aws"):
            # Create machineset for app worker nodes on each aws zone
            # Each zone will have one app worker node
            ms_name = list()
            labels = [("node-role.kubernetes.io/app", "app-scale")]
            for obj in machine.get_machineset_objs():
                if "app" in obj.name:
                    ms_name.append(obj.name)
            if not ms_name:
                if len(machine.get_machineset_objs()) == 3:
                    for zone in ["a", "b", "c"]:
                        ms_name.append(
                            machine.create_custom_machineset(
                                instance_type="m5.4xlarge",
                                labels=labels,
                                zone=zone,
                            ))
                else:
                    ms_name.append(
                        machine.create_custom_machineset(
                            instance_type="m5.4xlarge",
                            labels=labels,
                            zone="a",
                        ))
                for ms in ms_name:
                    machine.wait_for_new_node_to_be_ready(ms)
            if len(ms_name) == 3:
                exp_count = int(worker_count / 3)
            else:
                exp_count = worker_count
            for name in ms_name:
                machine.add_node(machine_set=name, count=exp_count)
            for ms in ms_name:
                machine.wait_for_new_node_to_be_ready(ms)
            worker_list = node.get_worker_nodes()
            ocs_worker_list = machine.get_labeled_nodes(
                constants.OPERATOR_NODE_LABEL)
            scale_label_worker = machine.get_labeled_nodes(
                constants.SCALE_LABEL)
            ocs_worker_list.extend(scale_label_worker)
            final_list = list(dict.fromkeys(ocs_worker_list))
            for node_item in final_list:
                if node_item in worker_list:
                    worker_list.remove(node_item)
            if worker_list:
                helpers.label_worker_node(
                    node_list=worker_list,
                    label_key="scale-label",
                    label_value="app-scale",
                )
            return True
        elif (config.ENV_DATA["deployment_type"] == "upi"
              and config.ENV_DATA["platform"].lower() == "vsphere"):
            raise UnsupportedPlatformError(
                "Unsupported Platform to add worker")
        elif (config.ENV_DATA["deployment_type"] == "upi"
              and config.ENV_DATA["platform"].lower() == "baremetal"):
            raise UnsupportedPlatformError(
                "Unsupported Platform to add worker")
        elif (config.ENV_DATA["deployment_type"] == "upi"
              and config.ENV_DATA["platform"].lower() == "azure"):
            raise UnsupportedPlatformError(
                "Unsupported Platform to add worker")
        else:
            raise UnavailableResourceException(
                "There is no enough worker nodes to continue app pod scaling")
示例#26
0
    def log_reader_writer_parallel(self):
        """
        Write and read logfile stored on cephfs volume, from all worker nodes of a
        cluster via k8s Deployment.

        Raise:
            NotFoundError: When given volume is not found in given spec
            UnexpectedBehaviour: When an unexpected problem with starting the workload occurred

        """

        # get deployment dict for the reproducer logwriter workload
        with open(constants.LOGWRITER_CEPHFS_REPRODUCER,
                  "r") as deployment_file:
            self.deploy_dict = yaml.safe_load(deployment_file.read())
        # if we are running in disconnected environment, we need to mirror the
        # container image first, and then use the mirror instead of the original
        if config.DEPLOYMENT.get("disconnected"):
            update_container_with_mirrored_image(
                self.deploy_dict["spec"]["template"])
        # we need to match deployment replicas with number of worker nodes
        self.deploy_dict["spec"]["replicas"] = len(get_worker_nodes())
        # drop topology spread constraints related to zones
        topology.drop_topology_constraint(
            self.deploy_dict["spec"]["template"]["spec"], topology.ZONE_LABEL)
        # and link the deployment with the pvc
        try:
            link_spec_volume(
                self.deploy_dict["spec"]["template"]["spec"],
                "logwriter-cephfs-volume",
                self.pvc_dict["metadata"]["name"],
            )
        except (exceptions.NotFoundError, KeyError) as ex:
            logger.warning(
                "Failed to link the deployment with the pvc. We may need to check if the "
                "LOGWRITER_CEPHFS_REPRODUCER still matches the code of this test"
            )
            raise ex

        # prepare k8s yaml file for deployment
        self.workload_file = ObjectConfFile(
            "log_reader_writer_parallel",
            [self.pvc_dict, self.deploy_dict],
            self.project,
            self.tmp_path,
        )
        # deploy the workload, starting the log reader/writer pods
        logger.info(
            "starting log reader/writer workload via Deployment, one pod per worker"
        )
        self.workload_file.create()

        logger.info("waiting for all pods of the workload Deployment to run")
        self.ocp_pod = ocp.OCP(kind="Pod", namespace=self.project.namespace)
        try:
            self.ocp_pod.wait_for_resource(
                resource_count=self.deploy_dict["spec"]["replicas"],
                condition=constants.STATUS_RUNNING,
                error_condition=constants.STATUS_ERROR,
                timeout=300,
                sleep=30,
            )
        except Exception as ex:
            # this is not a problem with feature under test, but with infra,
            # cluster configuration or unrelated bug which must have happened
            # before this test case
            error_msg = "unexpected problem with start of the workload, cluster is either misconfigured or broken"
            logger.exception(error_msg)
            logger.debug(self.workload_file.describe())
            raise exceptions.UnexpectedBehaviour(error_msg) from ex
示例#27
0
    def test_pod_reattach_time_performance(
        self, storageclass_factory, copies, timeout, total_time_limit
    ):
        """
        Test assign nodeName to a pod using RWX pvc
        Each kernel (unzipped) is 892M and 61694 files
        The test creates samples_num pvcs and pods, writes kernel files multiplied by number of copies
        and calculates average total and csi reattach times and standard deviation
        """
        kernel_url = "https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.19.5.tar.gz"
        download_path = "tmp"

        samples_num = 7
        if self.dev_mode:
            samples_num = 3

        test_start_time = PASTest.get_time()
        helpers.pull_images(constants.PERF_IMAGE)
        # Download a linux Kernel

        dir_path = os.path.join(os.getcwd(), download_path)
        file_path = os.path.join(dir_path, "file.gz")
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        urllib.request.urlretrieve(kernel_url, file_path)

        worker_nodes_list = node.get_worker_nodes()
        assert len(worker_nodes_list) > 1
        node_one = worker_nodes_list[0]
        node_two = worker_nodes_list[1]

        time_measures, csi_time_measures, files_written_list, data_written_list = (
            [],
            [],
            [],
            [],
        )

        self.sc_obj = storageclass_factory(self.interface)
        for sample_index in range(1, samples_num + 1):

            csi_start_time = self.get_time("csi")

            logger.info(f"Start creating PVC number {sample_index}.")
            pvc_obj = helpers.create_pvc(
                sc_name=self.sc_obj.name, size="100Gi", namespace=self.namespace
            )
            helpers.wait_for_resource_state(pvc_obj, constants.STATUS_BOUND)

            # Create a pod on one node
            logger.info(f"Creating Pod with pvc {pvc_obj.name} on node {node_one}")

            pvc_obj.reload()
            self.pvc_list.append(pvc_obj)

            try:
                pod_obj1 = helpers.create_pod(
                    interface_type=self.interface,
                    pvc_name=pvc_obj.name,
                    namespace=pvc_obj.namespace,
                    node_name=node_one,
                    pod_dict_path=constants.PERF_POD_YAML,
                )
            except Exception as e:
                logger.error(
                    f"Pod on PVC {pvc_obj.name} was not created, exception {str(e)}"
                )
                raise PodNotCreated("Pod on PVC was not created.")

            # Confirm that pod is running on the selected_nodes
            logger.info("Checking whether pods are running on the selected nodes")
            helpers.wait_for_resource_state(
                resource=pod_obj1, state=constants.STATUS_RUNNING, timeout=timeout
            )

            pod_name = pod_obj1.name
            pod_path = "/mnt"

            _ocp = OCP(namespace=pvc_obj.namespace)

            rsh_cmd = f"rsync {dir_path} {pod_name}:{pod_path}"
            _ocp.exec_oc_cmd(rsh_cmd)

            rsh_cmd = (
                f"exec {pod_name} -- tar xvf {pod_path}/tmp/file.gz -C {pod_path}/tmp"
            )
            _ocp.exec_oc_cmd(rsh_cmd)

            for x in range(copies):
                rsh_cmd = f"exec {pod_name} -- mkdir -p {pod_path}/folder{x}"
                _ocp.exec_oc_cmd(rsh_cmd)
                rsh_cmd = (
                    f"exec {pod_name} -- cp -r {pod_path}/tmp {pod_path}/folder{x}"
                )
                _ocp.exec_oc_cmd(rsh_cmd)
                rsh_cmd = f"exec {pod_name} -- sync"
                _ocp.exec_oc_cmd(rsh_cmd)

            logger.info("Getting the amount of data written to the PVC")
            rsh_cmd = f"exec {pod_name} -- df -h {pod_path}"
            data_written_str = _ocp.exec_oc_cmd(rsh_cmd).split()[-4]
            logger.info(f"The amount of written data is {data_written_str}")
            data_written = float(data_written_str[:-1])

            rsh_cmd = f"exec {pod_name} -- find {pod_path} -type f"
            files_written = len(_ocp.exec_oc_cmd(rsh_cmd).split())
            logger.info(
                f"For {self.interface} - The number of files written to the pod is {files_written}"
            )
            files_written_list.append(files_written)
            data_written_list.append(data_written)

            logger.info("Deleting the pod")
            rsh_cmd = f"delete pod {pod_name}"
            _ocp.exec_oc_cmd(rsh_cmd)

            logger.info(f"Creating Pod with pvc {pvc_obj.name} on node {node_two}")

            try:
                pod_obj2 = helpers.create_pod(
                    interface_type=self.interface,
                    pvc_name=pvc_obj.name,
                    namespace=pvc_obj.namespace,
                    node_name=node_two,
                    pod_dict_path=constants.PERF_POD_YAML,
                )
            except Exception as e:
                logger.error(
                    f"Pod on PVC {pvc_obj.name} was not created, exception {str(e)}"
                )
                raise PodNotCreated("Pod on PVC was not created.")

            start_time = time.time()

            pod_name = pod_obj2.name
            helpers.wait_for_resource_state(
                resource=pod_obj2, state=constants.STATUS_RUNNING, timeout=timeout
            )
            end_time = time.time()
            total_time = end_time - start_time
            if total_time > total_time_limit:
                logger.error(
                    f"Pod creation time is {total_time} and greater than {total_time_limit} seconds"
                )
                raise ex.PerformanceException(
                    f"Pod creation time is {total_time} and greater than {total_time_limit} seconds"
                )

            csi_time = performance_lib.pod_attach_csi_time(
                self.interface, pvc_obj.backed_pv, csi_start_time, pvc_obj.namespace
            )[0]
            csi_time_measures.append(csi_time)
            logger.info(
                f"PVC #{pvc_obj.name} pod {pod_name} creation time took {total_time} seconds, "
                f"csi time is {csi_time} seconds"
            )
            time_measures.append(total_time)

            logger.info("Deleting the pod")
            rsh_cmd = f"delete pod {pod_name}"
            _ocp.exec_oc_cmd(rsh_cmd)
            # teardown_factory(pod_obj2)

        average = statistics.mean(time_measures)
        logger.info(
            f"The average time of {self.interface} pod creation on {samples_num} PVCs is {average} seconds"
        )

        st_deviation = statistics.stdev(time_measures)
        logger.info(
            f"The standard deviation of {self.interface} pod creation time on {samples_num} PVCs is {st_deviation}"
        )

        csi_average = statistics.mean(csi_time_measures)
        logger.info(
            f"The average csi time of {self.interface} pod creation on {samples_num} PVCs is {csi_average} seconds"
        )

        csi_st_deviation = statistics.stdev(csi_time_measures)
        logger.info(
            f"The standard deviation of {self.interface} csi pod creation time on {samples_num} PVCs "
            f"is {csi_st_deviation}"
        )

        files_written_average = statistics.mean(files_written_list)
        data_written_average = statistics.mean(data_written_list)

        os.remove(file_path)
        os.rmdir(dir_path)

        # Produce ES report

        # Collecting environment information
        self.get_env_info()

        # Initialize the results doc file.
        full_results = self.init_full_results(
            ResultsAnalyse(
                self.uuid,
                self.crd_data,
                self.full_log_path,
                "pod_reattach_time_fullres",
            )
        )

        full_results.add_key("storageclass", self.sc)
        full_results.add_key("pod_reattach_time", time_measures)
        full_results.add_key("copies_number", copies)
        full_results.add_key("files_number_average", files_written_average)
        full_results.add_key("data_average", data_written_average)
        full_results.add_key("pod_reattach_time_average", average)
        full_results.add_key("pod_reattach_standard_deviation", st_deviation)
        full_results.add_key("pod_csi_reattach_time_average", csi_average)
        full_results.add_key("pod_csi_reattach_standard_deviation", csi_st_deviation)

        test_end_time = PASTest.get_time()

        # Add the test time to the ES report
        full_results.add_key(
            "test_time", {"start": test_start_time, "end": test_end_time}
        )

        # Write the test results into the ES server
        if full_results.es_write():
            res_link = full_results.results_link()
            logger.info(f"The Result can be found at : {res_link}")

            # Create text file with results of all subtest (4 - according to the parameters)
            self.results_path = get_full_test_logs_path(
                cname=self, fname="test_pod_reattach_time_performance"
            )
            self.write_result_to_file(res_link)
示例#28
0
    def test_pvc_disruptive(
        self,
        interface,
        operation_to_disrupt,
        resource_to_delete,
        multi_pvc_factory,
        pod_factory,
    ):
        """
        Base function for PVC disruptive tests.
        Deletion of 'resource_to_delete' will be introduced while
        'operation_to_disrupt' is progressing.
        """
        pod_functions = {
            "mds":
            partial(pod.get_mds_pods),
            "mon":
            partial(pod.get_mon_pods),
            "mgr":
            partial(pod.get_mgr_pods),
            "osd":
            partial(pod.get_osd_pods),
            "rbdplugin":
            partial(pod.get_plugin_pods, interface=interface),
            "cephfsplugin":
            partial(pod.get_plugin_pods, interface=interface),
            "cephfsplugin_provisioner":
            partial(pod.get_cephfsplugin_provisioner_pods),
            "rbdplugin_provisioner":
            partial(pod.get_rbdfsplugin_provisioner_pods),
            "operator":
            partial(pod.get_operator_pods),
        }

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        namespace = self.proj_obj.namespace

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(pod.get_all_pods(namespace=namespace))
        initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"])

        DISRUPTION_OPS.set_resource(resource=resource_to_delete)

        access_modes = [constants.ACCESS_MODE_RWO]
        if interface == constants.CEPHFILESYSTEM:
            access_modes.append(constants.ACCESS_MODE_RWX)
            num_of_pvc = 8
            access_mode_dist_ratio = [6, 2]

        # Modify access_modes list to create rbd `block` type volume with
        # RWX access mode. RWX is not supported in non-block type rbd
        if interface == constants.CEPHBLOCKPOOL:
            access_modes.extend([
                f"{constants.ACCESS_MODE_RWO}-Block",
                f"{constants.ACCESS_MODE_RWX}-Block",
            ])
            num_of_pvc = 9
            access_mode_dist_ratio = [4, 3, 2]

        executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc))

        # Start creation of PVCs
        bulk_pvc_create = executor.submit(
            multi_pvc_factory,
            interface=interface,
            project=self.proj_obj,
            size=5,
            access_modes=access_modes,
            access_modes_selection="distribute_random",
            access_mode_dist_ratio=access_mode_dist_ratio,
            status=constants.STATUS_BOUND,
            num_of_pvc=num_of_pvc,
            wait_each=False,
            timeout=90,
        )

        if operation_to_disrupt == "create_pvc":
            # Ensure PVCs are being created before deleting the resource
            ret = helpers.wait_for_resource_count_change(
                get_all_pvcs, initial_num_of_pvc, namespace, "increase")
            assert ret, "Wait timeout: PVCs are not being created."
            logger.info("PVCs creation has started.")
            DISRUPTION_OPS.delete_resource()

        pvc_objs = bulk_pvc_create.result()

        # Confirm that PVCs are Bound
        for pvc_obj in pvc_objs:
            helpers.wait_for_resource_state(resource=pvc_obj,
                                            state=constants.STATUS_BOUND,
                                            timeout=120)
            pvc_obj.reload()
        logger.info("Verified: PVCs are Bound.")

        # Start creating pods
        bulk_pod_create = executor.submit(
            helpers.create_pods,
            pvc_objs,
            pod_factory,
            interface,
            2,
            nodes=node.get_worker_nodes(),
        )

        if operation_to_disrupt == "create_pod":
            # Ensure that pods are being created before deleting the resource
            ret = helpers.wait_for_resource_count_change(
                pod.get_all_pods, initial_num_of_pods, namespace, "increase")
            assert ret, "Wait timeout: Pods are not being created."
            logger.info("Pods creation has started.")
            DISRUPTION_OPS.delete_resource()

        pod_objs = bulk_pod_create.result()

        # Verify pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING,
                                            timeout=90)
            pod_obj.reload()
        logger.info("Verified: All pods are Running.")

        # Do setup on pods for running IO
        logger.info("Setting up pods for running IO.")
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in pod_objs:
            logger.info(
                f"Waiting for IO setup to complete on pod {pod_obj.name}")
            for sample in TimeoutSampler(360, 2, getattr, pod_obj,
                                         "wl_setup_done"):
                if sample:
                    logger.info(f"Setup for running IO is completed on pod "
                                f"{pod_obj.name}.")
                    break
        logger.info("Setup for running IO is completed on all pods.")

        # Start IO on each pod
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=10,
                fio_filename=f"{pod_obj.name}_io_file1",
            )
        logger.info("FIO started on all pods.")

        if operation_to_disrupt == "run_io":
            DISRUPTION_OPS.delete_resource()

        logger.info("Fetching FIO results.")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
        logger.info("Verified FIO result on pods.")

        # Delete pods
        for pod_obj in pod_objs:
            pod_obj.delete(wait=True)
        for pod_obj in pod_objs:
            pod_obj.ocp.wait_for_delete(pod_obj.name)

        # Verify that PVCs are reusable by creating new pods
        pod_objs = helpers.create_pods(
            pvc_objs,
            pod_factory,
            interface,
            2,
            nodes=node.get_worker_nodes(),
        )

        # Verify new pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING,
                                            timeout=90)
            pod_obj.reload()
        logger.info("Verified: All new pods are Running.")

        # Run IO on each of the new pods
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            pod_obj.run_io(
                storage_type=storage_type,
                size="1G",
                runtime=10,
                fio_filename=f"{pod_obj.name}_io_file2",
            )

        logger.info("Fetching FIO results from new pods")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
        logger.info("Verified FIO result on new pods.")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}")

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
        logger.info("Ceph cluster health is OK")
    def setup_base(self, interface, multi_pvc_factory, pod_factory):
        """
        Create PVCs and pods
        """
        access_modes = [constants.ACCESS_MODE_RWO]
        if interface == constants.CEPHFILESYSTEM:
            access_modes.append(constants.ACCESS_MODE_RWX)
            self.num_of_pvcs = 10
            access_mode_dist_ratio = [8, 2]

        # Modify access_modes list to create rbd `block` type volume with
        # RWX access mode. RWX is not supported in filesystem type rbd
        if interface == constants.CEPHBLOCKPOOL:
            access_modes.extend(
                [
                    f"{constants.ACCESS_MODE_RWO}-Block",
                    f"{constants.ACCESS_MODE_RWX}-Block",
                ]
            )
            self.num_of_pvcs = 12
            access_mode_dist_ratio = [5, 5, 2]

        pvc_objs = multi_pvc_factory(
            interface=interface,
            project=None,
            storageclass=None,
            size=self.pvc_size,
            access_modes=access_modes,
            access_modes_selection="distribute_random",
            access_mode_dist_ratio=access_mode_dist_ratio,
            status=constants.STATUS_BOUND,
            num_of_pvc=self.num_of_pvcs,
            wait_each=False,
        )

        pod_objs = []
        rwx_pod_objs = []

        nodes_iter = cycle(node.get_worker_nodes())

        # Create one pod using each RWO PVC and two pods using each RWX PVC
        for pvc_obj in pvc_objs:
            pvc_info = pvc_obj.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                pod_dict = constants.CSI_RBD_RAW_BLOCK_POD_YAML
                raw_block_pv = True
            else:
                raw_block_pv = False
                pod_dict = ""
            if pvc_obj.access_mode == constants.ACCESS_MODE_RWX:
                pod_obj = pod_factory(
                    interface=interface,
                    pvc=pvc_obj,
                    status="",
                    node_name=next(nodes_iter),
                    pod_dict_path=pod_dict,
                    raw_block_pv=raw_block_pv,
                )
                rwx_pod_objs.append(pod_obj)
            pod_obj = pod_factory(
                interface=interface,
                pvc=pvc_obj,
                status="",
                node_name=next(nodes_iter),
                pod_dict_path=pod_dict,
                raw_block_pv=raw_block_pv,
            )
            pod_objs.append(pod_obj)

        # Wait for pods to be in Running state
        for pod_obj in pod_objs + rwx_pod_objs:
            wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING)
            pod_obj.reload()
        log.info(f"Created {len(pod_objs) + len(rwx_pod_objs)} pods.")

        return pvc_objs, pod_objs, rwx_pod_objs
    def test_non_ocs_taint_and_tolerations(self):
        """
        Test runs the following steps
        1. Taint ocs nodes with non-ocs taint
        2. Set tolerations on storagecluster, subscription, configmap and ocsinit
        3. Respin all ocs pods and check if it runs on ocs nodes with tolerations
        4. Add Capacity

        """

        # Taint all nodes with non-ocs taint
        ocs_nodes = get_worker_nodes()
        taint_nodes(nodes=ocs_nodes, taint_label="xyz=true:NoSchedule")

        # Add tolerations to the storagecluster
        storagecluster_obj = ocp.OCP(
            resource_name=constants.DEFAULT_CLUSTERNAME,
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            kind=constants.STORAGECLUSTER,
        )
        tolerations = (
            '{"tolerations": [{"effect": "NoSchedule", "key": "xyz",'
            '"operator": "Equal", "value": "true"}, '
            '{"effect": "NoSchedule", "key": "node.ocs.openshift.io/storage", '
            '"operator": "Equal", "value": "true"}]}')
        param = (
            f'{{"spec": {{"placement": {{"all": {tolerations}, "mds": {tolerations}, '
            f'"noobaa-core": {tolerations}, "rgw": {tolerations}}}}}}}')
        storagecluster_obj.patch(params=param, format_type="merge")

        # Add tolerations to the subscription
        sub_list = ocp.get_all_resource_names_of_a_kind(
            kind=constants.SUBSCRIPTION)
        param = (
            '{"spec": {"config":  {"tolerations": '
            '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", '
            '"value": "true"}]}}}')
        for sub in sub_list:
            sub_obj = ocp.OCP(
                resource_name=sub,
                namespace=defaults.ROOK_CLUSTER_NAMESPACE,
                kind=constants.SUBSCRIPTION,
            )
            sub_obj.patch(params=param, format_type="merge")

        # Add tolerations to the ocsinitializations.ocs.openshift.io
        param = (
            '{"spec":  {"tolerations": '
            '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", '
            '"value": "true"}]}}')

        ocsini_obj = ocp.OCP(
            resource_name=constants.OCSINIT,
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            kind=constants.OCSINITIALIZATION,
        )
        ocsini_obj.patch(params=param, format_type="merge")

        # Add tolerations to the configmap rook-ceph-operator-config
        configmap_obj = ocp.OCP(
            kind=constants.CONFIGMAP,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            resource_name=constants.ROOK_OPERATOR_CONFIGMAP,
        )
        toleration = configmap_obj.get().get("data").get(
            "CSI_PLUGIN_TOLERATIONS")
        toleration += (
            '\n- key: xyz\n  operator: Equal\n  value: "true"\n  effect: NoSchedule'
        )
        toleration = toleration.replace('"', '\\"').replace("\n", "\\n")
        param_cmd = (
            f'[{{"op": "replace", "path": "/data/CSI_PLUGIN_TOLERATIONS", "value": "{toleration}" }}, '
            f'{{"op": "replace", "path": "/data/CSI_PROVISIONER_TOLERATIONS", "value": "{toleration}" }}]'
        )
        configmap_obj.patch(params=param_cmd, format_type="json")

        # After edit noticed few pod respins as expected
        assert wait_for_pods_to_be_running(timeout=600, sleep=15)

        # Respin all pods and check it if is still running
        pod_list = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE, )
        for pod in pod_list:
            pod.delete(wait=False)

        assert wait_for_pods_to_be_running(timeout=600, sleep=15)
        self.sanity_helpers.health_check()

        # Add capacity to check if new osds has toleration
        osd_size = storage_cluster.get_osd_size()
        count = storage_cluster.add_capacity(osd_size)
        pod = ocp.OCP(kind=constants.POD,
                      namespace=config.ENV_DATA["cluster_namespace"])
        if is_flexible_scaling_enabled():
            replica_count = 1
        else:
            replica_count = 3
        assert pod.wait_for_resource(
            timeout=300,
            condition=constants.STATUS_RUNNING,
            selector=constants.OSD_APP_LABEL,
            resource_count=count * replica_count,
        ), "New OSDs failed to reach running state"
        check_ceph_health_after_add_capacity(ceph_rebalance_timeout=2500)