def test_rolling_shutdown_and_recovery(self, nodes, pvc_factory,
                                           pod_factory, bucket_factory,
                                           rgw_bucket_factory):
        SECONDS_TO_WAIT = 180
        """
        Test rolling shutdown and recovery of OCS worker nodes

        """
        # Get OCS worker node objects
        ocs_node_objs = get_ocs_nodes()

        # Start rolling shutdown and recovery of OCS worker nodes
        log.info("ShutDown OCS worker")
        for node_obj in ocs_node_objs:
            nodes.stop_nodes(nodes=[node_obj])
            log.info(
                f"Keeping node in stopped state for {SECONDS_TO_WAIT} mins")
            time.sleep(SECONDS_TO_WAIT)
            nodes.start_nodes(nodes=[node_obj])
            self.sanity_helpers.health_check(cluster_check=False, tries=60)
            log.info("Checking storage pods status")
            # Validate storage pods are running
            wait_for_pods_to_be_running(timeout=600)

        # Check basic cluster functionality by creating some resources
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory)
예제 #2
0
    def restart_ocs_operator_node(self):
        """
        Restart node that runs OCS operator pod
        """

        pod_obj = pod.get_ocs_operator_pod()
        node_obj = pod.get_pod_node(pod_obj)

        self.nodes.restart_nodes([node_obj])

        wait_for_nodes_status()

        pod.wait_for_pods_to_be_running(
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, pod_names=[pod_obj.name]
        )
예제 #3
0
    def is_cluster_healthy(self):
        """
        Wrapper function for cluster health check

        Returns:
            bool: True if ALL checks passed, False otherwise
        """
        return self.ceph_not_health_error() and pod.wait_for_pods_to_be_running()
예제 #4
0
        def finalizer():
            """
            Removes huge pages on worker nodes and verifies all pods are up

            """
            disable_huge_pages()

            wait_for_nodes_status(status=constants.NODE_READY, timeout=600)

            nodes = get_nodes()
            for node in nodes:
                assert (node.get()["status"]["allocatable"]["hugepages-2Mi"] ==
                        "0"), f"Huge pages is not applied on {node.name}"

            log.info("Wait for all pods to be in running state")
            wait_for_pods_to_be_running(timeout=600)
            sanity_helpers.ceph_health_check(tries=120)
예제 #5
0
def node_replacement_verification_steps_user_side(old_node_name, new_node_name,
                                                  new_osd_node_name,
                                                  old_osd_id):
    """
    Check the verification steps that the user should perform after the process
    of node replacement as described in the docs

    Args:
        old_node_name (str): The name of the old node that has been deleted
        new_node_name (str): The name of the new node that has been created
        new_osd_node_name (str): The name of the new node that has been added to osd nodes
        old_osd_id (str): The old osd id

    Returns:
        bool: True if all the verification steps passed. False otherwise

    """
    ocs_nodes = get_ocs_nodes()
    ocs_node_names = [n.name for n in ocs_nodes]
    if new_node_name not in ocs_node_names:
        log.warning("The new node not found in ocs nodes")
        return False
    if old_node_name in ocs_node_names:
        log.warning("The old node name found in ocs nodes")
        return False

    csi_cephfsplugin_pods = pod.get_plugin_pods(
        interface=constants.CEPHFILESYSTEM)
    csi_rbdplugin_pods = pod.get_plugin_pods(interface=constants.CEPHBLOCKPOOL)
    csi_plugin_pods = csi_cephfsplugin_pods + csi_rbdplugin_pods
    if not all(
        [p.status() == constants.STATUS_RUNNING for p in csi_plugin_pods]):
        log.warning("Not all csi rbd and cephfs plugin pods in status running")
        return False

    # It can take some time until all the ocs pods are up and running
    # after the process of node replacement
    if not pod.wait_for_pods_to_be_running():
        log.warning("Not all the pods in running state")
        return False

    new_osd_pod = get_node_pods(new_osd_node_name,
                                pods_to_search=pod.get_osd_pods())[0]
    if not new_osd_pod:
        log.warning("Didn't find any osd pods running on the new node")
        return False

    new_osd_id = pod.get_osd_pod_id(new_osd_pod)
    if old_osd_id != new_osd_id:
        log.warning(
            f"The osd pod, that associated to the new node, has the id {new_osd_id} "
            f"instead of the expected osd id {old_osd_id}")
        return False

    log.info("Verification steps from the user side finish successfully")
    return True
예제 #6
0
    def test_hugepages_post_odf_deployment(
        self,
        pvc_factory,
        pod_factory,
        bucket_factory,
        rgw_bucket_factory,
        node_restart_teardown,
    ):
        """
        Test to verify that after enabling huge pages the nodes come up with
        higher page size and all odf cluster pods come back up.

        """
        # Applies huge pages on the cluster nodes
        enable_huge_pages()

        log.info("Wait for all worker node to be READY state")
        wait_for_nodes_status(status=constants.NODE_READY, timeout=600)

        nodes = get_nodes()
        for node in nodes:
            assert (node.get()["status"]["allocatable"]["hugepages-2Mi"] ==
                    "64Mi"), f"Huge pages is not applied on {node.name}"

        log.info("Wait for all storage cluster pods to be in running state")
        wait_for_pods_to_be_running(timeout=600)

        # Creating Resources
        log.info("Creating Resources using sanity helpers")
        self.sanity_helpers.create_resources(pvc_factory, pod_factory,
                                             bucket_factory,
                                             rgw_bucket_factory, False)

        # Deleting Resources
        log.info("Deleting the resources created")
        self.sanity_helpers.delete_resources()

        # Verify everything running fine
        log.info(
            "Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)
예제 #7
0
파일: node.py 프로젝트: petr-balogh/ocs-ci
def node_replacement_verification_steps_ceph_side(
    old_node_name, new_node_name, new_osd_node_name
):
    """
    Check the verification steps from the Ceph side, after the process
    of node replacement as described in the docs

    Args:
        old_node_name (str): The name of the old node that has been deleted
        new_node_name (str): The name of the new node that has been created
        new_osd_node_name (str): The name of the new node that has been added to osd nodes

    Returns:
        bool: True if all the verification steps passed. False otherwise

    """
    if old_node_name == new_node_name:
        log.warning("Hostname didn't change")
        return False

    wait_for_nodes_status([new_node_name, new_osd_node_name])
    # It can take some time until all the ocs pods are up and running
    # after the process of node replacement
    if not pod.wait_for_pods_to_be_running():
        log.warning("Not all the pods in running state")
        return False

    ct_pod = pod.get_ceph_tools_pod()
    ceph_osd_status = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd status")
    if new_osd_node_name not in ceph_osd_status:
        log.warning("new osd node name not found in 'ceph osd status' output")
        return False
    if old_node_name in ceph_osd_status:
        log.warning("old node name found in 'ceph osd status' output")
        return False

    osd_node_names = get_osd_running_nodes()
    if new_osd_node_name not in osd_node_names:
        log.warning("the new osd hostname not found in osd node names")
        return False
    if old_node_name in osd_node_names:
        log.warning("the old hostname found in osd node names")
        return False

    from ocs_ci.ocs.cluster import check_ceph_osd_tree_after_node_replacement

    if not check_ceph_osd_tree_after_node_replacement():
        return False

    log.info("Verification steps from the ceph side finish successfully")
    return True
예제 #8
0
    def test_toleration(self):
        """
        1. Check if nodes are tainted
        2. Taint ocs nodes if not tainted
        3. Check for tolerations on all pod
        4. Respin all ocs pods and check if it runs on ocs nodes
        5. Untaint nodes

        """
        # taint nodes if not already tainted
        taint_ocs_nodes()

        # Check tolerations on pods under openshift-storage
        check_toleration_on_pods()

        # Respin all pods and check it if is still running
        pod_list = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE)
        for pod in pod_list:
            pod.delete(wait=False)
        assert wait_for_pods_to_be_running(timeout=300)
예제 #9
0
    def test_check_pods_status_after_node_failure(self, nodes,
                                                  node_restart_teardown):
        """
        Test check pods status after a node failure event.
        All the rook ceph pods should be in "Running" or "Completed"
        state after a node failure event.

        """
        ocs_nodes = get_ocs_nodes()
        if not ocs_nodes:
            pytest.skip("We don't have ocs nodes in the cluster")

        ocs_node = random.choice(ocs_nodes)
        node_name = ocs_node.name
        log.info(f"Selected node is '{node_name}'")
        # Save the rook ceph pods, the osd ids, and the mon ids before shutting down the node
        rook_ceph_pod_names_not_in_node = get_rook_ceph_pod_names_not_in_node(
            node_name)
        node_osd_ids = get_node_osd_ids(node_name)
        node_mon_ids = get_node_mon_ids(node_name)

        log.info(f"Shutting down node '{node_name}'")
        nodes.stop_nodes([ocs_node])
        wait_for_nodes_status(node_names=[node_name],
                              status=constants.NODE_NOT_READY)
        log.info(
            f"The node '{node_name}' reached '{constants.NODE_NOT_READY}' status"
        )

        log.info("Wait for a change in the rook ceph pod statuses...")
        timeout = 480
        is_rook_ceph_pods_status_changed = wait_for_change_in_rook_ceph_pods(
            node_name, timeout=timeout)
        assert (
            is_rook_ceph_pods_status_changed
        ), f"Rook Ceph pods status didn't change after {timeout} seconds"

        log.info(
            "Check the rook ceph pods are in 'Running' or 'Completed' state")
        timeout = 480
        are_pods_running = wait_for_pods_to_be_running(
            pod_names=rook_ceph_pod_names_not_in_node,
            timeout=timeout,
            sleep=30)
        assert are_pods_running, f"The pods are not 'Running' after {timeout} seconds"

        # Get the rook ceph pods without the osd, and mon pods have the old node ids
        osd_pods = get_osd_pods()
        new_node_osd_id_names_set = {
            p.name
            for p in osd_pods if get_osd_pod_id(p) in node_osd_ids
        }
        mon_pods = get_mon_pods()
        new_node_mon_id_names_set = {
            p.name
            for p in mon_pods if get_mon_pod_id(p) in node_mon_ids
        }

        new_node_osd_mon_id_names_set = new_node_osd_id_names_set.union(
            new_node_mon_id_names_set)
        rook_ceph_pod_names_set = set(get_rook_ceph_pod_names())
        new_rook_ceph_pod_names = list(rook_ceph_pod_names_set -
                                       new_node_osd_mon_id_names_set)

        log.info(
            "Verify that the new rook ceph pods are in 'Running' or 'Completed' state"
        )
        timeout = 300
        are_new_pods_running = wait_for_pods_to_be_running(
            pod_names=new_rook_ceph_pod_names, timeout=timeout, sleep=20)
        assert (are_new_pods_running
                ), f"The new pods are not 'Running' after {timeout} seconds"

        log.info("All the pods are in 'Running' or 'Completed' state")
        log.info(f"Starting the node '{node_name}' again...")
        nodes.start_nodes(nodes=[ocs_node])
        wait_for_nodes_status(node_names=[node_name])

        log.info(
            "Waiting for all the pods to be running and cluster health to be OK..."
        )
        wait_for_pods_to_be_running(timeout=600)
        self.sanity_helpers.health_check(tries=40)
예제 #10
0
    def test_all_worker_nodes_short_network_failure(
        self, nodes, setup, node_restart_teardown
    ):
        """
        OCS-1432/OCS-1433:
        - Start DeploymentConfig based app pods
        - Make all the worker nodes unresponsive by doing abrupt network failure
        - Reboot the unresponsive node after short duration of ~300 seconds
        - When unresponsive node recovers, app pods and ceph cluster should recover
        - Again run IOs from app pods
        """
        pod_objs = setup
        worker_nodes = node.get_worker_nodes()

        # Run IO on pods
        logger.info(f"Starting IO on {len(pod_objs)} app pods")
        with ThreadPoolExecutor() as executor:
            for pod_obj in pod_objs:
                logger.info(f"Starting IO on pod {pod_obj.name}")
                storage_type = (
                    "block" if pod_obj.pvc.get_pvc_vol_mode == "Block" else "fs"
                )
                executor.submit(
                    pod_obj.run_io,
                    storage_type=storage_type,
                    size="2G",
                    runtime=30,
                    fio_filename=f"{pod_obj.name}_io_f1",
                )

        logger.info(f"IO started on all {len(pod_objs)} app pods")

        # Wait for IO results
        for pod_obj in pod_objs:
            pod.get_fio_rw_iops(pod_obj)

        # Induce network failure on all worker nodes
        with ThreadPoolExecutor() as executor:
            for node_name in worker_nodes:
                executor.submit(node.node_network_failure, node_name, False)

        node.wait_for_nodes_status(
            node_names=worker_nodes, status=constants.NODE_NOT_READY
        )

        logger.info(f"Waiting for {self.short_nw_fail_time} seconds")
        sleep(self.short_nw_fail_time)

        # Reboot the worker nodes
        logger.info(f"Stop and start the worker nodes: {worker_nodes}")
        nodes.restart_nodes_by_stop_and_start(node.get_node_objs(worker_nodes))

        try:
            node.wait_for_nodes_status(
                node_names=worker_nodes, status=constants.NODE_READY
            )
            logger.info("Wait for OCS pods to be in running state")
            if not pod.wait_for_pods_to_be_running(timeout=720):
                raise ResourceWrongStatusException("Pods are not in running state")
        except ResourceWrongStatusException:
            # Restart nodes
            nodes.restart_nodes(node.get_node_objs(worker_nodes))

        ceph_health_check(tries=80)

        # Get current info of app pods
        new_pod_objs = list()
        for pod_obj in pod_objs:
            pod_label = pod_obj.labels.get("deploymentconfig")
            pods_data = pod.get_pods_having_label(
                f"deploymentconfig={pod_label}", pod_obj.namespace
            )
            current_pods = [
                pod_data.get("metadata").get("name")
                for pod_data in pods_data
                if "-deploy" not in pod_data.get("metadata").get("name")
            ]
            logger.info(f"Pods with label {pod_label}: {current_pods}")

            # Remove the older pod from the list if pod is rescheduled
            if len(current_pods) > 1:
                current_pods.remove(pod_obj.name)

            new_pod_obj = pod.get_pod_obj(current_pods.pop(), pod_obj.namespace)
            new_pod_obj.pvc = pod_obj.pvc
            new_pod_objs.append(new_pod_obj)

        logger.info("Wait for app pods are in running state")
        for pod_obj in new_pod_objs:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=720,
                sleep=20,
            )
        logger.info("All the app pods reached running state")

        # Run more IOs on app pods
        with ThreadPoolExecutor() as executor:
            for pod_obj in new_pod_objs:
                logger.info(f"Starting IO on pod {pod_obj.name}")
                pod_obj.wl_setup_done = False
                storage_type = (
                    "block" if pod_obj.pvc.get_pvc_vol_mode == "Block" else "fs"
                )
                executor.submit(
                    pod_obj.run_io,
                    storage_type=storage_type,
                    size="1G",
                    runtime=30,
                    fio_filename=f"{pod_obj.name}_io_f2",
                )

        for pod_obj in new_pod_objs:
            pod.get_fio_rw_iops(pod_obj)
예제 #11
0
    def test_noobaa_sts_host_node_failure(
        self,
        noobaa_sts,
        respin_noobaa_operator,
        mcg_obj,
        bucket_factory,
        nodes,
        node_restart_teardown,
    ):
        """
        Test case to fail node where NooBaa Statefulset pod (noobaa-core, noobaa-db)
        is hosted and verify the pod is rescheduled on a healthy node

        """
        executor = ThreadPoolExecutor(max_workers=1)
        pod_obj = OCP(kind=constants.POD,
                      namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)

        # Get noobaa statefulset pod and node where it is hosted
        noobaa_sts_pod = get_noobaa_pods(
            noobaa_label=self.labels_map[noobaa_sts])[0]
        noobaa_sts_pod_node = get_pod_node(noobaa_sts_pod)
        log.info(
            f"{noobaa_sts_pod.name} is running on {noobaa_sts_pod_node.name}")

        # Get the NooBaa operator pod and node where it is hosted
        # Check if NooBaa operator and statefulset pod are hosted on same node
        noobaa_operator_pod = get_noobaa_pods(noobaa_label=self.labels_map[
            constants.NOOBAA_OPERATOR_DEPLOYMENT])[0]
        noobaa_operator_pod_node = get_pod_node(noobaa_operator_pod)
        log.info(
            f"{noobaa_operator_pod.name} is running on {noobaa_operator_pod_node.name}"
        )
        if noobaa_sts_pod_node.name == noobaa_operator_pod_node.name:
            operator_on_same_node = True
            log.info(
                f"{noobaa_sts_pod.name} and {noobaa_operator_pod.name} are running on same node."
            )
        else:
            operator_on_same_node = False
            log.info(
                f"{noobaa_sts_pod.name} and {noobaa_operator_pod.name} are running on different node."
            )

        # Stop the node
        log.info(
            f"Stopping {noobaa_sts_pod_node.name} where {noobaa_sts_pod.name} is hosted"
        )
        stop_thread = executor.submit(nodes.stop_nodes,
                                      nodes=[noobaa_sts_pod_node])
        node.wait_for_nodes_status(node_names=[noobaa_sts_pod_node.name],
                                   status=constants.NODE_NOT_READY)

        # Disrupt NooBaa operator
        if respin_noobaa_operator:
            noobaa_operator_pod.delete(force=True)

        # Check result of 'stop_thread'
        stop_thread.result()

        # Wait for NooBaa operator pod to reach terminating state if on same node
        # and not respun
        if operator_on_same_node and not respin_noobaa_operator:
            wait_for_resource_state(
                resource=noobaa_operator_pod,
                state=constants.STATUS_TERMINATING,
                timeout=360,
            )

        # Wait for NooBaa operator pod to reach running state
        pod_obj.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=self.labels_map[constants.NOOBAA_OPERATOR_DEPLOYMENT],
            resource_count=1,
        )

        # Verify NooBaa statefulset pod reschedules on another node
        try:
            for pod_list in TimeoutSampler(
                    60,
                    3,
                    get_noobaa_pods,
                    noobaa_label=self.labels_map[noobaa_sts],
            ):
                if len(pod_list) == 1:
                    pod_node = get_pod_node(pod_list[0])
                    if pod_node.name != noobaa_sts_pod_node.name:
                        log.info(
                            f"{pod_list[0].name} has been rescheduled on {pod_node.name}"
                        )
                        break
                    log.info(
                        f"Waiting for {noobaa_sts_pod.name} pod to be rescheduled"
                    )
        except TimeoutExpiredError:
            raise TimeoutExpiredError(
                f"{noobaa_sts_pod.name} pod not rescheduled within 60 seconds")

        # Wait for rescheduled pod to reach Running state.
        # For noobaa-db pod which is attached to a PV it may take more time (~8 minutes)
        # until the new pod can attach to the PV
        pod_obj.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=self.labels_map[noobaa_sts],
            resource_count=1,
            timeout=800
            if noobaa_sts == constants.NOOBAA_DB_STATEFULSET else 60,
            sleep=30 if noobaa_sts == constants.NOOBAA_DB_STATEFULSET else 3,
        )

        # Start the node
        log.info(
            f"Starting {noobaa_sts_pod_node.name} where {noobaa_sts_pod.name} was hosted"
        )
        nodes.start_nodes(nodes=[noobaa_sts_pod_node])
        node.wait_for_nodes_status(node_names=[noobaa_sts_pod_node.name],
                                   status=constants.NODE_READY)

        log.info("Wait for all pods to be in running state")
        wait_for_pods_to_be_running(timeout=300)

        # Check cluster health
        self.sanity_helpers.health_check()

        # Creates bucket then writes, reads and deletes objects
        self.sanity_helpers.obc_put_obj_create_delete(mcg_obj, bucket_factory)
 def finalizer():
     for mon_scale in self.mons_scale:
         self.oc.exec_oc_cmd(
             f"scale --replicas=1 deployment/{mon_scale}")
     wait_for_pods_to_be_running(timeout=600)
예제 #13
0
    def test_pvpool_cpu_and_memory_modifications(
        self,
        awscli_pod_session,
        backingstore_factory,
        bucket_factory,
        test_directory_setup,
        mcg_obj_session,
    ):
        """
        Test to modify the CPU and Memory resource limits for BS and see if its reflecting
        """
        bucketclass_dict = {
            "interface": "OC",
            "backingstore_dict": {
                "pv": [(
                    1,
                    MIN_PV_BACKINGSTORE_SIZE_IN_GB,
                    "ocs-storagecluster-ceph-rbd",
                )]
            },
        }
        bucket = bucket_factory(1, "OC", bucketclass=bucketclass_dict)[0]
        bucket_name = bucket.name
        pv_backingstore = bucket.bucketclass.backingstores[0]
        pv_bs_name = pv_backingstore.name
        pv_pod_label = f"pool={pv_bs_name}"
        pv_pod_info = get_pods_having_label(
            label=pv_pod_label,
            namespace=config.ENV_DATA["cluster_namespace"])[0]
        pv_pod_obj = Pod(**pv_pod_info)
        pv_pod_name = pv_pod_obj.name
        logger.info(
            f"Pod created for PV Backingstore {pv_bs_name}: {pv_pod_name}")
        new_cpu = "500m"
        new_mem = "500Mi"
        new_resource_patch = {
            "spec": {
                "pvPool": {
                    "resources": {
                        "limits": {
                            "cpu": f"{new_cpu}",
                            "memory": f"{new_mem}",
                        },
                        "requests": {
                            "cpu": f"{new_cpu}",
                            "memory": f"{new_mem}",
                        },
                    }
                }
            }
        }
        try:
            OCP(
                namespace=config.ENV_DATA["cluster_namespace"],
                kind="backingstore",
                resource_name=pv_bs_name,
            ).patch(params=json.dumps(new_resource_patch), format_type="merge")
        except CommandFailed as e:
            logger.error(f"[ERROR] Failed to patch: {e}")
        else:
            logger.info("Patched new resource limits")
        wait_for_pods_to_be_running(
            namespace=config.ENV_DATA["cluster_namespace"],
            pod_names=[pv_pod_name])
        pv_pod_ocp_obj = OCP(namespace=config.ENV_DATA["cluster_namespace"],
                             kind="pod").get(resource_name=pv_pod_name)
        resource_dict = pv_pod_ocp_obj["spec"]["containers"][0]["resources"]
        assert (
            resource_dict["limits"]["cpu"] == new_cpu
            and resource_dict["limits"]["memory"] == new_mem
            and resource_dict["requests"]["cpu"] == new_cpu
            and resource_dict["requests"]["memory"] == new_mem
        ), "New resource modification in Backingstore is not reflected in PV Backingstore Pod!!"
        logger.info(
            "Resource modification reflected in the PV Backingstore Pod!!")

        # push some data to the bucket
        file_dir = test_directory_setup.origin_dir
        copy_random_individual_objects(
            podobj=awscli_pod_session,
            file_dir=file_dir,
            target=f"s3://{bucket_name}",
            amount=1,
            s3_obj=OBC(bucket_name),
        )
    def test_replication_with_disruptions(
        self,
        awscli_pod_session,
        mcg_obj_session,
        cld_mgr,
        bucket_factory,
        source_bucketclass,
        target_bucketclass,
        test_directory_setup,
        nodes,
    ):

        # check uni bucket replication from multi (aws+azure) namespace bucket to s3-compatible namespace bucket
        target_bucket_name = bucket_factory(
            bucketclass=target_bucketclass)[0].name
        replication_policy = ("basic-replication-rule", target_bucket_name,
                              None)
        source_bucket_name = bucket_factory(
            bucketclass=source_bucketclass,
            replication_policy=replication_policy)[0].name
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            source_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=5,
            pattern="first-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Uni-directional bucket replication working as expected")

        # change from uni-directional to bi-directional replication policy
        logger.info(
            "Changing the replication policy from uni to bi-directional!")
        bi_replication_policy_dict = {
            "spec": {
                "additionalConfig": {
                    "replicationPolicy":
                    json.dumps([{
                        "rule_id": "basic-replication-rule-2",
                        "destination_bucket": source_bucket_name,
                    }])
                }
            }
        }
        OCP(
            namespace=config.ENV_DATA["cluster_namespace"],
            kind="obc",
            resource_name=target_bucket_name,
        ).patch(params=json.dumps(bi_replication_policy_dict),
                format_type="merge")
        logger.info(
            "Patch ran successfully! Changed the replication policy from uni to bi directional"
        )

        # write objects to the second bucket and see if it's replicated on the other
        logger.info("checking if bi-directional replication works!!")
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=3,
            pattern="second-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")
        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Bi directional bucket replication working as expected")

        # delete all the s3-compatible namespace buckets objects and then recover it from other namespace bucket on
        # write
        logger.info(
            "checking replication when one of the bucket's objects are deleted!!"
        )
        try:
            mcg_obj_session.s3_resource.Bucket(
                target_bucket_name).objects.all().delete()
        except CommandFailed as e:
            logger.error(f"[Error] while deleting objects: {e}")
        if len(
                mcg_obj_session.s3_list_all_objects_in_bucket(
                    target_bucket_name)) != 0:
            assert (
                False
            ), f"[Error] Unexpectedly objects were not deleted from {target_bucket_name}"
        logger.info("All the objects in RGW namespace buckets are deleted!!!")

        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=1,
            pattern="third-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info(
            "All the objects retrieved back to s3-compatible bucket on new write!!"
        )

        # restart RGW pods and then see if object sync still works
        logger.info(
            "Checking if the replication works when there is RGW pod restarts!!"
        )
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=1,
            pattern="fourth-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        pod_names = get_pod_name_by_pattern(
            "rgw", namespace=config.ENV_DATA["cluster_namespace"])
        pod_objs = get_rgw_pods(namespace=config.ENV_DATA["cluster_namespace"])
        delete_pods(pod_objs=pod_objs)
        wait_for_pods_to_be_running(
            pod_names=pod_names,
            namespace=config.ENV_DATA["cluster_namespace"])

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Object sync works after the RGW pod restarted!!")

        # write some object to any of the bucket, followed by immediate cluster restart
        logger.info("Checking replication when there is a cluster reboot!!")
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=1,
            pattern="fifth-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        node_list = get_worker_nodes()
        node_objs = get_node_objs(node_list)
        nodes.restart_nodes(node_objs, timeout=500)
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(ocp.wait_for_cluster_connectivity(tries=400))
        wait_for_pods_to_be_running(
            namespace=config.ENV_DATA["cluster_namespace"], timeout=800)
        logger.info("Nodes rebooted successfully!!")

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Objects sync works even when the cluster is rebooted")
    def test_non_ocs_taint_and_tolerations(self):
        """
        Test runs the following steps
        1. Taint ocs nodes with non-ocs taint
        2. Set tolerations on storagecluster, subscription, configmap and ocsinit
        3. Respin all ocs pods and check if it runs on ocs nodes with tolerations
        4. Add Capacity

        """

        # Taint all nodes with non-ocs taint
        ocs_nodes = get_worker_nodes()
        taint_nodes(nodes=ocs_nodes, taint_label="xyz=true:NoSchedule")

        # Add tolerations to the storagecluster
        storagecluster_obj = ocp.OCP(
            resource_name=constants.DEFAULT_CLUSTERNAME,
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            kind=constants.STORAGECLUSTER,
        )
        tolerations = (
            '{"tolerations": [{"effect": "NoSchedule", "key": "xyz",'
            '"operator": "Equal", "value": "true"}, '
            '{"effect": "NoSchedule", "key": "node.ocs.openshift.io/storage", '
            '"operator": "Equal", "value": "true"}]}')
        param = (
            f'{{"spec": {{"placement": {{"all": {tolerations}, "mds": {tolerations}, '
            f'"noobaa-core": {tolerations}, "rgw": {tolerations}}}}}}}')
        storagecluster_obj.patch(params=param, format_type="merge")

        # Add tolerations to the subscription
        sub_list = ocp.get_all_resource_names_of_a_kind(
            kind=constants.SUBSCRIPTION)
        param = (
            '{"spec": {"config":  {"tolerations": '
            '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", '
            '"value": "true"}]}}}')
        for sub in sub_list:
            sub_obj = ocp.OCP(
                resource_name=sub,
                namespace=defaults.ROOK_CLUSTER_NAMESPACE,
                kind=constants.SUBSCRIPTION,
            )
            sub_obj.patch(params=param, format_type="merge")

        # Add tolerations to the ocsinitializations.ocs.openshift.io
        param = (
            '{"spec":  {"tolerations": '
            '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", '
            '"value": "true"}]}}')

        ocsini_obj = ocp.OCP(
            resource_name=constants.OCSINIT,
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            kind=constants.OCSINITIALIZATION,
        )
        ocsini_obj.patch(params=param, format_type="merge")

        # Add tolerations to the configmap rook-ceph-operator-config
        configmap_obj = ocp.OCP(
            kind=constants.CONFIGMAP,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            resource_name=constants.ROOK_OPERATOR_CONFIGMAP,
        )
        toleration = configmap_obj.get().get("data").get(
            "CSI_PLUGIN_TOLERATIONS")
        toleration += (
            '\n- key: xyz\n  operator: Equal\n  value: "true"\n  effect: NoSchedule'
        )
        toleration = toleration.replace('"', '\\"').replace("\n", "\\n")
        param_cmd = (
            f'[{{"op": "replace", "path": "/data/CSI_PLUGIN_TOLERATIONS", "value": "{toleration}" }}, '
            f'{{"op": "replace", "path": "/data/CSI_PROVISIONER_TOLERATIONS", "value": "{toleration}" }}]'
        )
        configmap_obj.patch(params=param_cmd, format_type="json")

        # After edit noticed few pod respins as expected
        assert wait_for_pods_to_be_running(timeout=600, sleep=15)

        # Respin all pods and check it if is still running
        pod_list = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE, )
        for pod in pod_list:
            pod.delete(wait=False)

        assert wait_for_pods_to_be_running(timeout=600, sleep=15)
        self.sanity_helpers.health_check()

        # Add capacity to check if new osds has toleration
        osd_size = storage_cluster.get_osd_size()
        count = storage_cluster.add_capacity(osd_size)
        pod = ocp.OCP(kind=constants.POD,
                      namespace=config.ENV_DATA["cluster_namespace"])
        if is_flexible_scaling_enabled():
            replica_count = 1
        else:
            replica_count = 3
        assert pod.wait_for_resource(
            timeout=300,
            condition=constants.STATUS_RUNNING,
            selector=constants.OSD_APP_LABEL,
            resource_count=count * replica_count,
        ), "New OSDs failed to reach running state"
        check_ceph_health_after_add_capacity(ceph_rebalance_timeout=2500)
예제 #16
0
    def test_rook_operator_restart_during_mon_failover(self,
                                                       node_drain_teardown):
        """
        Verify the number of monitoring pod is three when drain node

        """
        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=1,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "the expected pdb state is not equal to actual pdb state"

        log.info("Get worker node name where monitoring pod run")
        mon_pod_objs = get_mon_pods()
        node_name = mon_pod_objs[0].data["spec"]["nodeName"]

        drain_nodes([node_name])

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=0,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "the expected pdb state is not equal to actual pdb state"

        timeout = 1400
        log.info(f"Verify the number of mon pods is 3 for {timeout} seconds")
        sample = TimeoutSampler(timeout=timeout,
                                sleep=10,
                                func=check_number_of_mon_pods)
        if sample.wait_for_func_status(result=False):
            assert "There are more than 3 mon pods."

        log.info("Respin pod rook-ceph operator pod")
        rook_ceph_operator_pod_obj = get_operator_pods()
        rook_ceph_operator_pod_obj[0].delete()

        schedule_nodes([node_name])

        log.info("Wait for all the pods in openshift-storage to be running.")
        assert wait_for_pods_to_be_running(timeout=300)

        sample = TimeoutSampler(
            timeout=100,
            sleep=10,
            func=verify_pdb_mon,
            disruptions_allowed=1,
            max_unavailable_mon=1,
        )
        if not sample.wait_for_func_status(result=True):
            assert "the expected pdb state is not equal to actual pdb state"

        ceph_health_check()

        assert check_number_of_mon_pods(
        ), "The number of mon pods not equal to 3"
예제 #17
0
    def test_check_pods_status_after_node_failure(self, nodes, node_restart_teardown):
        """
        Test check pods status after a node failure event.
        All the rook ceph pods should be in "Running" or "Completed"
        state after a node failure event.

        """
        ocs_nodes = get_ocs_nodes()
        if not ocs_nodes:
            pytest.skip("We don't have ocs nodes in the cluster")

        wnodes = get_worker_nodes()

        ocs_node = random.choice(ocs_nodes)
        node_name = ocs_node.name
        log.info(f"Selected node is '{node_name}'")
        # Save the rook ceph pods, the osd ids, and the mon ids before shutting down the node
        rook_ceph_pod_names_not_in_node = get_rook_ceph_pod_names_not_in_node(node_name)
        node_osd_ids = get_node_osd_ids(node_name)
        node_mon_ids = get_node_mon_ids(node_name)

        log.info(f"Shutting down node '{node_name}'")
        nodes.stop_nodes([ocs_node])
        wait_for_nodes_status(node_names=[node_name], status=constants.NODE_NOT_READY)
        log.info(f"The node '{node_name}' reached '{constants.NODE_NOT_READY}' status")

        log.info("Wait for a change in the rook ceph pod statuses...")
        timeout = 480
        is_rook_ceph_pods_status_changed = wait_for_change_in_rook_ceph_pods(
            node_name, timeout=timeout
        )
        assert (
            is_rook_ceph_pods_status_changed
        ), f"Rook Ceph pods status didn't change after {timeout} seconds"

        log.info("Check the rook ceph pods are in 'Running' or 'Completed' state")
        previous_timeout = 480
        timeout = 600
        are_pods_running = wait_for_pods_to_be_running(
            pod_names=rook_ceph_pod_names_not_in_node, timeout=timeout, sleep=30
        )
        assert are_pods_running, (
            f"Increased timeout from {previous_timeout} to {timeout} seconds, "
            f"The pods are not 'Running' even after {timeout} seconds"
        )

        # Get the rook ceph pods without the osd, and mon pods have the old node ids
        osd_pods = get_osd_pods()
        new_node_osd_id_names_set = {
            p.name for p in osd_pods if get_osd_pod_id(p) in node_osd_ids
        }
        mon_pods = get_mon_pods()
        new_node_mon_id_names_set = {
            p.name for p in mon_pods if get_mon_pod_id(p) in node_mon_ids
        }

        new_node_osd_mon_id_names_set = new_node_osd_id_names_set.union(
            new_node_mon_id_names_set
        )
        rook_ceph_pod_names_set = set(get_rook_ceph_pod_names())
        new_rook_ceph_pod_names = list(
            rook_ceph_pod_names_set - new_node_osd_mon_id_names_set
        )

        log.info(
            "Verify that the new rook ceph pods are in 'Running' or 'Completed' state"
        )
        timeout = 300
        are_new_pods_running = wait_for_pods_to_be_running(
            pod_names=new_rook_ceph_pod_names, timeout=timeout, sleep=20
        )
        assert (
            are_new_pods_running
        ), f"The new pods are not 'Running' after {timeout} seconds"

        log.info("All the pods are in 'Running' or 'Completed' state")

        if is_managed_service_cluster():
            log.info(
                "When we use the managed service, the worker node should recover automatically "
                "by starting the node or removing it, and creating a new one."
                "Waiting for all the worker nodes to be ready..."
            )
            wait_for_node_count_to_reach_status(node_count=len(wnodes), timeout=900)
            log.info("Waiting for all the pods to be running")
            assert check_pods_after_node_replacement(), "Not all the pods are running"
        else:
            log.info(f"Starting the node '{node_name}' again...")
            nodes.start_nodes(nodes=[ocs_node])
            wait_for_nodes_status(node_names=[node_name])
            log.info("Waiting for all the pods to be running")
            wait_for_pods_to_be_running(timeout=600)

        log.info("Checking that the cluster health is OK...")
        self.sanity_helpers.health_check(tries=40)