def finalizer():

            try:

                # Validate all mon pods are running
                log.info("Validate all mons are up and running")
                POD_OBJ.wait_for_resource(
                    condition=STATUS_RUNNING,
                    selector=MON_APP_LABEL,
                    resource_count=len(mon_pod),
                )
                log.info("All mons are up and running")

            except (TimeoutExpiredError, ResourceWrongStatusException) as ex:
                log.error(f"{ex}")
                # Restart operator
                operator_pod_obj = get_operator_pods()
                delete_pods(pod_objs=operator_pod_obj)

                # Wait untill mon pod recovery
                POD_OBJ.wait_for_resource(
                    condition=STATUS_RUNNING,
                    selector=MON_APP_LABEL,
                    resource_count=len(mon_pod),
                    timeout=3600,
                    sleep=5,
                )
                log.info("All mons are up and running")

                # Check the ceph health OK
                ceph_health_check(tries=90, delay=15)
Пример #2
0
    def delete_pods(self):
        """
        Try to delete pods:
            - Rook operator
            - OSD
            - MGR
            - MON
        """
        pod_list = []
        rook_operator_pod = pod.get_ocs_operator_pod(
            ocs_label=constants.OPERATOR_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        pod_list.append(rook_operator_pod)

        osd_pods = pod.get_osd_pods()
        pod_list.extend(osd_pods)

        mgr_pods = pod.get_mgr_pods()
        pod_list.extend(mgr_pods)

        mon_pods = pod.get_mon_pods()
        pod_list.extend(mon_pods)

        logger.info(f"Deleting pods: {[p.name for p in pod_list]}")
        pod.delete_pods(pod_objs=pod_list)
Пример #3
0
        def finalizer():
            # Use provider cluster in managed service platform
            if self.consumer_cluster_index is not None:
                config.switch_to_provider()

            # Validate all mon services are running
            if len(mon_svc_list) != len(
                    get_services_by_label(
                        label=constants.MON_APP_LABEL,
                        namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
                    )):

                # Restart the rook-operator pod
                operator_pod_obj = get_operator_pods()
                delete_pods(pod_objs=operator_pod_obj)
                POD_OBJ.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.OPERATOR_LABEL,
                )

                # Wait till all mon services are up
                for svc_list in TimeoutSampler(
                        1200,
                        len(mon_svc_list),
                        get_services_by_label,
                        constants.MON_APP_LABEL,
                        constants.OPENSHIFT_STORAGE_NAMESPACE,
                ):
                    try:
                        if len(svc_list) == len(mon_svc_list):
                            log.info("All expected mon services are up")
                            break
                    except IndexError:
                        log.error(
                            f"All expected mon services are not up only found :{svc_list}. "
                            f"Expected: {mon_svc_list}")

                # Wait till all mon pods running
                POD_OBJ.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    selector=constants.MON_APP_LABEL,
                    resource_count=len(mon_pods_list),
                    timeout=600,
                    sleep=3,
                )

                # Check the ceph health OK
                ceph_health_check(tries=90, delay=15)

            # Switch the context to consumer cluster if needed
            if self.consumer_cluster_index is not None:
                config.switch_to_consumer(self.consumer_cluster_index)
Пример #4
0
def check_automated_recovery_from_drain_node(nodes):
    """
    1) Drain one worker node.
    2) Delete the OSD pods associated with the node.
    3) The new OSD pods with the same ids that come up, should be in a Pending state.
    4) Schedule the worker node.
    5) The OSD pods associated with the node, should back into a Running state, and come up
        on the same node.

    """
    osd_node_name = random.choice(get_osd_running_nodes())
    old_osd_pod_ids = get_node_osd_ids(osd_node_name)
    log.info(f"osd pod ids: {old_osd_pod_ids}")
    node_osd_pods = get_osd_pods_having_ids(old_osd_pod_ids)

    unschedule_nodes([osd_node_name])
    log.info(f"Successfully unschedule the node: {osd_node_name}")

    log.info("Delete the node osd pods")
    delete_pods(node_osd_pods)

    new_osd_pods = wait_for_osd_pods_having_ids(osd_ids=old_osd_pod_ids)
    new_osd_pod_names = [p.name for p in new_osd_pods]

    wnodes = get_worker_nodes()
    if len(wnodes) <= 3:
        expected_pods_status = constants.STATUS_PENDING
    else:
        expected_pods_status = constants.STATUS_RUNNING

    log.info(
        f"Verify the new osd pods {new_osd_pod_names} go into a {expected_pods_status} state"
    )
    res = wait_for_pods_to_be_in_statuses(
        [expected_pods_status],
        new_osd_pod_names,
        raise_pod_not_found_error=True,
    )
    assert res, f"Not all the node osd pods are in a {expected_pods_status} state"

    log.info(f"Wait for the node: {osd_node_name} to be scheduled")
    schedule_nodes([osd_node_name])
    log.info(f"Successfully scheduled the node {osd_node_name}")

    if len(wnodes) <= 3:
        assert wait_for_osd_ids_come_up_on_node(osd_node_name, old_osd_pod_ids)
        log.info(
            f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}"
        )
 def restart_osd_pod(self):
     """
     Restart a randomly picked OSD Pod
     """
     num_of_deletions = 0
     while not self.stop_checking_mon_db:
         osd_pod_list = pod.get_osd_pods()
         selected_osd_pod_obj = random.choice(osd_pod_list)
         log.info(
             f"Deleting osd pod {selected_osd_pod_obj.get().get('metadata').get('name')}. "
             f"Deletion #{num_of_deletions+1}"
         )
         pod.delete_pods(pod_objs=[selected_osd_pod_obj])
         num_of_deletions = num_of_deletions + 1
     log.info(f"Number of osd deletions: {num_of_deletions}")
Пример #6
0
    def teardown():

        # Delete created app pods and pvcs
        assert pod.delete_pods(pod_objs)
        assert pvc.delete_pvcs(pvc_objs)

        # Switch to default project
        ret = ocp.switch_to_default_rook_cluster_project()
        assert ret, 'Failed to switch to default rook cluster project'

        # Delete created projects
        for prj in namespace_list:
            prj.delete(resource_name=prj.namespace)
    def test_disruptive_during_pod_pvc_deletion_and_io(
        self, interface, resource_to_delete, setup_base
    ):
        """
        Delete ceph/rook pod while PVCs deletion, pods deletion and IO are
        progressing
        """
        pvc_objs, pod_objs, rwx_pod_objs = setup_base
        namespace = pvc_objs[0].project.namespace

        num_of_pods_to_delete = 3
        num_of_io_pods = 1

        # Select pods to be deleted
        pods_to_delete = pod_objs[:num_of_pods_to_delete]
        pods_to_delete.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in pods_to_delete
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        # Select pods to run IO
        io_pods = pod_objs[
            num_of_pods_to_delete : num_of_pods_to_delete + num_of_io_pods
        ]
        io_pods.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in io_pods
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        # Select pods which are having PVCs to delete
        pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods :]
        pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc]
        pods_for_pvc.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in pods_for_pvc
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        log.info(
            f"{len(pods_to_delete)} pods selected for deletion in which "
            f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod "
            f"share same RWX PVC"
        )
        log.info(
            f"{len(io_pods)} pods selected for running IO in which "
            f"{len(io_pods) - num_of_io_pods} pairs of pod share same "
            f"RWX PVC"
        )
        no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete)
        log.info(
            f"{len(pvcs_to_delete)} PVCs selected for deletion. "
            f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, "
            f"RWX PVCs: {no_of_rwx_pvcs_delete}"
        )

        pod_functions = {
            "mds": partial(get_mds_pods),
            "mon": partial(get_mon_pods),
            "mgr": partial(get_mgr_pods),
            "osd": partial(get_osd_pods),
            "rbdplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods),
            "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods),
            "operator": partial(get_operator_pods),
        }

        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        executor = ThreadPoolExecutor(max_workers=len(pod_objs) + len(rwx_pod_objs))

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(get_all_pods(namespace=namespace))
        initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"])

        # Fetch PV names to verify after deletion
        pv_objs = []
        for pvc_obj in pvcs_to_delete:
            pvc_obj.reload()
            pv_objs.append(pvc_obj.backed_pv_obj)

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod_obj in pods_to_delete:
            pod_info = pod_obj.get()
            node = pod_info["spec"]["nodeName"]
            pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"]
            for pvc_obj in pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Fetch image uuid associated with PVCs to be deleted
        pvc_uuid_map = {}
        for pvc_obj in pvcs_to_delete:
            pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid
        log.info("Fetched image uuid associated with each PVC")

        # Do setup on pods for running IO
        log.info("Setting up pods for running IO.")
        for pod_obj in pod_objs + rwx_pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in pod_objs + rwx_pod_objs:
            log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}")
            for sample in TimeoutSampler(180, 2, getattr, pod_obj, "wl_setup_done"):
                if sample:
                    log.info(
                        f"Setup for running IO is completed on pod " f"{pod_obj.name}."
                    )
                    break
        log.info("Setup for running IO is completed on all pods.")

        # Start IO on pods having PVCs to delete to load data
        log.info("Starting IO on pods having PVCs to delete.")
        self.run_io_on_pods(pods_for_pvc)
        log.info("IO started on pods having PVCs to delete.")

        log.info("Fetching IO results from the pods having PVCs to delete.")
        for pod_obj in pods_for_pvc:
            get_fio_rw_iops(pod_obj)
        log.info("Verified IO result on pods having PVCs to delete.")

        # Delete pods having PVCs to delete.
        delete_pods(pods_for_pvc)
        for pod_obj in pods_for_pvc:
            pod_obj.ocp.wait_for_delete(pod_obj.name)
        log.info("Verified: Deleted pods which are having PVCs to delete.")

        # Start IO on pods to be deleted
        log.info("Starting IO on pods to be deleted.")
        self.run_io_on_pods(pods_to_delete)
        log.info("IO started on pods to be deleted.")

        # Start deleting PVCs
        pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete)
        log.info("Started deleting PVCs")

        # Start deleting pods
        pod_bulk_delete = executor.submit(delete_pods, pods_to_delete, wait=False)
        log.info("Started deleting pods")

        # Start IO on IO pods
        self.run_io_on_pods(io_pods)
        log.info("Started IO on IO pods")

        # Verify pvc deletion has started
        pvc_deleting = executor.submit(
            wait_for_resource_count_change,
            func_to_use=get_all_pvcs,
            previous_num=initial_num_of_pvc,
            namespace=namespace,
            change_type="decrease",
            min_difference=1,
            timeout=30,
            interval=0.01,
        )

        # Verify pod deletion has started
        pod_deleting = executor.submit(
            wait_for_resource_count_change,
            func_to_use=get_all_pods,
            previous_num=initial_num_of_pods,
            namespace=namespace,
            change_type="decrease",
            min_difference=1,
            timeout=30,
            interval=0.01,
        )

        assert pvc_deleting.result(), "Wait timeout: PVCs are not being deleted."
        log.info("PVCs deletion has started.")

        assert pod_deleting.result(), "Wait timeout: Pods are not being deleted."
        log.info("Pods deletion has started.")

        # Delete pod of type 'resource_to_delete'
        disruption.delete_resource()

        pod_bulk_delete.result()

        # Verify pods are deleted
        for pod_obj in pods_to_delete:
            pod_obj.ocp.wait_for_delete(pod_obj.name, 300)
        log.info("Verified: Pods are deleted.")

        # Verify that the mount point is removed from nodes after deleting pod
        node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict)
        for node, pvs in node_pv_mounted.items():
            assert not pvs, (
                f"PVs {pvs} is still present on node {node} after "
                f"deleting the pods."
            )
        log.info(
            "Verified: mount points are removed from nodes after deleting " "the pods"
        )

        pvcs_deleted = pvc_bulk_delete.result()
        assert pvcs_deleted, "Deletion of PVCs failed."

        # Verify PVCs are deleted
        for pvc_obj in pvcs_to_delete:
            pvc_obj.ocp.wait_for_delete(pvc_obj.name)
        log.info("Verified: PVCs are deleted.")

        # Verify PVs are deleted
        for pv_obj in pv_objs:
            pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300)
        log.info("Verified: PVs are deleted.")

        # Verify PV using ceph toolbox. Image/Subvolume should be deleted.
        pool_name = default_ceph_block_pool()
        for pvc_name, uuid in pvc_uuid_map.items():
            if interface == constants.CEPHBLOCKPOOL:
                ret = verify_volume_deleted_in_backend(
                    interface=interface, image_uuid=uuid, pool_name=pool_name
                )
            if interface == constants.CEPHFILESYSTEM:
                ret = verify_volume_deleted_in_backend(
                    interface=interface, image_uuid=uuid
                )
            assert ret, (
                f"Volume associated with PVC {pvc_name} still exists " f"in backend"
            )

        log.info("Fetching IO results from the pods.")
        for pod_obj in io_pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
        log.info("Verified IO result on pods.")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}"
        )

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
        log.info("Ceph cluster health is OK")
    def test_multiple_mon_pod_stays_on_same_node(self):
        """
        A testcase to verify multiple mon pods stays on same node

        1. Edit the rook-ceph-mon-endpoints configmap
           say, assign mon-a to another node that would be on
           the same node as another mon (compute-1 instead of compute-0)
        2. Delete the mon-a deployment
        3. Edit the mon-b deployment to remove the required mon anti-affinity
        4. Restart the operator
        5. Edit the mon-a deployment to remove the required mon anti-affinity
        6. See mon-a start on compute-1 with mon-b
        7. Soon after, see the operator failover one of these mons onto the
        node that doesn't currently have a mon (compute-0) and start mon-d

        """
        ocs_version = config.ENV_DATA["ocs_version"]
        # Check that we have LSO cluster and OCS version is 4.8 and below
        # This is a workaround due to issue https://github.com/red-hat-storage/ocs-ci/issues/4937
        if not (is_lso_cluster()
                and Version.coerce(ocs_version) <= Version.coerce("4.8")):
            pytest.skip(
                "Skip the test because mons are not node assignment from Rook, if cluster is not "
                "LSO based. And also currently, we want to run the test only with OCS 4.8 and "
                "below. This is a workaround due to issue "
                "https://github.com/red-hat-storage/ocs-ci/issues/4937")
        # Initialize
        rook_ceph_mon = "rook-ceph-mon"

        # Get mons running on pod
        mon_pods = get_mon_pods()
        mon_name_to_del = mon_pods[0].get().get("metadata").get("labels").get(
            "mon")
        mon_name_to_edit = mon_pods[1].get().get("metadata").get("labels").get(
            "mon")
        mon_node = get_pod_node(mon_pods[1])

        # Edit the rook-ceph-mon-endpoints
        log.info(f"Edit the configmap {ROOK_CEPH_MON_ENDPOINTS}")
        configmap_obj = OCP(kind=CONFIGMAP,
                            namespace=OPENSHIFT_STORAGE_NAMESPACE)
        rook_ceph_mon_configmap = configmap_obj.get(
            resource_name=ROOK_CEPH_MON_ENDPOINTS)
        json_val = json.loads(rook_ceph_mon_configmap["data"]["mapping"])
        json_val["node"][mon_name_to_del].update(
            json_val["node"][mon_name_to_edit])
        rook_ceph_mon_configmap["data"]["mapping"] = json.dumps(json_val)
        new_data = rook_ceph_mon_configmap["data"]
        params = f'{{"data": {json.dumps(new_data)}}}'
        configmap_obj.patch(
            resource_name=ROOK_CEPH_MON_ENDPOINTS,
            params=params,
            format_type="strategic",
        )
        log.info(f"Configmap {ROOK_CEPH_MON_ENDPOINTS} edited successfully")
        log.info(
            f"Rook-ceph-mon-endpoints updated configmap: {rook_ceph_mon_configmap}"
        )

        # Delete one mon deployment which had been edited
        dep_obj = OCP(kind=DEPLOYMENT, namespace=OPENSHIFT_STORAGE_NAMESPACE)
        mon_deployment_name_to_del = f"{rook_ceph_mon}-{mon_name_to_del}"
        log.info(f"Deleting mon {mon_deployment_name_to_del} deployments")
        dep_obj.delete(resource_name=mon_deployment_name_to_del)

        # Edit other mon deployment to remove mon anti-affinity
        mon_deployment_name_to_edit = f"{rook_ceph_mon}-{mon_name_to_edit}"
        log.info(f"Edit mon {mon_deployment_name_to_edit} deployment "
                 "to remove the required mon anti-affinity")
        params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]'
        dep_obj.patch(resource_name=mon_deployment_name_to_edit,
                      params=params,
                      format_type="json")
        log.info(
            f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_edit}"
        )

        # Restart operator
        operator_pod_obj = get_operator_pods()
        delete_pods(pod_objs=operator_pod_obj)
        POD_OBJ.wait_for_resource(condition=STATUS_RUNNING,
                                  selector=OPERATOR_LABEL)

        # Validate deleted deployment mon came up and in pending state
        # Initially mon stucks in pending state, remove defined anti-affinity
        POD_OBJ.wait_for_resource(
            condition=STATUS_PENDING,
            resource_count=1,
            selector=MON_APP_LABEL,
            timeout=1200,
        )
        # Edit mon deployment to remove mon anti-affinity
        log.info(f"Edit mon {mon_deployment_name_to_del} deployment "
                 "to remove the required mon anti-affinity")
        params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]'
        dep_obj.patch(resource_name=mon_deployment_name_to_del,
                      params=params,
                      format_type="json")
        log.info(
            f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_del}"
        )

        # Validate mon pod moved to another node such that 2 mons are running on same node
        log.info("Waiting for 5 seconds for mon recovery")
        time.sleep(5)
        new_mon_pods = get_mon_pods()
        new_node = [
            get_pod_node(mon) for mon in new_mon_pods if mon.get().get(
                "metadata").get("labels").get("mon") == mon_name_to_del
        ]
        assert (
            new_node[0].name == mon_node.name
        ), f"Mon moved to node {mon_node} such that 2 mons are running on same node"

        # Verify rook deletes one of the mon and move to another node
        timeout = 60
        log.info(f"Waiting for {timeout} seconds for mon recovery")
        time.sleep(timeout)

        POD_OBJ.wait_for_resource(
            condition=STATUS_RUNNING,
            resource_count=len(mon_pods),
            selector=MON_APP_LABEL,
            timeout=3600,
            sleep=5,
        )
        log.info(
            "Mons are up and running state and validate are running on different nodes"
        )
        mon_pods_running_on_same_node()
    def test_del_mon_svc(
        self, multi_pvc_factory, validate_all_mon_svc_are_up_at_teardown
    ):
        """
        Test to verify same mon comes up and running
        after deleting mon services manually and joins the quorum

        1. Delete the mon services
        2. Restart the rook operator
        3. Make sure all mon pods are running,
        and same service or endpoints are running
        4. Make sure ceph health Ok and storage pods are running
        5. Create PVC, should succeeded.

        """

        self.sanity_helpers = Sanity()

        # Get all mon services
        mon_svc_before = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods = get_mon_pods()

        # Delete the mon services one by one
        svc_obj = OCP(
            kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE
        )
        mon_svc_ip_before = []
        for svc in mon_svc_before:
            svc_name = svc["metadata"]["name"]
            mon_svc_ip_before.append(svc["spec"]["clusterIP"])
            log.info(f"Delete mon service {svc_name}")
            svc_obj.delete(resource_name=svc_name)
            # Verify mon services deleted
            svc_obj.wait_for_delete(resource_name=svc_name)

        # Restart the rook-operator pod
        operator_pod_obj = get_operator_pods()
        delete_pods(pod_objs=operator_pod_obj)
        POD_OBJ.wait_for_resource(
            condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL
        )

        # Verify same mon services are created again
        for svc in mon_svc_before:
            svc_name = svc["metadata"]["name"]
            svc_obj.check_resource_existence(
                should_exist=True, timeout=300, resource_name=svc_name
            )
        log.info("Same old mon services are recreated")

        # Validate all mons are running
        log.info("Validate all mons are up and running")
        POD_OBJ.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=constants.MON_APP_LABEL,
            resource_count=len(mon_pods),
            timeout=600,
            sleep=3,
        )

        # Validate same mon services are running
        log.info("Validate same mon services are running")
        mon_svc_after = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        mon_svc_ip_after = [svc["spec"]["clusterIP"] for svc in mon_svc_after]
        assert len(set(mon_svc_ip_after) ^ set(mon_svc_ip_before)) == 0, (
            "Different mon services are running. "
            f"Before mon services list: {mon_svc_ip_before}, "
            f"After mon services list: {mon_svc_ip_after}"
        )
        log.info("Same old mon services are running and all mons are in running state")

        # Verify everything running fine
        log.info("Verifying All resources are Running and matches expected result")
        self.sanity_helpers.health_check(tries=120)

        # Validate all storage pods are running
        wait_for_storage_pods()

        # Create and delete resources
        self.sanity_helpers.create_pvc_delete(multi_pvc_factory=multi_pvc_factory)
Пример #10
0
    def test_sc_reclaim_policy_retain_rep2_comp(
        self,
        storageclass_factory,
        pvc_factory,
        pod_factory,
    ):
        """
        This test function does below,
        *. Create storageclass with reclaim policy retain
            and pool with rep2 and compression
        *. Create pvc and pod
        *. Run IO on pod
        *. Verify compression and replication
        *. Delete Pod, Pvc, Pv, Rbd image
        """

        log.info(f"Creating storageclass with replica {self.replica}"
                 f", compression {self.compression} and"
                 f"reclaim policy {self.reclaim_policy}")
        sc_obj = storageclass_factory(
            interface=CEPHBLOCKPOOL,
            new_rbd_pool=True,
            replica=self.replica,
            compression=self.compression,
            reclaim_policy=self.reclaim_policy,
        )
        pool = sc_obj.get()["parameters"]["pool"]

        log.info("Creating PVCs and PODs")
        pvc_obj = pvc_factory(interface=CEPHBLOCKPOOL,
                              storageclass=sc_obj,
                              size=10)
        pod_obj = pod_factory(interface=CEPHBLOCKPOOL, pvc=pvc_obj)

        log.info("Running IO on pod")
        pod_obj.run_io(
            "fs",
            size="1G",
            rate="1500m",
            runtime=60,
            buffer_compress_percentage=60,
            buffer_pattern="0xdeadface",
            bs="8K",
            jobs=5,
            readwrite="readwrite",
        )

        log.info(f"validating info on pool {pool}")
        validate_rep_result = validate_replica_data(pool, self.replica)
        if validate_rep_result is False:
            raise PoolNotReplicatedAsNeeded(
                f"pool {pool} not replicated as expected")
        validate_comp_result = validate_compression(pool)
        if validate_comp_result is False:
            raise PoolNotCompressedAsExpected(
                f"pool {pool} not compressed as expected")

        log.info("Deleting pod")
        pod_obj_list = [pod_obj]
        delete_pods(pod_obj_list, wait=True)

        log.info("Deleting pvc, pv and rbd image")
        pvc_obj.reload()
        pvc_uuid_map = pvc_obj.image_uuid
        pv_obj = pvc_obj.backed_pv_obj
        pvc_obj.delete()
        pv_obj.delete()
        delete_results = delete_volume_in_backend(img_uuid=pvc_uuid_map,
                                                  pool_name=pool)
        if not delete_results:
            raise ImageIsNotDeletedOrNotFound(
                f"Could not delete or find image csi-vol-{pvc_uuid_map}")
Пример #11
0
    def test_multiple_sc_comp_rep_data_deletion(self, storageclass_factory,
                                                pvc_factory, pod_factory):
        """
        This test function does below,
        *. Creates 2 Storage Class with creating new rbd pool
        *. Creates PVCs using new Storage Class
        *. Mount PVC to an app pod
        *. Run IO on an app pod
        *. Delete the pods and pvc
        *. Verify that the data is deleted

        """
        log.info("Creating storageclasses with compression and replica3")
        interface_type = constants.CEPHBLOCKPOOL
        sc_obj1 = storageclass_factory(
            interface=interface_type,
            new_rbd_pool=True,
            replica=3,
            compression="aggressive",
        )
        log.info("Creating storageclasses with compression and replica2")
        sc_obj2 = storageclass_factory(
            interface=interface_type,
            new_rbd_pool=True,
            replica=2,
            compression="aggressive",
        )

        sc_obj_list = [sc_obj1, sc_obj2]
        pod_obj_list = []
        pvc_obj_list = []

        log.info("Creating PVCs and PODs")
        for sc_obj in sc_obj_list:
            pvc_obj = pvc_factory(interface=interface_type,
                                  storageclass=sc_obj)
            pvc_obj_list.append(pvc_obj)
            pod_obj_list.append(
                pod_factory(interface=interface_type, pvc=pvc_obj))

        log.info("Running IO on pods")
        for pod_obj in pod_obj_list:
            pod_obj.run_io("fs", size="1G")

        for pod_obj in pod_obj_list:
            get_fio_rw_iops(pod_obj)

        log.info("deleting PODs and PVCs")
        delete_pods(pod_obj_list, wait=True)
        delete_pvcs(pvc_obj_list, concurrent=True)

        log.info("Wait for 15 seconds for all data to delete")
        sleep(15)
        log.info("Checking stats after deleting PODs and PVCs")
        for sc_obj in sc_obj_list:
            pvc_list = get_all_pvcs_in_storageclass(sc_obj.name)
            if len(pvc_list) == 0:
                cbp_name = sc_obj.get()["parameters"]["pool"]
                ceph_pool_byte_used = get_byte_used_by_pool(cbp_name)
                log.info(
                    f"pool {cbp_name} has {ceph_pool_byte_used} bytes used")
                if ceph_pool_byte_used > MAX_BYTES_IN_POOL_AFTER_DATA_DELETE:
                    raise PoolDataNotErased(
                        f"Pool {cbp_name} has {ceph_pool_byte_used} bytes which were not deleted"
                    )
            else:
                raise PvcNotDeleted(f"PVC {pvc_list} were not deleted")
    def test_replication_with_disruptions(
        self,
        awscli_pod_session,
        mcg_obj_session,
        cld_mgr,
        bucket_factory,
        source_bucketclass,
        target_bucketclass,
        test_directory_setup,
        nodes,
    ):

        # check uni bucket replication from multi (aws+azure) namespace bucket to s3-compatible namespace bucket
        target_bucket_name = bucket_factory(
            bucketclass=target_bucketclass)[0].name
        replication_policy = ("basic-replication-rule", target_bucket_name,
                              None)
        source_bucket_name = bucket_factory(
            bucketclass=source_bucketclass,
            replication_policy=replication_policy)[0].name
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            source_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=5,
            pattern="first-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Uni-directional bucket replication working as expected")

        # change from uni-directional to bi-directional replication policy
        logger.info(
            "Changing the replication policy from uni to bi-directional!")
        bi_replication_policy_dict = {
            "spec": {
                "additionalConfig": {
                    "replicationPolicy":
                    json.dumps([{
                        "rule_id": "basic-replication-rule-2",
                        "destination_bucket": source_bucket_name,
                    }])
                }
            }
        }
        OCP(
            namespace=config.ENV_DATA["cluster_namespace"],
            kind="obc",
            resource_name=target_bucket_name,
        ).patch(params=json.dumps(bi_replication_policy_dict),
                format_type="merge")
        logger.info(
            "Patch ran successfully! Changed the replication policy from uni to bi directional"
        )

        # write objects to the second bucket and see if it's replicated on the other
        logger.info("checking if bi-directional replication works!!")
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=3,
            pattern="second-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")
        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Bi directional bucket replication working as expected")

        # delete all the s3-compatible namespace buckets objects and then recover it from other namespace bucket on
        # write
        logger.info(
            "checking replication when one of the bucket's objects are deleted!!"
        )
        try:
            mcg_obj_session.s3_resource.Bucket(
                target_bucket_name).objects.all().delete()
        except CommandFailed as e:
            logger.error(f"[Error] while deleting objects: {e}")
        if len(
                mcg_obj_session.s3_list_all_objects_in_bucket(
                    target_bucket_name)) != 0:
            assert (
                False
            ), f"[Error] Unexpectedly objects were not deleted from {target_bucket_name}"
        logger.info("All the objects in RGW namespace buckets are deleted!!!")

        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=1,
            pattern="third-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info(
            "All the objects retrieved back to s3-compatible bucket on new write!!"
        )

        # restart RGW pods and then see if object sync still works
        logger.info(
            "Checking if the replication works when there is RGW pod restarts!!"
        )
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=1,
            pattern="fourth-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        pod_names = get_pod_name_by_pattern(
            "rgw", namespace=config.ENV_DATA["cluster_namespace"])
        pod_objs = get_rgw_pods(namespace=config.ENV_DATA["cluster_namespace"])
        delete_pods(pod_objs=pod_objs)
        wait_for_pods_to_be_running(
            pod_names=pod_names,
            namespace=config.ENV_DATA["cluster_namespace"])

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Object sync works after the RGW pod restarted!!")

        # write some object to any of the bucket, followed by immediate cluster restart
        logger.info("Checking replication when there is a cluster reboot!!")
        written_random_objects = write_random_test_objects_to_bucket(
            awscli_pod_session,
            target_bucket_name,
            test_directory_setup.origin_dir,
            mcg_obj=mcg_obj_session,
            amount=1,
            pattern="fifth-write-",
        )
        logger.info(f"Written objects: {written_random_objects}")

        node_list = get_worker_nodes()
        node_objs = get_node_objs(node_list)
        nodes.restart_nodes(node_objs, timeout=500)
        retry(
            (CommandFailed, TimeoutError, AssertionError,
             ResourceWrongStatusException),
            tries=60,
            delay=15,
        )(ocp.wait_for_cluster_connectivity(tries=400))
        wait_for_pods_to_be_running(
            namespace=config.ENV_DATA["cluster_namespace"], timeout=800)
        logger.info("Nodes rebooted successfully!!")

        compare_bucket_object_list(mcg_obj_session, source_bucket_name,
                                   target_bucket_name)
        logger.info("Objects sync works even when the cluster is rebooted")
Пример #13
0
    def test_no_volume_mounted(self):
        """
        Test reclaimspace job with no volume mounted

        Steps:
        1. Create and attach RBD PVC of size 25 GiB to an app pod.
        2. Get the used size of the RBD pool
        3. Create a file of size 10GiB
        4. Delete the file
        5. Delete the pod
        6. Create ReclaimSpaceJob
        7. No errors should be seen in reclaim space job

        """
        pvc_obj = self.pvc[0]
        pod_obj = self.pod[0]

        fio_filename1 = "fio_file1"

        # Fetch the used size of pool
        cbp_name = self.sc_obj.get().get("parameters").get("pool")
        used_size_before_io = fetch_used_size(cbp_name)
        log.info(f"Used size before IO is {used_size_before_io}")

        # Create a 10 GB file
        pod_obj.run_io(
            storage_type="fs",
            size="10G",
            runtime=120,
            fio_filename=fio_filename1,
            end_fsync=1,
        )
        pod_obj.get_fio_results()

        # Verify used size after IO
        exp_used_size_after_io = used_size_before_io + (10 * self.pool_replica)
        used_size_after_io = fetch_used_size(cbp_name, exp_used_size_after_io)
        log.info(f"Used size after IO is {used_size_after_io}")

        # Delete the file
        file_path = get_file_path(pod_obj, fio_filename1)
        pod_obj.exec_cmd_on_pod(command=f"rm -f {file_path}",
                                out_yaml_format=False)

        # Verify whether file is deleted
        try:
            check_file_existence(pod_obj=pod_obj, file_path=file_path)
        except CommandFailed as cmdfail:
            if "No such file or directory" not in str(cmdfail):
                raise
            log.info(f"Verified: File {file_path} deleted.")

        # Delete the pod
        log.info(f"Deleting the pod {pod_obj}")
        delete_pods([pod_obj])

        # Create ReclaimSpaceJob
        reclaim_space_job = pvc_obj.create_reclaim_space_job()

        # Verify Succeeded result of ReclaimSpaceJob
        self.reclaim_space_job(reclaim_space_job)