示例#1
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        _templating = templating.Templating()

        ceph_cluster = ocp.OCP(
            kind='CephCluster', namespace=config.ENV_DATA['cluster_namespace']
        )
        try:
            ceph_cluster.get().get('items')[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")

        create_oc_resource(
            'common.yaml', self.cluster_path, _templating, config.ENV_DATA
        )

        run_cmd(
            f'oc label namespace {config.ENV_DATA["cluster_namespace"]} '
            f'"openshift.io/cluster-monitoring=true"'
        )
        run_cmd(
            f"oc policy add-role-to-user view "
            f"system:serviceaccount:openshift-monitoring:prometheus-k8s "
            f"-n {config.ENV_DATA['cluster_namespace']}"
        )
        # HACK: If you would like to drop this hack, make sure that you also
        # updated docs and write appropriate unit/integration tests for config
        # processing.
        if config.ENV_DATA.get('monitoring_enabled') in ("true", "True", True):
            # RBAC rules for monitoring, based on documentation change in rook:
            # https://github.com/rook/rook/commit/1b6fe840f6ae7372a9675ba727ecc65326708aa8
            # HACK: This should be dropped when OCS is managed by OLM
            apply_oc_resource(
                'rbac.yaml',
                self.cluster_path,
                _templating,
                config.ENV_DATA,
                template_dir="monitoring"
            )
        # Increased to 15 seconds as 10 is not enough
        # TODO: do the sampler function and check if resource exist
        wait_time = 15
        logger.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        create_oc_resource(
            'operator-openshift.yaml', self.cluster_path,
            _templating, config.ENV_DATA
        )
        logger.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        run_cmd(
            f"oc wait --for condition=ready pod "
            f"-l app=rook-ceph-operator "
            f"-n {config.ENV_DATA['cluster_namespace']} "
            f"--timeout=120s"
        )
        run_cmd(
            f"oc wait --for condition=ready pod "
            f"-l app=rook-discover "
            f"-n {config.ENV_DATA['cluster_namespace']} "
            f"--timeout=120s"
        )
        create_oc_resource(
            'cluster.yaml', self.cluster_path, _templating, config.ENV_DATA
        )

        pod = ocp.OCP(
            kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']
        )
        cfs = ocp.OCP(
            kind=constants.CEPHFILESYSTEM,
            namespace=config.ENV_DATA['cluster_namespace']
        )
        # Check for Ceph pods
        assert pod.wait_for_resource(
            condition='Running', selector='app=rook-ceph-mon',
            resource_count=3, timeout=600
        )
        assert pod.wait_for_resource(
            condition='Running', selector='app=rook-ceph-mgr',
            timeout=600
        )
        assert pod.wait_for_resource(
            condition='Running', selector='app=rook-ceph-osd',
            resource_count=3, timeout=600
        )

        create_oc_resource(
            'toolbox.yaml', self.cluster_path, _templating, config.ENV_DATA
        )
        logger.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        # HACK: This should be dropped (including service-monitor.yaml and
        # prometheus-rules.yaml files) when OCS is managed by OLM
        if config.ENV_DATA.get('monitoring_enabled') not in ("true", "True", True):
            # HACK: skip creation of rook-ceph-mgr service monitor when monitoring
            # is enabled (if this were not skipped, the step would fail because
            # rook would create the service monitor at this point already)
            create_oc_resource(
                "service-monitor.yaml", self.cluster_path, _templating,
                config.ENV_DATA
            )
            # HACK: skip creation of prometheus-rules, rook-ceph is concerned
            # with it's setup now, based on clarification from Umanga
            # Chapagain
            create_oc_resource(
                "prometheus-rules.yaml", self.cluster_path, _templating,
                config.ENV_DATA
            )
        logger.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)

        # Create MDS pods for CephFileSystem
        fs_data = templating.load_yaml_to_dict(constants.CEPHFILESYSTEM_YAML)
        fs_data['metadata']['namespace'] = config.ENV_DATA['cluster_namespace']

        ceph_obj = OCS(**fs_data)
        ceph_obj.create()
        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING, selector='app=rook-ceph-mds',
            resource_count=2, timeout=600
        )

        # Check for CephFilesystem creation in ocp
        cfs_data = cfs.get()
        cfs_name = cfs_data['items'][0]['metadata']['name']

        if helpers.validate_cephfilesystem(cfs_name):
            logger.info(f"MDS deployment is successful!")
            defaults.CEPHFILESYSTEM_NAME = cfs_name
        else:
            logger.error(
                f"MDS deployment Failed! Please check logs!"
            )

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        assert ceph_health_check(
            namespace=config.ENV_DATA['cluster_namespace']
        )
        # patch gp2 (EBS) storage class as 'non-default'
        logger.info("Patch gp2 storageclass as non-default")
        patch = " '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"false\"}}}' "
        run_cmd(
            f"oc patch storageclass gp2 "
            f"-p {patch} "
            f"--request-timeout=120s"
        )
    def test_rwo_pvc_fencing_node_prolonged_network_failure(
            self, nodes, setup, teardown):
        """
        OCS-1427/OCS-1429:
        - Start DeploymentConfig based app pods on 1 OCS/Non-OCS node
        - Make the node (where app pods are running) unresponsive
            by bringing its main network interface down
        - Check new app pods and/or mon, osd pods scheduled on another node
            are stuck due to Multi-Attach error.
        - Power off the unresponsive node
        - Force delete the app pods and/or mon,osd pods on the unresponsive node
        - Check new app pods and/or mon, osd pods scheduled on another node comes
            into Running state
        - Run IOs on new app pods

        OCS-1430/OCS-1435:
        - Start DeploymentConfig based app pods on multiple node
            Colocated scenario: Select 1 node where osd and/or mon is running,
                select other 2 nodes where mon/osd are not running
            Dedicated scenario: 3 Non-OCS nodes
        - Disrupt the leader provisioner pods if not running on above selected
            nodes
        - Make the nodes (where app pods are running) unresponsive
            by bringing their main network interface down
        - Check new app pods and/or mon, osd pods scheduled on another node
            are stuck due to Multi-Attach error.
        - Power off the unresponsive nodes
        - Force delete the app pods and/or mon,osd pods on the unresponsive node
        - Check new app pods and/or mon, osd pods scheduled on another node comes
            into Running state
        - Run IOs on new app pods

        """
        ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup

        # Run IO on pods
        md5sum_data = self.run_and_verify_io(pod_list=dc_pods,
                                             fio_filename="io_file1",
                                             run_io_in_bg=True)

        # OCS-1430/OCS-1435
        # Disrupt leader plugin-provisioner pods, skip if running on node to be failed
        if disruptor:
            [disruption.delete_resource() for disruption in disruptor]

        # Induce network failure on the nodes
        node.node_network_failure(app_pod_nodes)
        logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds")
        sleep(self.prolong_nw_fail_time)

        # Wait for pods to be rescheduled
        for pod_obj in dc_pods + ceph_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_TERMINATING,
                resource_name=pod_obj.name)

        # Fetch info of new pods and verify Multi-Attach error
        new_dc_pods = self.get_new_pods(dc_pods)
        assert len(new_dc_pods) == len(
            dc_pods), "Unexpected number of app pods"
        self.verify_multi_attach_error(new_dc_pods)

        if ceph_pods:
            new_ceph_pods = self.get_new_pods(ceph_pods)
            assert len(new_ceph_pods) > 0, "Unexpected number of osd pods"
            self.verify_multi_attach_error(new_ceph_pods)

        logger.info("Executing manual recovery steps")
        # Power off the unresponsive node(s)
        logger.info(f"Powering off the unresponsive node(s): {app_pod_nodes}")
        nodes.stop_nodes(node.get_node_objs(app_pod_nodes))

        # Force delete the app pods and/or mon,osd pods on the unresponsive node
        if ceph_cluster.mon_count == 5 and float(
                config.ENV_DATA["ocs_version"]) < 4.4:
            for pod_obj in ceph_cluster.mons:
                if pod.get_pod_node(pod_obj).name in app_pod_nodes:
                    ceph_pods.append(pod_obj)

        for pod_obj in dc_pods + ceph_pods:
            pod_obj.delete(force=True)

        # Wait for new app pods to reach Running state
        for pod_obj in new_dc_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=1200,
                sleep=30,
            ), (f"App pod with name {pod_obj.name} did not reach Running state"
                )

        # Wait for mon and osd pods to reach Running state
        selectors_to_check = [constants.MON_APP_LABEL, constants.OSD_APP_LABEL]
        for selector in selectors_to_check:
            assert ceph_cluster.POD.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=selector,
                resource_count=3,
                timeout=1800,
                sleep=60,
            ), f"3 expected pods with selector {selector} are not in Running state"

        if ceph_cluster.mon_count == 3:
            # Check ceph health
            toolbox_status = ceph_cluster.POD.get_resource_status(
                ceph_cluster.toolbox.name)
            if toolbox_status == constants.STATUS_TERMINATING:
                ceph_cluster.toolbox.delete(force=True)

            assert ceph_health_check(), "Ceph cluster health is not OK"
            logger.info("Ceph cluster health is OK")

        # Verify data integrity from new pods
        for num, pod_obj in enumerate(new_dc_pods):
            pod.verify_data_integrity(pod_obj=pod_obj,
                                      file_name="io_file1",
                                      original_md5sum=md5sum_data[num])

        # Run IO on new pods
        self.run_and_verify_io(pod_list=new_dc_pods,
                               fio_filename="io_file2",
                               return_md5sum=False)
示例#3
0
def ocs_install_verification(
    timeout=600,
    skip_osd_distribution_check=False,
    ocs_registry_image=None,
    post_upgrade_verification=False,
    version_before_upgrade=None,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.
        version_before_upgrade (float): Set to OCS version before upgrade

    """
    from ocs_ci.ocs.node import get_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    from ocs_ci.ocs.resources.fips import check_fips_enabled

    number_of_worker_nodes = len(get_nodes())
    namespace = config.ENV_DATA["cluster_namespace"]
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    ocs_csv = get_ocs_csv()
    # Verify if OCS CSV has proper version.
    csv_version = ocs_csv.data["spec"]["version"]
    ocs_version = config.ENV_DATA["ocs_version"]
    log.info(
        f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}")
    assert (
        ocs_version in csv_version
    ), f"OCS version: {ocs_version} mismatch with CSV version {csv_version}"
    # Verify if OCS CSV has the same version in provided CI build.
    ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get(
        "ocs_registry_image")
    if ocs_registry_image and ocs_registry_image.endswith(".ci"):
        ocs_registry_image = ocs_registry_image.split(":")[1]
        log.info(
            f"Check if OCS registry image: {ocs_registry_image} matches with "
            f"CSV: {csv_version}")
        ignore_csv_mismatch = config.DEPLOYMENT.get("ignore_csv_mismatch")
        if ignore_csv_mismatch:
            log.info(
                "The possible mismatch will be ignored as you deployed "
                "the different version than the default version from the CSV")
        else:
            assert ocs_registry_image in csv_version, (
                f"OCS registry image version: {ocs_registry_image} mismatch "
                f"with CSV version {csv_version}")

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(f"Check if StorageCluster: {storage_cluster_name} is in"
             f"Succeeded phase")
    storage_cluster.wait_for_phase(phase="Ready", timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(kind=constants.POD, namespace=namespace)
    if not config.DEPLOYMENT["external_mode"]:
        osd_count = int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["count"]
        ) * int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"])
    rgw_count = None
    if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS:
        #  RGW count is 1 if OCS version < 4.5 or the cluster was upgraded from version <= 4.4
        if (float(config.ENV_DATA["ocs_version"]) < 4.5
                or float(config.ENV_DATA["ocs_version"]) == 4.5 and
            (post_upgrade_verification
             and float(version_before_upgrade) < 4.5)):
            rgw_count = 1
        else:
            rgw_count = 2

    # # With 4.4 OCS cluster deployed over Azure, RGW is the default backingstore
    if config.ENV_DATA.get("platform") == constants.AZURE_PLATFORM:
        if float(config.ENV_DATA["ocs_version"]) == 4.4 or (
                float(config.ENV_DATA["ocs_version"]) == 4.5 and
            (post_upgrade_verification
             and float(version_before_upgrade) < 4.5)):
            rgw_count = 1

    min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT
    max_eps = (constants.MAX_NB_ENDPOINT_COUNT
               if float(config.ENV_DATA["ocs_version"]) >= 4.6 else 1)

    if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM:
        min_eps = 1
        max_eps = 1

    nb_db_label = (constants.NOOBAA_DB_LABEL_46_AND_UNDER
                   if float(config.ENV_DATA["ocs_version"]) < 4.7 else
                   constants.NOOBAA_DB_LABEL_47_AND_ABOVE)
    resources_dict = {
        nb_db_label: 1,
        constants.OCS_OPERATOR_LABEL: 1,
        constants.OPERATOR_LABEL: 1,
        constants.NOOBAA_OPERATOR_POD_LABEL: 1,
        constants.NOOBAA_CORE_POD_LABEL: 1,
        constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps,
    }
    if not config.DEPLOYMENT["external_mode"]:
        resources_dict.update({
            constants.MON_APP_LABEL: 3,
            constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2,
            constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2,
            constants.OSD_APP_LABEL: osd_count,
            constants.MGR_APP_LABEL: 1,
            constants.MDS_APP_LABEL: 2,
            constants.RGW_APP_LABEL: rgw_count,
        })

    for label, count in resources_dict.items():
        if label == constants.RGW_APP_LABEL:
            if not config.ENV_DATA.get(
                    "platform") in constants.ON_PREM_PLATFORMS:
                continue
        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=label,
            resource_count=count,
            timeout=timeout,
        )

    nb_ep_pods = get_pods_having_label(
        label=constants.NOOBAA_ENDPOINT_POD_LABEL,
        namespace=defaults.ROOK_CLUSTER_NAMESPACE,
    )
    assert len(nb_ep_pods) <= max_eps, (
        f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) "
        f"is greater than the maximum defined in the NooBaa CR ({max_eps})")

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    required_storage_classes = {
        f"{storage_cluster_name}-cephfs",
        f"{storage_cluster_name}-ceph-rbd",
    }
    if config.DEPLOYMENT["external_mode"]:
        required_storage_classes.update({
            f"{storage_cluster_name}-ceph-rgw",
            f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io',
        })
    storage_classes = storage_class.get()
    storage_class_names = {
        item["metadata"]["name"]
        for item in storage_classes["items"]
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSDs are distributed
    if not config.DEPLOYMENT["external_mode"]:
        if not skip_osd_distribution_check:
            log.info(
                "Verifying OSDs are distributed evenly across worker nodes")
            ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
            osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"]
            deviceset_count = get_deviceset_count()
            node_names = [osd["spec"]["nodeName"] for osd in osds]
            for node in node_names:
                assert (
                    not node_names.count(node) > deviceset_count
                ), "OSD's are not distributed evenly across worker nodes"

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    csi_drivers = {
        item["metadata"]["name"]
        for item in csi_driver.get()["items"]
    }
    assert defaults.CSI_PROVISIONERS.issubset(csi_drivers)

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    if config.DEPLOYMENT["external_mode"]:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(resource_name=(
            constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS))
    else:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(
            resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    assert (sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"]
            == constants.RBD_NODE_SECRET)
    assert (sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"]
            == constants.RBD_PROVISIONER_SECRET)
    assert (
        sc_cephfs["parameters"]["csi.storage.k8s.io/node-stage-secret-name"] ==
        constants.CEPHFS_NODE_SECRET)
    assert (
        sc_cephfs["parameters"]["csi.storage.k8s.io/provisioner-secret-name"]
        == constants.CEPHFS_PROVISIONER_SECRET)
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    if not config.DEPLOYMENT["external_mode"]:
        log.info(
            "Verifying ceph osd tree output and checking for device set PVC names "
            "in the output.")

        if config.DEPLOYMENT.get("local_storage"):
            deviceset_pvcs = [osd.get_node() for osd in get_osd_pods()]
        else:
            deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

        ct_pod = get_ceph_tools_pod()
        osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree",
                                        format="json")
        schemas = {
            "root": constants.OSD_TREE_ROOT,
            "rack": constants.OSD_TREE_RACK,
            "host": constants.OSD_TREE_HOST,
            "osd": constants.OSD_TREE_OSD,
            "region": constants.OSD_TREE_REGION,
            "zone": constants.OSD_TREE_ZONE,
        }
        schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs}
        for item in osd_tree["nodes"]:
            validate(instance=item, schema=schemas[item["type"]])
            if item["type"] == "host":
                deviceset_pvcs.remove(item["name"])
        assert not deviceset_pvcs, (
            f"These device set PVCs are not given in ceph osd tree output "
            f"- {deviceset_pvcs}")
        log.info(
            "Verified ceph osd tree output. Device set PVC names are given in the "
            "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    # if the OCS version is < 4.6
    if float(config.ENV_DATA["ocs_version"]) < 4.6:
        log.info("Verifying CSI snapshotter is not present.")
        provisioner_pods = get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            selector=[
                constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
            ],
        )
        for pod_obj in provisioner_pods:
            pod_info = pod_obj.get()
            for container, image in get_images(data=pod_info).items():
                assert ("snapshot" not in container) and (
                    "snapshot" not in image
                ), (f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
        deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"]
        rook_ceph_operator_deployment = [
            deployment_val for deployment_val in deployments
            if deployment_val["name"] == "rook-ceph-operator"
        ]
        assert {
            "name": "CSI_ENABLE_SNAPSHOTTER",
            "value": "false"
        } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"]
              ["containers"][0]["env"]
              ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
        log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump",
                                          format="")
        pool_names = [
            constants.METADATA_POOL,
            constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL,
        ]
        crush_rules = [
            rule for rule in crush_dump["rules"]
            if rule["rule_name"] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule["steps"]
                if item.get("type") == "zone"
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    log.info("Validate cluster on PVC")
    validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(namespace, health_check_tries,
                                   health_check_delay)
    if config.ENV_DATA.get("fips"):
        # In case that fips is enabled when deploying,
        # a verification of the installation of it will run
        # on all running state pods
        check_fips_enabled()
    if config.ENV_DATA.get("encryption_at_rest"):
        osd_encryption_verification()
示例#4
0
 def finalizer():
     nodes.restart_nodes_by_stop_and_start_teardown()
     assert ceph_health_check(), "Ceph cluster health is not OK"
     log.info("Ceph cluster health is OK")
示例#5
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        set_registry_to_managed_state()
        image = None
        ceph_cluster = ocp.OCP(kind="CephCluster", namespace=self.namespace)
        try:
            ceph_cluster.get().get("items")[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")

        # disconnected installation?
        load_cluster_info()
        if config.DEPLOYMENT.get("disconnected"):
            image = prepare_disconnected_ocs_deployment()

        if config.DEPLOYMENT["external_mode"]:
            self.deploy_with_external_mode()
        else:
            self.deploy_ocs_via_operator(image)
            pod = ocp.OCP(kind=constants.POD, namespace=self.namespace)
            cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM,
                          namespace=self.namespace)
            # Check for Ceph pods
            mon_pod_timeout = (900 if self.platform
                               == constants.IBMCLOUD_PLATFORM else 600)
            assert pod.wait_for_resource(
                condition="Running",
                selector="app=rook-ceph-mon",
                resource_count=3,
                timeout=mon_pod_timeout,
            )
            assert pod.wait_for_resource(condition="Running",
                                         selector="app=rook-ceph-mgr",
                                         timeout=600)
            assert pod.wait_for_resource(
                condition="Running",
                selector="app=rook-ceph-osd",
                resource_count=3,
                timeout=600,
            )

            # validate ceph mon/osd volumes are backed by pvc
            validate_cluster_on_pvc()

            # validate PDB creation of MON, MDS, OSD pods
            validate_pdb_creation()

            # Creating toolbox pod
            setup_ceph_toolbox()

            assert pod.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector="app=rook-ceph-tools",
                resource_count=1,
                timeout=600,
            )

            if not config.COMPONENTS["disable_cephfs"]:
                # Check for CephFilesystem creation in ocp
                cfs_data = cfs.get()
                cfs_name = cfs_data["items"][0]["metadata"]["name"]

                if helpers.validate_cephfilesystem(cfs_name):
                    logger.info("MDS deployment is successful!")
                    defaults.CEPHFILESYSTEM_NAME = cfs_name
                else:
                    logger.error("MDS deployment Failed! Please check logs!")

        # Change monitoring backend to OCS
        if config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get(
                "persistent-monitoring"):
            setup_persistent_monitoring()
        elif config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get(
                "telemeter_server_url"):
            # Create configmap cluster-monitoring-config to reconfigure
            # telemeter server url when 'persistent-monitoring' is False
            create_configmap_cluster_monitoring_pod(
                telemeter_server_url=config.ENV_DATA["telemeter_server_url"])

        if not config.COMPONENTS["disable_cephfs"]:
            # Change registry backend to OCS CEPHFS RWX PVC
            registry.change_registry_backend_to_ocs()

        # Verify health of ceph cluster
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        try:
            ceph_health_check(namespace=self.namespace, tries=30, delay=10)
        except CephHealthException as ex:
            err = str(ex)
            logger.warning(f"Ceph health check failed with {err}")
            if "clock skew detected" in err:
                logger.info(f"Changing NTP on compute nodes to"
                            f" {constants.RH_NTP_CLOCK}")
                if self.platform == constants.VSPHERE_PLATFORM:
                    update_ntp_compute_nodes()
                assert ceph_health_check(namespace=self.namespace,
                                         tries=60,
                                         delay=10)

        # patch gp2/thin storage class as 'non-default'
        self.patch_default_sc_to_non_default()
示例#6
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        _templating = templating.Templating()

        ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace)
        try:
            ceph_cluster.get().get('items')[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")

        if not self.ocs_operator_deployment:
            create_oc_resource('common.yaml', self.cluster_path, _templating,
                               config.ENV_DATA)
            run_cmd(
                f'oc label namespace {config.ENV_DATA["cluster_namespace"]} '
                f'"openshift.io/cluster-monitoring=true"')
            run_cmd(
                f"oc policy add-role-to-user view "
                f"system:serviceaccount:openshift-monitoring:prometheus-k8s "
                f"-n {self.namespace}")
            # HACK: If you would like to drop this hack, make sure that you
            # also updated docs and write appropriate unit/integration tests
            # for config processing.
            if config.ENV_DATA.get('monitoring_enabled') in ("true", "True",
                                                             True):
                # RBAC rules for monitoring, based on documentation change in
                # rook:
                # https://github.com/rook/rook/commit/1b6fe840f6ae7372a9675ba727ecc65326708aa8
                # HACK: This should be dropped when OCS is managed by OLM
                apply_oc_resource('rbac.yaml',
                                  self.cluster_path,
                                  _templating,
                                  config.ENV_DATA,
                                  template_dir="monitoring")
            # Increased to 15 seconds as 10 is not enough
            # TODO: do the sampler function and check if resource exist
            wait_time = 15
            logger.info(f"Waiting {wait_time} seconds...")
            time.sleep(wait_time)
            create_oc_resource('operator-openshift.yaml', self.cluster_path,
                               _templating, config.ENV_DATA)
            logger.info(f"Waiting {wait_time} seconds...")
            time.sleep(wait_time)
            run_cmd(f"oc wait --for condition=ready pod "
                    f"-l app=rook-ceph-operator "
                    f"-n {self.namespace} "
                    f"--timeout=120s")
            run_cmd(f"oc wait --for condition=ready pod "
                    f"-l app=rook-discover "
                    f"-n {self.namespace} "
                    f"--timeout=120s")
            create_oc_resource('cluster.yaml', self.cluster_path, _templating,
                               config.ENV_DATA)
        else:
            self.deploy_ocs_via_operator()

        pod = ocp.OCP(kind=constants.POD, namespace=self.namespace)
        cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace)
        # Check for Ceph pods
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mon',
                                     resource_count=3,
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mgr',
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-osd',
                                     resource_count=3,
                                     timeout=600)
        # Validation for cluster on pvc
        logger.info("Validate mon and OSD are backed by PVCs")
        validate_cluster_on_pvc(label=constants.MON_APP_LABEL)
        validate_cluster_on_pvc(label=constants.DEFAULT_DEVICESET_LABEL)

        # Creating toolbox pod
        setup_ceph_toolbox()

        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector='app=rook-ceph-tools',
                                     resource_count=1,
                                     timeout=600)

        if not self.ocs_operator_deployment:
            logger.info(f"Waiting {wait_time} seconds...")
            time.sleep(wait_time)
            # HACK: This should be dropped (including service-monitor.yaml and
            # prometheus-rules.yaml files) when OCS is managed by OLM
            if config.ENV_DATA.get('monitoring_enabled') not in ("true",
                                                                 "True", True):
                # HACK: skip creation of rook-ceph-mgr service monitor when
                # monitoring is enabled (if this were not skipped, the step
                # would fail because rook would create the service monitor at
                # this point already)
                create_oc_resource("service-monitor.yaml", self.cluster_path,
                                   _templating, config.ENV_DATA)
                # HACK: skip creation of prometheus-rules, rook-ceph is
                # concerned with it's setup now, based on clarification from
                # Umanga Chapagain
                create_oc_resource("prometheus-rules.yaml", self.cluster_path,
                                   _templating, config.ENV_DATA)
            logger.info(f"Waiting {wait_time} seconds...")
            time.sleep(wait_time)

            # Create MDS pods for CephFileSystem
            fs_data = templating.load_yaml(constants.CEPHFILESYSTEM_YAML)
            fs_data['metadata']['namespace'] = self.namespace

            ceph_obj = OCS(**fs_data)
            ceph_obj.create()
            assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                         selector='app=rook-ceph-mds',
                                         resource_count=2,
                                         timeout=600)

        # Check for CephFilesystem creation in ocp
        cfs_data = cfs.get()
        cfs_name = cfs_data['items'][0]['metadata']['name']

        if helpers.validate_cephfilesystem(cfs_name):
            logger.info(f"MDS deployment is successful!")
            defaults.CEPHFILESYSTEM_NAME = cfs_name
        else:
            logger.error(f"MDS deployment Failed! Please check logs!")

        if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get(
                'persistent-monitoring'):
            # Create a pool, secrets and sc
            secret_obj = helpers.create_secret(
                interface_type=constants.CEPHBLOCKPOOL)
            cbj_obj = helpers.create_ceph_block_pool()
            sc_obj = helpers.create_storage_class(
                interface_type=constants.CEPHBLOCKPOOL,
                interface_name=cbj_obj.name,
                secret_name=secret_obj.name)

            # Get the list of monitoring pods
            pods_list = get_all_pods(
                namespace=defaults.OCS_MONITORING_NAMESPACE,
                selector=['prometheus', 'alertmanager'])

            # Create configmap cluster-monitoring-config
            create_configmap_cluster_monitoring_pod(sc_obj.name)

            # Take some time to respin the pod
            waiting_time = 30
            logger.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate the pods are respinned and in running state
            validate_pods_are_respinned_and_running_state(pods_list)

            # Validate the pvc is created on monitoring pods
            validate_pvc_created_and_bound_on_monitoring_pods()

            # Validate the pvc are mounted on pods
            validate_pvc_are_mounted_on_monitoring_pods(pods_list)

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        assert ceph_health_check(namespace=self.namespace)
        # patch gp2/thin storage class as 'non-default'
        self.patch_default_sc_to_non_default()
示例#7
0
文件: ocs.py 项目: tiffanyn108/ocs-ci
def ocs_install_verification(timeout=600, skip_osd_distribution_check=False):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.

    """
    from ocs_ci.ocs.node import get_typed_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    number_of_worker_nodes = len(get_typed_nodes())
    namespace = config.ENV_DATA['cluster_namespace']
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    operator_selector = get_selector_for_ocs_operator()
    ocs_package_manifest = PackageManifest(
        resource_name=defaults.OCS_OPERATOR_NAME,
        selector=operator_selector,
    )
    ocs_csv_name = ocs_package_manifest.get_current_csv()
    ocs_csv = CSV(resource_name=ocs_csv_name, namespace=namespace)
    log.info(f"Check if OCS operator: {ocs_csv_name} is in Succeeded phase.")
    ocs_csv.wait_for_phase(phase="Succeeded", timeout=timeout)

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(f"Check if StorageCluster: {storage_cluster_name} is in"
             f"Succeeded phase")
    storage_cluster.wait_for_phase(phase='Ready', timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(kind=constants.POD, namespace=namespace)
    # ocs-operator
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OCS_OPERATOR_LABEL,
                                 timeout=timeout)
    # rook-ceph-operator
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OPERATOR_LABEL,
                                 timeout=timeout)
    # noobaa
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.NOOBAA_APP_LABEL,
                                 resource_count=2,
                                 timeout=timeout)
    # mons
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MON_APP_LABEL,
                                 resource_count=3,
                                 timeout=timeout)
    # csi-cephfsplugin
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.CSI_CEPHFSPLUGIN_LABEL,
                                 resource_count=number_of_worker_nodes,
                                 timeout=timeout)
    # csi-cephfsplugin-provisioner
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
        resource_count=2,
        timeout=timeout)
    # csi-rbdplugin
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.CSI_RBDPLUGIN_LABEL,
                                 resource_count=number_of_worker_nodes,
                                 timeout=timeout)
    # csi-rbdplugin-provisioner
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
        resource_count=2,
        timeout=timeout)
    # osds
    osd_count = (
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) *
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica']))
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OSD_APP_LABEL,
                                 resource_count=osd_count,
                                 timeout=timeout)
    # mgr
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MGR_APP_LABEL,
                                 timeout=timeout)
    # mds
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MDS_APP_LABEL,
                                 resource_count=2,
                                 timeout=timeout)

    # rgw check only for VmWare
    if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM:
        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector=constants.RGW_APP_LABEL,
                                     resource_count=1,
                                     timeout=timeout)

    # Verify ceph health
    log.info("Verifying ceph health")
    assert utils.ceph_health_check(namespace=namespace)

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    required_storage_classes = {
        f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd'
    }
    storage_classes = storage_class.get()
    storage_class_names = {
        item['metadata']['name']
        for item in storage_classes['items']
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSD's are distributed
    if not skip_osd_distribution_check:
        log.info("Verifying OSD's are distributed evenly across worker nodes")
        ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
        osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items']
        node_names = [osd['spec']['nodeName'] for osd in osds]
        for node in node_names:
            assert not node_names.count(node) > 1, (
                "OSD's are not distributed evenly across worker nodes")

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == ({
        item['metadata']['name']
        for item in csi_driver.get()['items']
    })

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    sc_rbd = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_RBD)
    sc_cephfs = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    assert sc_rbd['parameters'][
        'csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET
    assert sc_rbd['parameters'][
        'csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET
    assert sc_cephfs['parameters'][
        'csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET
    assert sc_cephfs['parameters'][
        'csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    log.info(
        "Verifying ceph osd tree output and checking for device set PVC names "
        "in the output.")
    deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]
    ct_pod = get_ceph_tools_pod()
    osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json')
    schemas = {
        'root': constants.OSD_TREE_ROOT,
        'rack': constants.OSD_TREE_RACK,
        'host': constants.OSD_TREE_HOST,
        'osd': constants.OSD_TREE_OSD,
        'region': constants.OSD_TREE_REGION,
        'zone': constants.OSD_TREE_ZONE
    }
    schemas['host']['properties']['name'] = {'enum': deviceset_pvcs}
    for item in osd_tree['nodes']:
        validate(instance=item, schema=schemas[item['type']])
        if item['type'] == 'host':
            deviceset_pvcs.remove(item['name'])
    assert not deviceset_pvcs, (
        f"These device set PVCs are not given in ceph osd tree output "
        f"- {deviceset_pvcs}")
    log.info(
        "Verified ceph osd tree output. Device set PVC names are given in the "
        "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    log.info("Verifying CSI snapshotter is not present.")
    provisioner_pods = get_all_pods(
        namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        selector=[
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
        ])
    for pod_obj in provisioner_pods:
        pod_info = pod_obj.get()
        for container, image in get_images(data=pod_info).items():
            assert ('snapshot' not in container) and (
                'snapshot' not in image), (
                    f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
    assert {
        'name': 'CSI_ENABLE_SNAPSHOTTER',
        'value': 'false'
    } in (ocs_csv.get()['spec']['install']['spec']['deployments'][0]['spec']
          ['template']['spec']['containers'][0]['env']
          ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
    log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd crush dump',
                                          format='')
        pool_names = [
            constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL
        ]
        crush_rules = [
            rule for rule in crush_dump['rules']
            if rule['rule_name'] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule['steps']
                if item.get('type') == 'zone'
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
示例#8
0
    def test_upgrade_ocp(self, reduce_and_resume_cluster_load):
        """
        Tests OCS stability when upgrading OCP

        """

        ceph_cluster = CephCluster()
        with CephHealthMonitor(ceph_cluster):

            ocp_channel = config.UPGRADE.get("ocp_channel",
                                             ocp.get_ocp_upgrade_channel())
            ocp_upgrade_version = config.UPGRADE.get("ocp_upgrade_version")
            if not ocp_upgrade_version:
                ocp_upgrade_version = get_latest_ocp_version(
                    channel=ocp_channel)
                ocp_arch = config.UPGRADE["ocp_arch"]
                target_image = f"{ocp_upgrade_version}-{ocp_arch}"
            elif ocp_upgrade_version.endswith(".nightly"):
                target_image = expose_ocp_version(ocp_upgrade_version)

            logger.info(f"Target image; {target_image}")

            image_path = config.UPGRADE["ocp_upgrade_path"]
            cluster_operators = ocp.get_all_cluster_operators()
            logger.info(f" oc version: {ocp.get_current_oc_version()}")
            # Verify Upgrade subscription channel:
            ocp.patch_ocp_upgrade_channel(ocp_channel)
            for sampler in TimeoutSampler(
                    timeout=250,
                    sleep=15,
                    func=ocp.verify_ocp_upgrade_channel,
                    channel_variable=ocp_channel,
            ):
                if sampler:
                    logger.info(f"OCP Channel:{ocp_channel}")
                    break

            # Upgrade OCP
            logger.info(f"full upgrade path: {image_path}:{target_image}")
            ocp.upgrade_ocp(image=target_image, image_path=image_path)

            # Wait for upgrade
            for ocp_operator in cluster_operators:
                logger.info(f"Checking upgrade status of {ocp_operator}:")
                # ############ Workaround for issue 2624 #######
                name_changed_between_versions = (
                    "service-catalog-apiserver",
                    "service-catalog-controller-manager",
                )
                if ocp_operator in name_changed_between_versions:
                    logger.info(f"{ocp_operator} upgrade will not be verified")
                    continue
                # ############ End of Workaround ###############
                ver = ocp.get_cluster_operator_version(ocp_operator)
                logger.info(f"current {ocp_operator} version: {ver}")
                for sampler in TimeoutSampler(
                        timeout=2700,
                        sleep=60,
                        func=ocp.confirm_cluster_operator_version,
                        target_version=target_image,
                        cluster_operator=ocp_operator,
                ):
                    if sampler:
                        logger.info(f"{ocp_operator} upgrade completed!")
                        break
                    else:
                        logger.info(
                            f"{ocp_operator} upgrade did not completed yet!")

            # post upgrade validation: check cluster operator status
            cluster_operators = ocp.get_all_cluster_operators()
            for ocp_operator in cluster_operators:
                logger.info(f"Checking cluster status of {ocp_operator}")
                for sampler in TimeoutSampler(
                        timeout=2700,
                        sleep=60,
                        func=ocp.verify_cluster_operator_status,
                        cluster_operator=ocp_operator,
                ):
                    if sampler:
                        break
                    else:
                        logger.info(f"{ocp_operator} status is not valid")
            # Post upgrade validation: check cluster version status
            logger.info("Checking clusterversion status")
            for sampler in TimeoutSampler(
                    timeout=900,
                    sleep=15,
                    func=ocp.validate_cluster_version_status):
                if sampler:
                    logger.info("Upgrade Completed Successfully!")
                    break

        # load new config file
        self.load_ocp_version_config_file(ocp_upgrade_version)

        new_ceph_cluster = CephCluster()
        new_ceph_cluster.wait_for_rebalance(timeout=1800)
        ceph_health_check(tries=90, delay=30)
示例#9
0
    def test_add_capacity_with_resource_delete(
        self,
        workload_storageutilization_rbd,
        resource_name,
        resource_id,
        is_kill_resource_repeatedly,
    ):
        """
        The function get the resource name, and id.
        The function adds capacity to the cluster, and then delete the resource while
        storage capacity is getting increased.

        Args:
            resource_name (str): the name of the resource to delete
            resource_id (int): the id of the resource to delete
            is_kill_resource_repeatedly (bool): If True then kill the resource repeatedly. Else, if False
                delete the resource only once.

        """
        used_percentage = get_percent_used_capacity()
        logging.info(
            f"storageutilization is completed. used capacity = {used_percentage}"
        )

        osd_pods_before = pod_helpers.get_osd_pods()
        number_of_osd_pods_before = len(osd_pods_before)
        if number_of_osd_pods_before >= constants.MAX_OSDS:
            pytest.skip("We have maximum of OSDs in the cluster")

        d = Disruptions()
        d.set_resource(resource_name)

        self.new_pods_in_status_running = False

        osd_size = storage_cluster.get_osd_size()
        logging.info(f"Adding one new set of OSDs. osd size = {osd_size}")
        storagedeviceset_count = storage_cluster.add_capacity(osd_size)
        logging.info("Adding one new set of OSDs was issued without problems")

        # Wait for new osd's to come up. After the first new osd in status Init - delete the resource.
        # After deleting the resource we expect that all the new osd's will be in status running,
        # and the delete resource will be also in status running.
        pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before)
        logging.info(
            f"Delete a {resource_name} pod while storage capacity is getting increased"
        )
        if is_kill_resource_repeatedly:
            with ThreadPoolExecutor() as executor:
                executor.submit(self.kill_resource_repeatedly, resource_name,
                                resource_id)
                self.wait_for_osd_pods_to_be_running(storagedeviceset_count)
        else:
            d.delete_resource(resource_id)
            self.wait_for_osd_pods_to_be_running(storagedeviceset_count)

        self.new_pods_in_status_running = True
        logging.info(
            "Finished verifying add capacity when one of the pods gets deleted"
        )
        logging.info("Waiting for ceph health check to finished...")
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"],
                          tries=90)
        ceph_cluster_obj = CephCluster()
        assert ceph_cluster_obj.wait_for_rebalance(
            timeout=1800), "Data re-balance failed to complete"
示例#10
0
    def disruptive_base(self, interface, operation_to_disrupt,
                        resource_to_delete):
        """
        Base function for disruptive tests.
        Deletion of 'resource_to_delete' will be introduced while
        'operation_to_disrupt' is progressing.
        """
        pod_functions = {
            'mds': get_mds_pods,
            'mon': get_mon_pods,
            'mgr': get_mgr_pods,
            'osd': get_osd_pods
        }
        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_to_delete)
        executor = ThreadPoolExecutor(max_workers=1)

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(get_all_pods(namespace=self.namespace))
        initial_num_of_pvc = len(
            get_all_pvcs(namespace=self.namespace)['items'])

        # Fetch PV names
        pv_objs = []
        for pvc_obj in self.pvc_objs:
            pvc_obj.reload()
            pv_objs.append(pvc_obj.backed_pv_obj)

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod_obj in self.pod_objs:
            pod_info = pod_obj.get()
            node = pod_info['spec']['nodeName']
            pvc = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][
                'claimName']
            for pvc_obj in self.pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Do setup for running IO on pods
        log.info("Setting up pods for running IO")
        for pod_obj in self.pod_objs:
            pod_obj.workload_setup(storage_type='fs')
        log.info("Setup for running IO is completed on pods")

        # Start IO on each pod. RWX PVC will be used on two pods. So split the
        # size accordingly
        log.info("Starting IO on pods")
        for pod_obj in self.pod_objs:
            if pod_obj.pvc.access_mode == constants.ACCESS_MODE_RWX:
                io_size = int((self.pvc_size - 1) / 2)
            else:
                io_size = self.pvc_size - 1
            pod_obj.run_io(storage_type='fs',
                           size=f'{io_size}G',
                           fio_filename=f'{pod_obj.name}_io')
        log.info("IO started on all pods.")

        # Start deleting pods
        pod_bulk_delete = executor.submit(self.delete_pods)

        if operation_to_disrupt == 'delete_pods':
            ret = self.verify_resource_deletion(get_all_pods,
                                                initial_num_of_pods)
            assert ret, "Wait timeout: Pods are not being deleted."
            logging.info(f"Pods deletion has started.")
            disruption.delete_resource()

        pods_deleted = pod_bulk_delete.result()

        assert pods_deleted, "Deletion of pods failed."

        # Verify pods are deleted
        for pod_obj in self.pod_objs:
            assert pod_obj.ocp.wait_for_delete(
                pod_obj.name), (f"Pod {pod_obj.name} is not deleted")
        logging.info("Verified: Pods are deleted.")

        # Verify that the mount point is removed from nodes after deleting pod
        for node, pvs in node_pv_dict.items():
            cmd = f'oc debug nodes/{node} -- df'
            df_on_node = run_cmd(cmd)
            for pv in pvs:
                assert pv not in df_on_node, (
                    f"{pv} is still present on node {node} after "
                    f"deleting the pods.")
        log.info(
            "Verified: mount points are removed from nodes after deleting "
            "the pods")

        # Fetch image uuid associated with PVCs
        pvc_uuid_map = {}
        for pvc_obj in self.pvc_objs:
            pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid
        log.info("Fetched image uuid associated with each PVC")

        # Start deleting PVCs
        pvc_bulk_delete = executor.submit(delete_pvcs, self.pvc_objs)

        if operation_to_disrupt == 'delete_pvcs':
            ret = self.verify_resource_deletion(get_all_pvcs,
                                                initial_num_of_pvc)
            assert ret, "Wait timeout: PVCs are not being deleted."
            logging.info(f"PVCs deletion has started.")
            disruption.delete_resource()

        pvcs_deleted = pvc_bulk_delete.result()

        assert pvcs_deleted, "Deletion of PVCs failed."

        # Verify PVCs are deleted
        for pvc_obj in self.pvc_objs:
            assert pvc_obj.ocp.wait_for_delete(
                pvc_obj.name), (f"PVC {pvc_obj.name} is not deleted")
        logging.info("Verified: PVCs are deleted.")

        # Verify PVs are deleted
        for pv_obj in pv_objs:
            assert pv_obj.ocp.wait_for_delete(
                pv_obj.name), (f"PV {pv_obj.name} is not deleted")
        logging.info("Verified: PVs are deleted.")

        # Verify PV using ceph toolbox. Image/Subvolume should be deleted.
        for pvc_name, uuid in pvc_uuid_map.items():
            if interface == constants.CEPHBLOCKPOOL:
                ret = verify_volume_deleted_in_backend(
                    interface=interface,
                    image_uuid=uuid,
                    pool_name=self.sc_obj.ceph_pool.name)
            if interface == constants.CEPHFILESYSTEM:
                ret = verify_volume_deleted_in_backend(interface=interface,
                                                       image_uuid=uuid)
            assert ret, (f"Volume associated with PVC {pvc_name} still exists "
                         f"in backend")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}")

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'])
        log.info("Ceph cluster health is OK")
示例#11
0
    def test_bulk_clone_performance(self, namespace, tmp_path):
        """
        Creates number of PVCs in a bulk using kube job
        Write 60% of PVC capacity to each one of the created PVCs
        Creates 1 clone per each PVC altogether in a bulk
        Measuring time for bulk of clones creation

        """
        pvc_count = 50
        vol_size = "5Gi"
        job_pod_file, job_pvc_file, job_clone_file = [None, None, None]
        log.info(f"Start creating {self.interface} {pvc_count} PVC")
        if self.interface == constants.CEPHBLOCKPOOL:
            sc_name = constants.DEFAULT_STORAGECLASS_RBD
            clone_yaml = constants.CSI_RBD_PVC_CLONE_YAML
        elif self.interface == constants.CEPHFILESYSTEM:
            sc_name = constants.DEFAULT_STORAGECLASS_CEPHFS
            clone_yaml = constants.CSI_CEPHFS_PVC_CLONE_YAML

        try:
            pvc_dict_list = scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job(
                no_of_pvc=pvc_count,
                access_mode=constants.ACCESS_MODE_RWO,
                sc_name=sc_name,
                pvc_size=vol_size,
            )

            job_pvc_file = ObjectConfFile(
                name="job_profile_pvc",
                obj_dict_list=pvc_dict_list,
                project=self.namespace,
                tmp_path=tmp_path,
            )

            # Create kube_job
            job_pvc_file.create(namespace=self.namespace)

            # Check all the PVC reached Bound state
            pvc_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job(
                kube_job_obj=job_pvc_file,
                namespace=self.namespace,
                no_of_pvc=pvc_count,
            )

            logging.info(f"Number of PVCs in Bound state {len(pvc_bound_list)}")

            # Kube_job to Create pod
            pod_dict_list = scale_lib.attach_multiple_pvc_to_pod_dict(
                pvc_list=pvc_bound_list,
                namespace=self.namespace,
                pvcs_per_pod=1,
                start_io=False,
                pod_yaml=constants.NGINX_POD_YAML,
            )
            job_pod_file = ObjectConfFile(
                name="job_profile_pod",
                obj_dict_list=pod_dict_list,
                project=self.namespace,
                tmp_path=tmp_path,
            )
            job_pod_file.create(namespace=self.namespace)

            # Check all PODs in Running state
            scale_lib.check_all_pod_reached_running_state_in_kube_job(
                kube_job_obj=job_pod_file,
                namespace=self.namespace,
                no_of_pod=len(pod_dict_list),
                timeout=90,
            )
            logging.info(f"Number of PODs in Running state {len(pod_dict_list)}")

            total_files_size = self.run_fio_on_pvcs(vol_size)

            clone_dict_list = scale_lib.construct_pvc_clone_yaml_bulk_for_kube_job(
                pvc_dict_list, clone_yaml, sc_name
            )

            logging.info("Created clone dict list")

            job_clone_file = ObjectConfFile(
                name="job_profile_clone",
                obj_dict_list=clone_dict_list,
                project=self.namespace,
                tmp_path=tmp_path,
            )

            # Create kube_job that creates clones
            job_clone_file.create(namespace=self.namespace)

            logging.info("Going to check bound status for clones")
            # Check all the clones reached Bound state
            clone_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job(
                kube_job_obj=job_clone_file,
                namespace=self.namespace,
                no_of_pvc=pvc_count,
                timeout=180,
            )

            logging.info(f"Number of clones in Bound state {len(clone_bound_list)}")

            clone_objs = []
            all_pvc_objs = pvc.get_all_pvc_objs(namespace=self.namespace)
            for clone_yaml in clone_dict_list:
                name = clone_yaml["metadata"]["name"]
                size = clone_yaml["spec"]["resources"]["requests"]["storage"]
                logging.info(f"Clone {name} of size {size} created")
                for pvc_obj in all_pvc_objs:
                    if pvc_obj.name == name:
                        clone_objs.append(pvc_obj)

            assert len(clone_bound_list) == len(
                clone_objs
            ), "Not all clones reached BOUND state, cannot measure time"
            start_time = helpers.get_provision_time(
                self.interface, clone_objs, status="start"
            )
            end_time = helpers.get_provision_time(
                self.interface, clone_objs, status="end"
            )
            total_time = (end_time - start_time).total_seconds()
            speed = round(total_files_size / total_time, 2)
            logging.info(
                f"Total creation time = {total_time} secs, data size = {total_files_size} MB, speed = {speed} MB/sec "
                f"for {self.interface} clone in bulk of {pvc_count} clones."
            )

            # Produce ES report
            # Collecting environment information
            self.get_env_info()

            # Initialize the results doc file.
            full_results = self.init_full_results(
                ResultsAnalyse(
                    self.uuid,
                    self.crd_data,
                    self.full_log_path,
                    "bulk_clone_perf_fullres",
                )
            )

            full_results.add_key("interface", self.interface)
            full_results.add_key("bulk_size", pvc_count)
            full_results.add_key("clone_size", vol_size)
            full_results.add_key("bulk_creation_time", total_time)
            full_results.add_key("data_size(MB)", total_files_size)
            full_results.add_key("speed", speed)
            full_results.add_key("es_results_link", full_results.results_link())

            # Write the test results into the ES server
            full_results.es_write()
            # write the ES link to the test results in the test log.
            logging.info(f"The result can be found at : {full_results.results_link()}")

        # Finally is used to clean-up the resources created
        # Irrespective of try block pass/fail finally will be executed.
        finally:
            # Cleanup activities
            logging.info("Cleanup of all the resources created during test execution")
            if job_pod_file:
                job_pod_file.delete(namespace=self.namespace)
                job_pod_file.wait_for_delete(
                    resource_name=job_pod_file.name, namespace=self.namespace
                )

            if job_clone_file:
                job_clone_file.delete(namespace=self.namespace)
                job_clone_file.wait_for_delete(
                    resource_name=job_clone_file.name, namespace=self.namespace
                )

            if job_pvc_file:
                job_pvc_file.delete(namespace=self.namespace)
                job_pvc_file.wait_for_delete(
                    resource_name=job_pvc_file.name, namespace=self.namespace
                )

            # Check ceph health status
            utils.ceph_health_check(tries=20)
        def finalizer():
            helpers.remove_label_from_worker_node(node_list=worker_nodes,
                                                  label_key="nodetype")

            # Check ceph health
            ceph_health_check(tries=80)
    def test_all_worker_nodes_short_network_failure(self, nodes, setup,
                                                    node_restart_teardown):
        """
        OCS-1432/OCS-1433:
        - Start DeploymentConfig based app pods
        - Make all the worker nodes unresponsive by doing abrupt network failure
        - Reboot the unresponsive node after short duration of ~300 seconds
        - When unresponsive node recovers, app pods and ceph cluster should recover
        - Again run IOs from app pods
        """
        pod_objs = setup
        worker_nodes = node.get_worker_nodes()

        # Run IO on pods
        logger.info(f"Starting IO on {len(pod_objs)} app pods")
        with ThreadPoolExecutor() as executor:
            for pod_obj in pod_objs:
                logger.info(f"Starting IO on pod {pod_obj.name}")
                storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs"
                executor.submit(
                    pod_obj.run_io,
                    storage_type=storage_type,
                    size="2G",
                    runtime=30,
                    fio_filename=f"{pod_obj.name}_io_f1",
                )

        logger.info(f"IO started on all {len(pod_objs)} app pods")

        # Wait for IO results
        for pod_obj in pod_objs:
            pod.get_fio_rw_iops(pod_obj)

        # Induce network failure on all worker nodes
        with ThreadPoolExecutor() as executor:
            for node_name in worker_nodes:
                executor.submit(node.node_network_failure, node_name, False)

        node.wait_for_nodes_status(node_names=worker_nodes,
                                   status=constants.NODE_NOT_READY)

        logger.info(f"Waiting for {self.short_nw_fail_time} seconds")
        sleep(self.short_nw_fail_time)

        # Reboot the worker nodes
        logger.info(f"Stop and start the worker nodes: {worker_nodes}")
        nodes.restart_nodes_by_stop_and_start(node.get_node_objs(worker_nodes))

        try:
            node.wait_for_nodes_status(node_names=worker_nodes,
                                       status=constants.NODE_READY)
            logger.info(
                "Verifying StorageCluster pods are in running/completed state")
            pod.wait_for_storage_pods(timeout=720)
        except ResourceWrongStatusException:
            # Restart nodes
            nodes.restart_nodes(node.get_node_objs(worker_nodes))

        assert ceph_health_check(tries=80), "Ceph cluster health is not OK"
        logger.info("Ceph cluster health is OK")

        # Get current info of app pods
        new_pod_objs = list()
        for pod_obj in pod_objs:
            pod_label = pod_obj.labels.get("deploymentconfig")
            pods_data = pod.get_pods_having_label(
                f"deploymentconfig={pod_label}", pod_obj.namespace)
            current_pods = [
                pod_data.get("metadata").get("name") for pod_data in pods_data
                if "-deploy" not in pod_data.get("metadata").get("name")
            ]
            logger.info(f"Pods with label {pod_label}: {current_pods}")

            # Remove the older pod from the list if pod is rescheduled
            if len(current_pods) > 1:
                current_pods.remove(pod_obj.name)

            new_pod_obj = pod.get_pod_obj(current_pods.pop(),
                                          pod_obj.namespace)
            new_pod_obj.pvc = pod_obj.pvc
            new_pod_objs.append(new_pod_obj)

        logger.info("Wait for app pods are in running state")
        for pod_obj in new_pod_objs:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=720,
                sleep=20,
            )
        logger.info("All the app pods reached running state")

        # Run more IOs on app pods
        with ThreadPoolExecutor() as executor:
            for pod_obj in new_pod_objs:
                logger.info(f"Starting IO on pod {pod_obj.name}")
                pod_obj.wl_setup_done = False
                storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs"
                executor.submit(
                    pod_obj.run_io,
                    storage_type=storage_type,
                    size="1G",
                    runtime=30,
                    fio_filename=f"{pod_obj.name}_io_f2",
                )

        for pod_obj in new_pod_objs:
            pod.get_fio_rw_iops(pod_obj)
    def test_scale_osds_reboot_nodes(self, interface, project_factory,
                                     multi_pvc_factory, dc_pod_factory):
        """
        Check storage utilization, if its less then runs IO,
        Scale osds from 3-6, check for rebalance and reboot workers
        """
        current_osd_count = count_cluster_osd()
        proj_obj = project_factory()
        if current_osd_count == 3:
            while not validate_osd_utilization(osd_used=10):
                # Create pvc
                pvc_objs = multi_pvc_factory(project=proj_obj,
                                             interface=interface,
                                             size=self.pvc_size,
                                             num_of_pvc=self.num_of_pvcs)

                dc_pod_objs = list()
                for pvc_obj in pvc_objs:
                    dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj))

                wait_for_dc_app_pods_to_reach_running_state(dc_pod_objs,
                                                            timeout=1200)

                for pod_obj in dc_pod_objs:
                    pod_obj.run_io(storage_type='fs',
                                   size='3G',
                                   runtime='60',
                                   fio_filename=f'{pod_obj.name}_io')

        # Add capacity
        osd_size = storage_cluster.get_osd_size()
        count = storage_cluster.add_capacity(osd_size)
        pod = OCP(kind=constants.POD,
                  namespace=config.ENV_DATA['cluster_namespace'])
        pod.wait_for_resource(timeout=300,
                              condition=constants.STATUS_RUNNING,
                              selector='app=rook-ceph-osd',
                              resource_count=count * 3)
        assert ceph_health_check(
            delay=120, tries=50), "New OSDs failed to reach running state"

        cluster = CephCluster()

        # Get rebalance status
        rebalance_status = cluster.get_rebalance_status()
        logger.info(rebalance_status)
        if rebalance_status:
            time_taken = cluster.time_taken_to_complete_rebalance()
            logger.info(f"The time taken to complete rebalance {time_taken}")

        # Rolling reboot on worker nodes
        worker_nodes = get_typed_nodes(node_type='worker')

        factory = platform_nodes.PlatformNodesFactory()
        nodes = factory.get_nodes_platform()

        for node in worker_nodes:
            nodes.restart_nodes(nodes=[node])
            wait_for_nodes_status()

        assert ceph_health_check(
            delay=180), "Failed, Ceph health bad after nodes reboot"
示例#15
0
def measure_corrupt_pg(measurement_dir):
    """
    Create Ceph pool and corrupt Placement Group on one of OSDs, measures the
    time when it was corrupted and records alerts that were triggered during
    this event.

    Returns:
        dict: Contains information about `start` and `stop` time for
        corrupting Ceph Placement Group
    """
    oc = ocp.OCP(kind=constants.DEPLOYMENT,
                 namespace=config.ENV_DATA.get("cluster_namespace"))
    osd_deployments = oc.get(selector=constants.OSD_APP_LABEL).get("items")
    osd_deployment = osd_deployments[0].get("metadata").get("name")
    ct_pod = pod.get_ceph_tools_pod()
    pool_name = helpers.create_unique_resource_name("corrupted", "pool")
    ct_pod.exec_ceph_cmd(f"ceph osd pool create {pool_name} 1 1")
    logger.info("Setting osd noout flag")
    ct_pod.exec_ceph_cmd("ceph osd set noout")
    logger.info(f"Put object into {pool_name}")
    pool_object = "test_object"
    ct_pod.exec_ceph_cmd(f"rados -p {pool_name} put {pool_object} /etc/passwd")
    logger.info(f"Looking for Placement Group with {pool_object} object")
    pg = ct_pod.exec_ceph_cmd(
        f"ceph osd map {pool_name} {pool_object}")["pgid"]
    logger.info(f"Found Placement Group: {pg}")

    dummy_deployment, dummy_pod = helpers.create_dummy_osd(osd_deployment)

    def corrupt_pg():
        """
        Corrupt PG on one OSD in Ceph pool for 12 minutes and measure it.
        There should be only CephPGRepairTakingTooLong Pending alert as
        it takes 2 hours for it to become Firing.
        This configuration of alert can be observed in ceph-mixins which
        is used in the project:
            https://github.com/ceph/ceph-mixins/blob/d22afe8c0da34490cb77e52a202eefcf4f62a869/config.libsonnet#L23
        There should be also CephClusterErrorState alert that takes 10
        minutest to start firing.

        Returns:
            str: Name of corrupted deployment
        """
        # run_time of operation
        run_time = 60 * 12
        nonlocal oc
        nonlocal pool_name
        nonlocal pool_object
        nonlocal dummy_pod
        nonlocal pg
        nonlocal osd_deployment
        nonlocal dummy_deployment

        logger.info(f"Corrupting {pg} PG on {osd_deployment}")
        dummy_pod.exec_sh_cmd_on_pod(
            f"ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-"
            f"{osd_deployment.split('-')[-1]} --pgid {pg} {pool_object} "
            f"set-bytes /etc/shadow --no-mon-config")
        logger.info("Unsetting osd noout flag")
        ct_pod.exec_ceph_cmd("ceph osd unset noout")
        ct_pod.exec_ceph_cmd(f"ceph pg deep-scrub {pg}")
        oc.exec_oc_cmd(f"scale --replicas=0 deployment/{dummy_deployment}")
        oc.exec_oc_cmd(f"scale --replicas=1 deployment/{osd_deployment}")
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return osd_deployment

    test_file = os.path.join(measurement_dir, "measure_corrupt_pg.json")
    measured_op = measure_operation(corrupt_pg, test_file)
    logger.info(f"Deleting pool {pool_name}")
    ct_pod.exec_ceph_cmd(f"ceph osd pool delete {pool_name} {pool_name} "
                         f"--yes-i-really-really-mean-it")
    logger.info(f"Checking that pool {pool_name} is deleted")

    logger.info(f"Deleting deployment {dummy_deployment}")
    oc.delete(resource_name=dummy_deployment)

    # wait for ceph to return into HEALTH_OK state after osd deployment
    # is returned back to normal
    ceph_health_check(tries=20, delay=15)

    return measured_op
    def test_pvc_creation_after_del_mon_services(self, interface, pod_factory):
        """
        1. Delete one mon service
        2. Edit the configmap rook-ceph-endpoints
           remove all the deleted mon services entries
        3. Delete deployment, pvc of deleted mon service
        4. Restart rook-ceph-operator
        5. Make sure all mon pods are running
        6. Make sure ceph health Ok and storage pods are running
        7. Sleep for 300 seconds before deleting another mon
        8. Repeat above steps for all mons and at the
           end each mon should contain different endpoints
        9. Create PVC, should succeeded.

        """

        pod_obj = pod_factory(interface=interface)
        run_io_in_bg(pod_obj)

        # Get all mon services
        mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )

        # Get all mon pods
        mon_pods = get_mon_pods()
        mon_count = len(mon_pods)

        list_old_svc = []
        for svc in mon_svc:

            # Get rook-ceph-operator pod obj
            operator_pod_obj = get_operator_pods()
            operator_name = operator_pod_obj[0].name

            # Scale down rook-ceph-operator
            log.info("Scale down rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator", replica_count=0
            ), "Failed to scale down rook-ceph-operator to 0"
            log.info("Successfully scaled down rook-ceph-operator to 0")

            # Validate rook-ceph-operator pod not running
            POD_OBJ.wait_for_delete(resource_name=operator_name)

            pvc_name = svc["metadata"]["labels"]["pvc_name"]
            cluster_ip = svc["spec"]["clusterIP"]
            port = svc["spec"]["ports"][0]["port"]
            mon_endpoint = f"{cluster_ip}:{port}"
            mon_id = svc["spec"]["selector"]["mon"]
            list_old_svc.append(cluster_ip)

            # Delete deployment
            log.info("Delete mon deployments")
            del_obj = OCP(
                kind=constants.DEPLOYMENT,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            del_obj.delete(resource_name=pvc_name)

            # Delete pvc
            log.info("Delete mon PVC")
            pvc_obj = OCP(kind=constants.PVC,
                          namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
            pvc_obj.delete(resource_name=pvc_name)

            # Delete the mon service
            log.info("Delete mon service")
            svc_obj = OCP(kind=constants.SERVICE,
                          namespace=constants.OPENSHIFT_STORAGE_NAMESPACE)
            svc_obj.delete(resource_name=pvc_name)

            # Edit the cm
            log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}")
            configmap_obj = OCP(
                kind=constants.CONFIGMAP,
                namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
            )
            output_get = configmap_obj.get(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS)
            new_data = output_get["data"]
            new_data["csi-cluster-config-json"] = (
                new_data["csi-cluster-config-json"].replace(
                    f'"{mon_endpoint}",', "") if
                new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",')
                != 1 else new_data["csi-cluster-config-json"].replace(
                    f',"{mon_endpoint}"', ""))
            new_data["data"] = ",".join([
                value for value in new_data["data"].split(",")
                if f"{mon_id}=" not in value
            ])
            new_data["mapping"] = (
                new_data["mapping"].replace(f'"{mon_id}":null,', "")
                if new_data["mapping"].find(f'"{mon_id}":null,') != -1 else
                new_data["mapping"].replace(f',"{mon_id}":null', ""))
            params = f'{{"data": {json.dumps(new_data)}}}'
            log.info(f"Removing {mon_id} entries from configmap")
            configmap_obj.patch(
                resource_name=constants.ROOK_CEPH_MON_ENDPOINTS,
                params=params,
                format_type="strategic",
            )
            log.info(
                f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully"
            )

            # Scale up rook-ceph-operator
            log.info("Scale up rook-ceph-operator")
            assert modify_deployment_replica_count(
                deployment_name="rook-ceph-operator",
                replica_count=1), "Failed to scale up rook-ceph-operator to 1"
            log.info("Successfully scaled up rook-ceph-operator to 1")
            log.info("Validate rook-ceph-operator pod is running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.OPERATOR_LABEL,
                resource_count=1,
                timeout=600,
                sleep=5,
            )

            # Validate all mons are running
            log.info("Validate all mons are up and running")
            POD_OBJ.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=constants.MON_APP_LABEL,
                resource_count=mon_count,
                timeout=1200,
                sleep=5,
            )
            log.info("All mons are up and running")

            # Check the ceph health OK
            ceph_health_check(tries=90, delay=15)

            # Validate all storage pods are running
            wait_for_storage_pods()

            # Sleep for some seconds before deleting another mon
            sleep_time = 300
            log.info(
                f"Waiting for {sleep_time} seconds before deleting another mon"
            )
            time.sleep(sleep_time)

        # Check the endpoints are different
        log.info("Validate the mon endpoints are changed")
        new_mon_svc = get_services_by_label(
            label=constants.MON_APP_LABEL,
            namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
        )
        list_new_svc = []
        for new_svc in new_mon_svc:
            cluster_ip = new_svc["spec"]["clusterIP"]
            list_new_svc.append(cluster_ip)
        diff = set(list_new_svc) ^ set(list_old_svc)
        assert len(diff) == len(list_old_svc + list_new_svc), (
            f"Not all endpoints are changed. Set of old "
            f"endpoints {list_old_svc} and new endpoints {list_new_svc}")
        log.info(f"All new mon endpoints are created {list_new_svc}")

        # Create PVC and pods
        log.info(f"Create {interface} PVC")
        pod_obj = pod_factory(interface=interface)
        pod_obj.run_io(storage_type="fs", size="500M")
示例#17
0
def measure_stop_ceph_mon(measurement_dir):
    """
    Downscales Ceph Monitor deployment, measures the time when it was
    downscaled and monitors alerts that were triggered during this event.

    Returns:
        dict: Contains information about `start` and `stop` time for stopping
            Ceph Monitor pod
    """
    oc = ocp.OCP(kind=constants.DEPLOYMENT,
                 namespace=config.ENV_DATA["cluster_namespace"])
    mon_deployments = oc.get(selector=constants.MON_APP_LABEL)["items"]
    mons = [deployment["metadata"]["name"] for deployment in mon_deployments]

    # get monitor deployments to stop, leave even number of monitors
    split_index = len(mons) // 2 if len(mons) > 3 else 2
    mons_to_stop = mons[split_index:]
    logger.info(f"Monitors to stop: {mons_to_stop}")
    logger.info(f"Monitors left to run: {mons[:split_index]}")

    # run_time of operation
    run_time = 60 * 14

    def stop_mon():
        """
        Downscale Ceph Monitor deployments for 14 minutes. First 15 minutes
        the alert CephMonQuorumAtRisk should be in 'Pending'. After 15 minutes
        the alert turns into 'Firing' state.
        This configuration of monitoring can be observed in ceph-mixins which
        are used in the project:
            https://github.com/ceph/ceph-mixins/blob/d22afe8c0da34490cb77e52a202eefcf4f62a869/config.libsonnet#L16
        `Firing` state shouldn't actually happen because monitor should be
        automatically redeployed shortly after 10 minutes.

        Returns:
            str: Names of downscaled deployments
        """
        nonlocal oc
        nonlocal mons_to_stop
        for mon in mons_to_stop:
            logger.info(f"Downscaling deployment {mon} to 0")
            oc.exec_oc_cmd(f"scale --replicas=0 deployment/{mon}")
        logger.info(f"Waiting for {run_time} seconds")
        time.sleep(run_time)
        return mons_to_stop

    test_file = os.path.join(measurement_dir, "measure_stop_ceph_mon.json")
    measured_op = measure_operation(stop_mon, test_file)

    # expected minimal downtime of a mon inflicted by this fixture
    measured_op["min_downtime"] = run_time - (60 * 2)

    # get new list of monitors to make sure that new monitors were deployed
    mon_deployments = oc.get(selector=constants.MON_APP_LABEL)["items"]
    mons = [deployment["metadata"]["name"] for deployment in mon_deployments]

    # check that downscaled monitors are removed as OCS should redeploy them
    # but only when we are running this for the first time
    check_old_mons_deleted = all(mon not in mons for mon in mons_to_stop)
    if measured_op["first_run"] and not check_old_mons_deleted:
        for mon in mons_to_stop:
            logger.info(f"Upscaling deployment {mon} back to 1")
            oc.exec_oc_cmd(f"scale --replicas=1 deployment/{mon}")
        msg = f"Downscaled monitors {mons_to_stop} were not replaced"
        assert check_old_mons_deleted, msg

    # wait for ceph to return into HEALTH_OK state after mon deployment
    # is returned back to normal
    ceph_health_check(tries=20, delay=15)

    return measured_op
示例#18
0
def ocs_install_verification(
    timeout=600,
    skip_osd_distribution_check=False,
    ocs_registry_image=None,
    post_upgrade_verification=False,
    version_before_upgrade=None,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.
        version_before_upgrade (float): Set to OCS version before upgrade

    """
    from ocs_ci.ocs.node import get_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    from ocs_ci.ocs.resources.fips import check_fips_enabled

    number_of_worker_nodes = len(get_nodes())
    namespace = config.ENV_DATA["cluster_namespace"]
    log.info("Verifying OCS installation")
    if config.ENV_DATA.get("disable_components"):
        for component in config.ENV_DATA["disable_components"]:
            config.COMPONENTS[f"disable_{component}"] = True
    disable_noobaa = config.COMPONENTS["disable_noobaa"]
    disable_rgw = config.COMPONENTS["disable_rgw"]
    disable_blockpools = config.COMPONENTS["disable_blockpools"]
    disable_cephfs = config.COMPONENTS["disable_cephfs"]
    managed_service = (config.ENV_DATA["platform"].lower()
                       in constants.MANAGED_SERVICE_PLATFORMS)
    ocs_version = version.get_semantic_ocs_version_from_config()

    # Basic Verification for cluster
    basic_verification(ocs_registry_image)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    pod = OCP(kind=constants.POD, namespace=namespace)
    if not config.DEPLOYMENT["external_mode"]:
        osd_count = int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["count"]
        ) * int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"])
    rgw_count = None
    if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS:
        if not disable_rgw:
            rgw_count = get_rgw_count(f"{ocs_version}",
                                      post_upgrade_verification,
                                      version_before_upgrade)

    min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT

    if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM:
        min_eps = 1

    nb_db_label = (constants.NOOBAA_DB_LABEL_46_AND_UNDER
                   if ocs_version < version.VERSION_4_7 else
                   constants.NOOBAA_DB_LABEL_47_AND_ABOVE)
    resources_dict = {
        nb_db_label: 1,
        constants.OCS_OPERATOR_LABEL: 1,
        constants.OPERATOR_LABEL: 1,
        constants.NOOBAA_OPERATOR_POD_LABEL: 1,
        constants.NOOBAA_CORE_POD_LABEL: 1,
        constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps,
    }
    if not config.DEPLOYMENT["external_mode"]:
        resources_dict.update({
            constants.MON_APP_LABEL: 3,
            constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2,
            constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2,
            constants.OSD_APP_LABEL: osd_count,
            constants.MGR_APP_LABEL: 1,
            constants.MDS_APP_LABEL: 2,
            constants.RGW_APP_LABEL: rgw_count,
        })

    if ocs_version >= version.VERSION_4_9:
        resources_dict.update({
            constants.ODF_OPERATOR_CONTROL_MANAGER_LABEL: 1,
        })

    for label, count in resources_dict.items():
        if label == constants.RGW_APP_LABEL:
            if (not config.ENV_DATA.get("platform")
                    in constants.ON_PREM_PLATFORMS or managed_service
                    or disable_rgw):
                continue
        if "noobaa" in label and (disable_noobaa or managed_service):
            continue
        if "mds" in label and disable_cephfs:
            continue

        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=label,
            resource_count=count,
            timeout=timeout,
        )

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    required_storage_classes = {
        f"{storage_cluster_name}-cephfs",
        f"{storage_cluster_name}-ceph-rbd",
    }
    skip_storage_classes = set()
    if disable_cephfs:
        skip_storage_classes.update({
            f"{storage_cluster_name}-cephfs",
        })
    if disable_blockpools:
        skip_storage_classes.update({
            f"{storage_cluster_name}-ceph-rbd",
        })
    required_storage_classes = required_storage_classes.difference(
        skip_storage_classes)

    if config.DEPLOYMENT["external_mode"]:
        required_storage_classes.update({
            f"{storage_cluster_name}-ceph-rgw",
            f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io',
        })
    storage_classes = storage_class.get()
    storage_class_names = {
        item["metadata"]["name"]
        for item in storage_classes["items"]
    }
    # required storage class names should be observed in the cluster under test
    missing_scs = required_storage_classes.difference(storage_class_names)
    if len(missing_scs) > 0:
        log.error("few storage classess are not present: %s", missing_scs)
    assert list(missing_scs) == []

    # Verify OSDs are distributed
    if not config.DEPLOYMENT["external_mode"]:
        if not skip_osd_distribution_check:
            log.info(
                "Verifying OSDs are distributed evenly across worker nodes")
            ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
            osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"]
            deviceset_count = get_deviceset_count()
            node_names = [osd["spec"]["nodeName"] for osd in osds]
            for node in node_names:
                assert (
                    not node_names.count(node) > deviceset_count
                ), "OSD's are not distributed evenly across worker nodes"

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    csi_drivers = {
        item["metadata"]["name"]
        for item in csi_driver.get()["items"]
    }
    assert defaults.CSI_PROVISIONERS.issubset(csi_drivers)

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    if config.DEPLOYMENT["external_mode"]:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(resource_name=(
            constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS))
    else:
        if not disable_blockpools:
            sc_rbd = storage_class.get(
                resource_name=constants.DEFAULT_STORAGECLASS_RBD)
        if not disable_cephfs:
            sc_cephfs = storage_class.get(
                resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    if not disable_blockpools:
        assert (
            sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"]
            == constants.RBD_NODE_SECRET)
        assert (
            sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"]
            == constants.RBD_PROVISIONER_SECRET)
    if not disable_cephfs:
        assert (sc_cephfs["parameters"]
                ["csi.storage.k8s.io/node-stage-secret-name"] ==
                constants.CEPHFS_NODE_SECRET)
        assert (sc_cephfs["parameters"]
                ["csi.storage.k8s.io/provisioner-secret-name"] ==
                constants.CEPHFS_PROVISIONER_SECRET)
    log.info("Verified node and provisioner secret names in storage class.")

    ct_pod = get_ceph_tools_pod()

    # https://github.com/red-hat-storage/ocs-ci/issues/3820
    # Verify ceph osd tree output
    if not (config.DEPLOYMENT.get("ui_deployment")
            or config.DEPLOYMENT["external_mode"] or managed_service):
        log.info(
            "Verifying ceph osd tree output and checking for device set PVC names "
            "in the output.")
        if config.DEPLOYMENT.get("local_storage"):
            deviceset_pvcs = [osd.get_node() for osd in get_osd_pods()]
            # removes duplicate hostname
            deviceset_pvcs = list(set(deviceset_pvcs))
            if config.ENV_DATA.get(
                    "platform") == constants.BAREMETAL_PLATFORM or (
                        config.ENV_DATA.get("flexy_deployment")
                        and config.ENV_DATA.get("platform")
                        == constants.AWS_PLATFORM):
                deviceset_pvcs = [
                    deviceset.replace(".", "-") for deviceset in deviceset_pvcs
                ]
        else:
            deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

        osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree",
                                        format="json")
        schemas = {
            "root": constants.OSD_TREE_ROOT,
            "rack": constants.OSD_TREE_RACK,
            "host": constants.OSD_TREE_HOST,
            "osd": constants.OSD_TREE_OSD,
            "region": constants.OSD_TREE_REGION,
            "zone": constants.OSD_TREE_ZONE,
        }
        schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs}
        for item in osd_tree["nodes"]:
            validate(instance=item, schema=schemas[item["type"]])
            if item["type"] == "host":
                deviceset_pvcs.remove(item["name"])
        assert not deviceset_pvcs, (
            f"These device set PVCs are not given in ceph osd tree output "
            f"- {deviceset_pvcs}")
        log.info(
            "Verified ceph osd tree output. Device set PVC names are given in the "
            "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    # if the OCS version is < 4.6
    if ocs_version < version.VERSION_4_6:
        log.info("Verifying CSI snapshotter is not present.")
        provisioner_pods = get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            selector=[
                constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
            ],
        )
        for pod_obj in provisioner_pods:
            pod_info = pod_obj.get()
            for container, image in get_images(data=pod_info).items():
                assert ("snapshot" not in container) and (
                    "snapshot" not in image
                ), (f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
        ocs_csv = get_ocs_csv()
        deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"]
        rook_ceph_operator_deployment = [
            deployment_val for deployment_val in deployments
            if deployment_val["name"] == "rook-ceph-operator"
        ]
        assert {
            "name": "CSI_ENABLE_SNAPSHOTTER",
            "value": "false"
        } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"]
              ["containers"][0]["env"]
              ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
        log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump",
                                          format="")
        pool_names = [
            constants.METADATA_POOL,
            constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL,
        ]
        crush_rules = [
            rule for rule in crush_dump["rules"]
            if rule["rule_name"] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule["steps"]
                if item.get("type") == "zone"
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    # TODO: update pvc validation for managed services
    if not managed_service:
        log.info("Validate cluster on PVC")
        validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(namespace, health_check_tries,
                                   health_check_delay)
    if config.ENV_DATA.get("fips"):
        # In case that fips is enabled when deploying,
        # a verification of the installation of it will run
        # on all running state pods
        check_fips_enabled()
    if config.ENV_DATA.get("encryption_at_rest"):
        osd_encryption_verification()
        if config.DEPLOYMENT.get("kms_deployment"):
            kms = KMS.get_kms_deployment()
            kms.post_deploy_verification()
            if config.ENV_DATA.get("VAULT_CA_ONLY", None):
                verify_kms_ca_only()

    storage_cluster_obj = get_storage_cluster()
    is_flexible_scaling = (
        storage_cluster_obj.get()["items"][0].get("spec").get(
            "flexibleScaling", False))
    if is_flexible_scaling is True:
        failure_domain = storage_cluster_obj.data["items"][0]["status"][
            "failureDomain"]
        assert failure_domain == "host", (
            f"The expected failure domain on cluster with flexible scaling is 'host',"
            f" the actaul failure domain is {failure_domain}")

    if config.ENV_DATA.get("is_multus_enabled"):
        verify_multus_network()
    if managed_service:
        verify_managed_service_resources()
示例#19
0
 def finalizer():
     assert ceph_health_check(), "Ceph cluster health is not OK"
     log.info("Ceph cluster health is OK")
    def test_monitoring_after_draining_node_where_prometheus_hosted(self, pods):
        """
        Test case to validate when node is drained where prometheus
        is hosted, prometheus pod should re-spin on new healthy node
        and shouldn't be any data/metrics loss

        """

        # Get the prometheus pod
        pod_obj_list = pod.get_all_pods(
            namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus"]
        )

        for pod_obj in pod_obj_list:
            # Get the pvc which mounted on prometheus pod
            pod_info = pod_obj.get()
            pvc_name = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][
                "claimName"
            ]

            # Get the node where the prometheus pod is hosted
            prometheus_pod_obj = pod_obj.get()
            prometheus_node = prometheus_pod_obj["spec"]["nodeName"]

            # Drain node where the prometheus pod hosted
            drain_nodes([prometheus_node])

            # Validate node is in SchedulingDisabled state
            wait_for_nodes_status(
                [prometheus_node], status=constants.NODE_READY_SCHEDULING_DISABLED
            )

            # Validate all prometheus pod is running
            POD = ocp.OCP(
                kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE
            )
            assert POD.wait_for_resource(
                condition="Running", selector="app=prometheus", timeout=180
            ), "One or more prometheus pods are not in running state"

            # Validate prometheus pod is re-spinned on new healthy node
            pod_info = pod_obj.get()
            new_node = pod_info["spec"]["nodeName"]
            assert (
                new_node not in prometheus_node
            ), "Promethues pod not re-spinned on new node"
            log.info(f"Prometheus pod re-spinned on new node {new_node}")

            # Validate same pvc is mounted on prometheus pod
            assert (
                pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"]
                in pvc_name
            ), f"Old pvc not found after restarting the prometheus pod {pod_obj.name}"

            # Validate the prometheus health is ok
            assert prometheus_health_check(), "Prometheus cluster health is not OK"

            # Mark the nodes back to schedulable
            schedule_nodes([prometheus_node])

            # Wait some time after node scheduling back
            waiting_time = 30
            log.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate node is in Ready State
            wait_for_nodes_status([prometheus_node], status=constants.NODE_READY)

            # Validate ceph health OK
            ceph_health_check(tries=40, delay=30)

        # Check the node are Ready state and check cluster is health ok
        self.sanity_helpers.health_check(tries=40)

        # Check for the created pvc metrics after rebooting the master nodes
        for pod_obj in pods:
            assert check_pvcdata_collected_on_prometheus(
                pod_obj.pvc.name
            ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
示例#21
0
    def test_add_capacity(
        self,
        project_factory,
        multi_dc_pod,
        multi_pvc_factory,
        pod_factory,
        mcg_obj,
        awscli_pod,
        bucket_factory,
        percent_to_fill,
    ):

        #####################################
        #           ENTRY CRITERIA          #
        #####################################
        # Prepare initial configuration : logger, cluster filling, loop for creating & deleting of PVCs and Pods,
        # noobaa IOs etc.,

        # Perform Health checks:
        # Make sure cluster is healthy
        assert ceph_health_check(
            defaults.ROOK_CLUSTER_NAMESPACE
        ), "Entry criteria FAILED: Cluster is Unhealthy"

        # All OCS pods are in running state:
        # ToDo https://github.com/red-hat-storage/ocs-ci/issues/2361
        assert (
            pod_helpers.check_pods_in_running_state()
        ), "Entry criteria FAILED: one or more OCS pods are not in running state"
        # Create the namespace under which this test will execute:
        project = project_factory()

        # total pvc created will be 'num_of_pvcs' * 4 types of pvcs(rbd-rwo,rwx
        # & cephfs-rwo,rwx)
        num_of_pvcs = 40

        rwo_rbd_pods = multi_dc_pod(
            num_of_pvcs=num_of_pvcs,
            pvc_size=175,
            project=project,
            access_mode="RWO",
            pool_type="rbd",
            timeout=360,
        )
        # Note: Skipping cephfs pods creation
        # observing bug https://bugzilla.redhat.com/show_bug.cgi?id=1785399,
        # https://bugzilla.redhat.com/show_bug.cgi?id=1779421#c14
        # Todo: https://github.com/red-hat-storage/ocs-ci/issues/2360

        # Create rwx-rbd pods
        pods_ios_rwx_rbd = multi_dc_pod(
            num_of_pvcs=10,
            pvc_size=175,
            project=project,
            access_mode="RWX-BLK",
            pool_type="rbd",
            timeout=360,
        )

        cluster_fill_io_pods = rwo_rbd_pods
        logger.info("The DC pods are up. Running IOs from them to fill the cluster")
        filler = cluster_exp_helpers.ClusterFiller(
            cluster_fill_io_pods, percent_to_fill, project.namespace
        )
        assert filler.cluster_filler(), "IOs failed"

        # create separate threadpool for running IOs in the background
        executor_run_bg_ios_ops = ThreadPoolExecutor()

        bg_wrap = cluster_exp_helpers.BackgroundOps()
        status_cluster_ios = []
        pods_for_copy = rwo_rbd_pods[0:5] + pods_ios_rwx_rbd

        for p in pods_for_copy:
            logger.info(f"running IOs on {p.name}")
            if p.pod_type == "rbd_block_rwx":
                status_cluster_ios.append(
                    executor_run_bg_ios_ops.submit(
                        bg_wrap.wrap, cluster_exp_helpers.raw_block_io, p, iterations=10
                    )
                )
            else:
                status_cluster_ios.append(
                    executor_run_bg_ios_ops.submit(
                        bg_wrap.wrap,
                        cluster_exp_helpers.cluster_copy_ops,
                        p,
                        iterations=200,
                    )
                )

        # Start pvc ops in the background.:
        logger.info("Started pvc create delete operations")
        executor_run_bg_ios_ops.submit(
            bg_wrap.wrap,
            test_create_delete_pvcs,
            multi_pvc_factory,
            pod_factory,
            project,
            iterations=200,
        )

        # Start NooBaa IOs in the background.:
        logger.info("Started s3_io_create_delete...")

        executor_run_bg_ios_ops.submit(
            bg_wrap.wrap,
            s3_io_create_delete,
            mcg_obj,
            awscli_pod,
            bucket_factory,
            iterations=200,
        )

        logger.info("Started obc_io_create_delete...")

        executor_run_bg_ios_ops.submit(
            bg_wrap.wrap,
            obc_io_create_delete,
            mcg_obj,
            awscli_pod,
            bucket_factory,
            iterations=200,
        )

        # All ocs nodes are in Ready state (including master):
        executor_run_bg_ios_ops.submit(
            bg_wrap.wrap, cluster_exp_helpers.check_nodes_status, iterations=100
        )

        # Get restart count of ocs pods before expanstion
        restart_count_before = pod_helpers.get_pod_restarts_count(
            defaults.ROOK_CLUSTER_NAMESPACE
        )

        # Get osd pods before expansion
        osd_pods_before = pod_helpers.get_osd_pods()

        # Get the total space in cluster before expansion
        ct_pod = pod_helpers.get_ceph_tools_pod()
        output = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd df")
        total_space_b4_expansion = int(output.get("summary").get("total_kb"))
        logger.info(f"total_space_b4_expansion == {total_space_b4_expansion}")

        logger.info("############## Calling add_capacity $$$$$$$$$$")

        #####################
        # Call add_capacity #
        #####################
        osd_size = storage_cluster.get_osd_size()
        result = storage_cluster.add_capacity(osd_size)
        pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"])

        # New osd (all) pods corresponding to the additional capacity should be
        # in running state
        pod.wait_for_resource(
            timeout=1200,
            condition=constants.STATUS_RUNNING,
            selector="app=rook-ceph-osd",
            resource_count=result * 3,
        )

        #################################
        # Exit criteria verification:   #
        #################################
        cluster_exp_helpers.BackgroundOps.EXPANSION_COMPLETED = True

        # No ocs pods should get restarted unexpectedly
        # Get restart count of ocs pods after expansion and see any pods got
        # restated
        restart_count_after = pod_helpers.get_pod_restarts_count(
            defaults.ROOK_CLUSTER_NAMESPACE
        )
        #
        # # TO DO
        # # Handle Bug 1814254 - All Mons respinned during add capacity and OSDs took longtime to come up
        # # implement function to make sure no pods are respun after expansion

        logger.info(
            f"sum(restart_count_before.values()) = {sum(restart_count_before.values())}"
        )
        logger.info(
            f" sum(restart_count_after.values()) = {sum(restart_count_after.values())}"
        )
        assert sum(restart_count_before.values()) == sum(
            restart_count_after.values()
        ), "Exit criteria verification FAILED: One or more pods got restarted"

        logger.info("Exit criteria verification Success: No pods were restarted")
        # Make sure right number of OSDs are added:
        #   Get osd pods after expansion
        osd_pods_after = pod_helpers.get_osd_pods()
        number_of_osds_added = len(osd_pods_after) - len(osd_pods_before)
        logger.info(
            f"### number_of_osds_added = {number_of_osds_added}, "
            f"before = {len(osd_pods_before)}, after = {len(osd_pods_after) }"
        )
        # If the difference b/w updated count of osds and old osd count is not
        # 3 then expansion failed
        assert (
            number_of_osds_added == 3
        ), "Exit criteria verification FAILED: osd count mismatch"

        logger.info(
            "Exit criteria verification Success: Correct number of OSDs are added"
        )

        # The newly added capacity takes into effect at the storage level
        ct_pod = pod_helpers.get_ceph_tools_pod()
        output = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd df")
        total_space_after_expansion = int(output.get("summary").get("total_kb"))
        osd_size = int(output.get("nodes")[0].get("kb"))
        expanded_space = osd_size * 3  # 3 OSDS are added of size = 'osd_size'
        logger.info(f"space output == {output} ")
        logger.info(f"osd size == {osd_size} ")
        logger.info(f"total_space_after_expansion == {total_space_after_expansion} ")
        expected_total_space_after_expansion = total_space_b4_expansion + expanded_space
        logger.info(
            f"expected_total_space_after_expansion == {expected_total_space_after_expansion} "
        )
        assert (
            total_space_after_expansion == expected_total_space_after_expansion
        ), "Exit criteria verification FAILED: Expected capacity mismatch"

        logger.info(
            "Exit criteria verification Success: Newly added capacity took into effect"
        )

        logger.info("Exit criteria verification Success: IOs completed successfully")
        # 'ceph osd tree' should show the new osds under right nodes/hosts
        #   Verification is different for 3 AZ and 1 AZ configs
        ct_pod = pod_helpers.get_ceph_tools_pod()
        tree_output = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree")
        logger.info(f"### OSD tree output = {tree_output}")
        if config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM:
            assert cluster_helpers.check_osd_tree_1az_vmware(
                tree_output, len(osd_pods_after)
            ), "Exit criteria verification FAILED: Incorrect ceph osd tree formation found"

        aws_number_of_zones = 3
        if config.ENV_DATA["platform"].lower() == constants.AWS_PLATFORM:
            # parse the osd tree. if it contains a node 'rack' then it's a
            # AWS_1AZ cluster. Else, 3 AWS_3AZ cluster
            for i in range(len(tree_output["nodes"])):
                if tree_output["nodes"][i]["name"] in "rack0":
                    aws_number_of_zones = 1
            if aws_number_of_zones == 1:
                assert cluster_helpers.check_osd_tree_1az_aws(
                    tree_output, len(osd_pods_after)
                ), "Exit criteria verification FAILED: Incorrect ceph osd tree formation found"
            else:
                assert cluster_helpers.check_osd_tree_3az_aws(
                    tree_output, len(osd_pods_after)
                ), "Exit criteria verification FAILED: Incorrect ceph osd tree formation found"

        logger.info("Exit criteria verification Success: osd tree verification success")

        # Make sure new pvcs and pods can be created and IOs can be run from
        # the pods
        num_of_pvcs = 1
        rwo_rbd_pods = multi_dc_pod(
            num_of_pvcs=num_of_pvcs,
            pvc_size=5,
            project=project,
            access_mode="RWO",
            pool_type="rbd",
        )
        rwo_cephfs_pods = multi_dc_pod(
            num_of_pvcs=num_of_pvcs,
            pvc_size=5,
            project=project,
            access_mode="RWO",
            pool_type="cephfs",
        )
        rwx_cephfs_pods = multi_dc_pod(
            num_of_pvcs=num_of_pvcs,
            pvc_size=5,
            project=project,
            access_mode="RWX",
            pool_type="cephfs",
        )
        # Create rwx-rbd pods
        pods_ios_rwx_rbd = multi_dc_pod(
            num_of_pvcs=num_of_pvcs,
            pvc_size=5,
            project=project,
            access_mode="RWX-BLK",
            pool_type="rbd",
        )
        cluster_io_pods = (
            rwo_rbd_pods + rwo_cephfs_pods + rwx_cephfs_pods + pods_ios_rwx_rbd
        )

        with ThreadPoolExecutor() as pod_ios_executor:
            for p in cluster_io_pods:
                if p.pod_type == "rbd_block_rwx":
                    logger.info(f"Calling block fio on pod {p.name}")
                    pod_ios_executor.submit(cluster_exp_helpers.raw_block_io, p, "100M")
                else:
                    logger.info(f"calling file fio on pod {p.name}")
                    pod_ios_executor.submit(p.run_io, "fs", "100M")

        for pod_io in cluster_io_pods:
            pod_helpers.get_fio_rw_iops(pod_io)

        cluster_obj = cluster_helpers.CephCluster()
        assert (
            cluster_obj.get_ceph_health() != "HEALTH_ERR"
        ), "Ceph cluster health checking failed"
        logger.info("ALL Exit criteria verification successfully")
        logger.info(
            "********************** TEST PASSED *********************************"
        )
示例#22
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        _templating = templating.Templating()

        ceph_cluster = ocp.OCP(kind='CephCluster',
                               namespace=config.ENV_DATA['cluster_namespace'])
        try:
            ceph_cluster.get().get('items')[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")

        create_oc_resource('common.yaml', self.cluster_path, _templating,
                           config.ENV_DATA)

        run_cmd(f'oc label namespace {config.ENV_DATA["cluster_namespace"]} '
                f'"openshift.io/cluster-monitoring=true"')
        run_cmd(f"oc policy add-role-to-user view "
                f"system:serviceaccount:openshift-monitoring:prometheus-k8s "
                f"-n {config.ENV_DATA['cluster_namespace']}")
        apply_oc_resource('csi-nodeplugin-rbac_rbd.yaml',
                          self.cluster_path,
                          _templating,
                          config.ENV_DATA,
                          template_dir="ocs-deployment/csi/rbd/")
        apply_oc_resource('csi-provisioner-rbac_rbd.yaml',
                          self.cluster_path,
                          _templating,
                          config.ENV_DATA,
                          template_dir="ocs-deployment/csi/rbd/")
        apply_oc_resource('csi-nodeplugin-rbac_cephfs.yaml',
                          self.cluster_path,
                          _templating,
                          config.ENV_DATA,
                          template_dir="ocs-deployment/csi/cephfs/")
        apply_oc_resource('csi-provisioner-rbac_cephfs.yaml',
                          self.cluster_path,
                          _templating,
                          config.ENV_DATA,
                          template_dir="ocs-deployment/csi/cephfs/")
        # Increased to 15 seconds as 10 is not enough
        # TODO: do the sampler function and check if resource exist
        wait_time = 15
        logger.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        create_oc_resource('operator-openshift-with-csi.yaml',
                           self.cluster_path, _templating, config.ENV_DATA)
        logger.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        run_cmd(f"oc wait --for condition=ready pod "
                f"-l app=rook-ceph-operator "
                f"-n {config.ENV_DATA['cluster_namespace']} "
                f"--timeout=120s")
        run_cmd(f"oc wait --for condition=ready pod "
                f"-l app=rook-discover "
                f"-n {config.ENV_DATA['cluster_namespace']} "
                f"--timeout=120s")
        create_oc_resource('cluster.yaml', self.cluster_path, _templating,
                           config.ENV_DATA)

        pod = ocp.OCP(kind=constants.POD,
                      namespace=config.ENV_DATA['cluster_namespace'])
        cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM,
                      namespace=config.ENV_DATA['cluster_namespace'])

        # Check for the Running status of Ceph Pods
        run_cmd(f"oc wait --for condition=ready pod "
                f"-l app=rook-ceph-agent "
                f"-n {config.ENV_DATA['cluster_namespace']} "
                f"--timeout=120s")
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mon',
                                     resource_count=3,
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mgr',
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-osd',
                                     resource_count=3,
                                     timeout=600)

        create_oc_resource('toolbox.yaml', self.cluster_path, _templating,
                           config.ENV_DATA)
        logger.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)
        create_oc_resource('storage-manifest.yaml', self.cluster_path,
                           _templating, config.ENV_DATA)
        create_oc_resource("service-monitor.yaml", self.cluster_path,
                           _templating, config.ENV_DATA)
        create_oc_resource("prometheus-rules.yaml", self.cluster_path,
                           _templating, config.ENV_DATA)
        logger.info(f"Waiting {wait_time} seconds...")
        time.sleep(wait_time)

        # Create MDS pods for CephFileSystem
        fs_data = templating.load_yaml_to_dict(constants.CEPHFILESYSTEM_YAML)
        fs_data['metadata']['namespace'] = config.ENV_DATA['cluster_namespace']

        ceph_obj = OCS(**fs_data)
        ceph_obj.create()
        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector='app=rook-ceph-mds',
                                     resource_count=2,
                                     timeout=600)

        # Check for CephFilesystem creation in ocp
        cfs_data = cfs.get()
        cfs_name = cfs_data['items'][0]['metadata']['name']

        if helpers.validate_cephfilesystem(cfs_name):
            logger.info(f"MDS deployment is successful!")
            defaults.CEPHFILESYSTEM_NAME = cfs_name
        else:
            logger.error(f"MDS deployment Failed! Please check logs!")

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        assert ceph_health_check(
            namespace=config.ENV_DATA['cluster_namespace'])
        # patch gp2 (EBS) storage class as 'non-default'
        logger.info("Patch gp2 storageclass as non-default")
        patch = " '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"false\"}}}' "
        run_cmd(f"oc patch storageclass gp2 "
                f"-p {patch} "
                f"--request-timeout=120s")
示例#23
0
    def deploy_ocs(self):
        """
        Handle OCS deployment, since OCS deployment steps are common to any
        platform, implementing OCS deployment here in base class.
        """
        ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace)
        try:
            ceph_cluster.get().get('items')[0]
            logger.warning("OCS cluster already exists")
            return
        except (IndexError, CommandFailed):
            logger.info("Running OCS basic installation")

        self.deploy_ocs_via_operator()
        if config.DEPLOYMENT.get('ui_deployment'):
            config.ENV_DATA['skip_ocs_deployment'] = True
            return

        pod = ocp.OCP(kind=constants.POD, namespace=self.namespace)
        cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace)
        # Check for Ceph pods
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mon',
                                     resource_count=3,
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-mgr',
                                     timeout=600)
        assert pod.wait_for_resource(condition='Running',
                                     selector='app=rook-ceph-osd',
                                     resource_count=3,
                                     timeout=600)

        # validate ceph mon/osd volumes are backed by pvc
        validate_cluster_on_pvc()

        # Creating toolbox pod
        setup_ceph_toolbox()

        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector='app=rook-ceph-tools',
                                     resource_count=1,
                                     timeout=600)

        # Check for CephFilesystem creation in ocp
        cfs_data = cfs.get()
        cfs_name = cfs_data['items'][0]['metadata']['name']

        if helpers.validate_cephfilesystem(cfs_name):
            logger.info(f"MDS deployment is successful!")
            defaults.CEPHFILESYSTEM_NAME = cfs_name
        else:
            logger.error(f"MDS deployment Failed! Please check logs!")

        # Change monitoring backend to OCS
        if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get(
                'persistent-monitoring'):

            sc_name = f"{config.ENV_DATA['storage_cluster_name']}-{constants.DEFAULT_SC_RBD}"

            # Get the list of monitoring pods
            pods_list = get_all_pods(
                namespace=defaults.OCS_MONITORING_NAMESPACE,
                selector=['prometheus', 'alertmanager'])

            # Create configmap cluster-monitoring-config
            create_configmap_cluster_monitoring_pod(sc_name)

            # Take some time to respin the pod
            waiting_time = 45
            logger.info(f"Waiting {waiting_time} seconds...")
            time.sleep(waiting_time)

            # Validate the pods are respinned and in running state
            validate_pods_are_respinned_and_running_state(pods_list)

            # Validate the pvc is created on monitoring pods
            validate_pvc_created_and_bound_on_monitoring_pods()

            # Validate the pvc are mounted on pods
            validate_pvc_are_mounted_on_monitoring_pods(pods_list)

        # Change registry backend to OCS CEPHFS RWX PVC
        registry.change_registry_backend_to_ocs()

        # Verify health of ceph cluster
        # TODO: move destroy cluster logic to new CLI usage pattern?
        logger.info("Done creating rook resources, waiting for HEALTH_OK")
        assert ceph_health_check(namespace=self.namespace)
        # patch gp2/thin storage class as 'non-default'
        self.patch_default_sc_to_non_default()
    def test_daemon_kill_during_pvc_pod_deletion_and_io(
        self, interface, resource_name, setup_base
    ):
        """
        Kill 'resource_name' daemon while PVCs deletion, pods deletion
        and IO are progressing
        """
        pvc_objs, pod_objs, rwx_pod_objs = setup_base
        namespace = pvc_objs[0].project.namespace

        num_of_pods_to_delete = 10
        num_of_io_pods = 5

        # Select pods to be deleted
        pods_to_delete = pod_objs[:num_of_pods_to_delete]
        pods_to_delete.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in pods_to_delete
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        # Select pods to run IO
        io_pods = pod_objs[
            num_of_pods_to_delete : num_of_pods_to_delete + num_of_io_pods
        ]
        io_pods.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in io_pods
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        # Select pods which are having PVCs to delete
        pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods :]
        pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc]
        pods_for_pvc.extend(
            [
                pod
                for pod in rwx_pod_objs
                for pod_obj in pods_for_pvc
                if (pod_obj.pvc == pod.pvc)
            ]
        )

        log.info(
            f"{len(pods_to_delete)} pods selected for deletion in which "
            f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod "
            f"share same RWX PVC"
        )
        log.info(
            f"{len(io_pods)} pods selected for running IO in which "
            f"{len(io_pods) - num_of_io_pods} pairs of pod share same "
            f"RWX PVC"
        )
        no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete)
        log.info(
            f"{len(pvcs_to_delete)} PVCs selected for deletion. "
            f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, "
            f"RWX PVCs: {no_of_rwx_pvcs_delete}"
        )

        pod_functions = {
            "mds": partial(get_mds_pods),
            "mon": partial(get_mon_pods),
            "mgr": partial(get_mgr_pods),
            "osd": partial(get_osd_pods),
            "rbdplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin": partial(get_plugin_pods, interface=interface),
            "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods),
            "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods),
            "operator": partial(get_operator_pods),
        }

        disruption = disruption_helpers.Disruptions()
        disruption.set_resource(resource=resource_name)
        executor = ThreadPoolExecutor(max_workers=len(pod_objs) + len(rwx_pod_objs))

        # Get number of pods of type 'resource_name'
        num_of_resource_pods = len(pod_functions[resource_name]())

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(get_all_pods(namespace=namespace))
        initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"])

        # Fetch PV names to verify after deletion
        pv_objs = []
        for pvc_obj in pvcs_to_delete:
            pvc_obj.reload()
            pv_objs.append(pvc_obj.backed_pv_obj)

        # Fetch volume details from pods for the purpose of verification
        node_pv_dict = {}
        for pod_obj in pods_to_delete:
            pod_info = pod_obj.get()
            node = pod_info["spec"]["nodeName"]
            pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"]
            for pvc_obj in pvc_objs:
                if pvc_obj.name == pvc:
                    pvc_obj.reload()
                    pv = pvc_obj.backed_pv
                    break
            if node in node_pv_dict:
                node_pv_dict[node].append(pv)
            else:
                node_pv_dict[node] = [pv]

        # Fetch image uuid associated with PVCs to be deleted
        pvc_uuid_map = {}
        for pvc_obj in pvcs_to_delete:
            pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid
        log.info("Fetched image uuid associated with each PVC")

        # Do setup on pods for running IO
        log.info("Setting up pods for running IO.")
        for pod_obj in pod_objs + rwx_pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info["spec"]["volumeMode"] == "Block":
                storage_type = "block"
            else:
                storage_type = "fs"
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in pod_objs + rwx_pod_objs:
            log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}")
            for sample in TimeoutSampler(180, 2, getattr, pod_obj, "wl_setup_done"):
                if sample:
                    log.info(
                        f"Setup for running IO is completed on pod " f"{pod_obj.name}."
                    )
                    break
        log.info("Setup for running IO is completed on all pods.")

        # Start IO on pods having PVCs to delete to load data
        log.info("Starting IO on pods having PVCs to delete.")
        self.run_io_on_pods(pods_for_pvc)
        log.info("IO started on pods having PVCs to delete.")

        log.info("Fetching IO results from the pods having PVCs to delete.")
        for pod_obj in pods_for_pvc:
            get_fio_rw_iops(pod_obj)
        log.info("Verified IO result on pods having PVCs to delete.")

        # Delete pods having PVCs to delete.
        assert self.delete_pods(
            pods_for_pvc
        ), "Couldn't delete pods which are having PVCs to delete."
        for pod_obj in pods_for_pvc:
            pod_obj.ocp.wait_for_delete(pod_obj.name)
        log.info("Verified: Deleted pods which are having PVCs to delete.")

        # Select daemon
        disruption.select_daemon()

        # Start IO on pods to be deleted
        log.info("Starting IO on pods to be deleted.")
        self.run_io_on_pods(pods_to_delete)
        log.info("IO started on pods to be deleted.")

        # Start deleting PVCs
        pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete)
        log.info("Started deleting PVCs")

        # Start deleting pods
        pod_bulk_delete = executor.submit(self.delete_pods, pods_to_delete)
        log.info("Started deleting pods")

        # Start IO on IO pods
        self.run_io_on_pods(io_pods)
        log.info("Started IO on IO pods")

        # Verify pvc deletion has started
        pvc_deleting = executor.submit(
            wait_for_resource_count_change,
            func_to_use=get_all_pvcs,
            previous_num=initial_num_of_pvc,
            namespace=namespace,
            change_type="decrease",
            min_difference=1,
            timeout=30,
            interval=0.01,
        )

        # Verify pod deletion has started
        pod_deleting = executor.submit(
            wait_for_resource_count_change,
            func_to_use=get_all_pods,
            previous_num=initial_num_of_pods,
            namespace=namespace,
            change_type="decrease",
            min_difference=1,
            timeout=30,
            interval=0.01,
        )

        assert pvc_deleting.result(), "Wait timeout: PVCs are not being deleted."
        log.info("PVCs deletion has started.")

        assert pod_deleting.result(), "Wait timeout: Pods are not being deleted."
        log.info("Pods deletion has started.")

        # Kill daemon
        disruption.kill_daemon()

        pods_deleted = pod_bulk_delete.result()
        assert pods_deleted, "Deletion of pods failed."

        # Verify pods are deleted
        for pod_obj in pods_to_delete:
            pod_obj.ocp.wait_for_delete(pod_obj.name, 300)
        log.info("Verified: Pods are deleted.")

        # Verify that the mount point is removed from nodes after deleting pod
        node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict)
        for node, pvs in node_pv_mounted.items():
            assert not pvs, (
                f"PVs {pvs} is still present on node {node} after "
                f"deleting the pods."
            )
        log.info(
            "Verified: mount points are removed from nodes after deleting " "the pods"
        )

        pvcs_deleted = pvc_bulk_delete.result()
        assert pvcs_deleted, "Deletion of PVCs failed."

        # Verify PVCs are deleted
        for pvc_obj in pvcs_to_delete:
            pvc_obj.ocp.wait_for_delete(pvc_obj.name)
        log.info("Verified: PVCs are deleted.")

        # Verify PVs are deleted
        for pv_obj in pv_objs:
            pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300)
        log.info("Verified: PVs are deleted.")

        # Verify PV using ceph toolbox. Image/Subvolume should be deleted.
        pool_name = default_ceph_block_pool()
        for pvc_name, uuid in pvc_uuid_map.items():
            if interface == constants.CEPHBLOCKPOOL:
                ret = verify_volume_deleted_in_backend(
                    interface=interface, image_uuid=uuid, pool_name=pool_name
                )
            if interface == constants.CEPHFILESYSTEM:
                ret = verify_volume_deleted_in_backend(
                    interface=interface, image_uuid=uuid
                )
            assert ret, (
                f"Volume associated with PVC {pvc_name} still exists " f"in backend"
            )

        log.info("Fetching IO results from the pods.")
        for pod_obj in io_pods:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get("jobs")[0].get("error")
            assert (
                err_count == 0
            ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}"
        log.info("Verified IO result on pods.")

        # Verify number of pods of type 'resource_name'
        final_num_resource_name = len(pod_functions[resource_name]())
        assert final_num_resource_name == num_of_resource_pods, (
            f"Total number of {resource_name} pods is not matching with "
            f"initial value. Total number of pods before daemon kill: "
            f"{num_of_resource_pods}. Total number of pods present now: "
            f"{final_num_resource_name}"
        )

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
        log.info("Ceph cluster health is OK")
    def test_rwo_pvc_fencing_node_short_network_failure(
            self, nodes, setup, teardown):
        """
        OCS-1423/OCS-1428/OCS-1426:
        - Start DeploymentConfig based app pods on 1 OCS/Non-OCS node
        - Make the node (where app pods are running) unresponsive
            by bringing its main network interface down
        - Check new app pods and/or mon, osd pods scheduled on another node
            are stuck due to Multi-Attach error.
        - Reboot the unresponsive node
        - When unresponsive node recovers, run IOs on new app pods

        OCS-1424/OCS-1434:
        - Start DeploymentConfig based app pods on multiple node
            Colocated scenario: Select 1 node where osd and/or mon is running,
                select other 2 nodes where mon/osd are not running
            Dedicated scenario: 3 Non-OCS nodes
        - Disrupt the leader provisioner pods if not running on above selected
            nodes
        - Make the nodes (where app pods are running) unresponsive
            by bringing their main network interface down
        - Check new app pods and/or mon, osd pods scheduled on another node and
            are stuck due to Multi-Attach error.
        - Reboot the unresponsive nodes
        - When unresponsive nodes recover, run IOs on new app pods

        """
        ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup

        # Run IO on pods
        md5sum_data = self.run_and_verify_io(pod_list=dc_pods,
                                             fio_filename="io_file1",
                                             run_io_in_bg=True)

        # OCS-1424/OCS-1434
        # Disrupt leader plugin-provisioner pods, skip if running on node to be failed
        if disruptor:
            [disruption.delete_resource() for disruption in disruptor]

        # Induce network failure on the nodes
        node.node_network_failure(app_pod_nodes)
        logger.info(f"Waiting for {self.short_nw_fail_time} seconds")
        sleep(self.short_nw_fail_time)

        # Wait for pods to be rescheduled
        for pod_obj in dc_pods + ceph_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_TERMINATING,
                resource_name=pod_obj.name,
                timeout=600,
                sleep=30,
            )

        # Fetch info of new pods and verify Multi-Attach error
        new_dc_pods = self.get_new_pods(dc_pods)
        assert len(new_dc_pods) == len(
            dc_pods), "Unexpected number of app pods"
        self.verify_multi_attach_error(new_dc_pods)

        if ceph_pods:
            new_ceph_pods = self.get_new_pods(ceph_pods)
            assert len(new_ceph_pods) > 0, "Unexpected number of osd pods"
            self.verify_multi_attach_error(new_ceph_pods)

        # Reboot the unresponsive node(s)
        logger.info(f"Rebooting the unresponsive node(s): {app_pod_nodes}")
        nodes.restart_nodes_by_stop_and_start(
            node.get_node_objs(app_pod_nodes))
        node.wait_for_nodes_status(node_names=app_pod_nodes,
                                   status=constants.NODE_READY)

        # Wait for new app pods to reach Running state
        for pod_obj in new_dc_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=1200,
                sleep=30,
            ), (f"App pod with name {pod_obj.name} did not reach Running state"
                )

        # Wait for mon and osd pods to reach Running state
        selectors_to_check = {
            constants.MON_APP_LABEL: ceph_cluster.mon_count,
            constants.OSD_APP_LABEL: ceph_cluster.osd_count,
        }
        for selector, count in selectors_to_check.items():
            assert ceph_cluster.POD.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=selector,
                resource_count=count,
                timeout=1800,
                sleep=60,
            ), f"{count} expected pods with selector {selector} are not in Running state"

        assert ceph_health_check(), "Ceph cluster health is not OK"
        logger.info("Ceph cluster health is OK")

        # Verify data integrity from new pods
        for num, pod_obj in enumerate(new_dc_pods):
            assert pod.verify_data_integrity(pod_obj=pod_obj,
                                             file_name="io_file1",
                                             original_md5sum=md5sum_data[num]
                                             ), "Data integrity check failed"

        # Run IO on new pods
        self.run_and_verify_io(pod_list=new_dc_pods,
                               fio_filename="io_file2",
                               return_md5sum=False)
示例#26
0
    def test_add_capacity_node_restart(
        self,
        nodes,
        multi_pvc_factory,
        pod_factory,
        workload_storageutilization_rbd,
        num_of_nodes,
    ):
        """
        test add capacity when one of the worker nodes got restart in the middle of the process
        """
        logging.info(
            "Condition 1 to start the test is met: storageutilization is completed"
        )
        # Please notice: When the branch 'wip-add-capacity-e_e' will be merged into master
        # the test will include more much data both before, and after calling 'add_capacity'function.

        node_list = get_ocs_nodes(num_of_nodes=num_of_nodes)
        assert node_list, "Condition 2 to start test failed: No node to restart"

        max_osds = 15
        osd_pods_before = pod_helpers.get_osd_pods()
        assert (
            len(osd_pods_before) < max_osds
        ), "Condition 3 to start test failed: We have maximum of osd's in the cluster"
        logging.info("All start conditions are met!")

        osd_size = storage_cluster.get_osd_size()
        logging.info("Calling add_capacity function...")
        result = storage_cluster.add_capacity(osd_size)
        if result:
            logging.info("add capacity finished successfully")
        else:
            logging.info("add capacity failed")

        # Restart nodes while additional storage is being added
        logging.info("Restart nodes:")
        logging.info([n.name for n in node_list])
        nodes.restart_nodes(nodes=node_list, wait=True, timeout=420)
        logging.info("Finished restarting the node list")

        # The exit criteria verification conditions here are not complete. When the branch
        # 'wip-add-capacity-e_e' will be merged into master I will use the functions from this branch.

        pod = OCP(kind=constants.POD,
                  namespace=config.ENV_DATA["cluster_namespace"])
        pod.wait_for_resource(
            timeout=600,
            condition=constants.STATUS_RUNNING,
            selector="app=rook-ceph-osd",
            resource_count=result * 3,
        )

        # Verify OSDs are encrypted
        if config.ENV_DATA.get("encryption_at_rest"):
            osd_encryption_verification()

        logging.info(
            "Finished verifying add capacity osd storage with node restart")
        logging.info("Waiting for ceph health check to finished...")
        ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"],
                          tries=90)
    def test_rwo_pvc_fencing_node_prolonged_and_short_network_failure(
            self, nodes, setup, teardown):
        """
        OCS-1431/OCS-1436:
        - Start DeploymentConfig based app pods on 1 node
        - Make the node (where app pods are running) unresponsive
            by bringing its main network interface down
        - Disrupt the leader provisioner pods if not running on above selected
            node
        - Check new app pods and/or mon, osd pods scheduled on another node
            are stuck due to Multi-Attach error.
        - Power off the unresponsive node
        - Force delete the app pods and/or mon,osd pods on the unresponsive node
        - Check new app pods and/or mon, osd pods scheduled on another node comes
            into Running state
        - Run IOs on new app pods
        - Again make the node (where app pods are running) unresponsive
            by bringing its main network interface down
        - Check new app pods scheduled on another node are stuck due to
            Multi-Attach error.
        - Reboot the unresponsive node
        - When unresponsive node recovers, run IOs on new app pods

        """
        ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup

        extra_nodes = list(set(test_nodes) - set(app_pod_nodes))
        helpers.remove_label_from_worker_node(node_list=extra_nodes[:-1],
                                              label_key="nodetype")

        # Run IO on pods
        md5sum_data = self.run_and_verify_io(pod_list=dc_pods,
                                             fio_filename="io_file1",
                                             run_io_in_bg=True)

        # Disrupt leader plugin-provisioner pods, skip if running on node to be failed
        if disruptor:
            [disruption.delete_resource() for disruption in disruptor]

        # Induce network failure on the nodes
        node.node_network_failure(app_pod_nodes)
        logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds")
        sleep(self.prolong_nw_fail_time)

        # Wait for pods to be rescheduled
        for pod_obj in dc_pods + ceph_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_TERMINATING,
                resource_name=pod_obj.name)

        # Fetch info of new pods and verify Multi-Attach error
        new_dc_pods = self.get_new_pods(dc_pods)
        assert len(new_dc_pods) == len(
            dc_pods), "Unexpected number of app pods"
        self.verify_multi_attach_error(new_dc_pods)

        new_ceph_pods = []
        if ceph_pods:
            new_ceph_pods = self.get_new_pods(ceph_pods)
            assert len(new_ceph_pods) > 0, "Unexpected number of osd pods"
            self.verify_multi_attach_error(new_ceph_pods)

        logger.info("Executing manual recovery steps")
        # Power off the unresponsive node
        logger.info(f"Powering off the unresponsive node: {app_pod_nodes}")
        nodes.stop_nodes(node.get_node_objs(app_pod_nodes))

        # Force delete the app pods and/or mon,osd pods on the unresponsive node
        for pod_obj in dc_pods + ceph_pods:
            pod_obj.delete(force=True)

        # Wait for new app pods to reach Running state
        for pod_obj in new_dc_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=1200,
                sleep=30,
            ), (f"App pod with name {pod_obj.name} did not reach Running state"
                )

        # Wait for mon and osd pods to reach Running state
        selectors_to_check = [constants.MON_APP_LABEL, constants.OSD_APP_LABEL]
        for selector in selectors_to_check:
            assert ceph_cluster.POD.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=selector,
                resource_count=3,
                timeout=1800,
                sleep=60,
            ), f"3 expected pods with selector {selector} are not in Running state"

        if ceph_cluster.mon_count == 3:
            # Check ceph health
            toolbox_status = ceph_cluster.POD.get_resource_status(
                ceph_cluster.toolbox.name)
            if toolbox_status == constants.STATUS_TERMINATING:
                ceph_cluster.toolbox.delete(force=True)

            assert ceph_health_check(), "Ceph cluster health is not OK"
            logger.info("Ceph cluster health is OK")

        # Verify data integrity from new pods
        for num, pod_obj in enumerate(new_dc_pods):
            pod.verify_data_integrity(pod_obj=pod_obj,
                                      file_name="io_file1",
                                      original_md5sum=md5sum_data[num])

        # Run IO on new pods
        md5sum_data2 = self.run_and_verify_io(pod_list=new_dc_pods,
                                              fio_filename="io_file2",
                                              run_io_in_bg=True)

        helpers.label_worker_node(node_list=extra_nodes[:-1],
                                  label_key="nodetype",
                                  label_value="app-pod")

        # Induce network failure on the node
        node.node_network_failure(extra_nodes[-1])
        logger.info(f"Waiting for {self.short_nw_fail_time} seconds")
        sleep(self.short_nw_fail_time)

        # Wait for pods to be rescheduled
        for pod_obj in new_dc_pods:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_TERMINATING,
                resource_name=pod_obj.name,
                timeout=600,
                sleep=30,
            )

        # Fetch info of new pods and verify Multi-Attach error
        new_dc_pods2 = self.get_new_pods(new_dc_pods)
        assert len(new_dc_pods2) == len(
            new_dc_pods), "Unexpected number of app pods"
        self.verify_multi_attach_error(new_dc_pods2)

        # Reboot the unresponsive node
        logger.info(f"Rebooting the unresponsive node: {extra_nodes[-1]}")
        nodes.restart_nodes_by_stop_and_start(
            node.get_node_objs([extra_nodes[-1]]))
        node.wait_for_nodes_status(node_names=[extra_nodes[-1]],
                                   status=constants.NODE_READY)

        # Wait for new app pods to reach Running state
        for pod_obj in new_dc_pods2:
            pod_obj.ocp.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_name=pod_obj.name,
                timeout=1200,
                sleep=30,
            ), (f"App pod with name {pod_obj.name} did not reach Running state"
                )

        # Wait for mon and osd pods to reach Running state
        for selector in selectors_to_check:
            assert ceph_cluster.POD.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                selector=selector,
                resource_count=3,
                timeout=1800,
                sleep=60,
            ), f"3 expected pods with selector {selector} are not in Running state"

        if ceph_cluster.mon_count == 3:
            # Check ceph health
            assert ceph_health_check(), "Ceph cluster health is not OK"
            logger.info("Ceph cluster health is OK")

        # Verify data integrity from new pods
        for num, pod_obj in enumerate(new_dc_pods2):
            pod.verify_data_integrity(pod_obj=pod_obj,
                                      file_name="io_file2",
                                      original_md5sum=md5sum_data2[num])

        for num, pod_obj in enumerate(new_dc_pods2):
            pod.verify_data_integrity(pod_obj=pod_obj,
                                      file_name="io_file1",
                                      original_md5sum=md5sum_data[num])

        # Run IO on new pods
        self.run_and_verify_io(pod_list=new_dc_pods2,
                               fio_filename="io_file3",
                               return_md5sum=False)
示例#28
0
    def test_rgw_host_node_failure(self, nodes, node_restart_teardown,
                                   node_drain_teardown, mcg_obj,
                                   bucket_factory):
        """
        Test case to fail node where RGW and the NooBaa DB are hosted
        and verify the new pods spin on a healthy node

        """

        # Get nooba pods
        noobaa_pod_obj = get_noobaa_pods()

        # Get the node where noobaa-db hosted
        noobaa_pod_node = None
        for noobaa_pod in noobaa_pod_obj:
            if noobaa_pod.name in [
                    constants.NB_DB_NAME_46_AND_BELOW,
                    constants.NB_DB_NAME_47_AND_ABOVE,
            ]:
                noobaa_pod_node = get_pod_node(noobaa_pod)
        if noobaa_pod_node is None:
            assert False, "Could not find the NooBaa DB pod"

        # Validate if RGW pod and noobaa-db are hosted on same node
        # If not, make sure both pods are hosted on same node
        log.info("Validate if RGW pod and noobaa-db are hosted on same node")
        rgw_pod_obj = get_rgw_pods()
        rgw_pod_node_list = [
            rgw_pod.get().get("spec").get("nodeName")
            for rgw_pod in rgw_pod_obj
        ]
        if not list(
                set(rgw_pod_node_list).intersection(
                    noobaa_pod_node.name.split())):
            log.info("Unschedule other two nodes such that RGW "
                     "pod moves to node where NooBaa DB pod hosted")
            worker_node_list = get_worker_nodes()
            node_names = list(
                set(worker_node_list) - set(noobaa_pod_node.name.split()))
            unschedule_nodes(node_names=node_names)
            ocp_obj = OCP(kind=constants.POD,
                          namespace=defaults.ROOK_CLUSTER_NAMESPACE)
            rgw_pod_obj[0].delete()
            ocp_obj.wait_for_resource(
                condition=constants.STATUS_RUNNING,
                resource_count=len(rgw_pod_obj),
                selector=constants.RGW_APP_LABEL,
                timeout=300,
                sleep=5,
            )
            log.info("Schedule those nodes again")
            schedule_nodes(node_names=node_names)

            # Check the ceph health OK
            ceph_health_check(tries=90, delay=15)

            # Verify all storage pods are running
            wait_for_storage_pods()

            # Check again the rgw pod move to node where NooBaa DB pod hosted
            rgw_pod_obj_list = get_rgw_pods()
            rgw_pod_node_list = [
                get_pod_node(rgw_pod_obj) for rgw_pod_obj in rgw_pod_obj_list
            ]
            value = [
                True if rgw_pod_node == noobaa_pod_node.name else False
                for rgw_pod_node in rgw_pod_node_list
            ]
            assert value, ("RGW Pod didn't move to node where NooBaa DB pod"
                           " hosted even after cordoned and uncordoned nodes"
                           f"RGW pod hosted: {rgw_pod_node_list}"
                           f"NooBaa DB pod hosted: {noobaa_pod_node.name}")

        log.info(
            "RGW and noobaa-db are hosted on same node start the test execution"
        )
        rgw_pod_obj = get_rgw_pods()
        for rgw_pod in rgw_pod_obj:
            pod_node = rgw_pod.get().get("spec").get("nodeName")
            if pod_node == noobaa_pod_node.name:
                # Stop the node
                log.info(f"Stopping node {pod_node} where"
                         f" rgw pod {rgw_pod.name} and NooBaa DB are hosted")
                node_obj = get_node_objs(node_names=[pod_node])
                nodes.stop_nodes(node_obj)

                # Validate old rgw pod went terminating state
                wait_for_resource_state(resource=rgw_pod,
                                        state=constants.STATUS_TERMINATING,
                                        timeout=720)

                # Validate new rgw pod spun
                ocp_obj = OCP(kind=constants.POD,
                              namespace=defaults.ROOK_CLUSTER_NAMESPACE)
                ocp_obj.wait_for_resource(
                    condition=constants.STATUS_RUNNING,
                    resource_count=len(rgw_pod_obj),
                    selector=constants.RGW_APP_LABEL,
                )

                # Start the node
                nodes.start_nodes(node_obj)

                # Check the ceph health OK
                ceph_health_check(tries=90, delay=15)

                # Verify all storage pods are running
                wait_for_storage_pods()

                # Create OBC and read wnd write
                self.create_obc_creation(bucket_factory, mcg_obj,
                                         "Object-key-2")

        # Verify cluster health
        self.sanity_helpers.health_check()
    def test_ceph_daemon_kill_during_resource_creation(self, interface,
                                                       operation_to_disrupt,
                                                       resource_to_delete,
                                                       multi_pvc_factory,
                                                       pod_factory):
        """
        Base function for ceph daemon kill disruptive tests.
        Deletion of 'resource_to_delete' daemon will be introduced while
        'operation_to_disrupt' is progressing.
        """
        disruption = disruption_helpers.Disruptions()
        pod_functions = {
            'mds':
            partial(pod.get_mds_pods),
            'mon':
            partial(pod.get_mon_pods),
            'mgr':
            partial(pod.get_mgr_pods),
            'osd':
            partial(pod.get_osd_pods),
            'rbdplugin':
            partial(pod.get_plugin_pods, interface=interface),
            'cephfsplugin':
            partial(pod.get_plugin_pods, interface=interface),
            'cephfsplugin_provisioner':
            partial(pod.get_cephfsplugin_provisioner_pods),
            'rbdplugin_provisioner':
            partial(pod.get_rbdfsplugin_provisioner_pods),
            'operator':
            partial(pod.get_operator_pods)
        }

        # Get number of pods of type 'resource_to_delete'
        num_of_resource_to_delete = len(pod_functions[resource_to_delete]())

        num_of_pvc = 12
        namespace = self.proj_obj.namespace

        # Fetch the number of Pods and PVCs
        initial_num_of_pods = len(pod.get_all_pods(namespace=namespace))
        initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)['items'])

        executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc))

        disruption.set_resource(resource=resource_to_delete)
        disruption.select_daemon()

        access_modes = [constants.ACCESS_MODE_RWO]
        if interface == constants.CEPHFILESYSTEM:
            access_modes.append(constants.ACCESS_MODE_RWX)

        # Modify access_modes list to create rbd `block` type volume with
        # RWX access mode. RWX is not supported in non-block type rbd
        if interface == constants.CEPHBLOCKPOOL:
            access_modes.extend([
                f'{constants.ACCESS_MODE_RWO}-Block',
                f'{constants.ACCESS_MODE_RWX}-Block'
            ])

        # Start creation of PVCs
        bulk_pvc_create = executor.submit(
            multi_pvc_factory,
            interface=interface,
            project=self.proj_obj,
            storageclass=self.sc_obj,
            size=8,
            access_modes=access_modes,
            access_modes_selection='distribute_random',
            status=constants.STATUS_BOUND,
            num_of_pvc=num_of_pvc,
            wait_each=False)

        if operation_to_disrupt == 'create_pvc':
            # Ensure PVCs are being created before deleting the resource
            ret = self.verify_resource_creation(get_all_pvcs,
                                                initial_num_of_pvc, namespace)
            assert ret, "Wait timeout: PVCs are not being created."
            log.info("PVCs creation has started.")
            disruption.kill_daemon()

        pvc_objs = bulk_pvc_create.result()

        # Confirm that PVCs are Bound
        for pvc_obj in pvc_objs:
            helpers.wait_for_resource_state(resource=pvc_obj,
                                            state=constants.STATUS_BOUND,
                                            timeout=120)
            pvc_obj.reload()
        log.info("Verified: PVCs are Bound.")

        # Start creating pods
        bulk_pod_create = executor.submit(self.pods_creation, pvc_objs,
                                          pod_factory, interface)

        if operation_to_disrupt == 'create_pod':
            # Ensure that pods are being created before deleting the resource
            ret = self.verify_resource_creation(pod.get_all_pods,
                                                initial_num_of_pods, namespace)
            assert ret, "Wait timeout: Pods are not being created."
            log.info(f"Pods creation has started.")
            disruption.kill_daemon()

        pod_objs = bulk_pod_create.result()

        # Verify pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING)
            pod_obj.reload()
        log.info("Verified: All pods are Running.")

        # Do setup on pods for running IO
        log.info("Setting up pods for running IO.")
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info['spec']['volumeMode'] == 'Block':
                storage_type = 'block'
            else:
                storage_type = 'fs'
            executor.submit(pod_obj.workload_setup, storage_type=storage_type)

        # Wait for setup on pods to complete
        for pod_obj in pod_objs:
            for sample in TimeoutSampler(180, 2, getattr, pod_obj,
                                         'wl_setup_done'):
                if sample:
                    log.info(f"Setup for running IO is completed on pod "
                             f"{pod_obj.name}.")
                    break
        log.info("Setup for running IO is completed on all pods.")

        # Start IO on each pod
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info['spec']['volumeMode'] == 'Block':
                storage_type = 'block'
            else:
                storage_type = 'fs'
            pod_obj.run_io(storage_type=storage_type,
                           size='2G',
                           runtime=30,
                           fio_filename=f'{pod_obj.name}_io_file1')
        log.info("FIO started on all pods.")

        if operation_to_disrupt == 'run_io':
            disruption.kill_daemon()

        log.info("Fetching FIO results.")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get('jobs')[0].get('error')
            assert err_count == 0, (
                f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}")
        log.info("Verified FIO result on pods.")

        # Delete pods
        for pod_obj in pod_objs:
            pod_obj.delete(wait=True)
        for pod_obj in pod_objs:
            pod_obj.ocp.wait_for_delete(pod_obj.name)

        # Verify that PVCs are reusable by creating new pods
        create_pods = executor.submit(self.pods_creation, pvc_objs,
                                      pod_factory, interface)
        pod_objs = create_pods.result()

        # Verify new pods are Running
        for pod_obj in pod_objs:
            helpers.wait_for_resource_state(resource=pod_obj,
                                            state=constants.STATUS_RUNNING)
            pod_obj.reload()
        log.info("Verified: All new pods are Running.")

        # Run IO on each of the new pods
        for pod_obj in pod_objs:
            pvc_info = pod_obj.pvc.get()
            if pvc_info['spec']['volumeMode'] == 'Block':
                storage_type = 'block'
            else:
                storage_type = 'fs'
            pod_obj.run_io(storage_type=storage_type,
                           size='1G',
                           runtime=10,
                           fio_filename=f'{pod_obj.name}_io_file2')

        log.info("Fetching FIO results from new pods")
        for pod_obj in pod_objs:
            fio_result = pod_obj.get_fio_results()
            err_count = fio_result.get('jobs')[0].get('error')
            assert err_count == 0, (
                f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}")
        log.info("Verified FIO result on new pods.")

        # Verify number of pods of type 'resource_to_delete'
        final_num_resource_to_delete = len(pod_functions[resource_to_delete]())
        assert final_num_resource_to_delete == num_of_resource_to_delete, (
            f"Total number of {resource_to_delete} pods is not matching with "
            f"initial value. Total number of pods before deleting a pod: "
            f"{num_of_resource_to_delete}. Total number of pods present now: "
            f"{final_num_resource_to_delete}")

        # Check ceph status
        ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'])
        log.info("Ceph cluster health is OK")
示例#30
0
    def teardown(self):
        """
        Cleaning up the environment :
            Delete all snapshot
            Delete the POD
            Delete the PVC and the PV
            Delete the StorageClass
            Delete the VolumeSnapshotClass
            Delete the data pool
            Switch to the default namespace
            Delete the tested namespace

        """
        log.info("Cleanup the test environment")

        # Getting the name of the PCV's backed PV
        try:
            pv = self.pvc_obj.get("spec")["spec"]["volumeName"]
        except KeyError:
            log.error(
                f"Cannot found key in the PVC object {json.dumps(self.pvc_obj.get('spec').get('spec'), indent=3)}"
            )

        # Getting the list of all snapshots
        try:
            snapshot_list = self.snapshot.get(all_namespaces=True)["items"]
        except Exception as err:
            log.error(f"Cannot get the list of snapshots : {err}")
            snapshot_list = []

        # Deleting al snapshots from the cluster
        log.info(f"Trying to delete all ({len(snapshot_list)}) Snapshots")
        log.debug(
            f"The list of all snapshots is : {json.dumps(snapshot_list, indent=3)}"
        )
        for vs in snapshot_list:
            snap_name = vs["metadata"]["name"]
            log.info(f"Try to delete {snap_name}")
            try:
                self.snapshot.delete(resource_name=snap_name)
            except Exception as err:
                log.error(f"Cannot delete {snap_name} : {err}")

        # Deleting the pod which wrote data to the pvc
        log.info(f"Deleting the test POD : {self.pod_obj.name}")
        try:
            self.pod_obj.delete()
            log.info("Wait until the pod is deleted.")
            self.pod_obj.ocp.wait_for_delete(resource_name=self.pod_obj.name)
        except Exception as ex:
            log.error(f"Cannot delete the test pod : {ex}")

        # Deleting the PVC which used in the test.
        log.info(f"Delete the PVC : {self.pvc_obj.name}")
        try:
            self.pvc_obj.delete()
            log.info("Wait until the pvc is deleted.")
            self.pvc_obj.ocp.wait_for_delete(resource_name=self.pvc_obj.name)
        except Exception as ex:
            log.error(f"Cannot delete the test pvc : {ex}")

        # Delete the backend PV of the PVC
        log.info(f"Try to delete the backend PV : {pv}")
        try:
            run_oc_command(f"delete pv {pv}")
        except Exception as ex:
            err_msg = f"cannot delete PV {pv} - [{ex}]"
            log.error(err_msg)

        # Deleting the StorageClass used in the test
        log.info(f"Deleting the test StorageClass : {self.sc_obj.name}")
        try:
            self.sc_obj.delete()
            log.info("Wait until the SC is deleted.")
            self.sc_obj.ocp.wait_for_delete(resource_name=self.sc_obj.name)
        except Exception as ex:
            log.error(f"Can not delete the test sc : {ex}")

        # Deleting the VolumeSnapshotClass used in the test
        log.info(f"Deleting the test Snapshot Class : {self.snap_class.name}")
        try:
            self.snap_class.delete()
            log.info("Wait until the VSC is deleted.")
            self.snap_class.ocp.wait_for_delete(
                resource_name=self.snap_class.name)
        except Exception as ex:
            log.error(f"Can not delete the test vsc : {ex}")

        # Deleting the Data pool
        log.info(f"Deleting the test storage pool : {self.sc_name}")
        self.delete_ceph_pool(self.sc_name)
        # Verify deletion by checking the backend CEPH pools using the toolbox
        results = self.ceph_cluster.toolbox.exec_cmd_on_pod("ceph osd pool ls")
        log.debug(f"Existing pools are : {results}")
        if self.sc_name in results.split():
            log.warning(
                "The pool did not deleted by CSI, forcing delete it manually")
            self.ceph_cluster.toolbox.exec_cmd_on_pod(
                f"ceph osd pool delete {self.sc_name} {self.sc_name} "
                "--yes-i-really-really-mean-it")
        else:
            log.info(f"The pool {self.sc_name} was deleted successfully")

        # Deleting the namespace used by the test
        log.info(f"Deleting the test namespace : {self.nss_name}")
        switch_to_default_rook_cluster_project()
        try:
            self.proj.delete(resource_name=self.nss_name)
            self.proj.wait_for_delete(resource_name=self.nss_name,
                                      timeout=60,
                                      sleep=10)
        except CommandFailed:
            log.error(f"Can not delete project {self.nss_name}")
            raise CommandFailed(f"{self.nss_name} was not created")

        # After deleteing all data from the cluster, we need to wait until it will re-balnce
        ceph_health_check(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE,
                          tries=30,
                          delay=60)

        super(TestPvcMultiSnapshotPerformance, self).teardown()