Exemplo n.º 1
0
    def _prepare_upi_rhel_node(self, node_conf):
        """
        Handle RHEL worker instance creation
        1. Create RHEL worker instance , copy required AWS tags from existing
        worker instances to new RHEL instance
        2. Copy IAM role from existing worker to new RHEL worker

        """
        cluster_id = get_infra_id(self.cluster_path)
        node_id = node_conf['node_id']
        if not node_conf.get('zone'):
            num_zone = get_az_count()
            zone = random.randint(0, num_zone)
        else:
            zone = node_conf.get('zone')
        logger.info("Creating RHEL worker node")
        self.gather_worker_data(f'no{zone}')
        response = self.client.run_instances(
            BlockDeviceMappings=[
                {
                    'DeviceName': node_conf['root_disk'],
                    'Ebs': {
                        'DeleteOnTermination': True,
                        'VolumeSize': node_conf['root_disk_size'],
                        'VolumeType': 'gp2'
                    },
                },
            ],
            ImageId=node_conf['rhel_worker_ami'],
            SubnetId=self.worker_subnet,
            InstanceType=node_conf['rhel_worker_instance_type'],
            MaxCount=1,
            MinCount=1,
            Monitoring={
                'Enabled': False
            },
            SecurityGroupIds=[
                self.worker_security_group[0]['GroupId'],
            ],
            KeyName='openshift-dev'
        )
        inst_id = response['Instances'][0]['InstanceId']
        worker_ec2 = boto3.resource('ec2', region_name=self.region)
        worker_instance = worker_ec2.Instance(inst_id)
        worker_instance.wait_until_running()
        worker_name = f'{cluster_id}-rhel-worker-{node_id}'
        worker_ec2.create_tags(
            Resources=[inst_id],
            Tags=[
                {'Key': 'Name', 'Value': f'{worker_name}'},
                {'Key': self.worker_tag[0], 'Value': self.worker_tag[1]}
            ]
        )
        logging.info(self.worker_iam_role)
        self.client.associate_iam_instance_profile(
            IamInstanceProfile=self.worker_iam_role,
            InstanceId=inst_id,
        )
        return worker_instance
Exemplo n.º 2
0
def ocs_install_verification(timeout=600, skip_osd_distribution_check=False):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.

    """
    from ocs_ci.ocs.node import get_typed_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    number_of_worker_nodes = len(get_typed_nodes())
    namespace = config.ENV_DATA['cluster_namespace']
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    operator_selector = get_selector_for_ocs_operator()
    ocs_package_manifest = PackageManifest(
        resource_name=defaults.OCS_OPERATOR_NAME,
        selector=operator_selector,
    )
    ocs_csv_name = ocs_package_manifest.get_current_csv()
    ocs_csv = CSV(resource_name=ocs_csv_name, namespace=namespace)
    log.info(f"Check if OCS operator: {ocs_csv_name} is in Succeeded phase.")
    ocs_csv.wait_for_phase(phase="Succeeded", timeout=timeout)

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(f"Check if StorageCluster: {storage_cluster_name} is in"
             f"Succeeded phase")
    storage_cluster.wait_for_phase(phase='Ready', timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(kind=constants.POD, namespace=namespace)
    # ocs-operator
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OCS_OPERATOR_LABEL,
                                 timeout=timeout)
    # rook-ceph-operator
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OPERATOR_LABEL,
                                 timeout=timeout)
    # noobaa
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.NOOBAA_APP_LABEL,
                                 resource_count=2,
                                 timeout=timeout)
    # mons
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MON_APP_LABEL,
                                 resource_count=3,
                                 timeout=timeout)
    # csi-cephfsplugin
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.CSI_CEPHFSPLUGIN_LABEL,
                                 resource_count=number_of_worker_nodes,
                                 timeout=timeout)
    # csi-cephfsplugin-provisioner
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
        resource_count=2,
        timeout=timeout)
    # csi-rbdplugin
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.CSI_RBDPLUGIN_LABEL,
                                 resource_count=number_of_worker_nodes,
                                 timeout=timeout)
    # csi-rbdplugin-provisioner
    assert pod.wait_for_resource(
        condition=constants.STATUS_RUNNING,
        selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
        resource_count=2,
        timeout=timeout)
    # osds
    osd_count = (
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) *
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica']))
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.OSD_APP_LABEL,
                                 resource_count=osd_count,
                                 timeout=timeout)
    # mgr
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MGR_APP_LABEL,
                                 timeout=timeout)
    # mds
    assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                 selector=constants.MDS_APP_LABEL,
                                 resource_count=2,
                                 timeout=timeout)

    # rgw check only for VmWare
    if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM:
        assert pod.wait_for_resource(condition=constants.STATUS_RUNNING,
                                     selector=constants.RGW_APP_LABEL,
                                     resource_count=1,
                                     timeout=timeout)

    # Verify ceph health
    log.info("Verifying ceph health")
    assert utils.ceph_health_check(namespace=namespace)

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    required_storage_classes = {
        f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd'
    }
    storage_classes = storage_class.get()
    storage_class_names = {
        item['metadata']['name']
        for item in storage_classes['items']
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSD's are distributed
    if not skip_osd_distribution_check:
        log.info("Verifying OSD's are distributed evenly across worker nodes")
        ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
        osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items']
        node_names = [osd['spec']['nodeName'] for osd in osds]
        for node in node_names:
            assert not node_names.count(node) > 1, (
                "OSD's are not distributed evenly across worker nodes")

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == ({
        item['metadata']['name']
        for item in csi_driver.get()['items']
    })

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    sc_rbd = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_RBD)
    sc_cephfs = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    assert sc_rbd['parameters'][
        'csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET
    assert sc_rbd['parameters'][
        'csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET
    assert sc_cephfs['parameters'][
        'csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET
    assert sc_cephfs['parameters'][
        'csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    log.info(
        "Verifying ceph osd tree output and checking for device set PVC names "
        "in the output.")
    deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]
    ct_pod = get_ceph_tools_pod()
    osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json')
    schemas = {
        'root': constants.OSD_TREE_ROOT,
        'rack': constants.OSD_TREE_RACK,
        'host': constants.OSD_TREE_HOST,
        'osd': constants.OSD_TREE_OSD,
        'region': constants.OSD_TREE_REGION,
        'zone': constants.OSD_TREE_ZONE
    }
    schemas['host']['properties']['name'] = {'enum': deviceset_pvcs}
    for item in osd_tree['nodes']:
        validate(instance=item, schema=schemas[item['type']])
        if item['type'] == 'host':
            deviceset_pvcs.remove(item['name'])
    assert not deviceset_pvcs, (
        f"These device set PVCs are not given in ceph osd tree output "
        f"- {deviceset_pvcs}")
    log.info(
        "Verified ceph osd tree output. Device set PVC names are given in the "
        "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    log.info("Verifying CSI snapshotter is not present.")
    provisioner_pods = get_all_pods(
        namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        selector=[
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
        ])
    for pod_obj in provisioner_pods:
        pod_info = pod_obj.get()
        for container, image in get_images(data=pod_info).items():
            assert ('snapshot' not in container) and (
                'snapshot' not in image), (
                    f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
    assert {
        'name': 'CSI_ENABLE_SNAPSHOTTER',
        'value': 'false'
    } in (ocs_csv.get()['spec']['install']['spec']['deployments'][0]['spec']
          ['template']['spec']['containers'][0]['env']
          ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
    log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd crush dump',
                                          format='')
        pool_names = [
            constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL
        ]
        crush_rules = [
            rule for rule in crush_dump['rules']
            if rule['rule_name'] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule['steps']
                if item.get('type') == 'zone'
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
Exemplo n.º 3
0
def ocs_install_verification(
    timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None,
    post_upgrade_verification=False,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.

    """
    from ocs_ci.ocs.node import get_typed_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    number_of_worker_nodes = len(get_typed_nodes())
    namespace = config.ENV_DATA['cluster_namespace']
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    ocs_csv = get_ocs_csv()
    # Verify if OCS CSV has proper version.
    csv_version = ocs_csv.data['spec']['version']
    ocs_version = config.ENV_DATA['ocs_version']
    log.info(
        f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}"
    )
    assert ocs_version in csv_version, (
        f"OCS version: {ocs_version} mismatch with CSV version {csv_version}"
    )
    # Verify if OCS CSV has the same version in provided CI build.
    ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get(
        'ocs_registry_image'
    )
    if ocs_registry_image and ocs_registry_image.endswith(".ci"):
        ocs_registry_image = ocs_registry_image.split(":")[1]
        log.info(
            f"Check if OCS registry image: {ocs_registry_image} matches with "
            f"CSV: {csv_version}"
        )
        ignore_csv_mismatch = config.DEPLOYMENT.get('ignore_csv_mismatch')
        if ignore_csv_mismatch:
            log.info(
                "The possible mismatch will be ignored as you deployed "
                "the different version than the default version from the CSV"
            )
        else:
            assert ocs_registry_image in csv_version, (
                f"OCS registry image version: {ocs_registry_image} mismatch "
                f"with CSV version {csv_version}"
            )

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(
        f"Check if StorageCluster: {storage_cluster_name} is in"
        f"Succeeded phase"
    )
    storage_cluster.wait_for_phase(phase='Ready', timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(
        kind=constants.POD, namespace=namespace
    )
    osd_count = (
        int(storage_cluster.data['spec']['storageDeviceSets'][0]['count'])
        * int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica'])
    )

    # check noobaa CR for min number of noobaa endpoint pods
    nb_obj = OCP(kind='noobaa', namespace=defaults.ROOK_CLUSTER_NAMESPACE)
    min_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('minCount')
    max_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('maxCount')

    resources_dict = {
        constants.OCS_OPERATOR_LABEL: 1,
        constants.OPERATOR_LABEL: 1,
        constants.NOOBAA_DB_LABEL: 1,
        constants.NOOBAA_OPERATOR_POD_LABEL: 1,
        constants.NOOBAA_CORE_POD_LABEL: 1,
        constants.MON_APP_LABEL: 3,
        constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes,
        constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2,
        constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes,
        constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2,
        constants.OSD_APP_LABEL: osd_count,
        constants.MGR_APP_LABEL: 1,
        constants.MDS_APP_LABEL: 2,
        constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps
    }
    if config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS:
        # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1857802 - RGW count is 1
        # post upgrade to OCS 4.5. Tracked with
        # https://github.com/red-hat-storage/ocs-ci/issues/2532
        rgw_count = 2 if float(config.ENV_DATA['ocs_version']) >= 4.5 and not (
            post_upgrade_verification
        ) else 1
        resources_dict.update({constants.RGW_APP_LABEL: rgw_count})
    for label, count in resources_dict.items():
        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=label,
            resource_count=count,
            timeout=timeout
        )

    nb_ep_pods = get_pods_having_label(
        label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE
    )
    assert len(nb_ep_pods) <= max_eps, (
        f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) "
        f"is greater than the maximum defined in the NooBaa CR ({max_eps})"
    )

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(
        kind=constants.STORAGECLASS, namespace=namespace
    )
    storage_cluster_name = config.ENV_DATA['storage_cluster_name']
    required_storage_classes = {
        f'{storage_cluster_name}-cephfs',
        f'{storage_cluster_name}-ceph-rbd'
    }
    storage_classes = storage_class.get()
    storage_class_names = {
        item['metadata']['name'] for item in storage_classes['items']
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSDs are distributed
    if not skip_osd_distribution_check:
        log.info("Verifying OSDs are distributed evenly across worker nodes")
        ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
        osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items']
        deviceset_count = get_deviceset_count()
        node_names = [osd['spec']['nodeName'] for osd in osds]
        for node in node_names:
            assert not node_names.count(node) > deviceset_count, (
                "OSD's are not distributed evenly across worker nodes"
            )

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == (
        {item['metadata']['name'] for item in csi_driver.get()['items']}
    )

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    sc_rbd = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_RBD
    )
    sc_cephfs = storage_class.get(
        resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS
    )
    assert sc_rbd['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET
    assert sc_rbd['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET
    assert sc_cephfs['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET
    assert sc_cephfs['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    log.info(
        "Verifying ceph osd tree output and checking for device set PVC names "
        "in the output."
    )

    if (
        config.DEPLOYMENT.get('local_storage')
        and config.ENV_DATA['platform'] != constants.BAREMETALPSI_PLATFORM
    ):
        deviceset_pvcs = get_compute_node_names()
    else:
        deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

    ct_pod = get_ceph_tools_pod()
    osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json')
    schemas = {
        'root': constants.OSD_TREE_ROOT,
        'rack': constants.OSD_TREE_RACK,
        'host': constants.OSD_TREE_HOST,
        'osd': constants.OSD_TREE_OSD,
        'region': constants.OSD_TREE_REGION,
        'zone': constants.OSD_TREE_ZONE
    }
    schemas['host']['properties']['name'] = {'enum': deviceset_pvcs}
    for item in osd_tree['nodes']:
        validate(instance=item, schema=schemas[item['type']])
        if item['type'] == 'host':
            deviceset_pvcs.remove(item['name'])
    assert not deviceset_pvcs, (
        f"These device set PVCs are not given in ceph osd tree output "
        f"- {deviceset_pvcs}"
    )
    log.info(
        "Verified ceph osd tree output. Device set PVC names are given in the "
        "output."
    )

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    log.info("Verifying CSI snapshotter is not present.")
    provisioner_pods = get_all_pods(
        namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        selector=[
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL
        ]
    )
    for pod_obj in provisioner_pods:
        pod_info = pod_obj.get()
        for container, image in get_images(data=pod_info).items():
            assert ('snapshot' not in container) and ('snapshot' not in image), (
                f"Snapshot container is present in {pod_obj.name} pod. "
                f"Container {container}. Image {image}"
            )
    deployments = ocs_csv.get()['spec']['install']['spec']['deployments']
    rook_ceph_operator_deployment = [
        deployment_val for deployment_val in deployments if deployment_val['name'] == 'rook-ceph-operator'
    ]
    assert {'name': 'CSI_ENABLE_SNAPSHOTTER', 'value': 'false'} in (
        rook_ceph_operator_deployment[0]['spec']['template']['spec']['containers'][0]['env']
    ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
    log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(
            ceph_cmd='ceph osd crush dump', format=''
        )
        pool_names = [
            constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL
        ]
        crush_rules = [rule for rule in crush_dump['rules'] if rule['rule_name'] in pool_names]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule['steps'] if item.get('type') == 'zone'
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    log.info("Validate cluster on PVC")
    validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(
        namespace, health_check_tries, health_check_delay
    )
Exemplo n.º 4
0
def ocs_install_verification(
    timeout=600,
    skip_osd_distribution_check=False,
    ocs_registry_image=None,
    post_upgrade_verification=False,
    version_before_upgrade=None,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.
        version_before_upgrade (float): Set to OCS version before upgrade

    """
    from ocs_ci.ocs.node import get_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    from ocs_ci.ocs.resources.fips import check_fips_enabled

    number_of_worker_nodes = len(get_nodes())
    namespace = config.ENV_DATA["cluster_namespace"]
    log.info("Verifying OCS installation")
    if config.ENV_DATA.get("disable_components"):
        for component in config.ENV_DATA["disable_components"]:
            config.COMPONENTS[f"disable_{component}"] = True
    disable_noobaa = config.COMPONENTS["disable_noobaa"]
    disable_rgw = config.COMPONENTS["disable_rgw"]
    disable_blockpools = config.COMPONENTS["disable_blockpools"]
    disable_cephfs = config.COMPONENTS["disable_cephfs"]

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    ocs_csv = get_ocs_csv()
    # Verify if OCS CSV has proper version.
    csv_version = ocs_csv.data["spec"]["version"]
    ocs_version = version.get_semantic_ocs_version_from_config()
    log.info(
        f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}")
    assert (
        f"{ocs_version}" in csv_version
    ), f"OCS version: {ocs_version} mismatch with CSV version {csv_version}"
    # Verify if OCS CSV has the same version in provided CI build.
    ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get(
        "ocs_registry_image")
    if ocs_registry_image and ocs_registry_image.endswith(".ci"):
        ocs_registry_image = ocs_registry_image.rsplit(":", 1)[1]
        log.info(
            f"Check if OCS registry image: {ocs_registry_image} matches with "
            f"CSV: {csv_version}")
        ignore_csv_mismatch = config.DEPLOYMENT.get("ignore_csv_mismatch")
        if ignore_csv_mismatch:
            log.info(
                "The possible mismatch will be ignored as you deployed "
                "the different version than the default version from the CSV")
        else:
            assert ocs_registry_image in csv_version, (
                f"OCS registry image version: {ocs_registry_image} mismatch "
                f"with CSV version {csv_version}")

    # Verify Storage System status
    if ocs_version >= version.VERSION_4_9:
        log.info("Verifying storage system status")
        storage_system = OCP(kind=constants.STORAGESYSTEM, namespace=namespace)
        storage_system_data = storage_system.get()
        storage_system_status = {}
        for condition in storage_system_data["items"][0]["status"][
                "conditions"]:
            storage_system_status[condition["type"]] = condition["status"]
        log.debug(f"storage system status: {storage_system_status}")
        assert storage_system_status == constants.STORAGE_SYSTEM_STATUS, (
            f"Storage System status is not in expected state. Expected {constants.STORAGE_SYSTEM_STATUS}"
            f" but found {storage_system_status}")

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(f"Check if StorageCluster: {storage_cluster_name} is in"
             f"Succeeded phase")
    storage_cluster.wait_for_phase(phase="Ready", timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(kind=constants.POD, namespace=namespace)
    if not config.DEPLOYMENT["external_mode"]:
        osd_count = int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["count"]
        ) * int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"])
    rgw_count = None
    if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS:
        if not disable_rgw:
            rgw_count = get_rgw_count(f"{ocs_version}",
                                      post_upgrade_verification,
                                      version_before_upgrade)

    min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT
    max_eps = (constants.MAX_NB_ENDPOINT_COUNT
               if ocs_version >= version.VERSION_4_6 else 1)

    if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM:
        min_eps = 1
        max_eps = 1

    nb_db_label = (constants.NOOBAA_DB_LABEL_46_AND_UNDER
                   if ocs_version < version.VERSION_4_7 else
                   constants.NOOBAA_DB_LABEL_47_AND_ABOVE)
    resources_dict = {
        nb_db_label: 1,
        constants.OCS_OPERATOR_LABEL: 1,
        constants.OPERATOR_LABEL: 1,
        constants.NOOBAA_OPERATOR_POD_LABEL: 1,
        constants.NOOBAA_CORE_POD_LABEL: 1,
        constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps,
    }
    if not config.DEPLOYMENT["external_mode"]:
        resources_dict.update({
            constants.MON_APP_LABEL: 3,
            constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2,
            constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2,
            constants.OSD_APP_LABEL: osd_count,
            constants.MGR_APP_LABEL: 1,
            constants.MDS_APP_LABEL: 2,
            constants.RGW_APP_LABEL: rgw_count,
        })

    if ocs_version >= version.VERSION_4_9:
        resources_dict.update({
            constants.ODF_OPERATOR_CONTROL_MANAGER_LABEL: 1,
        })

    for label, count in resources_dict.items():
        if label == constants.RGW_APP_LABEL:
            if (not config.ENV_DATA.get("platform")
                    in constants.ON_PREM_PLATFORMS or disable_rgw):
                continue
        if "noobaa" in label and disable_noobaa:
            continue
        if "mds" in label and disable_cephfs:
            continue

        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=label,
            resource_count=count,
            timeout=timeout,
        )

    if not disable_noobaa:
        nb_ep_pods = get_pods_having_label(
            label=constants.NOOBAA_ENDPOINT_POD_LABEL,
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
        )
        assert len(nb_ep_pods) <= max_eps, (
            f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) "
            f"is greater than the maximum defined in the NooBaa CR ({max_eps})"
        )

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    required_storage_classes = {
        f"{storage_cluster_name}-cephfs",
        f"{storage_cluster_name}-ceph-rbd",
    }
    if ocs_version >= version.VERSION_4_10:
        # TODO: Add rbd-thick storage class verification in external mode cluster upgraded
        # to OCS 4.8 when the bug 1978542 is fixed
        # Skip rbd-thick storage class verification in external mode upgraded cluster. This is blocked by bug 1978542
        if not (config.DEPLOYMENT["external_mode"]
                and post_upgrade_verification):
            required_storage_classes.update(
                {f"{storage_cluster_name}-ceph-rbd-thick"})
    skip_storage_classes = set()
    if disable_cephfs:
        skip_storage_classes.update({
            f"{storage_cluster_name}-cephfs",
        })
    if disable_blockpools:
        skip_storage_classes.update({
            f"{storage_cluster_name}-ceph-rbd",
        })
    required_storage_classes = required_storage_classes.difference(
        skip_storage_classes)

    if config.DEPLOYMENT["external_mode"]:
        required_storage_classes.update({
            f"{storage_cluster_name}-ceph-rgw",
            f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io',
        })
    storage_classes = storage_class.get()
    storage_class_names = {
        item["metadata"]["name"]
        for item in storage_classes["items"]
    }
    # required storage class names should be observed in the cluster under test
    missing_scs = required_storage_classes.difference(storage_class_names)
    if len(missing_scs) > 0:
        log.error("few storage classess are not present: %s", missing_scs)
    assert list(missing_scs) == []

    # Verify OSDs are distributed
    if not config.DEPLOYMENT["external_mode"]:
        if not skip_osd_distribution_check:
            log.info(
                "Verifying OSDs are distributed evenly across worker nodes")
            ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
            osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"]
            deviceset_count = get_deviceset_count()
            node_names = [osd["spec"]["nodeName"] for osd in osds]
            for node in node_names:
                assert (
                    not node_names.count(node) > deviceset_count
                ), "OSD's are not distributed evenly across worker nodes"

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    csi_drivers = {
        item["metadata"]["name"]
        for item in csi_driver.get()["items"]
    }
    assert defaults.CSI_PROVISIONERS.issubset(csi_drivers)

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    if config.DEPLOYMENT["external_mode"]:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(resource_name=(
            constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS))
    else:
        if not disable_blockpools:
            sc_rbd = storage_class.get(
                resource_name=constants.DEFAULT_STORAGECLASS_RBD)
        if not disable_cephfs:
            sc_cephfs = storage_class.get(
                resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    if not disable_blockpools:
        assert (
            sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"]
            == constants.RBD_NODE_SECRET)
        assert (
            sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"]
            == constants.RBD_PROVISIONER_SECRET)
    if not disable_cephfs:
        assert (sc_cephfs["parameters"]
                ["csi.storage.k8s.io/node-stage-secret-name"] ==
                constants.CEPHFS_NODE_SECRET)
        assert (sc_cephfs["parameters"]
                ["csi.storage.k8s.io/provisioner-secret-name"] ==
                constants.CEPHFS_PROVISIONER_SECRET)
    log.info("Verified node and provisioner secret names in storage class.")

    ct_pod = get_ceph_tools_pod()

    # https://github.com/red-hat-storage/ocs-ci/issues/3820
    # Verify ceph osd tree output
    if not (config.DEPLOYMENT.get("ui_deployment")
            or config.DEPLOYMENT["external_mode"]):
        log.info(
            "Verifying ceph osd tree output and checking for device set PVC names "
            "in the output.")
        if config.DEPLOYMENT.get("local_storage"):
            deviceset_pvcs = [osd.get_node() for osd in get_osd_pods()]
            # removes duplicate hostname
            deviceset_pvcs = list(set(deviceset_pvcs))
            if config.ENV_DATA.get("platform") == constants.BAREMETAL_PLATFORM:
                deviceset_pvcs = [
                    deviceset.replace(".", "-") for deviceset in deviceset_pvcs
                ]
        else:
            deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

        osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree",
                                        format="json")
        schemas = {
            "root": constants.OSD_TREE_ROOT,
            "rack": constants.OSD_TREE_RACK,
            "host": constants.OSD_TREE_HOST,
            "osd": constants.OSD_TREE_OSD,
            "region": constants.OSD_TREE_REGION,
            "zone": constants.OSD_TREE_ZONE,
        }
        schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs}
        for item in osd_tree["nodes"]:
            validate(instance=item, schema=schemas[item["type"]])
            if item["type"] == "host":
                deviceset_pvcs.remove(item["name"])
        assert not deviceset_pvcs, (
            f"These device set PVCs are not given in ceph osd tree output "
            f"- {deviceset_pvcs}")
        log.info(
            "Verified ceph osd tree output. Device set PVC names are given in the "
            "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    # if the OCS version is < 4.6
    if ocs_version < version.VERSION_4_6:
        log.info("Verifying CSI snapshotter is not present.")
        provisioner_pods = get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            selector=[
                constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
            ],
        )
        for pod_obj in provisioner_pods:
            pod_info = pod_obj.get()
            for container, image in get_images(data=pod_info).items():
                assert ("snapshot" not in container) and (
                    "snapshot" not in image
                ), (f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
        deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"]
        rook_ceph_operator_deployment = [
            deployment_val for deployment_val in deployments
            if deployment_val["name"] == "rook-ceph-operator"
        ]
        assert {
            "name": "CSI_ENABLE_SNAPSHOTTER",
            "value": "false"
        } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"]
              ["containers"][0]["env"]
              ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
        log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump",
                                          format="")
        pool_names = [
            constants.METADATA_POOL,
            constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL,
        ]
        crush_rules = [
            rule for rule in crush_dump["rules"]
            if rule["rule_name"] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule["steps"]
                if item.get("type") == "zone"
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    log.info("Validate cluster on PVC")
    validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(namespace, health_check_tries,
                                   health_check_delay)
    if config.ENV_DATA.get("fips"):
        # In case that fips is enabled when deploying,
        # a verification of the installation of it will run
        # on all running state pods
        check_fips_enabled()
    if config.ENV_DATA.get("encryption_at_rest"):
        osd_encryption_verification()
        if config.DEPLOYMENT.get("kms_deployment"):
            kms = KMS.get_kms_deployment()
            kms.post_deploy_verification()

    storage_cluster_obj = get_storage_cluster()
    is_flexible_scaling = (
        storage_cluster_obj.get()["items"][0].get("spec").get(
            "flexibleScaling", False))
    if is_flexible_scaling is True:
        failure_domain = storage_cluster_obj.data["items"][0]["status"][
            "failureDomain"]
        assert failure_domain == "host", (
            f"The expected failure domain on cluster with flexible scaling is 'host',"
            f" the actaul failure domain is {failure_domain}")

    if ocs_version >= version.VERSION_4_7:
        log.info("Verifying images in storage cluster")
        verify_sc_images(storage_cluster)

    if config.ENV_DATA.get("is_multus_enabled"):
        verify_multus_network()
Exemplo n.º 5
0
    def select_nodes_for_app_pods(self, scenario, ceph_cluster, ocs_nodes,
                                  non_ocs_nodes, num_of_nodes):
        """
        Select nodes for running app pods
        Colocated scenario: Select 1 OCS node where osd and/or mon is running,
            select other nodes where mon/osd are not running
        Dedicated scenario: Select non-OCS nodes

        Args:
            scenario (str): Scenario of app pods running on OCS or dedicated nodes
                (eg., 'colocated', 'dedicated')
            ceph_cluster (obj): CephCluster object
            ocs_nodes (list): list of OCS nodes name
            non_ocs_nodes (list): list of non-OCS nodes name
            num_of_nodes (int): number of nodes to be selected

        Returns:
            list: list of selected nodes name for running app pods

        """
        selected_nodes = []
        if scenario == "colocated":
            logger.info(f"Selecting {num_of_nodes} OCS node from {ocs_nodes}")
            if len(ocs_nodes) == 3:
                selected_nodes.append(random.choice(ocs_nodes))
            else:
                az_count = get_az_count()
                logger.info(f"AZ count: {az_count}")
                if az_count == 1:
                    label_to_search = "topology.rook.io/rack"
                else:
                    label_to_search = "failure-domain.beta.kubernetes.io/zone"

                mon_pod_nodes = [
                    pod.get_pod_node(pod_obj).name
                    for pod_obj in ceph_cluster.mons
                ]
                logger.info(f"Mon pods are running on {mon_pod_nodes}")
                osd_pod_nodes = [
                    pod.get_pod_node(pod_obj).name
                    for pod_obj in ceph_cluster.osds
                ]
                logger.info(f"OSD pods are running on {osd_pod_nodes}")

                # Nodes having both mon and osd pods
                ceph_pod_nodes = list(set(mon_pod_nodes) & set(osd_pod_nodes))

                fd_worker_nodes = {}
                nodes_objs = node.get_node_objs(ocs_nodes)
                for wnode in nodes_objs:
                    fd = wnode.get().get("metadata").get("labels").get(
                        label_to_search)
                    fd_node_list = fd_worker_nodes.get(fd, [])
                    fd_node_list.append(wnode.name)
                    fd_worker_nodes[fd] = fd_node_list

                fd_sorted = sorted(fd_worker_nodes,
                                   key=lambda k: len(fd_worker_nodes[k]),
                                   reverse=True)

                worker_nodes = fd_worker_nodes.get(fd_sorted[0])
                logger.info(
                    f"Selecting 1 OCS node where OSD and/or Mon are running from {worker_nodes}"
                )
                common_nodes = list(set(worker_nodes) & set(ceph_pod_nodes))
                if len(common_nodes) == 0:
                    common_nodes = list(set(worker_nodes) & set(osd_pod_nodes))
                selected_nodes.append(random.choice(common_nodes))

                logger.info(f"Selected 1 OCS node {selected_nodes}")

                if num_of_nodes > 1:
                    available_nodes = list()
                    for fd in fd_sorted:
                        worker_nodes = fd_worker_nodes.get(fd)
                        # Remove already selected node and 1 extra node for later
                        # osd pod to move over that node
                        if selected_nodes[0] in worker_nodes:
                            worker_nodes.remove(selected_nodes[0])
                            worker_nodes = worker_nodes[1:]
                        available_nodes += worker_nodes

                    logger.info(
                        f"Selecting {num_of_nodes - 1} OCS node from {available_nodes}"
                    )
                    preferred_nodes = list(
                        set(available_nodes) - set(osd_pod_nodes))
                    if len(preferred_nodes) < (num_of_nodes - 1):
                        preferred_nodes += list(
                            set(available_nodes) - set(preferred_nodes))

                    selected_nodes += preferred_nodes[0:num_of_nodes - 1]
                    logger.info(
                        f"Selected {num_of_nodes - 1} OCS node {selected_nodes[1:]}"
                    )

        else:
            logger.info(
                f"Selecting {num_of_nodes} non-OCS node from {non_ocs_nodes}")
            selected_nodes += non_ocs_nodes[0:num_of_nodes]

        logger.info(f"Selected nodes for running app pods: {selected_nodes}")
        return selected_nodes
Exemplo n.º 6
0
    def deploy_ocs_via_operator(self, image=None):
        """
        Method for deploy OCS via OCS operator

        Args:
            image (str): Image of ocs registry.

        """
        ui_deployment = config.DEPLOYMENT.get("ui_deployment")
        live_deployment = config.DEPLOYMENT.get("live_deployment")
        arbiter_deployment = config.DEPLOYMENT.get("arbiter_deployment")

        if ui_deployment:
            self.deployment_with_ui()
            # Skip the rest of the deployment when deploy via UI
            return
        else:
            logger.info("Deployment of OCS via OCS operator")
            self.label_and_taint_nodes()

        if config.DEPLOYMENT.get("local_storage"):
            setup_local_storage(storageclass=self.DEFAULT_STORAGECLASS_LSO)

        logger.info("Creating namespace and operator group.")
        run_cmd(f"oc create -f {constants.OLM_YAML}")
        if config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM:
            ibmcloud.add_deployment_dependencies()
            if not live_deployment:
                create_ocs_secret(self.namespace)
        if not live_deployment:
            create_catalog_source(image)
        self.subscribe_ocs()
        operator_selector = get_selector_for_ocs_operator()
        subscription_plan_approval = config.DEPLOYMENT.get(
            "subscription_plan_approval")
        package_manifest = PackageManifest(
            resource_name=defaults.OCS_OPERATOR_NAME,
            selector=operator_selector,
            subscription_plan_approval=subscription_plan_approval,
        )
        package_manifest.wait_for_resource(timeout=300)
        channel = config.DEPLOYMENT.get("ocs_csv_channel")
        csv_name = package_manifest.get_current_csv(channel=channel)
        csv = CSV(resource_name=csv_name, namespace=self.namespace)
        if (config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM
                and not live_deployment):
            csv.wait_for_phase("Installing", timeout=720)
            logger.info("Sleeping for 30 seconds before applying SA")
            time.sleep(30)
            link_all_sa_and_secret(constants.OCS_SECRET, self.namespace)
            logger.info("Deleting all pods in openshift-storage namespace")
            exec_cmd(f"oc delete pod --all -n {self.namespace}")
        csv.wait_for_phase("Succeeded", timeout=720)
        ocp_version = float(get_ocp_version())
        if config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM:
            config_map = ocp.OCP(
                kind="configmap",
                namespace=self.namespace,
                resource_name=constants.ROOK_OPERATOR_CONFIGMAP,
            )
            config_map.get(retry=10, wait=5)
            config_map_patch = (
                '\'{"data": {"ROOK_CSI_KUBELET_DIR_PATH": "/var/data/kubelet"}}\''
            )
            logger.info("Patching config map to change KUBLET DIR PATH")
            exec_cmd(
                f"oc patch configmap -n {self.namespace} "
                f"{constants.ROOK_OPERATOR_CONFIGMAP} -p {config_map_patch}")
            if config.DEPLOYMENT.get("create_ibm_cos_secret", True):
                logger.info("Creating secret for IBM Cloud Object Storage")
                with open(constants.IBM_COS_SECRET_YAML, "r") as cos_secret_fd:
                    cos_secret_data = yaml.load(cos_secret_fd,
                                                Loader=yaml.SafeLoader)
                key_id = config.AUTH["ibmcloud"]["ibm_cos_access_key_id"]
                key_secret = config.AUTH["ibmcloud"][
                    "ibm_cos_secret_access_key"]
                cos_secret_data["data"]["IBM_COS_ACCESS_KEY_ID"] = key_id
                cos_secret_data["data"][
                    "IBM_COS_SECRET_ACCESS_KEY"] = key_secret
                cos_secret_data_yaml = tempfile.NamedTemporaryFile(
                    mode="w+", prefix="cos_secret", delete=False)
                templating.dump_data_to_temp_yaml(cos_secret_data,
                                                  cos_secret_data_yaml.name)
                exec_cmd(f"oc create -f {cos_secret_data_yaml.name}")

        # Modify the CSV with custom values if required
        if all(key in config.DEPLOYMENT
               for key in ("csv_change_from", "csv_change_to")):
            modify_csv(
                csv=csv_name,
                replace_from=config.DEPLOYMENT["csv_change_from"],
                replace_to=config.DEPLOYMENT["csv_change_to"],
            )

        # create custom storage class for StorageCluster CR if necessary
        if self.CUSTOM_STORAGE_CLASS_PATH is not None:
            with open(self.CUSTOM_STORAGE_CLASS_PATH, "r") as custom_sc_fo:
                custom_sc = yaml.load(custom_sc_fo, Loader=yaml.SafeLoader)
            # set value of DEFAULT_STORAGECLASS to mach the custom storage cls
            self.DEFAULT_STORAGECLASS = custom_sc["metadata"]["name"]
            run_cmd(f"oc create -f {self.CUSTOM_STORAGE_CLASS_PATH}")

        # creating StorageCluster
        if config.DEPLOYMENT.get("kms_deployment"):
            kms = KMS.get_kms_deployment()
            kms.deploy()
        cluster_data = templating.load_yaml(constants.STORAGE_CLUSTER_YAML)

        # Figure out all the OCS modules enabled/disabled
        # CLI parameter --disable-components takes the precedence over
        # anything which comes from config file
        if config.ENV_DATA.get("disable_components"):
            for component in config.ENV_DATA["disable_components"]:
                config.COMPONENTS[f"disable_{component}"] = True
                logger.warning(f"disabling: {component}")

        # Update cluster_data with respective component enable/disable
        for key in config.COMPONENTS.keys():
            comp_name = constants.OCS_COMPONENTS_MAP[key.split("_")[1]]
            if config.COMPONENTS[key]:
                if "noobaa" in key:
                    merge_dict(
                        cluster_data,
                        {
                            "spec": {
                                "multiCloudGateway": {
                                    "reconcileStrategy": "ignore"
                                }
                            }
                        },
                    )
                else:
                    merge_dict(
                        cluster_data,
                        {
                            "spec": {
                                "managedResources": {
                                    f"{comp_name}": {
                                        "reconcileStrategy": "ignore"
                                    }
                                }
                            }
                        },
                    )

        if arbiter_deployment:
            cluster_data["spec"]["arbiter"] = {}
            cluster_data["spec"]["nodeTopologies"] = {}
            cluster_data["spec"]["arbiter"]["enable"] = True
            cluster_data["spec"]["nodeTopologies"][
                "arbiterLocation"] = self.get_arbiter_location()
            cluster_data["spec"]["storageDeviceSets"][0][
                "replica"] = config.DEPLOYMENT.get(
                    "ocs_operator_nodes_to_label", 4)

        cluster_data["metadata"]["name"] = config.ENV_DATA[
            "storage_cluster_name"]

        deviceset_data = cluster_data["spec"]["storageDeviceSets"][0]
        device_size = int(
            config.ENV_DATA.get("device_size", defaults.DEVICE_SIZE))

        logger.info(
            "Flexible scaling is available from version 4.7 on LSO cluster with less than 3 zones"
        )
        ocs_version = config.ENV_DATA["ocs_version"]
        zone_num = get_az_count()
        if (config.DEPLOYMENT.get("local_storage")
                and Version.coerce(ocs_version) >= Version.coerce("4.7")
                and zone_num < 3):
            cluster_data["spec"]["flexibleScaling"] = True
            # https://bugzilla.redhat.com/show_bug.cgi?id=1921023
            cluster_data["spec"]["storageDeviceSets"][0]["count"] = 3
            cluster_data["spec"]["storageDeviceSets"][0]["replica"] = 1

        # set size of request for storage
        if self.platform.lower() == constants.BAREMETAL_PLATFORM:
            pv_size_list = helpers.get_pv_size(
                storageclass=self.DEFAULT_STORAGECLASS_LSO)
            pv_size_list.sort()
            deviceset_data["dataPVCTemplate"]["spec"]["resources"]["requests"][
                "storage"] = f"{pv_size_list[0]}"
        else:
            deviceset_data["dataPVCTemplate"]["spec"]["resources"]["requests"][
                "storage"] = f"{device_size}Gi"

        # set storage class to OCS default on current platform
        if self.DEFAULT_STORAGECLASS:
            deviceset_data["dataPVCTemplate"]["spec"][
                "storageClassName"] = self.DEFAULT_STORAGECLASS

        ocs_version = float(config.ENV_DATA["ocs_version"])

        # StorageCluster tweaks for LSO
        if config.DEPLOYMENT.get("local_storage"):
            cluster_data["spec"]["manageNodes"] = False
            cluster_data["spec"]["monDataDirHostPath"] = "/var/lib/rook"
            deviceset_data["name"] = constants.DEFAULT_DEVICESET_LSO_PVC_NAME
            deviceset_data["portable"] = False
            deviceset_data["dataPVCTemplate"]["spec"][
                "storageClassName"] = self.DEFAULT_STORAGECLASS_LSO
            lso_type = config.DEPLOYMENT.get("type")
            if (self.platform.lower() == constants.AWS_PLATFORM
                    and not lso_type == constants.AWS_EBS):
                deviceset_data["count"] = 2
            if (ocp_version >= 4.6) and (ocs_version >= 4.6):
                cluster_data["metadata"]["annotations"] = {
                    "cluster.ocs.openshift.io/local-devices": "true"
                }

        # Allow lower instance requests and limits for OCS deployment
        # The resources we need to change can be found here:
        # https://github.com/openshift/ocs-operator/blob/release-4.5/pkg/deploy-manager/storagecluster.go#L88-L116
        if config.DEPLOYMENT.get("allow_lower_instance_requirements"):
            none_resources = {"Requests": None, "Limits": None}
            deviceset_data["resources"] = deepcopy(none_resources)
            resources = [
                "mon",
                "mds",
                "rgw",
                "mgr",
                "noobaa-core",
                "noobaa-db",
            ]
            if ocs_version >= 4.5:
                resources.append("noobaa-endpoint")
            cluster_data["spec"]["resources"] = {
                resource: deepcopy(none_resources)
                for resource in resources
            }
            if ocs_version >= 4.5:
                cluster_data["spec"]["resources"]["noobaa-endpoint"] = {
                    "limits": {
                        "cpu": 1,
                        "memory": "500Mi"
                    },
                    "requests": {
                        "cpu": 1,
                        "memory": "500Mi"
                    },
                }
        else:
            local_storage = config.DEPLOYMENT.get("local_storage")
            platform = config.ENV_DATA.get("platform", "").lower()
            if local_storage and platform == "aws":
                resources = {
                    "mds": {
                        "limits": {
                            "cpu": 3,
                            "memory": "8Gi"
                        },
                        "requests": {
                            "cpu": 1,
                            "memory": "8Gi"
                        },
                    }
                }
                if ocs_version < 4.5:
                    resources["noobaa-core"] = {
                        "limits": {
                            "cpu": 2,
                            "memory": "8Gi"
                        },
                        "requests": {
                            "cpu": 1,
                            "memory": "8Gi"
                        },
                    }
                    resources["noobaa-db"] = {
                        "limits": {
                            "cpu": 2,
                            "memory": "8Gi"
                        },
                        "requests": {
                            "cpu": 1,
                            "memory": "8Gi"
                        },
                    }
                cluster_data["spec"]["resources"] = resources

        # Enable host network if enabled in config (this require all the
        # rules to be enabled on underlaying platform).
        if config.DEPLOYMENT.get("host_network"):
            cluster_data["spec"]["hostNetwork"] = True

        cluster_data["spec"]["storageDeviceSets"] = [deviceset_data]

        if self.platform == constants.IBMCLOUD_PLATFORM:
            mon_pvc_template = {
                "spec": {
                    "accessModes": ["ReadWriteOnce"],
                    "resources": {
                        "requests": {
                            "storage": "20Gi"
                        }
                    },
                    "storageClassName": self.DEFAULT_STORAGECLASS,
                    "volumeMode": "Filesystem",
                }
            }
            cluster_data["spec"]["monPVCTemplate"] = mon_pvc_template
            # Need to check if it's needed for ibm cloud to set manageNodes
            cluster_data["spec"]["manageNodes"] = False

        if config.ENV_DATA.get("encryption_at_rest"):
            if ocs_version < 4.6:
                error_message = "Encryption at REST can be enabled only on OCS >= 4.6!"
                logger.error(error_message)
                raise UnsupportedFeatureError(error_message)
            logger.info("Enabling encryption at REST!")
            cluster_data["spec"]["encryption"] = {
                "enable": True,
            }
            if config.DEPLOYMENT.get("kms_deployment"):
                cluster_data["spec"]["encryption"]["kms"] = {
                    "enable": True,
                }

        if config.DEPLOYMENT.get("ceph_debug"):
            setup_ceph_debug()
            cluster_data["spec"]["managedResources"] = {
                "cephConfig": {
                    "reconcileStrategy": "ignore"
                }
            }

        cluster_data_yaml = tempfile.NamedTemporaryFile(
            mode="w+", prefix="cluster_storage", delete=False)
        templating.dump_data_to_temp_yaml(cluster_data, cluster_data_yaml.name)
        run_cmd(f"oc create -f {cluster_data_yaml.name}", timeout=1200)
        if config.DEPLOYMENT["infra_nodes"]:
            _ocp = ocp.OCP(kind="node")
            _ocp.exec_oc_cmd(
                command=f"annotate namespace {defaults.ROOK_CLUSTER_NAMESPACE} "
                f"{constants.NODE_SELECTOR_ANNOTATION}")
Exemplo n.º 7
0
def ocs_install_verification(
    timeout=600,
    skip_osd_distribution_check=False,
    ocs_registry_image=None,
    post_upgrade_verification=False,
    version_before_upgrade=None,
):
    """
    Perform steps necessary to verify a successful OCS installation

    Args:
        timeout (int): Number of seconds for timeout which will be used in the
            checks used in this function.
        skip_osd_distribution_check (bool): If true skip the check for osd
            distribution.
        ocs_registry_image (str): Specific image to check if it was installed
            properly.
        post_upgrade_verification (bool): Set to True if this function is
            called after upgrade.
        version_before_upgrade (float): Set to OCS version before upgrade

    """
    from ocs_ci.ocs.node import get_nodes
    from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs
    from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods
    from ocs_ci.ocs.cluster import validate_cluster_on_pvc
    from ocs_ci.ocs.resources.fips import check_fips_enabled

    number_of_worker_nodes = len(get_nodes())
    namespace = config.ENV_DATA["cluster_namespace"]
    log.info("Verifying OCS installation")

    # Verify OCS CSV is in Succeeded phase
    log.info("verifying ocs csv")
    ocs_csv = get_ocs_csv()
    # Verify if OCS CSV has proper version.
    csv_version = ocs_csv.data["spec"]["version"]
    ocs_version = config.ENV_DATA["ocs_version"]
    log.info(
        f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}")
    assert (
        ocs_version in csv_version
    ), f"OCS version: {ocs_version} mismatch with CSV version {csv_version}"
    # Verify if OCS CSV has the same version in provided CI build.
    ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get(
        "ocs_registry_image")
    if ocs_registry_image and ocs_registry_image.endswith(".ci"):
        ocs_registry_image = ocs_registry_image.split(":")[1]
        log.info(
            f"Check if OCS registry image: {ocs_registry_image} matches with "
            f"CSV: {csv_version}")
        ignore_csv_mismatch = config.DEPLOYMENT.get("ignore_csv_mismatch")
        if ignore_csv_mismatch:
            log.info(
                "The possible mismatch will be ignored as you deployed "
                "the different version than the default version from the CSV")
        else:
            assert ocs_registry_image in csv_version, (
                f"OCS registry image version: {ocs_registry_image} mismatch "
                f"with CSV version {csv_version}")

    # Verify OCS Cluster Service (ocs-storagecluster) is Ready
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    log.info("Verifying status of storage cluster: %s", storage_cluster_name)
    storage_cluster = StorageCluster(
        resource_name=storage_cluster_name,
        namespace=namespace,
    )
    log.info(f"Check if StorageCluster: {storage_cluster_name} is in"
             f"Succeeded phase")
    storage_cluster.wait_for_phase(phase="Ready", timeout=timeout)

    # Verify pods in running state and proper counts
    log.info("Verifying pod states and counts")
    pod = OCP(kind=constants.POD, namespace=namespace)
    if not config.DEPLOYMENT["external_mode"]:
        osd_count = int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["count"]
        ) * int(
            storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"])
    rgw_count = None
    if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS:
        #  RGW count is 1 if OCS version < 4.5 or the cluster was upgraded from version <= 4.4
        if (float(config.ENV_DATA["ocs_version"]) < 4.5
                or float(config.ENV_DATA["ocs_version"]) == 4.5 and
            (post_upgrade_verification
             and float(version_before_upgrade) < 4.5)):
            rgw_count = 1
        else:
            rgw_count = 2

    # # With 4.4 OCS cluster deployed over Azure, RGW is the default backingstore
    if config.ENV_DATA.get("platform") == constants.AZURE_PLATFORM:
        if float(config.ENV_DATA["ocs_version"]) == 4.4 or (
                float(config.ENV_DATA["ocs_version"]) == 4.5 and
            (post_upgrade_verification
             and float(version_before_upgrade) < 4.5)):
            rgw_count = 1

    min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT
    max_eps = (constants.MAX_NB_ENDPOINT_COUNT
               if float(config.ENV_DATA["ocs_version"]) >= 4.6 else 1)

    if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM:
        min_eps = 1
        max_eps = 1

    resources_dict = {
        constants.OCS_OPERATOR_LABEL: 1,
        constants.OPERATOR_LABEL: 1,
        constants.NOOBAA_DB_LABEL: 1,
        constants.NOOBAA_OPERATOR_POD_LABEL: 1,
        constants.NOOBAA_CORE_POD_LABEL: 1,
        constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps,
    }
    if not config.DEPLOYMENT["external_mode"]:
        resources_dict.update({
            constants.MON_APP_LABEL: 3,
            constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2,
            constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes,
            constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2,
            constants.OSD_APP_LABEL: osd_count,
            constants.MGR_APP_LABEL: 1,
            constants.MDS_APP_LABEL: 2,
            constants.RGW_APP_LABEL: rgw_count,
        })

    for label, count in resources_dict.items():
        if label == constants.RGW_APP_LABEL:
            if not config.ENV_DATA.get(
                    "platform") in constants.ON_PREM_PLATFORMS:
                continue
        assert pod.wait_for_resource(
            condition=constants.STATUS_RUNNING,
            selector=label,
            resource_count=count,
            timeout=timeout,
        )

    nb_ep_pods = get_pods_having_label(
        label=constants.NOOBAA_ENDPOINT_POD_LABEL,
        namespace=defaults.ROOK_CLUSTER_NAMESPACE,
    )
    assert len(nb_ep_pods) <= max_eps, (
        f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) "
        f"is greater than the maximum defined in the NooBaa CR ({max_eps})")

    # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd)
    log.info("Verifying storage classes")
    storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace)
    storage_cluster_name = config.ENV_DATA["storage_cluster_name"]
    required_storage_classes = {
        f"{storage_cluster_name}-cephfs",
        f"{storage_cluster_name}-ceph-rbd",
    }
    if config.DEPLOYMENT["external_mode"]:
        required_storage_classes.update({
            f"{storage_cluster_name}-ceph-rgw",
            f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io',
        })
    storage_classes = storage_class.get()
    storage_class_names = {
        item["metadata"]["name"]
        for item in storage_classes["items"]
    }
    assert required_storage_classes.issubset(storage_class_names)

    # Verify OSDs are distributed
    if not config.DEPLOYMENT["external_mode"]:
        if not skip_osd_distribution_check:
            log.info(
                "Verifying OSDs are distributed evenly across worker nodes")
            ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace)
            osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"]
            deviceset_count = get_deviceset_count()
            node_names = [osd["spec"]["nodeName"] for osd in osds]
            for node in node_names:
                assert (
                    not node_names.count(node) > deviceset_count
                ), "OSD's are not distributed evenly across worker nodes"

    # Verify that CSI driver object contains provisioner names
    log.info("Verifying CSI driver object contains provisioner names.")
    csi_driver = OCP(kind="CSIDriver")
    csi_drivers = {
        item["metadata"]["name"]
        for item in csi_driver.get()["items"]
    }
    assert defaults.CSI_PROVISIONERS.issubset(csi_drivers)

    # Verify node and provisioner secret names in storage class
    log.info("Verifying node and provisioner secret names in storage class.")
    if config.DEPLOYMENT["external_mode"]:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(resource_name=(
            constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS))
    else:
        sc_rbd = storage_class.get(
            resource_name=constants.DEFAULT_STORAGECLASS_RBD)
        sc_cephfs = storage_class.get(
            resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS)
    assert (sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"]
            == constants.RBD_NODE_SECRET)
    assert (sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"]
            == constants.RBD_PROVISIONER_SECRET)
    assert (
        sc_cephfs["parameters"]["csi.storage.k8s.io/node-stage-secret-name"] ==
        constants.CEPHFS_NODE_SECRET)
    assert (
        sc_cephfs["parameters"]["csi.storage.k8s.io/provisioner-secret-name"]
        == constants.CEPHFS_PROVISIONER_SECRET)
    log.info("Verified node and provisioner secret names in storage class.")

    # Verify ceph osd tree output
    if not config.DEPLOYMENT["external_mode"]:
        log.info(
            "Verifying ceph osd tree output and checking for device set PVC names "
            "in the output.")

        if config.DEPLOYMENT.get("local_storage"):
            deviceset_pvcs = get_compute_node_names()
        else:
            deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()]

        ct_pod = get_ceph_tools_pod()
        osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree",
                                        format="json")
        schemas = {
            "root": constants.OSD_TREE_ROOT,
            "rack": constants.OSD_TREE_RACK,
            "host": constants.OSD_TREE_HOST,
            "osd": constants.OSD_TREE_OSD,
            "region": constants.OSD_TREE_REGION,
            "zone": constants.OSD_TREE_ZONE,
        }
        schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs}
        for item in osd_tree["nodes"]:
            validate(instance=item, schema=schemas[item["type"]])
            if item["type"] == "host":
                deviceset_pvcs.remove(item["name"])
        assert not deviceset_pvcs, (
            f"These device set PVCs are not given in ceph osd tree output "
            f"- {deviceset_pvcs}")
        log.info(
            "Verified ceph osd tree output. Device set PVC names are given in the "
            "output.")

    # TODO: Verify ceph osd tree output have osd listed as ssd
    # TODO: Verify ceph osd tree output have zone or rack based on AZ

    # Verify CSI snapshotter sidecar container is not present
    # if the OCS version is < 4.6
    if float(config.ENV_DATA["ocs_version"]) < 4.6:
        log.info("Verifying CSI snapshotter is not present.")
        provisioner_pods = get_all_pods(
            namespace=defaults.ROOK_CLUSTER_NAMESPACE,
            selector=[
                constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL,
                constants.CSI_RBDPLUGIN_PROVISIONER_LABEL,
            ],
        )
        for pod_obj in provisioner_pods:
            pod_info = pod_obj.get()
            for container, image in get_images(data=pod_info).items():
                assert ("snapshot" not in container) and (
                    "snapshot" not in image
                ), (f"Snapshot container is present in {pod_obj.name} pod. "
                    f"Container {container}. Image {image}")
        deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"]
        rook_ceph_operator_deployment = [
            deployment_val for deployment_val in deployments
            if deployment_val["name"] == "rook-ceph-operator"
        ]
        assert {
            "name": "CSI_ENABLE_SNAPSHOTTER",
            "value": "false"
        } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"]
              ["containers"][0]["env"]
              ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'."
        log.info("Verified: CSI snapshotter is not present.")

    # Verify pool crush rule is with "type": "zone"
    if utils.get_az_count() == 3:
        log.info("Verifying pool crush rule is with type: zone")
        crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump",
                                          format="")
        pool_names = [
            constants.METADATA_POOL,
            constants.DEFAULT_BLOCKPOOL,
            constants.DATA_POOL,
        ]
        crush_rules = [
            rule for rule in crush_dump["rules"]
            if rule["rule_name"] in pool_names
        ]
        for crush_rule in crush_rules:
            assert [
                item for item in crush_rule["steps"]
                if item.get("type") == "zone"
            ], f"{crush_rule['rule_name']} is not with type as zone"
        log.info("Verified - pool crush rule is with type: zone")
    log.info("Validate cluster on PVC")
    validate_cluster_on_pvc()

    # Verify ceph health
    log.info("Verifying ceph health")
    health_check_tries = 20
    health_check_delay = 30
    if post_upgrade_verification:
        # In case of upgrade with FIO we have to wait longer time to see
        # health OK. See discussion in BZ:
        # https://bugzilla.redhat.com/show_bug.cgi?id=1817727
        health_check_tries = 180
    assert utils.ceph_health_check(namespace, health_check_tries,
                                   health_check_delay)
    if config.ENV_DATA.get("fips"):
        # In case that fips is enabled when deploying,
        # a verification of the installation of it will run
        # on all running state pods
        check_fips_enabled()
    if config.ENV_DATA.get("encryption_at_rest"):
        osd_encryption_verification()