def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") self.deploy_ocs_via_operator() pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mgr', timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # validate PDB creation of MON, MDS, OSD pods validate_pdb_creation() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector='app=rook-ceph-tools', resource_count=1, timeout=600) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info("MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error("MDS deployment Failed! Please check logs!") # Change monitoring backend to OCS if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( 'persistent-monitoring'): sc = helpers.default_storage_class( interface_type=constants.CEPHBLOCKPOOL) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager']) # Create configmap cluster-monitoring-config and reconfigure # storage class and telemeter server (if the url is specified in a # config file) create_configmap_cluster_monitoring_pod( sc_name=sc.name, telemeter_server_url=config.ENV_DATA.get( "telemeter_server_url")) # Take some time to respin the pod waiting_time = 45 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state retry((CommandFailed, ResourceWrongStatusException), tries=3, delay=15)(validate_pods_are_respinned_and_running_state)( pods_list) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods retry((CommandFailed, AssertionError), tries=3, delay=15)( validate_pvc_are_mounted_on_monitoring_pods)(pods_list) elif config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( "telemeter_server_url"): # Create configmap cluster-monitoring-config to reconfigure # telemeter server url when 'persistent-monitoring' is False create_configmap_cluster_monitoring_pod( telemeter_server_url=config.ENV_DATA["telemeter_server_url"]) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") try: ceph_health_check(namespace=self.namespace, tries=30, delay=10) except CephHealthException as ex: err = str(ex) logger.warning(f"Ceph health check failed with {err}") if "clock skew detected" in err: logger.info(f"Changing NTP on compute nodes to" f" {constants.RH_NTP_CLOCK}") update_ntp_compute_nodes() assert ceph_health_check(namespace=self.namespace, tries=60, delay=10) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default() if check_nodes_specs(min_cpu=constants.MIN_NODE_CPU, min_memory=constants.MIN_NODE_MEMORY): logger.info("The cluster specs meet the minimum requirements and " "therefore, NooBaa auto scale will be enabled") min_nb_eps = config.DEPLOYMENT.get('min_noobaa_endpoints') max_nb_eps = config.DEPLOYMENT.get('max_noobaa_endpoints') change_noobaa_endpoints_count(min_nb_eps=min_nb_eps, max_nb_eps=max_nb_eps) else: logger.warning( "The cluster specs do not meet the minimum requirements" " and therefore, NooBaa auto scale will remain disabled") change_noobaa_endpoints_count(min_nb_eps=1, max_nb_eps=1)
def ocs_install_verification( timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None, post_upgrade_verification=False, ): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. ocs_registry_image (str): Specific image to check if it was installed properly. post_upgrade_verification (bool): Set to True if this function is called after upgrade. """ from ocs_ci.ocs.node import get_typed_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods from ocs_ci.ocs.cluster import validate_cluster_on_pvc from ocs_ci.ocs.resources.fips import check_fips_enabled number_of_worker_nodes = len(get_typed_nodes()) namespace = config.ENV_DATA['cluster_namespace'] log.info("Verifying OCS installation") # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") ocs_csv = get_ocs_csv() # Verify if OCS CSV has proper version. csv_version = ocs_csv.data['spec']['version'] ocs_version = config.ENV_DATA['ocs_version'] log.info( f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}") assert ocs_version in csv_version, ( f"OCS version: {ocs_version} mismatch with CSV version {csv_version}") # Verify if OCS CSV has the same version in provided CI build. ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get( 'ocs_registry_image') if ocs_registry_image and ocs_registry_image.endswith(".ci"): ocs_registry_image = ocs_registry_image.split(":")[1] log.info( f"Check if OCS registry image: {ocs_registry_image} matches with " f"CSV: {csv_version}") ignore_csv_mismatch = config.DEPLOYMENT.get('ignore_csv_mismatch') if ignore_csv_mismatch: log.info( "The possible mismatch will be ignored as you deployed " "the different version than the default version from the CSV") else: assert ocs_registry_image in csv_version, ( f"OCS registry image version: {ocs_registry_image} mismatch " f"with CSV version {csv_version}") # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA['storage_cluster_name'] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info(f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase") storage_cluster.wait_for_phase(phase='Ready', timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP(kind=constants.POD, namespace=namespace) if not config.DEPLOYMENT['external_mode']: osd_count = (int( storage_cluster.data['spec']['storageDeviceSets'][0]['count']) * int(storage_cluster.data['spec']['storageDeviceSets'][0] ['replica'])) rgw_count = None if config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS: # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1857802 - RGW count is 1 # post upgrade to OCS 4.5. Tracked with # https://github.com/red-hat-storage/ocs-ci/issues/2532 rgw_count = 2 if float(config.ENV_DATA['ocs_version']) >= 4.5 and not ( post_upgrade_verification) else 1 # With 4.4 OCS cluster deployed over Azure, RGW is the default backingstore if float(config.ENV_DATA['ocs_version']) == 4.4 and config.ENV_DATA.get( 'platform') == constants.AZURE_PLATFORM: rgw_count = 1 if float(config.ENV_DATA['ocs_version']) == 4.5 and config.ENV_DATA.get( 'platform' ) == constants.AZURE_PLATFORM and post_upgrade_verification: rgw_count = 1 # Fetch the min and max Noobaa endpoints from the run config if check_nodes_specs(min_cpu=constants.MIN_NODE_CPU, min_memory=constants.MIN_NODE_MEMORY): min_eps = config.DEPLOYMENT.get('min_noobaa_endpoints') max_eps = config.DEPLOYMENT.get('max_noobaa_endpoints') else: min_eps = 1 max_eps = 1 if float(config.ENV_DATA['ocs_version']) < 4.6 else 2 resources_dict = { constants.OCS_OPERATOR_LABEL: 1, constants.OPERATOR_LABEL: 1, constants.NOOBAA_DB_LABEL: 1, constants.NOOBAA_OPERATOR_POD_LABEL: 1, constants.NOOBAA_CORE_POD_LABEL: 1, constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps } if not config.DEPLOYMENT['external_mode']: resources_dict.update({ constants.MON_APP_LABEL: 3, constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2, constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2, constants.OSD_APP_LABEL: osd_count, constants.MGR_APP_LABEL: 1, constants.MDS_APP_LABEL: 2, constants.RGW_APP_LABEL: rgw_count }) for label, count in resources_dict.items(): if label == constants.RGW_APP_LABEL: if not config.ENV_DATA.get( 'platform') in constants.ON_PREM_PLATFORMS: continue assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=label, resource_count=count, timeout=timeout) nb_ep_pods = get_pods_having_label( label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert len(nb_ep_pods) <= max_eps, ( f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) " f"is greater than the maximum defined in the NooBaa CR ({max_eps})") # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace) storage_cluster_name = config.ENV_DATA['storage_cluster_name'] required_storage_classes = { f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd' } if config.DEPLOYMENT['external_mode']: required_storage_classes.update({ f'{storage_cluster_name}-ceph-rgw', f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io' }) storage_classes = storage_class.get() storage_class_names = { item['metadata']['name'] for item in storage_classes['items'] } assert required_storage_classes.issubset(storage_class_names) # Verify OSDs are distributed if not config.DEPLOYMENT['external_mode']: if not skip_osd_distribution_check: log.info( "Verifying OSDs are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items'] deviceset_count = get_deviceset_count() node_names = [osd['spec']['nodeName'] for osd in osds] for node in node_names: assert not node_names.count(node) > deviceset_count, ( "OSD's are not distributed evenly across worker nodes") # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") csi_drivers = ({ item['metadata']['name'] for item in csi_driver.get()['items'] }) assert defaults.CSI_PROVISIONERS.issubset(csi_drivers) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") if config.DEPLOYMENT['external_mode']: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD) sc_cephfs = storage_class.get(resource_name=( constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS)) else: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD) sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS) assert sc_rbd['parameters'][ 'csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET assert sc_rbd['parameters'][ 'csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET assert sc_cephfs['parameters'][ 'csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET assert sc_cephfs['parameters'][ 'csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET log.info("Verified node and provisioner secret names in storage class.") # Verify ceph osd tree output if not config.DEPLOYMENT['external_mode']: log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output.") if (config.DEPLOYMENT.get('local_storage') and config.ENV_DATA['platform'] != constants.BAREMETALPSI_PLATFORM): deviceset_pvcs = get_compute_node_names() else: deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] ct_pod = get_ceph_tools_pod() osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json') schemas = { 'root': constants.OSD_TREE_ROOT, 'rack': constants.OSD_TREE_RACK, 'host': constants.OSD_TREE_HOST, 'osd': constants.OSD_TREE_OSD, 'region': constants.OSD_TREE_REGION, 'zone': constants.OSD_TREE_ZONE } schemas['host']['properties']['name'] = {'enum': deviceset_pvcs} for item in osd_tree['nodes']: validate(instance=item, schema=schemas[item['type']]) if item['type'] == 'host': deviceset_pvcs.remove(item['name']) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}") log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output.") # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present # if the OCS version is < 4.6 if float(config.ENV_DATA['ocs_version']) < 4.6: log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL ]) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ('snapshot' not in container) and ( 'snapshot' not in image ), (f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}") deployments = ocs_csv.get()['spec']['install']['spec']['deployments'] rook_ceph_operator_deployment = [ deployment_val for deployment_val in deployments if deployment_val['name'] == 'rook-ceph-operator' ] assert { 'name': 'CSI_ENABLE_SNAPSHOTTER', 'value': 'false' } in (rook_ceph_operator_deployment[0]['spec']['template']['spec'] ['containers'][0]['env'] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd crush dump', format='') pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL ] crush_rules = [ rule for rule in crush_dump['rules'] if rule['rule_name'] in pool_names ] for crush_rule in crush_rules: assert [ item for item in crush_rule['steps'] if item.get('type') == 'zone' ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone") log.info("Validate cluster on PVC") validate_cluster_on_pvc() # Verify ceph health log.info("Verifying ceph health") health_check_tries = 20 health_check_delay = 30 if post_upgrade_verification: # In case of upgrade with FIO we have to wait longer time to see # health OK. See discussion in BZ: # https://bugzilla.redhat.com/show_bug.cgi?id=1817727 health_check_tries = 180 assert utils.ceph_health_check(namespace, health_check_tries, health_check_delay) if config.ENV_DATA.get('fips'): # In case that fips is enabled when deploying, # a verification of the installation of it will run # on all running state pods check_fips_enabled()