def get_osd_size(): """ Get osd size from Storage cluster Returns: int: osd size """ sc = get_storage_cluster() size = ( sc.get() .get("items")[0] .get("spec") .get("storageDeviceSets")[0] .get("dataPVCTemplate") .get("spec") .get("resources") .get("requests") .get("storage") ) if size.isdigit or config.DEPLOYMENT.get("local_storage"): # In the case of UI deployment of LSO cluster, the value in StorageCluster CR # is set to 1, so we can not take OSD size from there. For LSO we will return # the size from PVC. pvc = get_deviceset_pvcs()[0] return int(pvc.get()["status"]["capacity"]["storage"][:-2]) else: return int(size[:-2])
def test_delete_local_volume_sym_link(self): """ Delete sym link on LSO Cluster """ # Get rook-ceph-crashcollector pod objects crashcollector_pods = get_pod_name_by_pattern( pattern="rook-ceph-crashcollector", namespace=ROOK_CLUSTER_NAMESPACE) crashcollector_pods_objs = [] for crashcollector_pod in crashcollector_pods: crashcollector_pods_objs.append( get_pod_obj(name=crashcollector_pod, namespace=ROOK_CLUSTER_NAMESPACE)) # Get Node object node_obj = get_pod_node(pod_obj=crashcollector_pods_objs[0]) # Get Sym link osd_pvcs = get_deviceset_pvcs() pv_name = osd_pvcs[0].data["spec"]["volumeName"] ocp_obj = ocp.OCP(namespace=ROOK_CLUSTER_NAMESPACE, kind=constants.PV) pv_obj = ocp_obj.get(resource_name=pv_name) path = pv_obj["spec"]["local"]["path"] log.info("Delete sym link") oc_cmd = ocp.OCP(namespace=ROOK_CLUSTER_NAMESPACE) cmd = f"rm -rfv {path}" oc_cmd.exec_oc_debug_cmd(node=node_obj.name, cmd_list=[cmd]) log.info( "Waiting for rook-ceph-crashcollector pods to be reach Running state" ) for crashcollector_pods_obj in crashcollector_pods_objs: wait_for_resource_state(resource=crashcollector_pods_obj, state=constants.STATUS_RUNNING) # Check all OCS pods status, they should be in Running or Completed state wait_for_storage_pods() # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory): """ Test cluster recovery from disk deletion from the platform side. Based on documented procedure detailed in https://bugzilla.redhat.com/show_bug.cgi?id=1823183 """ logger.info("Picking a PV which to be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get("spec").get("claimRef").get("name") # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ ds for ds in osd_pvcs if ds.get().get("metadata").get("name") == claim_name ][0] # Get the corresponding OSD pod and ID logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] logger.info(f"OSD_POD {osd_pod.name}") osd_id = osd_pod.get().get("metadata").get("labels").get("ceph-osd-id") # Get the node that has the OSD pod running on logger.info( f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get("metadata").get( "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get( "labels").get("job-name")) osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_deployment_name = osd_deployment.name # Delete the volume from the platform side logger.info(f"Deleting {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Scale down OSD deployment logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0") ocp.OCP().exec_oc_cmd( f"scale --replicas=0 deployment/{osd_deployment_name}") # Force delete OSD pod if necessary osd_pod_name = osd_pod.name logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted") try: osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) except TimeoutError: osd_pod.delete(force=True) osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) # Run ocs-osd-removal job ocp_version = float(get_ocp_version()) if ocp_version >= 4.6: cmd = f"process ocs-osd-removal -p FAILED_OSD_IDS={osd_id} -o yaml" else: cmd = f"process ocs-osd-removal -p FAILED_OSD_ID={osd_id} -o yaml" logger.info(f"Executing OSD removal job on OSD-{osd_id}") ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"]) osd_removal_job_yaml = ocp_obj.exec_oc_cmd(cmd) osd_removal_job = OCS(**osd_removal_job_yaml) osd_removal_job.create(do_reload=False) # Get ocs-osd-removal pod name logger.info("Getting the ocs-osd-removal pod name") osd_removal_pod_name = get_osd_removal_pod_name(osd_id) osd_removal_pod_obj = get_pod_obj(osd_removal_pod_name, namespace="openshift-storage") osd_removal_pod_obj.ocp.wait_for_resource( condition=constants.STATUS_COMPLETED, resource_name=osd_removal_pod_name) # Verify OSD removal from the ocs-osd-removal pod logs logger.info( f"Verifying removal of OSD from {osd_removal_pod_name} pod logs") logs = get_pod_logs(osd_removal_pod_name) pattern = f"purged osd.{osd_id}" assert re.search(pattern, logs) osd_pvc_name = osd_pvc.name if ocp_version < 4.6: # Delete the OSD prepare job logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}") osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete( resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Delete the OSD deployment logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete( resource_name=osd_deployment_name, timeout=120) else: # If ocp version is '4.6' and above the osd removal job should # delete the OSD prepare job, OSD PVC, OSD deployment logger.info( f"Verifying deletion of OSD prepare job {osd_prepare_job_name}" ) osd_prepare_job.ocp.wait_for_delete( resource_name=osd_prepare_job_name, timeout=30) logger.info(f"Verifying deletion of OSD PVC {osd_pvc_name}") osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name, timeout=30) logger.info( f"Verifying deletion of OSD deployment {osd_deployment_name}") osd_deployment.ocp.wait_for_delete( resource_name=osd_deployment_name, timeout=30) # Delete PV logger.info(f"Verifying deletion of PV {osd_pv_name}") try: osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) except TimeoutError: osd_pv.delete() osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) if ocp_version < 4.6: # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info( f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() # Delete the OSD removal job logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}") osd_removal_job = get_job_obj(f"ocs-osd-removal-{osd_id}") osd_removal_job.delete() osd_removal_job.ocp.wait_for_delete( resource_name=f"ocs-osd-removal-{osd_id}") timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info( "Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info( "Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810 # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438 if ocp_version >= 4.6: silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning( osd_pod_name) if not silence_osd_crash: logger.info("Didn't find ceph osd crash warning") # Validate cluster is still functional self.sanity_helpers.health_check(tries=100) self.sanity_helpers.create_resources(pvc_factory, pod_factory)
def ocs_install_verification(timeout=600, skip_osd_distribution_check=False): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. """ from ocs_ci.ocs.node import get_typed_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods number_of_worker_nodes = len(get_typed_nodes()) namespace = config.ENV_DATA['cluster_namespace'] log.info("Verifying OCS installation") # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") operator_selector = get_selector_for_ocs_operator() ocs_package_manifest = PackageManifest( resource_name=defaults.OCS_OPERATOR_NAME, selector=operator_selector, ) ocs_csv_name = ocs_package_manifest.get_current_csv() ocs_csv = CSV(resource_name=ocs_csv_name, namespace=namespace) log.info(f"Check if OCS operator: {ocs_csv_name} is in Succeeded phase.") ocs_csv.wait_for_phase(phase="Succeeded", timeout=timeout) # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA['storage_cluster_name'] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info(f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase") storage_cluster.wait_for_phase(phase='Ready', timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP(kind=constants.POD, namespace=namespace) # ocs-operator assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.OCS_OPERATOR_LABEL, timeout=timeout) # rook-ceph-operator assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL, timeout=timeout) # noobaa assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.NOOBAA_APP_LABEL, resource_count=2, timeout=timeout) # mons assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=3, timeout=timeout) # csi-cephfsplugin assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.CSI_CEPHFSPLUGIN_LABEL, resource_count=number_of_worker_nodes, timeout=timeout) # csi-cephfsplugin-provisioner assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, resource_count=2, timeout=timeout) # csi-rbdplugin assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.CSI_RBDPLUGIN_LABEL, resource_count=number_of_worker_nodes, timeout=timeout) # csi-rbdplugin-provisioner assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, resource_count=2, timeout=timeout) # osds osd_count = ( int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) * int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica'])) assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_count, timeout=timeout) # mgr assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MGR_APP_LABEL, timeout=timeout) # mds assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MDS_APP_LABEL, resource_count=2, timeout=timeout) # rgw check only for VmWare if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM: assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.RGW_APP_LABEL, resource_count=1, timeout=timeout) # Verify ceph health log.info("Verifying ceph health") assert utils.ceph_health_check(namespace=namespace) # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace) storage_cluster_name = config.ENV_DATA['storage_cluster_name'] required_storage_classes = { f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd' } storage_classes = storage_class.get() storage_class_names = { item['metadata']['name'] for item in storage_classes['items'] } assert required_storage_classes.issubset(storage_class_names) # Verify OSD's are distributed if not skip_osd_distribution_check: log.info("Verifying OSD's are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items'] node_names = [osd['spec']['nodeName'] for osd in osds] for node in node_names: assert not node_names.count(node) > 1, ( "OSD's are not distributed evenly across worker nodes") # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == ({ item['metadata']['name'] for item in csi_driver.get()['items'] }) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD) sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS) assert sc_rbd['parameters'][ 'csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET assert sc_rbd['parameters'][ 'csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET assert sc_cephfs['parameters'][ 'csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET assert sc_cephfs['parameters'][ 'csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET log.info("Verified node and provisioner secret names in storage class.") # Verify ceph osd tree output log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output.") deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] ct_pod = get_ceph_tools_pod() osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json') schemas = { 'root': constants.OSD_TREE_ROOT, 'rack': constants.OSD_TREE_RACK, 'host': constants.OSD_TREE_HOST, 'osd': constants.OSD_TREE_OSD, 'region': constants.OSD_TREE_REGION, 'zone': constants.OSD_TREE_ZONE } schemas['host']['properties']['name'] = {'enum': deviceset_pvcs} for item in osd_tree['nodes']: validate(instance=item, schema=schemas[item['type']]) if item['type'] == 'host': deviceset_pvcs.remove(item['name']) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}") log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output.") # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL ]) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ('snapshot' not in container) and ( 'snapshot' not in image), ( f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}") assert { 'name': 'CSI_ENABLE_SNAPSHOTTER', 'value': 'false' } in (ocs_csv.get()['spec']['install']['spec']['deployments'][0]['spec'] ['template']['spec']['containers'][0]['env'] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd crush dump', format='') pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL ] crush_rules = [ rule for rule in crush_dump['rules'] if rule['rule_name'] in pool_names ] for crush_rule in crush_rules: assert [ item for item in crush_rule['steps'] if item.get('type') == 'zone' ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone")
def ocs_install_verification( timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None, post_upgrade_verification=False, ): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. ocs_registry_image (str): Specific image to check if it was installed properly. post_upgrade_verification (bool): Set to True if this function is called after upgrade. """ from ocs_ci.ocs.node import get_typed_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods from ocs_ci.ocs.cluster import validate_cluster_on_pvc number_of_worker_nodes = len(get_typed_nodes()) namespace = config.ENV_DATA['cluster_namespace'] log.info("Verifying OCS installation") # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") ocs_csv = get_ocs_csv() # Verify if OCS CSV has proper version. csv_version = ocs_csv.data['spec']['version'] ocs_version = config.ENV_DATA['ocs_version'] log.info( f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}" ) assert ocs_version in csv_version, ( f"OCS version: {ocs_version} mismatch with CSV version {csv_version}" ) # Verify if OCS CSV has the same version in provided CI build. ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get( 'ocs_registry_image' ) if ocs_registry_image and ocs_registry_image.endswith(".ci"): ocs_registry_image = ocs_registry_image.split(":")[1] log.info( f"Check if OCS registry image: {ocs_registry_image} matches with " f"CSV: {csv_version}" ) ignore_csv_mismatch = config.DEPLOYMENT.get('ignore_csv_mismatch') if ignore_csv_mismatch: log.info( "The possible mismatch will be ignored as you deployed " "the different version than the default version from the CSV" ) else: assert ocs_registry_image in csv_version, ( f"OCS registry image version: {ocs_registry_image} mismatch " f"with CSV version {csv_version}" ) # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA['storage_cluster_name'] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info( f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase" ) storage_cluster.wait_for_phase(phase='Ready', timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP( kind=constants.POD, namespace=namespace ) osd_count = ( int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) * int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica']) ) # check noobaa CR for min number of noobaa endpoint pods nb_obj = OCP(kind='noobaa', namespace=defaults.ROOK_CLUSTER_NAMESPACE) min_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('minCount') max_eps = nb_obj.get().get('items')[0].get('spec').get('endpoints').get('maxCount') resources_dict = { constants.OCS_OPERATOR_LABEL: 1, constants.OPERATOR_LABEL: 1, constants.NOOBAA_DB_LABEL: 1, constants.NOOBAA_OPERATOR_POD_LABEL: 1, constants.NOOBAA_CORE_POD_LABEL: 1, constants.MON_APP_LABEL: 3, constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2, constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2, constants.OSD_APP_LABEL: osd_count, constants.MGR_APP_LABEL: 1, constants.MDS_APP_LABEL: 2, constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps } if config.ENV_DATA.get('platform') in constants.ON_PREM_PLATFORMS: # Workaround for https://bugzilla.redhat.com/show_bug.cgi?id=1857802 - RGW count is 1 # post upgrade to OCS 4.5. Tracked with # https://github.com/red-hat-storage/ocs-ci/issues/2532 rgw_count = 2 if float(config.ENV_DATA['ocs_version']) >= 4.5 and not ( post_upgrade_verification ) else 1 resources_dict.update({constants.RGW_APP_LABEL: rgw_count}) for label, count in resources_dict.items(): assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=label, resource_count=count, timeout=timeout ) nb_ep_pods = get_pods_having_label( label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE ) assert len(nb_ep_pods) <= max_eps, ( f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) " f"is greater than the maximum defined in the NooBaa CR ({max_eps})" ) # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP( kind=constants.STORAGECLASS, namespace=namespace ) storage_cluster_name = config.ENV_DATA['storage_cluster_name'] required_storage_classes = { f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd' } storage_classes = storage_class.get() storage_class_names = { item['metadata']['name'] for item in storage_classes['items'] } assert required_storage_classes.issubset(storage_class_names) # Verify OSDs are distributed if not skip_osd_distribution_check: log.info("Verifying OSDs are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items'] deviceset_count = get_deviceset_count() node_names = [osd['spec']['nodeName'] for osd in osds] for node in node_names: assert not node_names.count(node) > deviceset_count, ( "OSD's are not distributed evenly across worker nodes" ) # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == ( {item['metadata']['name'] for item in csi_driver.get()['items']} ) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD ) sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS ) assert sc_rbd['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET assert sc_rbd['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET assert sc_cephfs['parameters']['csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET assert sc_cephfs['parameters']['csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET log.info("Verified node and provisioner secret names in storage class.") # Verify ceph osd tree output log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output." ) if ( config.DEPLOYMENT.get('local_storage') and config.ENV_DATA['platform'] != constants.BAREMETALPSI_PLATFORM ): deviceset_pvcs = get_compute_node_names() else: deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] ct_pod = get_ceph_tools_pod() osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json') schemas = { 'root': constants.OSD_TREE_ROOT, 'rack': constants.OSD_TREE_RACK, 'host': constants.OSD_TREE_HOST, 'osd': constants.OSD_TREE_OSD, 'region': constants.OSD_TREE_REGION, 'zone': constants.OSD_TREE_ZONE } schemas['host']['properties']['name'] = {'enum': deviceset_pvcs} for item in osd_tree['nodes']: validate(instance=item, schema=schemas[item['type']]) if item['type'] == 'host': deviceset_pvcs.remove(item['name']) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}" ) log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output." ) # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL ] ) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ('snapshot' not in container) and ('snapshot' not in image), ( f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}" ) deployments = ocs_csv.get()['spec']['install']['spec']['deployments'] rook_ceph_operator_deployment = [ deployment_val for deployment_val in deployments if deployment_val['name'] == 'rook-ceph-operator' ] assert {'name': 'CSI_ENABLE_SNAPSHOTTER', 'value': 'false'} in ( rook_ceph_operator_deployment[0]['spec']['template']['spec']['containers'][0]['env'] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd( ceph_cmd='ceph osd crush dump', format='' ) pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL ] crush_rules = [rule for rule in crush_dump['rules'] if rule['rule_name'] in pool_names] for crush_rule in crush_rules: assert [ item for item in crush_rule['steps'] if item.get('type') == 'zone' ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone") log.info("Validate cluster on PVC") validate_cluster_on_pvc() # Verify ceph health log.info("Verifying ceph health") health_check_tries = 20 health_check_delay = 30 if post_upgrade_verification: # In case of upgrade with FIO we have to wait longer time to see # health OK. See discussion in BZ: # https://bugzilla.redhat.com/show_bug.cgi?id=1817727 health_check_tries = 180 assert utils.ceph_health_check( namespace, health_check_tries, health_check_delay )
def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory): """ Test cluster recovery from disk deletion from the platform side. Based on documented procedure detailed in https://bugzilla.redhat.com/show_bug.cgi?id=1787236#c16 """ logger.info("Picking a PV which will be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get('spec').get('claimRef').get('name') # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ds for ds in osd_pvcs if ds.get().get('metadata').get('name') == claim_name][0] # Get the corresponding OSD pod logger.info(f"Getting the corresponding OSD pod of PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get() .get('metadata').get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] # Get the node that has the OSD pod running on logger.info(f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) volume_size = osd_pvc.size osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get('metadata') .get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = osd_prepare_pod.get().get('metadata').get('labels').get('job-name') osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the corresponding OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get() .get('metadata').get('labels').get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] # Delete the volume from the platform side logger.info(f"Deleting volume {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Delete the OSD deployment osd_deployment_name = osd_deployment.name logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name, timeout=120) # Delete the OSD prepare job osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC osd_pvc_name = osd_pvc.name logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Recreate a volume from the platform side logger.info("Creating a replacing volume from the platform side") nodes.create_and_attach_volume(osd_node, volume_size) # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info(f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info("Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count ), ( f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info("Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count ), ( f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # Validate cluster is still functional self.sanity_helpers.health_check(tries=80) self.sanity_helpers.create_resources(pvc_factory, pod_factory)
def ocs_install_verification( timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None, post_upgrade_verification=False, version_before_upgrade=None, ): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. ocs_registry_image (str): Specific image to check if it was installed properly. post_upgrade_verification (bool): Set to True if this function is called after upgrade. version_before_upgrade (float): Set to OCS version before upgrade """ from ocs_ci.ocs.node import get_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods from ocs_ci.ocs.cluster import validate_cluster_on_pvc from ocs_ci.ocs.resources.fips import check_fips_enabled number_of_worker_nodes = len(get_nodes()) namespace = config.ENV_DATA["cluster_namespace"] log.info("Verifying OCS installation") if config.ENV_DATA.get("disable_components"): for component in config.ENV_DATA["disable_components"]: config.COMPONENTS[f"disable_{component}"] = True disable_noobaa = config.COMPONENTS["disable_noobaa"] disable_rgw = config.COMPONENTS["disable_rgw"] disable_blockpools = config.COMPONENTS["disable_blockpools"] disable_cephfs = config.COMPONENTS["disable_cephfs"] # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") ocs_csv = get_ocs_csv() # Verify if OCS CSV has proper version. csv_version = ocs_csv.data["spec"]["version"] ocs_version = version.get_semantic_ocs_version_from_config() log.info( f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}") assert ( f"{ocs_version}" in csv_version ), f"OCS version: {ocs_version} mismatch with CSV version {csv_version}" # Verify if OCS CSV has the same version in provided CI build. ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get( "ocs_registry_image") if ocs_registry_image and ocs_registry_image.endswith(".ci"): ocs_registry_image = ocs_registry_image.rsplit(":", 1)[1] log.info( f"Check if OCS registry image: {ocs_registry_image} matches with " f"CSV: {csv_version}") ignore_csv_mismatch = config.DEPLOYMENT.get("ignore_csv_mismatch") if ignore_csv_mismatch: log.info( "The possible mismatch will be ignored as you deployed " "the different version than the default version from the CSV") else: assert ocs_registry_image in csv_version, ( f"OCS registry image version: {ocs_registry_image} mismatch " f"with CSV version {csv_version}") # Verify Storage System status if ocs_version >= version.VERSION_4_9: log.info("Verifying storage system status") storage_system = OCP(kind=constants.STORAGESYSTEM, namespace=namespace) storage_system_data = storage_system.get() storage_system_status = {} for condition in storage_system_data["items"][0]["status"][ "conditions"]: storage_system_status[condition["type"]] = condition["status"] log.debug(f"storage system status: {storage_system_status}") assert storage_system_status == constants.STORAGE_SYSTEM_STATUS, ( f"Storage System status is not in expected state. Expected {constants.STORAGE_SYSTEM_STATUS}" f" but found {storage_system_status}") # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA["storage_cluster_name"] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info(f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase") storage_cluster.wait_for_phase(phase="Ready", timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP(kind=constants.POD, namespace=namespace) if not config.DEPLOYMENT["external_mode"]: osd_count = int( storage_cluster.data["spec"]["storageDeviceSets"][0]["count"] ) * int( storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"]) rgw_count = None if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS: if not disable_rgw: rgw_count = get_rgw_count(f"{ocs_version}", post_upgrade_verification, version_before_upgrade) min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT max_eps = (constants.MAX_NB_ENDPOINT_COUNT if ocs_version >= version.VERSION_4_6 else 1) if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM: min_eps = 1 max_eps = 1 nb_db_label = (constants.NOOBAA_DB_LABEL_46_AND_UNDER if ocs_version < version.VERSION_4_7 else constants.NOOBAA_DB_LABEL_47_AND_ABOVE) resources_dict = { nb_db_label: 1, constants.OCS_OPERATOR_LABEL: 1, constants.OPERATOR_LABEL: 1, constants.NOOBAA_OPERATOR_POD_LABEL: 1, constants.NOOBAA_CORE_POD_LABEL: 1, constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps, } if not config.DEPLOYMENT["external_mode"]: resources_dict.update({ constants.MON_APP_LABEL: 3, constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2, constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2, constants.OSD_APP_LABEL: osd_count, constants.MGR_APP_LABEL: 1, constants.MDS_APP_LABEL: 2, constants.RGW_APP_LABEL: rgw_count, }) if ocs_version >= version.VERSION_4_9: resources_dict.update({ constants.ODF_OPERATOR_CONTROL_MANAGER_LABEL: 1, }) for label, count in resources_dict.items(): if label == constants.RGW_APP_LABEL: if (not config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS or disable_rgw): continue if "noobaa" in label and disable_noobaa: continue if "mds" in label and disable_cephfs: continue assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=label, resource_count=count, timeout=timeout, ) if not disable_noobaa: nb_ep_pods = get_pods_having_label( label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) assert len(nb_ep_pods) <= max_eps, ( f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) " f"is greater than the maximum defined in the NooBaa CR ({max_eps})" ) # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace) storage_cluster_name = config.ENV_DATA["storage_cluster_name"] required_storage_classes = { f"{storage_cluster_name}-cephfs", f"{storage_cluster_name}-ceph-rbd", } if ocs_version >= version.VERSION_4_10: # TODO: Add rbd-thick storage class verification in external mode cluster upgraded # to OCS 4.8 when the bug 1978542 is fixed # Skip rbd-thick storage class verification in external mode upgraded cluster. This is blocked by bug 1978542 if not (config.DEPLOYMENT["external_mode"] and post_upgrade_verification): required_storage_classes.update( {f"{storage_cluster_name}-ceph-rbd-thick"}) skip_storage_classes = set() if disable_cephfs: skip_storage_classes.update({ f"{storage_cluster_name}-cephfs", }) if disable_blockpools: skip_storage_classes.update({ f"{storage_cluster_name}-ceph-rbd", }) required_storage_classes = required_storage_classes.difference( skip_storage_classes) if config.DEPLOYMENT["external_mode"]: required_storage_classes.update({ f"{storage_cluster_name}-ceph-rgw", f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io', }) storage_classes = storage_class.get() storage_class_names = { item["metadata"]["name"] for item in storage_classes["items"] } # required storage class names should be observed in the cluster under test missing_scs = required_storage_classes.difference(storage_class_names) if len(missing_scs) > 0: log.error("few storage classess are not present: %s", missing_scs) assert list(missing_scs) == [] # Verify OSDs are distributed if not config.DEPLOYMENT["external_mode"]: if not skip_osd_distribution_check: log.info( "Verifying OSDs are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"] deviceset_count = get_deviceset_count() node_names = [osd["spec"]["nodeName"] for osd in osds] for node in node_names: assert ( not node_names.count(node) > deviceset_count ), "OSD's are not distributed evenly across worker nodes" # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") csi_drivers = { item["metadata"]["name"] for item in csi_driver.get()["items"] } assert defaults.CSI_PROVISIONERS.issubset(csi_drivers) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") if config.DEPLOYMENT["external_mode"]: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD) sc_cephfs = storage_class.get(resource_name=( constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS)) else: if not disable_blockpools: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD) if not disable_cephfs: sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS) if not disable_blockpools: assert ( sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"] == constants.RBD_NODE_SECRET) assert ( sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"] == constants.RBD_PROVISIONER_SECRET) if not disable_cephfs: assert (sc_cephfs["parameters"] ["csi.storage.k8s.io/node-stage-secret-name"] == constants.CEPHFS_NODE_SECRET) assert (sc_cephfs["parameters"] ["csi.storage.k8s.io/provisioner-secret-name"] == constants.CEPHFS_PROVISIONER_SECRET) log.info("Verified node and provisioner secret names in storage class.") ct_pod = get_ceph_tools_pod() # https://github.com/red-hat-storage/ocs-ci/issues/3820 # Verify ceph osd tree output if not (config.DEPLOYMENT.get("ui_deployment") or config.DEPLOYMENT["external_mode"]): log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output.") if config.DEPLOYMENT.get("local_storage"): deviceset_pvcs = [osd.get_node() for osd in get_osd_pods()] # removes duplicate hostname deviceset_pvcs = list(set(deviceset_pvcs)) if config.ENV_DATA.get("platform") == constants.BAREMETAL_PLATFORM: deviceset_pvcs = [ deviceset.replace(".", "-") for deviceset in deviceset_pvcs ] else: deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree", format="json") schemas = { "root": constants.OSD_TREE_ROOT, "rack": constants.OSD_TREE_RACK, "host": constants.OSD_TREE_HOST, "osd": constants.OSD_TREE_OSD, "region": constants.OSD_TREE_REGION, "zone": constants.OSD_TREE_ZONE, } schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs} for item in osd_tree["nodes"]: validate(instance=item, schema=schemas[item["type"]]) if item["type"] == "host": deviceset_pvcs.remove(item["name"]) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}") log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output.") # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present # if the OCS version is < 4.6 if ocs_version < version.VERSION_4_6: log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, ], ) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ("snapshot" not in container) and ( "snapshot" not in image ), (f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}") deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"] rook_ceph_operator_deployment = [ deployment_val for deployment_val in deployments if deployment_val["name"] == "rook-ceph-operator" ] assert { "name": "CSI_ENABLE_SNAPSHOTTER", "value": "false" } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"] ["containers"][0]["env"] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump", format="") pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL, ] crush_rules = [ rule for rule in crush_dump["rules"] if rule["rule_name"] in pool_names ] for crush_rule in crush_rules: assert [ item for item in crush_rule["steps"] if item.get("type") == "zone" ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone") log.info("Validate cluster on PVC") validate_cluster_on_pvc() # Verify ceph health log.info("Verifying ceph health") health_check_tries = 20 health_check_delay = 30 if post_upgrade_verification: # In case of upgrade with FIO we have to wait longer time to see # health OK. See discussion in BZ: # https://bugzilla.redhat.com/show_bug.cgi?id=1817727 health_check_tries = 180 assert utils.ceph_health_check(namespace, health_check_tries, health_check_delay) if config.ENV_DATA.get("fips"): # In case that fips is enabled when deploying, # a verification of the installation of it will run # on all running state pods check_fips_enabled() if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() if config.DEPLOYMENT.get("kms_deployment"): kms = KMS.get_kms_deployment() kms.post_deploy_verification() storage_cluster_obj = get_storage_cluster() is_flexible_scaling = ( storage_cluster_obj.get()["items"][0].get("spec").get( "flexibleScaling", False)) if is_flexible_scaling is True: failure_domain = storage_cluster_obj.data["items"][0]["status"][ "failureDomain"] assert failure_domain == "host", ( f"The expected failure domain on cluster with flexible scaling is 'host'," f" the actaul failure domain is {failure_domain}") if ocs_version >= version.VERSION_4_7: log.info("Verifying images in storage cluster") verify_sc_images(storage_cluster) if config.ENV_DATA.get("is_multus_enabled"): verify_multus_network()
def osd_device_replacement(nodes): """ Replacing randomly picked osd device Args: node (OCS): The OCS object representing the node """ logger.info("Picking a PV which to be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get("spec").get("claimRef").get("name") # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ ds for ds in osd_pvcs if ds.get().get("metadata").get("name") == claim_name ][0] # Get the corresponding OSD pod and ID logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get().get("metadata").get( "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] logger.info(f"OSD_POD {osd_pod.name}") osd_id = get_osd_pod_id(osd_pod) # Get the node that has the OSD pod running on logger.info( f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) ocp_version = get_ocp_version() if Version.coerce(ocp_version) < Version.coerce("4.6"): osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get("metadata").get( "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get( "labels").get("job-name")) osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_deployment_name = osd_deployment.name # Delete the volume from the platform side logger.info(f"Deleting {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Scale down OSD deployment logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0") ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"]) ocp_obj.exec_oc_cmd(f"scale --replicas=0 deployment/{osd_deployment_name}") # Force delete OSD pod if necessary osd_pod_name = osd_pod.name logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted") try: osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) except TimeoutError: osd_pod.delete(force=True) osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) # Run ocs-osd-removal job osd_removal_job = run_osd_removal_job([osd_id]) assert osd_removal_job, "ocs-osd-removal failed to create" is_completed = verify_osd_removal_job_completed_successfully(osd_id) assert is_completed, "ocs-osd-removal-job is not in status 'completed'" logger.info("ocs-osd-removal-job completed successfully") osd_pvc_name = osd_pvc.name if Version.coerce(ocp_version) < Version.coerce("4.6"): # Delete the OSD prepare job logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}") osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete(resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Delete the OSD deployment logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete(resource_name=osd_deployment_name, timeout=120) else: # If ocp version is '4.6' and above the osd removal job should # delete the OSD prepare job, OSD PVC, OSD deployment # We just need to verify the old PV is in the expected status logger.info( f"Verify that the old PV '{osd_pv_name}' is in the expected status" ) if cluster.is_lso_cluster(): expected_old_pv_statuses = [constants.STATUS_RELEASED] else: expected_old_pv_statuses = [ constants.STATUS_RELEASED, constants.STATUS_FAILED, ] assert (osd_pv.ocp.get_resource_status(osd_pv_name) in expected_old_pv_statuses), logger.warning( f"The old PV '{osd_pv_name}' is not in " f"the expected statuses: {expected_old_pv_statuses}") # Delete PV logger.info(f"Verifying deletion of PV {osd_pv_name}") try: osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) except TimeoutError: osd_pv.delete() osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) # If we use LSO, we need to create and attach a new disk manually if cluster.is_lso_cluster(): node.add_disk_to_node(osd_node) if Version.coerce(ocp_version) < Version.coerce("4.6"): # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info( f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() # Delete the OSD removal job logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}") is_deleted = delete_osd_removal_job(osd_id) assert is_deleted, "Failed to delete ocs-osd-removal-job" logger.info("ocs-osd-removal-job deleted successfully") timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info( "Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info( "Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810 # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438 if Version.coerce(ocp_version) >= Version.coerce("4.6"): silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning( osd_pod_name) if not silence_osd_crash: logger.info("Didn't find ceph osd crash warning") sanity_helpers = Sanity() sanity_helpers.health_check(tries=120)
def ocs_install_verification( timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None, post_upgrade_verification=False, version_before_upgrade=None, ): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. ocs_registry_image (str): Specific image to check if it was installed properly. post_upgrade_verification (bool): Set to True if this function is called after upgrade. version_before_upgrade (float): Set to OCS version before upgrade """ from ocs_ci.ocs.node import get_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods from ocs_ci.ocs.cluster import validate_cluster_on_pvc from ocs_ci.ocs.resources.fips import check_fips_enabled number_of_worker_nodes = len(get_nodes()) namespace = config.ENV_DATA["cluster_namespace"] log.info("Verifying OCS installation") # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") ocs_csv = get_ocs_csv() # Verify if OCS CSV has proper version. csv_version = ocs_csv.data["spec"]["version"] ocs_version = config.ENV_DATA["ocs_version"] log.info( f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}") assert ( ocs_version in csv_version ), f"OCS version: {ocs_version} mismatch with CSV version {csv_version}" # Verify if OCS CSV has the same version in provided CI build. ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get( "ocs_registry_image") if ocs_registry_image and ocs_registry_image.endswith(".ci"): ocs_registry_image = ocs_registry_image.split(":")[1] log.info( f"Check if OCS registry image: {ocs_registry_image} matches with " f"CSV: {csv_version}") ignore_csv_mismatch = config.DEPLOYMENT.get("ignore_csv_mismatch") if ignore_csv_mismatch: log.info( "The possible mismatch will be ignored as you deployed " "the different version than the default version from the CSV") else: assert ocs_registry_image in csv_version, ( f"OCS registry image version: {ocs_registry_image} mismatch " f"with CSV version {csv_version}") # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA["storage_cluster_name"] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info(f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase") storage_cluster.wait_for_phase(phase="Ready", timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP(kind=constants.POD, namespace=namespace) if not config.DEPLOYMENT["external_mode"]: osd_count = int( storage_cluster.data["spec"]["storageDeviceSets"][0]["count"] ) * int( storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"]) rgw_count = None if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS: # RGW count is 1 if OCS version < 4.5 or the cluster was upgraded from version <= 4.4 if (float(config.ENV_DATA["ocs_version"]) < 4.5 or float(config.ENV_DATA["ocs_version"]) == 4.5 and (post_upgrade_verification and float(version_before_upgrade) < 4.5)): rgw_count = 1 else: rgw_count = 2 # # With 4.4 OCS cluster deployed over Azure, RGW is the default backingstore if config.ENV_DATA.get("platform") == constants.AZURE_PLATFORM: if float(config.ENV_DATA["ocs_version"]) == 4.4 or ( float(config.ENV_DATA["ocs_version"]) == 4.5 and (post_upgrade_verification and float(version_before_upgrade) < 4.5)): rgw_count = 1 min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT max_eps = (constants.MAX_NB_ENDPOINT_COUNT if float(config.ENV_DATA["ocs_version"]) >= 4.6 else 1) if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM: min_eps = 1 max_eps = 1 resources_dict = { constants.OCS_OPERATOR_LABEL: 1, constants.OPERATOR_LABEL: 1, constants.NOOBAA_DB_LABEL: 1, constants.NOOBAA_OPERATOR_POD_LABEL: 1, constants.NOOBAA_CORE_POD_LABEL: 1, constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps, } if not config.DEPLOYMENT["external_mode"]: resources_dict.update({ constants.MON_APP_LABEL: 3, constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2, constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2, constants.OSD_APP_LABEL: osd_count, constants.MGR_APP_LABEL: 1, constants.MDS_APP_LABEL: 2, constants.RGW_APP_LABEL: rgw_count, }) for label, count in resources_dict.items(): if label == constants.RGW_APP_LABEL: if not config.ENV_DATA.get( "platform") in constants.ON_PREM_PLATFORMS: continue assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=label, resource_count=count, timeout=timeout, ) nb_ep_pods = get_pods_having_label( label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) assert len(nb_ep_pods) <= max_eps, ( f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) " f"is greater than the maximum defined in the NooBaa CR ({max_eps})") # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace) storage_cluster_name = config.ENV_DATA["storage_cluster_name"] required_storage_classes = { f"{storage_cluster_name}-cephfs", f"{storage_cluster_name}-ceph-rbd", } if config.DEPLOYMENT["external_mode"]: required_storage_classes.update({ f"{storage_cluster_name}-ceph-rgw", f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io', }) storage_classes = storage_class.get() storage_class_names = { item["metadata"]["name"] for item in storage_classes["items"] } assert required_storage_classes.issubset(storage_class_names) # Verify OSDs are distributed if not config.DEPLOYMENT["external_mode"]: if not skip_osd_distribution_check: log.info( "Verifying OSDs are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"] deviceset_count = get_deviceset_count() node_names = [osd["spec"]["nodeName"] for osd in osds] for node in node_names: assert ( not node_names.count(node) > deviceset_count ), "OSD's are not distributed evenly across worker nodes" # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") csi_drivers = { item["metadata"]["name"] for item in csi_driver.get()["items"] } assert defaults.CSI_PROVISIONERS.issubset(csi_drivers) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") if config.DEPLOYMENT["external_mode"]: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD) sc_cephfs = storage_class.get(resource_name=( constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS)) else: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD) sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS) assert (sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"] == constants.RBD_NODE_SECRET) assert (sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"] == constants.RBD_PROVISIONER_SECRET) assert ( sc_cephfs["parameters"]["csi.storage.k8s.io/node-stage-secret-name"] == constants.CEPHFS_NODE_SECRET) assert ( sc_cephfs["parameters"]["csi.storage.k8s.io/provisioner-secret-name"] == constants.CEPHFS_PROVISIONER_SECRET) log.info("Verified node and provisioner secret names in storage class.") # Verify ceph osd tree output if not config.DEPLOYMENT["external_mode"]: log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output.") if config.DEPLOYMENT.get("local_storage"): deviceset_pvcs = get_compute_node_names() else: deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] ct_pod = get_ceph_tools_pod() osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree", format="json") schemas = { "root": constants.OSD_TREE_ROOT, "rack": constants.OSD_TREE_RACK, "host": constants.OSD_TREE_HOST, "osd": constants.OSD_TREE_OSD, "region": constants.OSD_TREE_REGION, "zone": constants.OSD_TREE_ZONE, } schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs} for item in osd_tree["nodes"]: validate(instance=item, schema=schemas[item["type"]]) if item["type"] == "host": deviceset_pvcs.remove(item["name"]) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}") log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output.") # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present # if the OCS version is < 4.6 if float(config.ENV_DATA["ocs_version"]) < 4.6: log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, ], ) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ("snapshot" not in container) and ( "snapshot" not in image ), (f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}") deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"] rook_ceph_operator_deployment = [ deployment_val for deployment_val in deployments if deployment_val["name"] == "rook-ceph-operator" ] assert { "name": "CSI_ENABLE_SNAPSHOTTER", "value": "false" } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"] ["containers"][0]["env"] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump", format="") pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL, ] crush_rules = [ rule for rule in crush_dump["rules"] if rule["rule_name"] in pool_names ] for crush_rule in crush_rules: assert [ item for item in crush_rule["steps"] if item.get("type") == "zone" ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone") log.info("Validate cluster on PVC") validate_cluster_on_pvc() # Verify ceph health log.info("Verifying ceph health") health_check_tries = 20 health_check_delay = 30 if post_upgrade_verification: # In case of upgrade with FIO we have to wait longer time to see # health OK. See discussion in BZ: # https://bugzilla.redhat.com/show_bug.cgi?id=1817727 health_check_tries = 180 assert utils.ceph_health_check(namespace, health_check_tries, health_check_delay) if config.ENV_DATA.get("fips"): # In case that fips is enabled when deploying, # a verification of the installation of it will run # on all running state pods check_fips_enabled() if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification()