def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ _templating = templating.Templating() ceph_cluster = ocp.OCP( kind='CephCluster', namespace=config.ENV_DATA['cluster_namespace'] ) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") create_oc_resource( 'common.yaml', self.cluster_path, _templating, config.ENV_DATA ) run_cmd( f'oc label namespace {config.ENV_DATA["cluster_namespace"]} ' f'"openshift.io/cluster-monitoring=true"' ) run_cmd( f"oc policy add-role-to-user view " f"system:serviceaccount:openshift-monitoring:prometheus-k8s " f"-n {config.ENV_DATA['cluster_namespace']}" ) # HACK: If you would like to drop this hack, make sure that you also # updated docs and write appropriate unit/integration tests for config # processing. if config.ENV_DATA.get('monitoring_enabled') in ("true", "True", True): # RBAC rules for monitoring, based on documentation change in rook: # https://github.com/rook/rook/commit/1b6fe840f6ae7372a9675ba727ecc65326708aa8 # HACK: This should be dropped when OCS is managed by OLM apply_oc_resource( 'rbac.yaml', self.cluster_path, _templating, config.ENV_DATA, template_dir="monitoring" ) # Increased to 15 seconds as 10 is not enough # TODO: do the sampler function and check if resource exist wait_time = 15 logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) create_oc_resource( 'operator-openshift.yaml', self.cluster_path, _templating, config.ENV_DATA ) logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) run_cmd( f"oc wait --for condition=ready pod " f"-l app=rook-ceph-operator " f"-n {config.ENV_DATA['cluster_namespace']} " f"--timeout=120s" ) run_cmd( f"oc wait --for condition=ready pod " f"-l app=rook-discover " f"-n {config.ENV_DATA['cluster_namespace']} " f"--timeout=120s" ) create_oc_resource( 'cluster.yaml', self.cluster_path, _templating, config.ENV_DATA ) pod = ocp.OCP( kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace'] ) cfs = ocp.OCP( kind=constants.CEPHFILESYSTEM, namespace=config.ENV_DATA['cluster_namespace'] ) # Check for Ceph pods assert pod.wait_for_resource( condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600 ) assert pod.wait_for_resource( condition='Running', selector='app=rook-ceph-mgr', timeout=600 ) assert pod.wait_for_resource( condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600 ) create_oc_resource( 'toolbox.yaml', self.cluster_path, _templating, config.ENV_DATA ) logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) # HACK: This should be dropped (including service-monitor.yaml and # prometheus-rules.yaml files) when OCS is managed by OLM if config.ENV_DATA.get('monitoring_enabled') not in ("true", "True", True): # HACK: skip creation of rook-ceph-mgr service monitor when monitoring # is enabled (if this were not skipped, the step would fail because # rook would create the service monitor at this point already) create_oc_resource( "service-monitor.yaml", self.cluster_path, _templating, config.ENV_DATA ) # HACK: skip creation of prometheus-rules, rook-ceph is concerned # with it's setup now, based on clarification from Umanga # Chapagain create_oc_resource( "prometheus-rules.yaml", self.cluster_path, _templating, config.ENV_DATA ) logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) # Create MDS pods for CephFileSystem fs_data = templating.load_yaml_to_dict(constants.CEPHFILESYSTEM_YAML) fs_data['metadata']['namespace'] = config.ENV_DATA['cluster_namespace'] ceph_obj = OCS(**fs_data) ceph_obj.create() assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector='app=rook-ceph-mds', resource_count=2, timeout=600 ) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info(f"MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error( f"MDS deployment Failed! Please check logs!" ) # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") assert ceph_health_check( namespace=config.ENV_DATA['cluster_namespace'] ) # patch gp2 (EBS) storage class as 'non-default' logger.info("Patch gp2 storageclass as non-default") patch = " '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"false\"}}}' " run_cmd( f"oc patch storageclass gp2 " f"-p {patch} " f"--request-timeout=120s" )
def test_rwo_pvc_fencing_node_prolonged_network_failure( self, nodes, setup, teardown): """ OCS-1427/OCS-1429: - Start DeploymentConfig based app pods on 1 OCS/Non-OCS node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive node - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods OCS-1430/OCS-1435: - Start DeploymentConfig based app pods on multiple node Colocated scenario: Select 1 node where osd and/or mon is running, select other 2 nodes where mon/osd are not running Dedicated scenario: 3 Non-OCS nodes - Disrupt the leader provisioner pods if not running on above selected nodes - Make the nodes (where app pods are running) unresponsive by bringing their main network interface down - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive nodes - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup # Run IO on pods md5sum_data = self.run_and_verify_io(pod_list=dc_pods, fio_filename="io_file1", run_io_in_bg=True) # OCS-1430/OCS-1435 # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds") sleep(self.prolong_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in dc_pods + ceph_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len( dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods) if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, "Unexpected number of osd pods" self.verify_multi_attach_error(new_ceph_pods) logger.info("Executing manual recovery steps") # Power off the unresponsive node(s) logger.info(f"Powering off the unresponsive node(s): {app_pod_nodes}") nodes.stop_nodes(node.get_node_objs(app_pod_nodes)) # Force delete the app pods and/or mon,osd pods on the unresponsive node if ceph_cluster.mon_count == 5 and float( config.ENV_DATA["ocs_version"]) < 4.4: for pod_obj in ceph_cluster.mons: if pod.get_pod_node(pod_obj).name in app_pod_nodes: ceph_pods.append(pod_obj) for pod_obj in dc_pods + ceph_pods: pod_obj.delete(force=True) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) # Wait for mon and osd pods to reach Running state selectors_to_check = [constants.MON_APP_LABEL, constants.OSD_APP_LABEL] for selector in selectors_to_check: assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=3, timeout=1800, sleep=60, ), f"3 expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == 3: # Check ceph health toolbox_status = ceph_cluster.POD.get_resource_status( ceph_cluster.toolbox.name) if toolbox_status == constants.STATUS_TERMINATING: ceph_cluster.toolbox.delete(force=True) assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods self.run_and_verify_io(pod_list=new_dc_pods, fio_filename="io_file2", return_md5sum=False)
def ocs_install_verification( timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None, post_upgrade_verification=False, version_before_upgrade=None, ): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. ocs_registry_image (str): Specific image to check if it was installed properly. post_upgrade_verification (bool): Set to True if this function is called after upgrade. version_before_upgrade (float): Set to OCS version before upgrade """ from ocs_ci.ocs.node import get_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods from ocs_ci.ocs.cluster import validate_cluster_on_pvc from ocs_ci.ocs.resources.fips import check_fips_enabled number_of_worker_nodes = len(get_nodes()) namespace = config.ENV_DATA["cluster_namespace"] log.info("Verifying OCS installation") # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") ocs_csv = get_ocs_csv() # Verify if OCS CSV has proper version. csv_version = ocs_csv.data["spec"]["version"] ocs_version = config.ENV_DATA["ocs_version"] log.info( f"Check if OCS version: {ocs_version} matches with CSV: {csv_version}") assert ( ocs_version in csv_version ), f"OCS version: {ocs_version} mismatch with CSV version {csv_version}" # Verify if OCS CSV has the same version in provided CI build. ocs_registry_image = ocs_registry_image or config.DEPLOYMENT.get( "ocs_registry_image") if ocs_registry_image and ocs_registry_image.endswith(".ci"): ocs_registry_image = ocs_registry_image.split(":")[1] log.info( f"Check if OCS registry image: {ocs_registry_image} matches with " f"CSV: {csv_version}") ignore_csv_mismatch = config.DEPLOYMENT.get("ignore_csv_mismatch") if ignore_csv_mismatch: log.info( "The possible mismatch will be ignored as you deployed " "the different version than the default version from the CSV") else: assert ocs_registry_image in csv_version, ( f"OCS registry image version: {ocs_registry_image} mismatch " f"with CSV version {csv_version}") # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA["storage_cluster_name"] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info(f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase") storage_cluster.wait_for_phase(phase="Ready", timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP(kind=constants.POD, namespace=namespace) if not config.DEPLOYMENT["external_mode"]: osd_count = int( storage_cluster.data["spec"]["storageDeviceSets"][0]["count"] ) * int( storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"]) rgw_count = None if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS: # RGW count is 1 if OCS version < 4.5 or the cluster was upgraded from version <= 4.4 if (float(config.ENV_DATA["ocs_version"]) < 4.5 or float(config.ENV_DATA["ocs_version"]) == 4.5 and (post_upgrade_verification and float(version_before_upgrade) < 4.5)): rgw_count = 1 else: rgw_count = 2 # # With 4.4 OCS cluster deployed over Azure, RGW is the default backingstore if config.ENV_DATA.get("platform") == constants.AZURE_PLATFORM: if float(config.ENV_DATA["ocs_version"]) == 4.4 or ( float(config.ENV_DATA["ocs_version"]) == 4.5 and (post_upgrade_verification and float(version_before_upgrade) < 4.5)): rgw_count = 1 min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT max_eps = (constants.MAX_NB_ENDPOINT_COUNT if float(config.ENV_DATA["ocs_version"]) >= 4.6 else 1) if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM: min_eps = 1 max_eps = 1 nb_db_label = (constants.NOOBAA_DB_LABEL_46_AND_UNDER if float(config.ENV_DATA["ocs_version"]) < 4.7 else constants.NOOBAA_DB_LABEL_47_AND_ABOVE) resources_dict = { nb_db_label: 1, constants.OCS_OPERATOR_LABEL: 1, constants.OPERATOR_LABEL: 1, constants.NOOBAA_OPERATOR_POD_LABEL: 1, constants.NOOBAA_CORE_POD_LABEL: 1, constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps, } if not config.DEPLOYMENT["external_mode"]: resources_dict.update({ constants.MON_APP_LABEL: 3, constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2, constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2, constants.OSD_APP_LABEL: osd_count, constants.MGR_APP_LABEL: 1, constants.MDS_APP_LABEL: 2, constants.RGW_APP_LABEL: rgw_count, }) for label, count in resources_dict.items(): if label == constants.RGW_APP_LABEL: if not config.ENV_DATA.get( "platform") in constants.ON_PREM_PLATFORMS: continue assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=label, resource_count=count, timeout=timeout, ) nb_ep_pods = get_pods_having_label( label=constants.NOOBAA_ENDPOINT_POD_LABEL, namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) assert len(nb_ep_pods) <= max_eps, ( f"The number of running NooBaa endpoint pods ({len(nb_ep_pods)}) " f"is greater than the maximum defined in the NooBaa CR ({max_eps})") # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace) storage_cluster_name = config.ENV_DATA["storage_cluster_name"] required_storage_classes = { f"{storage_cluster_name}-cephfs", f"{storage_cluster_name}-ceph-rbd", } if config.DEPLOYMENT["external_mode"]: required_storage_classes.update({ f"{storage_cluster_name}-ceph-rgw", f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io', }) storage_classes = storage_class.get() storage_class_names = { item["metadata"]["name"] for item in storage_classes["items"] } assert required_storage_classes.issubset(storage_class_names) # Verify OSDs are distributed if not config.DEPLOYMENT["external_mode"]: if not skip_osd_distribution_check: log.info( "Verifying OSDs are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"] deviceset_count = get_deviceset_count() node_names = [osd["spec"]["nodeName"] for osd in osds] for node in node_names: assert ( not node_names.count(node) > deviceset_count ), "OSD's are not distributed evenly across worker nodes" # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") csi_drivers = { item["metadata"]["name"] for item in csi_driver.get()["items"] } assert defaults.CSI_PROVISIONERS.issubset(csi_drivers) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") if config.DEPLOYMENT["external_mode"]: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD) sc_cephfs = storage_class.get(resource_name=( constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS)) else: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD) sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS) assert (sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"] == constants.RBD_NODE_SECRET) assert (sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"] == constants.RBD_PROVISIONER_SECRET) assert ( sc_cephfs["parameters"]["csi.storage.k8s.io/node-stage-secret-name"] == constants.CEPHFS_NODE_SECRET) assert ( sc_cephfs["parameters"]["csi.storage.k8s.io/provisioner-secret-name"] == constants.CEPHFS_PROVISIONER_SECRET) log.info("Verified node and provisioner secret names in storage class.") # Verify ceph osd tree output if not config.DEPLOYMENT["external_mode"]: log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output.") if config.DEPLOYMENT.get("local_storage"): deviceset_pvcs = [osd.get_node() for osd in get_osd_pods()] else: deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] ct_pod = get_ceph_tools_pod() osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree", format="json") schemas = { "root": constants.OSD_TREE_ROOT, "rack": constants.OSD_TREE_RACK, "host": constants.OSD_TREE_HOST, "osd": constants.OSD_TREE_OSD, "region": constants.OSD_TREE_REGION, "zone": constants.OSD_TREE_ZONE, } schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs} for item in osd_tree["nodes"]: validate(instance=item, schema=schemas[item["type"]]) if item["type"] == "host": deviceset_pvcs.remove(item["name"]) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}") log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output.") # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present # if the OCS version is < 4.6 if float(config.ENV_DATA["ocs_version"]) < 4.6: log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, ], ) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ("snapshot" not in container) and ( "snapshot" not in image ), (f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}") deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"] rook_ceph_operator_deployment = [ deployment_val for deployment_val in deployments if deployment_val["name"] == "rook-ceph-operator" ] assert { "name": "CSI_ENABLE_SNAPSHOTTER", "value": "false" } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"] ["containers"][0]["env"] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump", format="") pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL, ] crush_rules = [ rule for rule in crush_dump["rules"] if rule["rule_name"] in pool_names ] for crush_rule in crush_rules: assert [ item for item in crush_rule["steps"] if item.get("type") == "zone" ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone") log.info("Validate cluster on PVC") validate_cluster_on_pvc() # Verify ceph health log.info("Verifying ceph health") health_check_tries = 20 health_check_delay = 30 if post_upgrade_verification: # In case of upgrade with FIO we have to wait longer time to see # health OK. See discussion in BZ: # https://bugzilla.redhat.com/show_bug.cgi?id=1817727 health_check_tries = 180 assert utils.ceph_health_check(namespace, health_check_tries, health_check_delay) if config.ENV_DATA.get("fips"): # In case that fips is enabled when deploying, # a verification of the installation of it will run # on all running state pods check_fips_enabled() if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification()
def finalizer(): nodes.restart_nodes_by_stop_and_start_teardown() assert ceph_health_check(), "Ceph cluster health is not OK" log.info("Ceph cluster health is OK")
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ set_registry_to_managed_state() image = None ceph_cluster = ocp.OCP(kind="CephCluster", namespace=self.namespace) try: ceph_cluster.get().get("items")[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") # disconnected installation? load_cluster_info() if config.DEPLOYMENT.get("disconnected"): image = prepare_disconnected_ocs_deployment() if config.DEPLOYMENT["external_mode"]: self.deploy_with_external_mode() else: self.deploy_ocs_via_operator(image) pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods mon_pod_timeout = (900 if self.platform == constants.IBMCLOUD_PLATFORM else 600) assert pod.wait_for_resource( condition="Running", selector="app=rook-ceph-mon", resource_count=3, timeout=mon_pod_timeout, ) assert pod.wait_for_resource(condition="Running", selector="app=rook-ceph-mgr", timeout=600) assert pod.wait_for_resource( condition="Running", selector="app=rook-ceph-osd", resource_count=3, timeout=600, ) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # validate PDB creation of MON, MDS, OSD pods validate_pdb_creation() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector="app=rook-ceph-tools", resource_count=1, timeout=600, ) if not config.COMPONENTS["disable_cephfs"]: # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data["items"][0]["metadata"]["name"] if helpers.validate_cephfilesystem(cfs_name): logger.info("MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error("MDS deployment Failed! Please check logs!") # Change monitoring backend to OCS if config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get( "persistent-monitoring"): setup_persistent_monitoring() elif config.ENV_DATA.get("monitoring_enabled") and config.ENV_DATA.get( "telemeter_server_url"): # Create configmap cluster-monitoring-config to reconfigure # telemeter server url when 'persistent-monitoring' is False create_configmap_cluster_monitoring_pod( telemeter_server_url=config.ENV_DATA["telemeter_server_url"]) if not config.COMPONENTS["disable_cephfs"]: # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster logger.info("Done creating rook resources, waiting for HEALTH_OK") try: ceph_health_check(namespace=self.namespace, tries=30, delay=10) except CephHealthException as ex: err = str(ex) logger.warning(f"Ceph health check failed with {err}") if "clock skew detected" in err: logger.info(f"Changing NTP on compute nodes to" f" {constants.RH_NTP_CLOCK}") if self.platform == constants.VSPHERE_PLATFORM: update_ntp_compute_nodes() assert ceph_health_check(namespace=self.namespace, tries=60, delay=10) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default()
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ _templating = templating.Templating() ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") if not self.ocs_operator_deployment: create_oc_resource('common.yaml', self.cluster_path, _templating, config.ENV_DATA) run_cmd( f'oc label namespace {config.ENV_DATA["cluster_namespace"]} ' f'"openshift.io/cluster-monitoring=true"') run_cmd( f"oc policy add-role-to-user view " f"system:serviceaccount:openshift-monitoring:prometheus-k8s " f"-n {self.namespace}") # HACK: If you would like to drop this hack, make sure that you # also updated docs and write appropriate unit/integration tests # for config processing. if config.ENV_DATA.get('monitoring_enabled') in ("true", "True", True): # RBAC rules for monitoring, based on documentation change in # rook: # https://github.com/rook/rook/commit/1b6fe840f6ae7372a9675ba727ecc65326708aa8 # HACK: This should be dropped when OCS is managed by OLM apply_oc_resource('rbac.yaml', self.cluster_path, _templating, config.ENV_DATA, template_dir="monitoring") # Increased to 15 seconds as 10 is not enough # TODO: do the sampler function and check if resource exist wait_time = 15 logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) create_oc_resource('operator-openshift.yaml', self.cluster_path, _templating, config.ENV_DATA) logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) run_cmd(f"oc wait --for condition=ready pod " f"-l app=rook-ceph-operator " f"-n {self.namespace} " f"--timeout=120s") run_cmd(f"oc wait --for condition=ready pod " f"-l app=rook-discover " f"-n {self.namespace} " f"--timeout=120s") create_oc_resource('cluster.yaml', self.cluster_path, _templating, config.ENV_DATA) else: self.deploy_ocs_via_operator() pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mgr', timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600) # Validation for cluster on pvc logger.info("Validate mon and OSD are backed by PVCs") validate_cluster_on_pvc(label=constants.MON_APP_LABEL) validate_cluster_on_pvc(label=constants.DEFAULT_DEVICESET_LABEL) # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector='app=rook-ceph-tools', resource_count=1, timeout=600) if not self.ocs_operator_deployment: logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) # HACK: This should be dropped (including service-monitor.yaml and # prometheus-rules.yaml files) when OCS is managed by OLM if config.ENV_DATA.get('monitoring_enabled') not in ("true", "True", True): # HACK: skip creation of rook-ceph-mgr service monitor when # monitoring is enabled (if this were not skipped, the step # would fail because rook would create the service monitor at # this point already) create_oc_resource("service-monitor.yaml", self.cluster_path, _templating, config.ENV_DATA) # HACK: skip creation of prometheus-rules, rook-ceph is # concerned with it's setup now, based on clarification from # Umanga Chapagain create_oc_resource("prometheus-rules.yaml", self.cluster_path, _templating, config.ENV_DATA) logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) # Create MDS pods for CephFileSystem fs_data = templating.load_yaml(constants.CEPHFILESYSTEM_YAML) fs_data['metadata']['namespace'] = self.namespace ceph_obj = OCS(**fs_data) ceph_obj.create() assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector='app=rook-ceph-mds', resource_count=2, timeout=600) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info(f"MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error(f"MDS deployment Failed! Please check logs!") if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( 'persistent-monitoring'): # Create a pool, secrets and sc secret_obj = helpers.create_secret( interface_type=constants.CEPHBLOCKPOOL) cbj_obj = helpers.create_ceph_block_pool() sc_obj = helpers.create_storage_class( interface_type=constants.CEPHBLOCKPOOL, interface_name=cbj_obj.name, secret_name=secret_obj.name) # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager']) # Create configmap cluster-monitoring-config create_configmap_cluster_monitoring_pod(sc_obj.name) # Take some time to respin the pod waiting_time = 30 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state validate_pods_are_respinned_and_running_state(pods_list) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods validate_pvc_are_mounted_on_monitoring_pods(pods_list) # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") assert ceph_health_check(namespace=self.namespace) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default()
def ocs_install_verification(timeout=600, skip_osd_distribution_check=False): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. """ from ocs_ci.ocs.node import get_typed_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods number_of_worker_nodes = len(get_typed_nodes()) namespace = config.ENV_DATA['cluster_namespace'] log.info("Verifying OCS installation") # Verify OCS CSV is in Succeeded phase log.info("verifying ocs csv") operator_selector = get_selector_for_ocs_operator() ocs_package_manifest = PackageManifest( resource_name=defaults.OCS_OPERATOR_NAME, selector=operator_selector, ) ocs_csv_name = ocs_package_manifest.get_current_csv() ocs_csv = CSV(resource_name=ocs_csv_name, namespace=namespace) log.info(f"Check if OCS operator: {ocs_csv_name} is in Succeeded phase.") ocs_csv.wait_for_phase(phase="Succeeded", timeout=timeout) # Verify OCS Cluster Service (ocs-storagecluster) is Ready storage_cluster_name = config.ENV_DATA['storage_cluster_name'] log.info("Verifying status of storage cluster: %s", storage_cluster_name) storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) log.info(f"Check if StorageCluster: {storage_cluster_name} is in" f"Succeeded phase") storage_cluster.wait_for_phase(phase='Ready', timeout=timeout) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") pod = OCP(kind=constants.POD, namespace=namespace) # ocs-operator assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.OCS_OPERATOR_LABEL, timeout=timeout) # rook-ceph-operator assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL, timeout=timeout) # noobaa assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.NOOBAA_APP_LABEL, resource_count=2, timeout=timeout) # mons assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=3, timeout=timeout) # csi-cephfsplugin assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.CSI_CEPHFSPLUGIN_LABEL, resource_count=number_of_worker_nodes, timeout=timeout) # csi-cephfsplugin-provisioner assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, resource_count=2, timeout=timeout) # csi-rbdplugin assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.CSI_RBDPLUGIN_LABEL, resource_count=number_of_worker_nodes, timeout=timeout) # csi-rbdplugin-provisioner assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, resource_count=2, timeout=timeout) # osds osd_count = ( int(storage_cluster.data['spec']['storageDeviceSets'][0]['count']) * int(storage_cluster.data['spec']['storageDeviceSets'][0]['replica'])) assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_count, timeout=timeout) # mgr assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MGR_APP_LABEL, timeout=timeout) # mds assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.MDS_APP_LABEL, resource_count=2, timeout=timeout) # rgw check only for VmWare if config.ENV_DATA.get('platform') == constants.VSPHERE_PLATFORM: assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector=constants.RGW_APP_LABEL, resource_count=1, timeout=timeout) # Verify ceph health log.info("Verifying ceph health") assert utils.ceph_health_check(namespace=namespace) # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace) storage_cluster_name = config.ENV_DATA['storage_cluster_name'] required_storage_classes = { f'{storage_cluster_name}-cephfs', f'{storage_cluster_name}-ceph-rbd' } storage_classes = storage_class.get() storage_class_names = { item['metadata']['name'] for item in storage_classes['items'] } assert required_storage_classes.issubset(storage_class_names) # Verify OSD's are distributed if not skip_osd_distribution_check: log.info("Verifying OSD's are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)['items'] node_names = [osd['spec']['nodeName'] for osd in osds] for node in node_names: assert not node_names.count(node) > 1, ( "OSD's are not distributed evenly across worker nodes") # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") assert {defaults.CEPHFS_PROVISIONER, defaults.RBD_PROVISIONER} == ({ item['metadata']['name'] for item in csi_driver.get()['items'] }) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD) sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS) assert sc_rbd['parameters'][ 'csi.storage.k8s.io/node-stage-secret-name'] == constants.RBD_NODE_SECRET assert sc_rbd['parameters'][ 'csi.storage.k8s.io/provisioner-secret-name'] == constants.RBD_PROVISIONER_SECRET assert sc_cephfs['parameters'][ 'csi.storage.k8s.io/node-stage-secret-name'] == constants.CEPHFS_NODE_SECRET assert sc_cephfs['parameters'][ 'csi.storage.k8s.io/provisioner-secret-name'] == constants.CEPHFS_PROVISIONER_SECRET log.info("Verified node and provisioner secret names in storage class.") # Verify ceph osd tree output log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output.") deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] ct_pod = get_ceph_tools_pod() osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd tree', format='json') schemas = { 'root': constants.OSD_TREE_ROOT, 'rack': constants.OSD_TREE_RACK, 'host': constants.OSD_TREE_HOST, 'osd': constants.OSD_TREE_OSD, 'region': constants.OSD_TREE_REGION, 'zone': constants.OSD_TREE_ZONE } schemas['host']['properties']['name'] = {'enum': deviceset_pvcs} for item in osd_tree['nodes']: validate(instance=item, schema=schemas[item['type']]) if item['type'] == 'host': deviceset_pvcs.remove(item['name']) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}") log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output.") # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL ]) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ('snapshot' not in container) and ( 'snapshot' not in image), ( f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}") assert { 'name': 'CSI_ENABLE_SNAPSHOTTER', 'value': 'false' } in (ocs_csv.get()['spec']['install']['spec']['deployments'][0]['spec'] ['template']['spec']['containers'][0]['env'] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd='ceph osd crush dump', format='') pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL ] crush_rules = [ rule for rule in crush_dump['rules'] if rule['rule_name'] in pool_names ] for crush_rule in crush_rules: assert [ item for item in crush_rule['steps'] if item.get('type') == 'zone' ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone")
def test_upgrade_ocp(self, reduce_and_resume_cluster_load): """ Tests OCS stability when upgrading OCP """ ceph_cluster = CephCluster() with CephHealthMonitor(ceph_cluster): ocp_channel = config.UPGRADE.get("ocp_channel", ocp.get_ocp_upgrade_channel()) ocp_upgrade_version = config.UPGRADE.get("ocp_upgrade_version") if not ocp_upgrade_version: ocp_upgrade_version = get_latest_ocp_version( channel=ocp_channel) ocp_arch = config.UPGRADE["ocp_arch"] target_image = f"{ocp_upgrade_version}-{ocp_arch}" elif ocp_upgrade_version.endswith(".nightly"): target_image = expose_ocp_version(ocp_upgrade_version) logger.info(f"Target image; {target_image}") image_path = config.UPGRADE["ocp_upgrade_path"] cluster_operators = ocp.get_all_cluster_operators() logger.info(f" oc version: {ocp.get_current_oc_version()}") # Verify Upgrade subscription channel: ocp.patch_ocp_upgrade_channel(ocp_channel) for sampler in TimeoutSampler( timeout=250, sleep=15, func=ocp.verify_ocp_upgrade_channel, channel_variable=ocp_channel, ): if sampler: logger.info(f"OCP Channel:{ocp_channel}") break # Upgrade OCP logger.info(f"full upgrade path: {image_path}:{target_image}") ocp.upgrade_ocp(image=target_image, image_path=image_path) # Wait for upgrade for ocp_operator in cluster_operators: logger.info(f"Checking upgrade status of {ocp_operator}:") # ############ Workaround for issue 2624 ####### name_changed_between_versions = ( "service-catalog-apiserver", "service-catalog-controller-manager", ) if ocp_operator in name_changed_between_versions: logger.info(f"{ocp_operator} upgrade will not be verified") continue # ############ End of Workaround ############### ver = ocp.get_cluster_operator_version(ocp_operator) logger.info(f"current {ocp_operator} version: {ver}") for sampler in TimeoutSampler( timeout=2700, sleep=60, func=ocp.confirm_cluster_operator_version, target_version=target_image, cluster_operator=ocp_operator, ): if sampler: logger.info(f"{ocp_operator} upgrade completed!") break else: logger.info( f"{ocp_operator} upgrade did not completed yet!") # post upgrade validation: check cluster operator status cluster_operators = ocp.get_all_cluster_operators() for ocp_operator in cluster_operators: logger.info(f"Checking cluster status of {ocp_operator}") for sampler in TimeoutSampler( timeout=2700, sleep=60, func=ocp.verify_cluster_operator_status, cluster_operator=ocp_operator, ): if sampler: break else: logger.info(f"{ocp_operator} status is not valid") # Post upgrade validation: check cluster version status logger.info("Checking clusterversion status") for sampler in TimeoutSampler( timeout=900, sleep=15, func=ocp.validate_cluster_version_status): if sampler: logger.info("Upgrade Completed Successfully!") break # load new config file self.load_ocp_version_config_file(ocp_upgrade_version) new_ceph_cluster = CephCluster() new_ceph_cluster.wait_for_rebalance(timeout=1800) ceph_health_check(tries=90, delay=30)
def test_add_capacity_with_resource_delete( self, workload_storageutilization_rbd, resource_name, resource_id, is_kill_resource_repeatedly, ): """ The function get the resource name, and id. The function adds capacity to the cluster, and then delete the resource while storage capacity is getting increased. Args: resource_name (str): the name of the resource to delete resource_id (int): the id of the resource to delete is_kill_resource_repeatedly (bool): If True then kill the resource repeatedly. Else, if False delete the resource only once. """ used_percentage = get_percent_used_capacity() logging.info( f"storageutilization is completed. used capacity = {used_percentage}" ) osd_pods_before = pod_helpers.get_osd_pods() number_of_osd_pods_before = len(osd_pods_before) if number_of_osd_pods_before >= constants.MAX_OSDS: pytest.skip("We have maximum of OSDs in the cluster") d = Disruptions() d.set_resource(resource_name) self.new_pods_in_status_running = False osd_size = storage_cluster.get_osd_size() logging.info(f"Adding one new set of OSDs. osd size = {osd_size}") storagedeviceset_count = storage_cluster.add_capacity(osd_size) logging.info("Adding one new set of OSDs was issued without problems") # Wait for new osd's to come up. After the first new osd in status Init - delete the resource. # After deleting the resource we expect that all the new osd's will be in status running, # and the delete resource will be also in status running. pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before) logging.info( f"Delete a {resource_name} pod while storage capacity is getting increased" ) if is_kill_resource_repeatedly: with ThreadPoolExecutor() as executor: executor.submit(self.kill_resource_repeatedly, resource_name, resource_id) self.wait_for_osd_pods_to_be_running(storagedeviceset_count) else: d.delete_resource(resource_id) self.wait_for_osd_pods_to_be_running(storagedeviceset_count) self.new_pods_in_status_running = True logging.info( "Finished verifying add capacity when one of the pods gets deleted" ) logging.info("Waiting for ceph health check to finished...") ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=90) ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=1800), "Data re-balance failed to complete"
def disruptive_base(self, interface, operation_to_disrupt, resource_to_delete): """ Base function for disruptive tests. Deletion of 'resource_to_delete' will be introduced while 'operation_to_disrupt' is progressing. """ pod_functions = { 'mds': get_mds_pods, 'mon': get_mon_pods, 'mgr': get_mgr_pods, 'osd': get_osd_pods } disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_to_delete) executor = ThreadPoolExecutor(max_workers=1) # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) # Fetch the number of Pods and PVCs initial_num_of_pods = len(get_all_pods(namespace=self.namespace)) initial_num_of_pvc = len( get_all_pvcs(namespace=self.namespace)['items']) # Fetch PV names pv_objs = [] for pvc_obj in self.pvc_objs: pvc_obj.reload() pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in self.pod_objs: pod_info = pod_obj.get() node = pod_info['spec']['nodeName'] pvc = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] for pvc_obj in self.pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Do setup for running IO on pods log.info("Setting up pods for running IO") for pod_obj in self.pod_objs: pod_obj.workload_setup(storage_type='fs') log.info("Setup for running IO is completed on pods") # Start IO on each pod. RWX PVC will be used on two pods. So split the # size accordingly log.info("Starting IO on pods") for pod_obj in self.pod_objs: if pod_obj.pvc.access_mode == constants.ACCESS_MODE_RWX: io_size = int((self.pvc_size - 1) / 2) else: io_size = self.pvc_size - 1 pod_obj.run_io(storage_type='fs', size=f'{io_size}G', fio_filename=f'{pod_obj.name}_io') log.info("IO started on all pods.") # Start deleting pods pod_bulk_delete = executor.submit(self.delete_pods) if operation_to_disrupt == 'delete_pods': ret = self.verify_resource_deletion(get_all_pods, initial_num_of_pods) assert ret, "Wait timeout: Pods are not being deleted." logging.info(f"Pods deletion has started.") disruption.delete_resource() pods_deleted = pod_bulk_delete.result() assert pods_deleted, "Deletion of pods failed." # Verify pods are deleted for pod_obj in self.pod_objs: assert pod_obj.ocp.wait_for_delete( pod_obj.name), (f"Pod {pod_obj.name} is not deleted") logging.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod for node, pvs in node_pv_dict.items(): cmd = f'oc debug nodes/{node} -- df' df_on_node = run_cmd(cmd) for pv in pvs: assert pv not in df_on_node, ( f"{pv} is still present on node {node} after " f"deleting the pods.") log.info( "Verified: mount points are removed from nodes after deleting " "the pods") # Fetch image uuid associated with PVCs pvc_uuid_map = {} for pvc_obj in self.pvc_objs: pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, self.pvc_objs) if operation_to_disrupt == 'delete_pvcs': ret = self.verify_resource_deletion(get_all_pvcs, initial_num_of_pvc) assert ret, "Wait timeout: PVCs are not being deleted." logging.info(f"PVCs deletion has started.") disruption.delete_resource() pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in self.pvc_objs: assert pvc_obj.ocp.wait_for_delete( pvc_obj.name), (f"PVC {pvc_obj.name} is not deleted") logging.info("Verified: PVCs are deleted.") # Verify PVs are deleted for pv_obj in pv_objs: assert pv_obj.ocp.wait_for_delete( pv_obj.name), (f"PV {pv_obj.name} is not deleted") logging.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. for pvc_name, uuid in pvc_uuid_map.items(): if interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend( interface=interface, image_uuid=uuid, pool_name=self.sc_obj.ceph_pool.name) if interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend(interface=interface, image_uuid=uuid) assert ret, (f"Volume associated with PVC {pvc_name} still exists " f"in backend") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}") # Check ceph status ceph_health_check(namespace=config.ENV_DATA['cluster_namespace']) log.info("Ceph cluster health is OK")
def test_bulk_clone_performance(self, namespace, tmp_path): """ Creates number of PVCs in a bulk using kube job Write 60% of PVC capacity to each one of the created PVCs Creates 1 clone per each PVC altogether in a bulk Measuring time for bulk of clones creation """ pvc_count = 50 vol_size = "5Gi" job_pod_file, job_pvc_file, job_clone_file = [None, None, None] log.info(f"Start creating {self.interface} {pvc_count} PVC") if self.interface == constants.CEPHBLOCKPOOL: sc_name = constants.DEFAULT_STORAGECLASS_RBD clone_yaml = constants.CSI_RBD_PVC_CLONE_YAML elif self.interface == constants.CEPHFILESYSTEM: sc_name = constants.DEFAULT_STORAGECLASS_CEPHFS clone_yaml = constants.CSI_CEPHFS_PVC_CLONE_YAML try: pvc_dict_list = scale_lib.construct_pvc_creation_yaml_bulk_for_kube_job( no_of_pvc=pvc_count, access_mode=constants.ACCESS_MODE_RWO, sc_name=sc_name, pvc_size=vol_size, ) job_pvc_file = ObjectConfFile( name="job_profile_pvc", obj_dict_list=pvc_dict_list, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job job_pvc_file.create(namespace=self.namespace) # Check all the PVC reached Bound state pvc_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job( kube_job_obj=job_pvc_file, namespace=self.namespace, no_of_pvc=pvc_count, ) logging.info(f"Number of PVCs in Bound state {len(pvc_bound_list)}") # Kube_job to Create pod pod_dict_list = scale_lib.attach_multiple_pvc_to_pod_dict( pvc_list=pvc_bound_list, namespace=self.namespace, pvcs_per_pod=1, start_io=False, pod_yaml=constants.NGINX_POD_YAML, ) job_pod_file = ObjectConfFile( name="job_profile_pod", obj_dict_list=pod_dict_list, project=self.namespace, tmp_path=tmp_path, ) job_pod_file.create(namespace=self.namespace) # Check all PODs in Running state scale_lib.check_all_pod_reached_running_state_in_kube_job( kube_job_obj=job_pod_file, namespace=self.namespace, no_of_pod=len(pod_dict_list), timeout=90, ) logging.info(f"Number of PODs in Running state {len(pod_dict_list)}") total_files_size = self.run_fio_on_pvcs(vol_size) clone_dict_list = scale_lib.construct_pvc_clone_yaml_bulk_for_kube_job( pvc_dict_list, clone_yaml, sc_name ) logging.info("Created clone dict list") job_clone_file = ObjectConfFile( name="job_profile_clone", obj_dict_list=clone_dict_list, project=self.namespace, tmp_path=tmp_path, ) # Create kube_job that creates clones job_clone_file.create(namespace=self.namespace) logging.info("Going to check bound status for clones") # Check all the clones reached Bound state clone_bound_list = scale_lib.check_all_pvc_reached_bound_state_in_kube_job( kube_job_obj=job_clone_file, namespace=self.namespace, no_of_pvc=pvc_count, timeout=180, ) logging.info(f"Number of clones in Bound state {len(clone_bound_list)}") clone_objs = [] all_pvc_objs = pvc.get_all_pvc_objs(namespace=self.namespace) for clone_yaml in clone_dict_list: name = clone_yaml["metadata"]["name"] size = clone_yaml["spec"]["resources"]["requests"]["storage"] logging.info(f"Clone {name} of size {size} created") for pvc_obj in all_pvc_objs: if pvc_obj.name == name: clone_objs.append(pvc_obj) assert len(clone_bound_list) == len( clone_objs ), "Not all clones reached BOUND state, cannot measure time" start_time = helpers.get_provision_time( self.interface, clone_objs, status="start" ) end_time = helpers.get_provision_time( self.interface, clone_objs, status="end" ) total_time = (end_time - start_time).total_seconds() speed = round(total_files_size / total_time, 2) logging.info( f"Total creation time = {total_time} secs, data size = {total_files_size} MB, speed = {speed} MB/sec " f"for {self.interface} clone in bulk of {pvc_count} clones." ) # Produce ES report # Collecting environment information self.get_env_info() # Initialize the results doc file. full_results = self.init_full_results( ResultsAnalyse( self.uuid, self.crd_data, self.full_log_path, "bulk_clone_perf_fullres", ) ) full_results.add_key("interface", self.interface) full_results.add_key("bulk_size", pvc_count) full_results.add_key("clone_size", vol_size) full_results.add_key("bulk_creation_time", total_time) full_results.add_key("data_size(MB)", total_files_size) full_results.add_key("speed", speed) full_results.add_key("es_results_link", full_results.results_link()) # Write the test results into the ES server full_results.es_write() # write the ES link to the test results in the test log. logging.info(f"The result can be found at : {full_results.results_link()}") # Finally is used to clean-up the resources created # Irrespective of try block pass/fail finally will be executed. finally: # Cleanup activities logging.info("Cleanup of all the resources created during test execution") if job_pod_file: job_pod_file.delete(namespace=self.namespace) job_pod_file.wait_for_delete( resource_name=job_pod_file.name, namespace=self.namespace ) if job_clone_file: job_clone_file.delete(namespace=self.namespace) job_clone_file.wait_for_delete( resource_name=job_clone_file.name, namespace=self.namespace ) if job_pvc_file: job_pvc_file.delete(namespace=self.namespace) job_pvc_file.wait_for_delete( resource_name=job_pvc_file.name, namespace=self.namespace ) # Check ceph health status utils.ceph_health_check(tries=20)
def finalizer(): helpers.remove_label_from_worker_node(node_list=worker_nodes, label_key="nodetype") # Check ceph health ceph_health_check(tries=80)
def test_all_worker_nodes_short_network_failure(self, nodes, setup, node_restart_teardown): """ OCS-1432/OCS-1433: - Start DeploymentConfig based app pods - Make all the worker nodes unresponsive by doing abrupt network failure - Reboot the unresponsive node after short duration of ~300 seconds - When unresponsive node recovers, app pods and ceph cluster should recover - Again run IOs from app pods """ pod_objs = setup worker_nodes = node.get_worker_nodes() # Run IO on pods logger.info(f"Starting IO on {len(pod_objs)} app pods") with ThreadPoolExecutor() as executor: for pod_obj in pod_objs: logger.info(f"Starting IO on pod {pod_obj.name}") storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs" executor.submit( pod_obj.run_io, storage_type=storage_type, size="2G", runtime=30, fio_filename=f"{pod_obj.name}_io_f1", ) logger.info(f"IO started on all {len(pod_objs)} app pods") # Wait for IO results for pod_obj in pod_objs: pod.get_fio_rw_iops(pod_obj) # Induce network failure on all worker nodes with ThreadPoolExecutor() as executor: for node_name in worker_nodes: executor.submit(node.node_network_failure, node_name, False) node.wait_for_nodes_status(node_names=worker_nodes, status=constants.NODE_NOT_READY) logger.info(f"Waiting for {self.short_nw_fail_time} seconds") sleep(self.short_nw_fail_time) # Reboot the worker nodes logger.info(f"Stop and start the worker nodes: {worker_nodes}") nodes.restart_nodes_by_stop_and_start(node.get_node_objs(worker_nodes)) try: node.wait_for_nodes_status(node_names=worker_nodes, status=constants.NODE_READY) logger.info( "Verifying StorageCluster pods are in running/completed state") pod.wait_for_storage_pods(timeout=720) except ResourceWrongStatusException: # Restart nodes nodes.restart_nodes(node.get_node_objs(worker_nodes)) assert ceph_health_check(tries=80), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Get current info of app pods new_pod_objs = list() for pod_obj in pod_objs: pod_label = pod_obj.labels.get("deploymentconfig") pods_data = pod.get_pods_having_label( f"deploymentconfig={pod_label}", pod_obj.namespace) current_pods = [ pod_data.get("metadata").get("name") for pod_data in pods_data if "-deploy" not in pod_data.get("metadata").get("name") ] logger.info(f"Pods with label {pod_label}: {current_pods}") # Remove the older pod from the list if pod is rescheduled if len(current_pods) > 1: current_pods.remove(pod_obj.name) new_pod_obj = pod.get_pod_obj(current_pods.pop(), pod_obj.namespace) new_pod_obj.pvc = pod_obj.pvc new_pod_objs.append(new_pod_obj) logger.info("Wait for app pods are in running state") for pod_obj in new_pod_objs: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=720, sleep=20, ) logger.info("All the app pods reached running state") # Run more IOs on app pods with ThreadPoolExecutor() as executor: for pod_obj in new_pod_objs: logger.info(f"Starting IO on pod {pod_obj.name}") pod_obj.wl_setup_done = False storage_type = "block" if pod_obj.pvc.volume_mode == "Block" else "fs" executor.submit( pod_obj.run_io, storage_type=storage_type, size="1G", runtime=30, fio_filename=f"{pod_obj.name}_io_f2", ) for pod_obj in new_pod_objs: pod.get_fio_rw_iops(pod_obj)
def test_scale_osds_reboot_nodes(self, interface, project_factory, multi_pvc_factory, dc_pod_factory): """ Check storage utilization, if its less then runs IO, Scale osds from 3-6, check for rebalance and reboot workers """ current_osd_count = count_cluster_osd() proj_obj = project_factory() if current_osd_count == 3: while not validate_osd_utilization(osd_used=10): # Create pvc pvc_objs = multi_pvc_factory(project=proj_obj, interface=interface, size=self.pvc_size, num_of_pvc=self.num_of_pvcs) dc_pod_objs = list() for pvc_obj in pvc_objs: dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj)) wait_for_dc_app_pods_to_reach_running_state(dc_pod_objs, timeout=1200) for pod_obj in dc_pod_objs: pod_obj.run_io(storage_type='fs', size='3G', runtime='60', fio_filename=f'{pod_obj.name}_io') # Add capacity osd_size = storage_cluster.get_osd_size() count = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=count * 3) assert ceph_health_check( delay=120, tries=50), "New OSDs failed to reach running state" cluster = CephCluster() # Get rebalance status rebalance_status = cluster.get_rebalance_status() logger.info(rebalance_status) if rebalance_status: time_taken = cluster.time_taken_to_complete_rebalance() logger.info(f"The time taken to complete rebalance {time_taken}") # Rolling reboot on worker nodes worker_nodes = get_typed_nodes(node_type='worker') factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() for node in worker_nodes: nodes.restart_nodes(nodes=[node]) wait_for_nodes_status() assert ceph_health_check( delay=180), "Failed, Ceph health bad after nodes reboot"
def measure_corrupt_pg(measurement_dir): """ Create Ceph pool and corrupt Placement Group on one of OSDs, measures the time when it was corrupted and records alerts that were triggered during this event. Returns: dict: Contains information about `start` and `stop` time for corrupting Ceph Placement Group """ oc = ocp.OCP(kind=constants.DEPLOYMENT, namespace=config.ENV_DATA.get("cluster_namespace")) osd_deployments = oc.get(selector=constants.OSD_APP_LABEL).get("items") osd_deployment = osd_deployments[0].get("metadata").get("name") ct_pod = pod.get_ceph_tools_pod() pool_name = helpers.create_unique_resource_name("corrupted", "pool") ct_pod.exec_ceph_cmd(f"ceph osd pool create {pool_name} 1 1") logger.info("Setting osd noout flag") ct_pod.exec_ceph_cmd("ceph osd set noout") logger.info(f"Put object into {pool_name}") pool_object = "test_object" ct_pod.exec_ceph_cmd(f"rados -p {pool_name} put {pool_object} /etc/passwd") logger.info(f"Looking for Placement Group with {pool_object} object") pg = ct_pod.exec_ceph_cmd( f"ceph osd map {pool_name} {pool_object}")["pgid"] logger.info(f"Found Placement Group: {pg}") dummy_deployment, dummy_pod = helpers.create_dummy_osd(osd_deployment) def corrupt_pg(): """ Corrupt PG on one OSD in Ceph pool for 12 minutes and measure it. There should be only CephPGRepairTakingTooLong Pending alert as it takes 2 hours for it to become Firing. This configuration of alert can be observed in ceph-mixins which is used in the project: https://github.com/ceph/ceph-mixins/blob/d22afe8c0da34490cb77e52a202eefcf4f62a869/config.libsonnet#L23 There should be also CephClusterErrorState alert that takes 10 minutest to start firing. Returns: str: Name of corrupted deployment """ # run_time of operation run_time = 60 * 12 nonlocal oc nonlocal pool_name nonlocal pool_object nonlocal dummy_pod nonlocal pg nonlocal osd_deployment nonlocal dummy_deployment logger.info(f"Corrupting {pg} PG on {osd_deployment}") dummy_pod.exec_sh_cmd_on_pod( f"ceph-objectstore-tool --data-path /var/lib/ceph/osd/ceph-" f"{osd_deployment.split('-')[-1]} --pgid {pg} {pool_object} " f"set-bytes /etc/shadow --no-mon-config") logger.info("Unsetting osd noout flag") ct_pod.exec_ceph_cmd("ceph osd unset noout") ct_pod.exec_ceph_cmd(f"ceph pg deep-scrub {pg}") oc.exec_oc_cmd(f"scale --replicas=0 deployment/{dummy_deployment}") oc.exec_oc_cmd(f"scale --replicas=1 deployment/{osd_deployment}") logger.info(f"Waiting for {run_time} seconds") time.sleep(run_time) return osd_deployment test_file = os.path.join(measurement_dir, "measure_corrupt_pg.json") measured_op = measure_operation(corrupt_pg, test_file) logger.info(f"Deleting pool {pool_name}") ct_pod.exec_ceph_cmd(f"ceph osd pool delete {pool_name} {pool_name} " f"--yes-i-really-really-mean-it") logger.info(f"Checking that pool {pool_name} is deleted") logger.info(f"Deleting deployment {dummy_deployment}") oc.delete(resource_name=dummy_deployment) # wait for ceph to return into HEALTH_OK state after osd deployment # is returned back to normal ceph_health_check(tries=20, delay=15) return measured_op
def test_pvc_creation_after_del_mon_services(self, interface, pod_factory): """ 1. Delete one mon service 2. Edit the configmap rook-ceph-endpoints remove all the deleted mon services entries 3. Delete deployment, pvc of deleted mon service 4. Restart rook-ceph-operator 5. Make sure all mon pods are running 6. Make sure ceph health Ok and storage pods are running 7. Sleep for 300 seconds before deleting another mon 8. Repeat above steps for all mons and at the end each mon should contain different endpoints 9. Create PVC, should succeeded. """ pod_obj = pod_factory(interface=interface) run_io_in_bg(pod_obj) # Get all mon services mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) # Get all mon pods mon_pods = get_mon_pods() mon_count = len(mon_pods) list_old_svc = [] for svc in mon_svc: # Get rook-ceph-operator pod obj operator_pod_obj = get_operator_pods() operator_name = operator_pod_obj[0].name # Scale down rook-ceph-operator log.info("Scale down rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=0 ), "Failed to scale down rook-ceph-operator to 0" log.info("Successfully scaled down rook-ceph-operator to 0") # Validate rook-ceph-operator pod not running POD_OBJ.wait_for_delete(resource_name=operator_name) pvc_name = svc["metadata"]["labels"]["pvc_name"] cluster_ip = svc["spec"]["clusterIP"] port = svc["spec"]["ports"][0]["port"] mon_endpoint = f"{cluster_ip}:{port}" mon_id = svc["spec"]["selector"]["mon"] list_old_svc.append(cluster_ip) # Delete deployment log.info("Delete mon deployments") del_obj = OCP( kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) del_obj.delete(resource_name=pvc_name) # Delete pvc log.info("Delete mon PVC") pvc_obj = OCP(kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) pvc_obj.delete(resource_name=pvc_name) # Delete the mon service log.info("Delete mon service") svc_obj = OCP(kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) svc_obj.delete(resource_name=pvc_name) # Edit the cm log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}") configmap_obj = OCP( kind=constants.CONFIGMAP, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) output_get = configmap_obj.get( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS) new_data = output_get["data"] new_data["csi-cluster-config-json"] = ( new_data["csi-cluster-config-json"].replace( f'"{mon_endpoint}",', "") if new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",') != 1 else new_data["csi-cluster-config-json"].replace( f',"{mon_endpoint}"', "")) new_data["data"] = ",".join([ value for value in new_data["data"].split(",") if f"{mon_id}=" not in value ]) new_data["mapping"] = ( new_data["mapping"].replace(f'"{mon_id}":null,', "") if new_data["mapping"].find(f'"{mon_id}":null,') != -1 else new_data["mapping"].replace(f',"{mon_id}":null', "")) params = f'{{"data": {json.dumps(new_data)}}}' log.info(f"Removing {mon_id} entries from configmap") configmap_obj.patch( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS, params=params, format_type="strategic", ) log.info( f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully" ) # Scale up rook-ceph-operator log.info("Scale up rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=1), "Failed to scale up rook-ceph-operator to 1" log.info("Successfully scaled up rook-ceph-operator to 1") log.info("Validate rook-ceph-operator pod is running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL, resource_count=1, timeout=600, sleep=5, ) # Validate all mons are running log.info("Validate all mons are up and running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=mon_count, timeout=1200, sleep=5, ) log.info("All mons are up and running") # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Validate all storage pods are running wait_for_storage_pods() # Sleep for some seconds before deleting another mon sleep_time = 300 log.info( f"Waiting for {sleep_time} seconds before deleting another mon" ) time.sleep(sleep_time) # Check the endpoints are different log.info("Validate the mon endpoints are changed") new_mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) list_new_svc = [] for new_svc in new_mon_svc: cluster_ip = new_svc["spec"]["clusterIP"] list_new_svc.append(cluster_ip) diff = set(list_new_svc) ^ set(list_old_svc) assert len(diff) == len(list_old_svc + list_new_svc), ( f"Not all endpoints are changed. Set of old " f"endpoints {list_old_svc} and new endpoints {list_new_svc}") log.info(f"All new mon endpoints are created {list_new_svc}") # Create PVC and pods log.info(f"Create {interface} PVC") pod_obj = pod_factory(interface=interface) pod_obj.run_io(storage_type="fs", size="500M")
def measure_stop_ceph_mon(measurement_dir): """ Downscales Ceph Monitor deployment, measures the time when it was downscaled and monitors alerts that were triggered during this event. Returns: dict: Contains information about `start` and `stop` time for stopping Ceph Monitor pod """ oc = ocp.OCP(kind=constants.DEPLOYMENT, namespace=config.ENV_DATA["cluster_namespace"]) mon_deployments = oc.get(selector=constants.MON_APP_LABEL)["items"] mons = [deployment["metadata"]["name"] for deployment in mon_deployments] # get monitor deployments to stop, leave even number of monitors split_index = len(mons) // 2 if len(mons) > 3 else 2 mons_to_stop = mons[split_index:] logger.info(f"Monitors to stop: {mons_to_stop}") logger.info(f"Monitors left to run: {mons[:split_index]}") # run_time of operation run_time = 60 * 14 def stop_mon(): """ Downscale Ceph Monitor deployments for 14 minutes. First 15 minutes the alert CephMonQuorumAtRisk should be in 'Pending'. After 15 minutes the alert turns into 'Firing' state. This configuration of monitoring can be observed in ceph-mixins which are used in the project: https://github.com/ceph/ceph-mixins/blob/d22afe8c0da34490cb77e52a202eefcf4f62a869/config.libsonnet#L16 `Firing` state shouldn't actually happen because monitor should be automatically redeployed shortly after 10 minutes. Returns: str: Names of downscaled deployments """ nonlocal oc nonlocal mons_to_stop for mon in mons_to_stop: logger.info(f"Downscaling deployment {mon} to 0") oc.exec_oc_cmd(f"scale --replicas=0 deployment/{mon}") logger.info(f"Waiting for {run_time} seconds") time.sleep(run_time) return mons_to_stop test_file = os.path.join(measurement_dir, "measure_stop_ceph_mon.json") measured_op = measure_operation(stop_mon, test_file) # expected minimal downtime of a mon inflicted by this fixture measured_op["min_downtime"] = run_time - (60 * 2) # get new list of monitors to make sure that new monitors were deployed mon_deployments = oc.get(selector=constants.MON_APP_LABEL)["items"] mons = [deployment["metadata"]["name"] for deployment in mon_deployments] # check that downscaled monitors are removed as OCS should redeploy them # but only when we are running this for the first time check_old_mons_deleted = all(mon not in mons for mon in mons_to_stop) if measured_op["first_run"] and not check_old_mons_deleted: for mon in mons_to_stop: logger.info(f"Upscaling deployment {mon} back to 1") oc.exec_oc_cmd(f"scale --replicas=1 deployment/{mon}") msg = f"Downscaled monitors {mons_to_stop} were not replaced" assert check_old_mons_deleted, msg # wait for ceph to return into HEALTH_OK state after mon deployment # is returned back to normal ceph_health_check(tries=20, delay=15) return measured_op
def ocs_install_verification( timeout=600, skip_osd_distribution_check=False, ocs_registry_image=None, post_upgrade_verification=False, version_before_upgrade=None, ): """ Perform steps necessary to verify a successful OCS installation Args: timeout (int): Number of seconds for timeout which will be used in the checks used in this function. skip_osd_distribution_check (bool): If true skip the check for osd distribution. ocs_registry_image (str): Specific image to check if it was installed properly. post_upgrade_verification (bool): Set to True if this function is called after upgrade. version_before_upgrade (float): Set to OCS version before upgrade """ from ocs_ci.ocs.node import get_nodes from ocs_ci.ocs.resources.pvc import get_deviceset_pvcs from ocs_ci.ocs.resources.pod import get_ceph_tools_pod, get_all_pods from ocs_ci.ocs.cluster import validate_cluster_on_pvc from ocs_ci.ocs.resources.fips import check_fips_enabled number_of_worker_nodes = len(get_nodes()) namespace = config.ENV_DATA["cluster_namespace"] log.info("Verifying OCS installation") if config.ENV_DATA.get("disable_components"): for component in config.ENV_DATA["disable_components"]: config.COMPONENTS[f"disable_{component}"] = True disable_noobaa = config.COMPONENTS["disable_noobaa"] disable_rgw = config.COMPONENTS["disable_rgw"] disable_blockpools = config.COMPONENTS["disable_blockpools"] disable_cephfs = config.COMPONENTS["disable_cephfs"] managed_service = (config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS) ocs_version = version.get_semantic_ocs_version_from_config() # Basic Verification for cluster basic_verification(ocs_registry_image) # Verify pods in running state and proper counts log.info("Verifying pod states and counts") storage_cluster_name = config.ENV_DATA["storage_cluster_name"] storage_cluster = StorageCluster( resource_name=storage_cluster_name, namespace=namespace, ) pod = OCP(kind=constants.POD, namespace=namespace) if not config.DEPLOYMENT["external_mode"]: osd_count = int( storage_cluster.data["spec"]["storageDeviceSets"][0]["count"] ) * int( storage_cluster.data["spec"]["storageDeviceSets"][0]["replica"]) rgw_count = None if config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS: if not disable_rgw: rgw_count = get_rgw_count(f"{ocs_version}", post_upgrade_verification, version_before_upgrade) min_eps = constants.MIN_NB_ENDPOINT_COUNT_POST_DEPLOYMENT if config.ENV_DATA.get("platform") == constants.IBM_POWER_PLATFORM: min_eps = 1 nb_db_label = (constants.NOOBAA_DB_LABEL_46_AND_UNDER if ocs_version < version.VERSION_4_7 else constants.NOOBAA_DB_LABEL_47_AND_ABOVE) resources_dict = { nb_db_label: 1, constants.OCS_OPERATOR_LABEL: 1, constants.OPERATOR_LABEL: 1, constants.NOOBAA_OPERATOR_POD_LABEL: 1, constants.NOOBAA_CORE_POD_LABEL: 1, constants.NOOBAA_ENDPOINT_POD_LABEL: min_eps, } if not config.DEPLOYMENT["external_mode"]: resources_dict.update({ constants.MON_APP_LABEL: 3, constants.CSI_CEPHFSPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL: 2, constants.CSI_RBDPLUGIN_LABEL: number_of_worker_nodes, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL: 2, constants.OSD_APP_LABEL: osd_count, constants.MGR_APP_LABEL: 1, constants.MDS_APP_LABEL: 2, constants.RGW_APP_LABEL: rgw_count, }) if ocs_version >= version.VERSION_4_9: resources_dict.update({ constants.ODF_OPERATOR_CONTROL_MANAGER_LABEL: 1, }) for label, count in resources_dict.items(): if label == constants.RGW_APP_LABEL: if (not config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS or managed_service or disable_rgw): continue if "noobaa" in label and (disable_noobaa or managed_service): continue if "mds" in label and disable_cephfs: continue assert pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=label, resource_count=count, timeout=timeout, ) # Verify StorageClasses (1 ceph-fs, 1 ceph-rbd) log.info("Verifying storage classes") storage_class = OCP(kind=constants.STORAGECLASS, namespace=namespace) storage_cluster_name = config.ENV_DATA["storage_cluster_name"] required_storage_classes = { f"{storage_cluster_name}-cephfs", f"{storage_cluster_name}-ceph-rbd", } skip_storage_classes = set() if disable_cephfs: skip_storage_classes.update({ f"{storage_cluster_name}-cephfs", }) if disable_blockpools: skip_storage_classes.update({ f"{storage_cluster_name}-ceph-rbd", }) required_storage_classes = required_storage_classes.difference( skip_storage_classes) if config.DEPLOYMENT["external_mode"]: required_storage_classes.update({ f"{storage_cluster_name}-ceph-rgw", f'{config.ENV_DATA["cluster_namespace"]}.noobaa.io', }) storage_classes = storage_class.get() storage_class_names = { item["metadata"]["name"] for item in storage_classes["items"] } # required storage class names should be observed in the cluster under test missing_scs = required_storage_classes.difference(storage_class_names) if len(missing_scs) > 0: log.error("few storage classess are not present: %s", missing_scs) assert list(missing_scs) == [] # Verify OSDs are distributed if not config.DEPLOYMENT["external_mode"]: if not skip_osd_distribution_check: log.info( "Verifying OSDs are distributed evenly across worker nodes") ocp_pod_obj = OCP(kind=constants.POD, namespace=namespace) osds = ocp_pod_obj.get(selector=constants.OSD_APP_LABEL)["items"] deviceset_count = get_deviceset_count() node_names = [osd["spec"]["nodeName"] for osd in osds] for node in node_names: assert ( not node_names.count(node) > deviceset_count ), "OSD's are not distributed evenly across worker nodes" # Verify that CSI driver object contains provisioner names log.info("Verifying CSI driver object contains provisioner names.") csi_driver = OCP(kind="CSIDriver") csi_drivers = { item["metadata"]["name"] for item in csi_driver.get()["items"] } assert defaults.CSI_PROVISIONERS.issubset(csi_drivers) # Verify node and provisioner secret names in storage class log.info("Verifying node and provisioner secret names in storage class.") if config.DEPLOYMENT["external_mode"]: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD) sc_cephfs = storage_class.get(resource_name=( constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS)) else: if not disable_blockpools: sc_rbd = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_RBD) if not disable_cephfs: sc_cephfs = storage_class.get( resource_name=constants.DEFAULT_STORAGECLASS_CEPHFS) if not disable_blockpools: assert ( sc_rbd["parameters"]["csi.storage.k8s.io/node-stage-secret-name"] == constants.RBD_NODE_SECRET) assert ( sc_rbd["parameters"]["csi.storage.k8s.io/provisioner-secret-name"] == constants.RBD_PROVISIONER_SECRET) if not disable_cephfs: assert (sc_cephfs["parameters"] ["csi.storage.k8s.io/node-stage-secret-name"] == constants.CEPHFS_NODE_SECRET) assert (sc_cephfs["parameters"] ["csi.storage.k8s.io/provisioner-secret-name"] == constants.CEPHFS_PROVISIONER_SECRET) log.info("Verified node and provisioner secret names in storage class.") ct_pod = get_ceph_tools_pod() # https://github.com/red-hat-storage/ocs-ci/issues/3820 # Verify ceph osd tree output if not (config.DEPLOYMENT.get("ui_deployment") or config.DEPLOYMENT["external_mode"] or managed_service): log.info( "Verifying ceph osd tree output and checking for device set PVC names " "in the output.") if config.DEPLOYMENT.get("local_storage"): deviceset_pvcs = [osd.get_node() for osd in get_osd_pods()] # removes duplicate hostname deviceset_pvcs = list(set(deviceset_pvcs)) if config.ENV_DATA.get( "platform") == constants.BAREMETAL_PLATFORM or ( config.ENV_DATA.get("flexy_deployment") and config.ENV_DATA.get("platform") == constants.AWS_PLATFORM): deviceset_pvcs = [ deviceset.replace(".", "-") for deviceset in deviceset_pvcs ] else: deviceset_pvcs = [pvc.name for pvc in get_deviceset_pvcs()] osd_tree = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree", format="json") schemas = { "root": constants.OSD_TREE_ROOT, "rack": constants.OSD_TREE_RACK, "host": constants.OSD_TREE_HOST, "osd": constants.OSD_TREE_OSD, "region": constants.OSD_TREE_REGION, "zone": constants.OSD_TREE_ZONE, } schemas["host"]["properties"]["name"] = {"enum": deviceset_pvcs} for item in osd_tree["nodes"]: validate(instance=item, schema=schemas[item["type"]]) if item["type"] == "host": deviceset_pvcs.remove(item["name"]) assert not deviceset_pvcs, ( f"These device set PVCs are not given in ceph osd tree output " f"- {deviceset_pvcs}") log.info( "Verified ceph osd tree output. Device set PVC names are given in the " "output.") # TODO: Verify ceph osd tree output have osd listed as ssd # TODO: Verify ceph osd tree output have zone or rack based on AZ # Verify CSI snapshotter sidecar container is not present # if the OCS version is < 4.6 if ocs_version < version.VERSION_4_6: log.info("Verifying CSI snapshotter is not present.") provisioner_pods = get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE, selector=[ constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL, constants.CSI_RBDPLUGIN_PROVISIONER_LABEL, ], ) for pod_obj in provisioner_pods: pod_info = pod_obj.get() for container, image in get_images(data=pod_info).items(): assert ("snapshot" not in container) and ( "snapshot" not in image ), (f"Snapshot container is present in {pod_obj.name} pod. " f"Container {container}. Image {image}") ocs_csv = get_ocs_csv() deployments = ocs_csv.get()["spec"]["install"]["spec"]["deployments"] rook_ceph_operator_deployment = [ deployment_val for deployment_val in deployments if deployment_val["name"] == "rook-ceph-operator" ] assert { "name": "CSI_ENABLE_SNAPSHOTTER", "value": "false" } in (rook_ceph_operator_deployment[0]["spec"]["template"]["spec"] ["containers"][0]["env"] ), "CSI_ENABLE_SNAPSHOTTER value is not set to 'false'." log.info("Verified: CSI snapshotter is not present.") # Verify pool crush rule is with "type": "zone" if utils.get_az_count() == 3: log.info("Verifying pool crush rule is with type: zone") crush_dump = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd crush dump", format="") pool_names = [ constants.METADATA_POOL, constants.DEFAULT_BLOCKPOOL, constants.DATA_POOL, ] crush_rules = [ rule for rule in crush_dump["rules"] if rule["rule_name"] in pool_names ] for crush_rule in crush_rules: assert [ item for item in crush_rule["steps"] if item.get("type") == "zone" ], f"{crush_rule['rule_name']} is not with type as zone" log.info("Verified - pool crush rule is with type: zone") # TODO: update pvc validation for managed services if not managed_service: log.info("Validate cluster on PVC") validate_cluster_on_pvc() # Verify ceph health log.info("Verifying ceph health") health_check_tries = 20 health_check_delay = 30 if post_upgrade_verification: # In case of upgrade with FIO we have to wait longer time to see # health OK. See discussion in BZ: # https://bugzilla.redhat.com/show_bug.cgi?id=1817727 health_check_tries = 180 assert utils.ceph_health_check(namespace, health_check_tries, health_check_delay) if config.ENV_DATA.get("fips"): # In case that fips is enabled when deploying, # a verification of the installation of it will run # on all running state pods check_fips_enabled() if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() if config.DEPLOYMENT.get("kms_deployment"): kms = KMS.get_kms_deployment() kms.post_deploy_verification() if config.ENV_DATA.get("VAULT_CA_ONLY", None): verify_kms_ca_only() storage_cluster_obj = get_storage_cluster() is_flexible_scaling = ( storage_cluster_obj.get()["items"][0].get("spec").get( "flexibleScaling", False)) if is_flexible_scaling is True: failure_domain = storage_cluster_obj.data["items"][0]["status"][ "failureDomain"] assert failure_domain == "host", ( f"The expected failure domain on cluster with flexible scaling is 'host'," f" the actaul failure domain is {failure_domain}") if config.ENV_DATA.get("is_multus_enabled"): verify_multus_network() if managed_service: verify_managed_service_resources()
def finalizer(): assert ceph_health_check(), "Ceph cluster health is not OK" log.info("Ceph cluster health is OK")
def test_monitoring_after_draining_node_where_prometheus_hosted(self, pods): """ Test case to validate when node is drained where prometheus is hosted, prometheus pod should re-spin on new healthy node and shouldn't be any data/metrics loss """ # Get the prometheus pod pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus"] ) for pod_obj in pod_obj_list: # Get the pvc which mounted on prometheus pod pod_info = pod_obj.get() pvc_name = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"][ "claimName" ] # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj["spec"]["nodeName"] # Drain node where the prometheus pod hosted drain_nodes([prometheus_node]) # Validate node is in SchedulingDisabled state wait_for_nodes_status( [prometheus_node], status=constants.NODE_READY_SCHEDULING_DISABLED ) # Validate all prometheus pod is running POD = ocp.OCP( kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE ) assert POD.wait_for_resource( condition="Running", selector="app=prometheus", timeout=180 ), "One or more prometheus pods are not in running state" # Validate prometheus pod is re-spinned on new healthy node pod_info = pod_obj.get() new_node = pod_info["spec"]["nodeName"] assert ( new_node not in prometheus_node ), "Promethues pod not re-spinned on new node" log.info(f"Prometheus pod re-spinned on new node {new_node}") # Validate same pvc is mounted on prometheus pod assert ( pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"] in pvc_name ), f"Old pvc not found after restarting the prometheus pod {pod_obj.name}" # Validate the prometheus health is ok assert prometheus_health_check(), "Prometheus cluster health is not OK" # Mark the nodes back to schedulable schedule_nodes([prometheus_node]) # Wait some time after node scheduling back waiting_time = 30 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate node is in Ready State wait_for_nodes_status([prometheus_node], status=constants.NODE_READY) # Validate ceph health OK ceph_health_check(tries=40, delay=30) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
def test_add_capacity( self, project_factory, multi_dc_pod, multi_pvc_factory, pod_factory, mcg_obj, awscli_pod, bucket_factory, percent_to_fill, ): ##################################### # ENTRY CRITERIA # ##################################### # Prepare initial configuration : logger, cluster filling, loop for creating & deleting of PVCs and Pods, # noobaa IOs etc., # Perform Health checks: # Make sure cluster is healthy assert ceph_health_check( defaults.ROOK_CLUSTER_NAMESPACE ), "Entry criteria FAILED: Cluster is Unhealthy" # All OCS pods are in running state: # ToDo https://github.com/red-hat-storage/ocs-ci/issues/2361 assert ( pod_helpers.check_pods_in_running_state() ), "Entry criteria FAILED: one or more OCS pods are not in running state" # Create the namespace under which this test will execute: project = project_factory() # total pvc created will be 'num_of_pvcs' * 4 types of pvcs(rbd-rwo,rwx # & cephfs-rwo,rwx) num_of_pvcs = 40 rwo_rbd_pods = multi_dc_pod( num_of_pvcs=num_of_pvcs, pvc_size=175, project=project, access_mode="RWO", pool_type="rbd", timeout=360, ) # Note: Skipping cephfs pods creation # observing bug https://bugzilla.redhat.com/show_bug.cgi?id=1785399, # https://bugzilla.redhat.com/show_bug.cgi?id=1779421#c14 # Todo: https://github.com/red-hat-storage/ocs-ci/issues/2360 # Create rwx-rbd pods pods_ios_rwx_rbd = multi_dc_pod( num_of_pvcs=10, pvc_size=175, project=project, access_mode="RWX-BLK", pool_type="rbd", timeout=360, ) cluster_fill_io_pods = rwo_rbd_pods logger.info("The DC pods are up. Running IOs from them to fill the cluster") filler = cluster_exp_helpers.ClusterFiller( cluster_fill_io_pods, percent_to_fill, project.namespace ) assert filler.cluster_filler(), "IOs failed" # create separate threadpool for running IOs in the background executor_run_bg_ios_ops = ThreadPoolExecutor() bg_wrap = cluster_exp_helpers.BackgroundOps() status_cluster_ios = [] pods_for_copy = rwo_rbd_pods[0:5] + pods_ios_rwx_rbd for p in pods_for_copy: logger.info(f"running IOs on {p.name}") if p.pod_type == "rbd_block_rwx": status_cluster_ios.append( executor_run_bg_ios_ops.submit( bg_wrap.wrap, cluster_exp_helpers.raw_block_io, p, iterations=10 ) ) else: status_cluster_ios.append( executor_run_bg_ios_ops.submit( bg_wrap.wrap, cluster_exp_helpers.cluster_copy_ops, p, iterations=200, ) ) # Start pvc ops in the background.: logger.info("Started pvc create delete operations") executor_run_bg_ios_ops.submit( bg_wrap.wrap, test_create_delete_pvcs, multi_pvc_factory, pod_factory, project, iterations=200, ) # Start NooBaa IOs in the background.: logger.info("Started s3_io_create_delete...") executor_run_bg_ios_ops.submit( bg_wrap.wrap, s3_io_create_delete, mcg_obj, awscli_pod, bucket_factory, iterations=200, ) logger.info("Started obc_io_create_delete...") executor_run_bg_ios_ops.submit( bg_wrap.wrap, obc_io_create_delete, mcg_obj, awscli_pod, bucket_factory, iterations=200, ) # All ocs nodes are in Ready state (including master): executor_run_bg_ios_ops.submit( bg_wrap.wrap, cluster_exp_helpers.check_nodes_status, iterations=100 ) # Get restart count of ocs pods before expanstion restart_count_before = pod_helpers.get_pod_restarts_count( defaults.ROOK_CLUSTER_NAMESPACE ) # Get osd pods before expansion osd_pods_before = pod_helpers.get_osd_pods() # Get the total space in cluster before expansion ct_pod = pod_helpers.get_ceph_tools_pod() output = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd df") total_space_b4_expansion = int(output.get("summary").get("total_kb")) logger.info(f"total_space_b4_expansion == {total_space_b4_expansion}") logger.info("############## Calling add_capacity $$$$$$$$$$") ##################### # Call add_capacity # ##################### osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) # New osd (all) pods corresponding to the additional capacity should be # in running state pod.wait_for_resource( timeout=1200, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * 3, ) ################################# # Exit criteria verification: # ################################# cluster_exp_helpers.BackgroundOps.EXPANSION_COMPLETED = True # No ocs pods should get restarted unexpectedly # Get restart count of ocs pods after expansion and see any pods got # restated restart_count_after = pod_helpers.get_pod_restarts_count( defaults.ROOK_CLUSTER_NAMESPACE ) # # # TO DO # # Handle Bug 1814254 - All Mons respinned during add capacity and OSDs took longtime to come up # # implement function to make sure no pods are respun after expansion logger.info( f"sum(restart_count_before.values()) = {sum(restart_count_before.values())}" ) logger.info( f" sum(restart_count_after.values()) = {sum(restart_count_after.values())}" ) assert sum(restart_count_before.values()) == sum( restart_count_after.values() ), "Exit criteria verification FAILED: One or more pods got restarted" logger.info("Exit criteria verification Success: No pods were restarted") # Make sure right number of OSDs are added: # Get osd pods after expansion osd_pods_after = pod_helpers.get_osd_pods() number_of_osds_added = len(osd_pods_after) - len(osd_pods_before) logger.info( f"### number_of_osds_added = {number_of_osds_added}, " f"before = {len(osd_pods_before)}, after = {len(osd_pods_after) }" ) # If the difference b/w updated count of osds and old osd count is not # 3 then expansion failed assert ( number_of_osds_added == 3 ), "Exit criteria verification FAILED: osd count mismatch" logger.info( "Exit criteria verification Success: Correct number of OSDs are added" ) # The newly added capacity takes into effect at the storage level ct_pod = pod_helpers.get_ceph_tools_pod() output = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd df") total_space_after_expansion = int(output.get("summary").get("total_kb")) osd_size = int(output.get("nodes")[0].get("kb")) expanded_space = osd_size * 3 # 3 OSDS are added of size = 'osd_size' logger.info(f"space output == {output} ") logger.info(f"osd size == {osd_size} ") logger.info(f"total_space_after_expansion == {total_space_after_expansion} ") expected_total_space_after_expansion = total_space_b4_expansion + expanded_space logger.info( f"expected_total_space_after_expansion == {expected_total_space_after_expansion} " ) assert ( total_space_after_expansion == expected_total_space_after_expansion ), "Exit criteria verification FAILED: Expected capacity mismatch" logger.info( "Exit criteria verification Success: Newly added capacity took into effect" ) logger.info("Exit criteria verification Success: IOs completed successfully") # 'ceph osd tree' should show the new osds under right nodes/hosts # Verification is different for 3 AZ and 1 AZ configs ct_pod = pod_helpers.get_ceph_tools_pod() tree_output = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree") logger.info(f"### OSD tree output = {tree_output}") if config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM: assert cluster_helpers.check_osd_tree_1az_vmware( tree_output, len(osd_pods_after) ), "Exit criteria verification FAILED: Incorrect ceph osd tree formation found" aws_number_of_zones = 3 if config.ENV_DATA["platform"].lower() == constants.AWS_PLATFORM: # parse the osd tree. if it contains a node 'rack' then it's a # AWS_1AZ cluster. Else, 3 AWS_3AZ cluster for i in range(len(tree_output["nodes"])): if tree_output["nodes"][i]["name"] in "rack0": aws_number_of_zones = 1 if aws_number_of_zones == 1: assert cluster_helpers.check_osd_tree_1az_aws( tree_output, len(osd_pods_after) ), "Exit criteria verification FAILED: Incorrect ceph osd tree formation found" else: assert cluster_helpers.check_osd_tree_3az_aws( tree_output, len(osd_pods_after) ), "Exit criteria verification FAILED: Incorrect ceph osd tree formation found" logger.info("Exit criteria verification Success: osd tree verification success") # Make sure new pvcs and pods can be created and IOs can be run from # the pods num_of_pvcs = 1 rwo_rbd_pods = multi_dc_pod( num_of_pvcs=num_of_pvcs, pvc_size=5, project=project, access_mode="RWO", pool_type="rbd", ) rwo_cephfs_pods = multi_dc_pod( num_of_pvcs=num_of_pvcs, pvc_size=5, project=project, access_mode="RWO", pool_type="cephfs", ) rwx_cephfs_pods = multi_dc_pod( num_of_pvcs=num_of_pvcs, pvc_size=5, project=project, access_mode="RWX", pool_type="cephfs", ) # Create rwx-rbd pods pods_ios_rwx_rbd = multi_dc_pod( num_of_pvcs=num_of_pvcs, pvc_size=5, project=project, access_mode="RWX-BLK", pool_type="rbd", ) cluster_io_pods = ( rwo_rbd_pods + rwo_cephfs_pods + rwx_cephfs_pods + pods_ios_rwx_rbd ) with ThreadPoolExecutor() as pod_ios_executor: for p in cluster_io_pods: if p.pod_type == "rbd_block_rwx": logger.info(f"Calling block fio on pod {p.name}") pod_ios_executor.submit(cluster_exp_helpers.raw_block_io, p, "100M") else: logger.info(f"calling file fio on pod {p.name}") pod_ios_executor.submit(p.run_io, "fs", "100M") for pod_io in cluster_io_pods: pod_helpers.get_fio_rw_iops(pod_io) cluster_obj = cluster_helpers.CephCluster() assert ( cluster_obj.get_ceph_health() != "HEALTH_ERR" ), "Ceph cluster health checking failed" logger.info("ALL Exit criteria verification successfully") logger.info( "********************** TEST PASSED *********************************" )
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ _templating = templating.Templating() ceph_cluster = ocp.OCP(kind='CephCluster', namespace=config.ENV_DATA['cluster_namespace']) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") create_oc_resource('common.yaml', self.cluster_path, _templating, config.ENV_DATA) run_cmd(f'oc label namespace {config.ENV_DATA["cluster_namespace"]} ' f'"openshift.io/cluster-monitoring=true"') run_cmd(f"oc policy add-role-to-user view " f"system:serviceaccount:openshift-monitoring:prometheus-k8s " f"-n {config.ENV_DATA['cluster_namespace']}") apply_oc_resource('csi-nodeplugin-rbac_rbd.yaml', self.cluster_path, _templating, config.ENV_DATA, template_dir="ocs-deployment/csi/rbd/") apply_oc_resource('csi-provisioner-rbac_rbd.yaml', self.cluster_path, _templating, config.ENV_DATA, template_dir="ocs-deployment/csi/rbd/") apply_oc_resource('csi-nodeplugin-rbac_cephfs.yaml', self.cluster_path, _templating, config.ENV_DATA, template_dir="ocs-deployment/csi/cephfs/") apply_oc_resource('csi-provisioner-rbac_cephfs.yaml', self.cluster_path, _templating, config.ENV_DATA, template_dir="ocs-deployment/csi/cephfs/") # Increased to 15 seconds as 10 is not enough # TODO: do the sampler function and check if resource exist wait_time = 15 logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) create_oc_resource('operator-openshift-with-csi.yaml', self.cluster_path, _templating, config.ENV_DATA) logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) run_cmd(f"oc wait --for condition=ready pod " f"-l app=rook-ceph-operator " f"-n {config.ENV_DATA['cluster_namespace']} " f"--timeout=120s") run_cmd(f"oc wait --for condition=ready pod " f"-l app=rook-discover " f"-n {config.ENV_DATA['cluster_namespace']} " f"--timeout=120s") create_oc_resource('cluster.yaml', self.cluster_path, _templating, config.ENV_DATA) pod = ocp.OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=config.ENV_DATA['cluster_namespace']) # Check for the Running status of Ceph Pods run_cmd(f"oc wait --for condition=ready pod " f"-l app=rook-ceph-agent " f"-n {config.ENV_DATA['cluster_namespace']} " f"--timeout=120s") assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mgr', timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600) create_oc_resource('toolbox.yaml', self.cluster_path, _templating, config.ENV_DATA) logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) create_oc_resource('storage-manifest.yaml', self.cluster_path, _templating, config.ENV_DATA) create_oc_resource("service-monitor.yaml", self.cluster_path, _templating, config.ENV_DATA) create_oc_resource("prometheus-rules.yaml", self.cluster_path, _templating, config.ENV_DATA) logger.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) # Create MDS pods for CephFileSystem fs_data = templating.load_yaml_to_dict(constants.CEPHFILESYSTEM_YAML) fs_data['metadata']['namespace'] = config.ENV_DATA['cluster_namespace'] ceph_obj = OCS(**fs_data) ceph_obj.create() assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector='app=rook-ceph-mds', resource_count=2, timeout=600) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info(f"MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error(f"MDS deployment Failed! Please check logs!") # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") assert ceph_health_check( namespace=config.ENV_DATA['cluster_namespace']) # patch gp2 (EBS) storage class as 'non-default' logger.info("Patch gp2 storageclass as non-default") patch = " '{\"metadata\": {\"annotations\":{\"storageclass.kubernetes.io/is-default-class\":\"false\"}}}' " run_cmd(f"oc patch storageclass gp2 " f"-p {patch} " f"--request-timeout=120s")
def deploy_ocs(self): """ Handle OCS deployment, since OCS deployment steps are common to any platform, implementing OCS deployment here in base class. """ ceph_cluster = ocp.OCP(kind='CephCluster', namespace=self.namespace) try: ceph_cluster.get().get('items')[0] logger.warning("OCS cluster already exists") return except (IndexError, CommandFailed): logger.info("Running OCS basic installation") self.deploy_ocs_via_operator() if config.DEPLOYMENT.get('ui_deployment'): config.ENV_DATA['skip_ocs_deployment'] = True return pod = ocp.OCP(kind=constants.POD, namespace=self.namespace) cfs = ocp.OCP(kind=constants.CEPHFILESYSTEM, namespace=self.namespace) # Check for Ceph pods assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mon', resource_count=3, timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-mgr', timeout=600) assert pod.wait_for_resource(condition='Running', selector='app=rook-ceph-osd', resource_count=3, timeout=600) # validate ceph mon/osd volumes are backed by pvc validate_cluster_on_pvc() # Creating toolbox pod setup_ceph_toolbox() assert pod.wait_for_resource(condition=constants.STATUS_RUNNING, selector='app=rook-ceph-tools', resource_count=1, timeout=600) # Check for CephFilesystem creation in ocp cfs_data = cfs.get() cfs_name = cfs_data['items'][0]['metadata']['name'] if helpers.validate_cephfilesystem(cfs_name): logger.info(f"MDS deployment is successful!") defaults.CEPHFILESYSTEM_NAME = cfs_name else: logger.error(f"MDS deployment Failed! Please check logs!") # Change monitoring backend to OCS if config.ENV_DATA.get('monitoring_enabled') and config.ENV_DATA.get( 'persistent-monitoring'): sc_name = f"{config.ENV_DATA['storage_cluster_name']}-{constants.DEFAULT_SC_RBD}" # Get the list of monitoring pods pods_list = get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus', 'alertmanager']) # Create configmap cluster-monitoring-config create_configmap_cluster_monitoring_pod(sc_name) # Take some time to respin the pod waiting_time = 45 logger.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate the pods are respinned and in running state validate_pods_are_respinned_and_running_state(pods_list) # Validate the pvc is created on monitoring pods validate_pvc_created_and_bound_on_monitoring_pods() # Validate the pvc are mounted on pods validate_pvc_are_mounted_on_monitoring_pods(pods_list) # Change registry backend to OCS CEPHFS RWX PVC registry.change_registry_backend_to_ocs() # Verify health of ceph cluster # TODO: move destroy cluster logic to new CLI usage pattern? logger.info("Done creating rook resources, waiting for HEALTH_OK") assert ceph_health_check(namespace=self.namespace) # patch gp2/thin storage class as 'non-default' self.patch_default_sc_to_non_default()
def test_daemon_kill_during_pvc_pod_deletion_and_io( self, interface, resource_name, setup_base ): """ Kill 'resource_name' daemon while PVCs deletion, pods deletion and IO are progressing """ pvc_objs, pod_objs, rwx_pod_objs = setup_base namespace = pvc_objs[0].project.namespace num_of_pods_to_delete = 10 num_of_io_pods = 5 # Select pods to be deleted pods_to_delete = pod_objs[:num_of_pods_to_delete] pods_to_delete.extend( [ pod for pod in rwx_pod_objs for pod_obj in pods_to_delete if (pod_obj.pvc == pod.pvc) ] ) # Select pods to run IO io_pods = pod_objs[ num_of_pods_to_delete : num_of_pods_to_delete + num_of_io_pods ] io_pods.extend( [ pod for pod in rwx_pod_objs for pod_obj in io_pods if (pod_obj.pvc == pod.pvc) ] ) # Select pods which are having PVCs to delete pods_for_pvc = pod_objs[num_of_pods_to_delete + num_of_io_pods :] pvcs_to_delete = [pod_obj.pvc for pod_obj in pods_for_pvc] pods_for_pvc.extend( [ pod for pod in rwx_pod_objs for pod_obj in pods_for_pvc if (pod_obj.pvc == pod.pvc) ] ) log.info( f"{len(pods_to_delete)} pods selected for deletion in which " f"{len(pods_to_delete) - num_of_pods_to_delete} pairs of pod " f"share same RWX PVC" ) log.info( f"{len(io_pods)} pods selected for running IO in which " f"{len(io_pods) - num_of_io_pods} pairs of pod share same " f"RWX PVC" ) no_of_rwx_pvcs_delete = len(pods_for_pvc) - len(pvcs_to_delete) log.info( f"{len(pvcs_to_delete)} PVCs selected for deletion. " f"RWO PVCs: {len(pvcs_to_delete) - no_of_rwx_pvcs_delete}, " f"RWX PVCs: {no_of_rwx_pvcs_delete}" ) pod_functions = { "mds": partial(get_mds_pods), "mon": partial(get_mon_pods), "mgr": partial(get_mgr_pods), "osd": partial(get_osd_pods), "rbdplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin": partial(get_plugin_pods, interface=interface), "cephfsplugin_provisioner": partial(get_cephfsplugin_provisioner_pods), "rbdplugin_provisioner": partial(get_rbdfsplugin_provisioner_pods), "operator": partial(get_operator_pods), } disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource_name) executor = ThreadPoolExecutor(max_workers=len(pod_objs) + len(rwx_pod_objs)) # Get number of pods of type 'resource_name' num_of_resource_pods = len(pod_functions[resource_name]()) # Fetch the number of Pods and PVCs initial_num_of_pods = len(get_all_pods(namespace=namespace)) initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)["items"]) # Fetch PV names to verify after deletion pv_objs = [] for pvc_obj in pvcs_to_delete: pvc_obj.reload() pv_objs.append(pvc_obj.backed_pv_obj) # Fetch volume details from pods for the purpose of verification node_pv_dict = {} for pod_obj in pods_to_delete: pod_info = pod_obj.get() node = pod_info["spec"]["nodeName"] pvc = pod_info["spec"]["volumes"][0]["persistentVolumeClaim"]["claimName"] for pvc_obj in pvc_objs: if pvc_obj.name == pvc: pvc_obj.reload() pv = pvc_obj.backed_pv break if node in node_pv_dict: node_pv_dict[node].append(pv) else: node_pv_dict[node] = [pv] # Fetch image uuid associated with PVCs to be deleted pvc_uuid_map = {} for pvc_obj in pvcs_to_delete: pvc_uuid_map[pvc_obj.name] = pvc_obj.image_uuid log.info("Fetched image uuid associated with each PVC") # Do setup on pods for running IO log.info("Setting up pods for running IO.") for pod_obj in pod_objs + rwx_pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info["spec"]["volumeMode"] == "Block": storage_type = "block" else: storage_type = "fs" executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs + rwx_pod_objs: log.info(f"Waiting for IO setup to complete on pod {pod_obj.name}") for sample in TimeoutSampler(180, 2, getattr, pod_obj, "wl_setup_done"): if sample: log.info( f"Setup for running IO is completed on pod " f"{pod_obj.name}." ) break log.info("Setup for running IO is completed on all pods.") # Start IO on pods having PVCs to delete to load data log.info("Starting IO on pods having PVCs to delete.") self.run_io_on_pods(pods_for_pvc) log.info("IO started on pods having PVCs to delete.") log.info("Fetching IO results from the pods having PVCs to delete.") for pod_obj in pods_for_pvc: get_fio_rw_iops(pod_obj) log.info("Verified IO result on pods having PVCs to delete.") # Delete pods having PVCs to delete. assert self.delete_pods( pods_for_pvc ), "Couldn't delete pods which are having PVCs to delete." for pod_obj in pods_for_pvc: pod_obj.ocp.wait_for_delete(pod_obj.name) log.info("Verified: Deleted pods which are having PVCs to delete.") # Select daemon disruption.select_daemon() # Start IO on pods to be deleted log.info("Starting IO on pods to be deleted.") self.run_io_on_pods(pods_to_delete) log.info("IO started on pods to be deleted.") # Start deleting PVCs pvc_bulk_delete = executor.submit(delete_pvcs, pvcs_to_delete) log.info("Started deleting PVCs") # Start deleting pods pod_bulk_delete = executor.submit(self.delete_pods, pods_to_delete) log.info("Started deleting pods") # Start IO on IO pods self.run_io_on_pods(io_pods) log.info("Started IO on IO pods") # Verify pvc deletion has started pvc_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pvcs, previous_num=initial_num_of_pvc, namespace=namespace, change_type="decrease", min_difference=1, timeout=30, interval=0.01, ) # Verify pod deletion has started pod_deleting = executor.submit( wait_for_resource_count_change, func_to_use=get_all_pods, previous_num=initial_num_of_pods, namespace=namespace, change_type="decrease", min_difference=1, timeout=30, interval=0.01, ) assert pvc_deleting.result(), "Wait timeout: PVCs are not being deleted." log.info("PVCs deletion has started.") assert pod_deleting.result(), "Wait timeout: Pods are not being deleted." log.info("Pods deletion has started.") # Kill daemon disruption.kill_daemon() pods_deleted = pod_bulk_delete.result() assert pods_deleted, "Deletion of pods failed." # Verify pods are deleted for pod_obj in pods_to_delete: pod_obj.ocp.wait_for_delete(pod_obj.name, 300) log.info("Verified: Pods are deleted.") # Verify that the mount point is removed from nodes after deleting pod node_pv_mounted = verify_pv_mounted_on_node(node_pv_dict) for node, pvs in node_pv_mounted.items(): assert not pvs, ( f"PVs {pvs} is still present on node {node} after " f"deleting the pods." ) log.info( "Verified: mount points are removed from nodes after deleting " "the pods" ) pvcs_deleted = pvc_bulk_delete.result() assert pvcs_deleted, "Deletion of PVCs failed." # Verify PVCs are deleted for pvc_obj in pvcs_to_delete: pvc_obj.ocp.wait_for_delete(pvc_obj.name) log.info("Verified: PVCs are deleted.") # Verify PVs are deleted for pv_obj in pv_objs: pv_obj.ocp.wait_for_delete(resource_name=pv_obj.name, timeout=300) log.info("Verified: PVs are deleted.") # Verify PV using ceph toolbox. Image/Subvolume should be deleted. pool_name = default_ceph_block_pool() for pvc_name, uuid in pvc_uuid_map.items(): if interface == constants.CEPHBLOCKPOOL: ret = verify_volume_deleted_in_backend( interface=interface, image_uuid=uuid, pool_name=pool_name ) if interface == constants.CEPHFILESYSTEM: ret = verify_volume_deleted_in_backend( interface=interface, image_uuid=uuid ) assert ret, ( f"Volume associated with PVC {pvc_name} still exists " f"in backend" ) log.info("Fetching IO results from the pods.") for pod_obj in io_pods: fio_result = pod_obj.get_fio_results() err_count = fio_result.get("jobs")[0].get("error") assert ( err_count == 0 ), f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}" log.info("Verified IO result on pods.") # Verify number of pods of type 'resource_name' final_num_resource_name = len(pod_functions[resource_name]()) assert final_num_resource_name == num_of_resource_pods, ( f"Total number of {resource_name} pods is not matching with " f"initial value. Total number of pods before daemon kill: " f"{num_of_resource_pods}. Total number of pods present now: " f"{final_num_resource_name}" ) # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"]) log.info("Ceph cluster health is OK")
def test_rwo_pvc_fencing_node_short_network_failure( self, nodes, setup, teardown): """ OCS-1423/OCS-1428/OCS-1426: - Start DeploymentConfig based app pods on 1 OCS/Non-OCS node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Reboot the unresponsive node - When unresponsive node recovers, run IOs on new app pods OCS-1424/OCS-1434: - Start DeploymentConfig based app pods on multiple node Colocated scenario: Select 1 node where osd and/or mon is running, select other 2 nodes where mon/osd are not running Dedicated scenario: 3 Non-OCS nodes - Disrupt the leader provisioner pods if not running on above selected nodes - Make the nodes (where app pods are running) unresponsive by bringing their main network interface down - Check new app pods and/or mon, osd pods scheduled on another node and are stuck due to Multi-Attach error. - Reboot the unresponsive nodes - When unresponsive nodes recover, run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup # Run IO on pods md5sum_data = self.run_and_verify_io(pod_list=dc_pods, fio_filename="io_file1", run_io_in_bg=True) # OCS-1424/OCS-1434 # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.short_nw_fail_time} seconds") sleep(self.short_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in dc_pods + ceph_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name, timeout=600, sleep=30, ) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len( dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods) if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, "Unexpected number of osd pods" self.verify_multi_attach_error(new_ceph_pods) # Reboot the unresponsive node(s) logger.info(f"Rebooting the unresponsive node(s): {app_pod_nodes}") nodes.restart_nodes_by_stop_and_start( node.get_node_objs(app_pod_nodes)) node.wait_for_nodes_status(node_names=app_pod_nodes, status=constants.NODE_READY) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) # Wait for mon and osd pods to reach Running state selectors_to_check = { constants.MON_APP_LABEL: ceph_cluster.mon_count, constants.OSD_APP_LABEL: ceph_cluster.osd_count, } for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): assert pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num] ), "Data integrity check failed" # Run IO on new pods self.run_and_verify_io(pod_list=new_dc_pods, fio_filename="io_file2", return_md5sum=False)
def test_add_capacity_node_restart( self, nodes, multi_pvc_factory, pod_factory, workload_storageutilization_rbd, num_of_nodes, ): """ test add capacity when one of the worker nodes got restart in the middle of the process """ logging.info( "Condition 1 to start the test is met: storageutilization is completed" ) # Please notice: When the branch 'wip-add-capacity-e_e' will be merged into master # the test will include more much data both before, and after calling 'add_capacity'function. node_list = get_ocs_nodes(num_of_nodes=num_of_nodes) assert node_list, "Condition 2 to start test failed: No node to restart" max_osds = 15 osd_pods_before = pod_helpers.get_osd_pods() assert ( len(osd_pods_before) < max_osds ), "Condition 3 to start test failed: We have maximum of osd's in the cluster" logging.info("All start conditions are met!") osd_size = storage_cluster.get_osd_size() logging.info("Calling add_capacity function...") result = storage_cluster.add_capacity(osd_size) if result: logging.info("add capacity finished successfully") else: logging.info("add capacity failed") # Restart nodes while additional storage is being added logging.info("Restart nodes:") logging.info([n.name for n in node_list]) nodes.restart_nodes(nodes=node_list, wait=True, timeout=420) logging.info("Finished restarting the node list") # The exit criteria verification conditions here are not complete. When the branch # 'wip-add-capacity-e_e' will be merged into master I will use the functions from this branch. pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) pod.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * 3, ) # Verify OSDs are encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() logging.info( "Finished verifying add capacity osd storage with node restart") logging.info("Waiting for ceph health check to finished...") ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=90)
def test_rwo_pvc_fencing_node_prolonged_and_short_network_failure( self, nodes, setup, teardown): """ OCS-1431/OCS-1436: - Start DeploymentConfig based app pods on 1 node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Disrupt the leader provisioner pods if not running on above selected node - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive node - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods - Again make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods scheduled on another node are stuck due to Multi-Attach error. - Reboot the unresponsive node - When unresponsive node recovers, run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup extra_nodes = list(set(test_nodes) - set(app_pod_nodes)) helpers.remove_label_from_worker_node(node_list=extra_nodes[:-1], label_key="nodetype") # Run IO on pods md5sum_data = self.run_and_verify_io(pod_list=dc_pods, fio_filename="io_file1", run_io_in_bg=True) # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds") sleep(self.prolong_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in dc_pods + ceph_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len( dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods) new_ceph_pods = [] if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, "Unexpected number of osd pods" self.verify_multi_attach_error(new_ceph_pods) logger.info("Executing manual recovery steps") # Power off the unresponsive node logger.info(f"Powering off the unresponsive node: {app_pod_nodes}") nodes.stop_nodes(node.get_node_objs(app_pod_nodes)) # Force delete the app pods and/or mon,osd pods on the unresponsive node for pod_obj in dc_pods + ceph_pods: pod_obj.delete(force=True) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) # Wait for mon and osd pods to reach Running state selectors_to_check = [constants.MON_APP_LABEL, constants.OSD_APP_LABEL] for selector in selectors_to_check: assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=3, timeout=1800, sleep=60, ), f"3 expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == 3: # Check ceph health toolbox_status = ceph_cluster.POD.get_resource_status( ceph_cluster.toolbox.name) if toolbox_status == constants.STATUS_TERMINATING: ceph_cluster.toolbox.delete(force=True) assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods md5sum_data2 = self.run_and_verify_io(pod_list=new_dc_pods, fio_filename="io_file2", run_io_in_bg=True) helpers.label_worker_node(node_list=extra_nodes[:-1], label_key="nodetype", label_value="app-pod") # Induce network failure on the node node.node_network_failure(extra_nodes[-1]) logger.info(f"Waiting for {self.short_nw_fail_time} seconds") sleep(self.short_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name, timeout=600, sleep=30, ) # Fetch info of new pods and verify Multi-Attach error new_dc_pods2 = self.get_new_pods(new_dc_pods) assert len(new_dc_pods2) == len( new_dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods2) # Reboot the unresponsive node logger.info(f"Rebooting the unresponsive node: {extra_nodes[-1]}") nodes.restart_nodes_by_stop_and_start( node.get_node_objs([extra_nodes[-1]])) node.wait_for_nodes_status(node_names=[extra_nodes[-1]], status=constants.NODE_READY) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods2: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) # Wait for mon and osd pods to reach Running state for selector in selectors_to_check: assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=3, timeout=1800, sleep=60, ), f"3 expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == 3: # Check ceph health assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods2): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file2", original_md5sum=md5sum_data2[num]) for num, pod_obj in enumerate(new_dc_pods2): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods self.run_and_verify_io(pod_list=new_dc_pods2, fio_filename="io_file3", return_md5sum=False)
def test_rgw_host_node_failure(self, nodes, node_restart_teardown, node_drain_teardown, mcg_obj, bucket_factory): """ Test case to fail node where RGW and the NooBaa DB are hosted and verify the new pods spin on a healthy node """ # Get nooba pods noobaa_pod_obj = get_noobaa_pods() # Get the node where noobaa-db hosted noobaa_pod_node = None for noobaa_pod in noobaa_pod_obj: if noobaa_pod.name in [ constants.NB_DB_NAME_46_AND_BELOW, constants.NB_DB_NAME_47_AND_ABOVE, ]: noobaa_pod_node = get_pod_node(noobaa_pod) if noobaa_pod_node is None: assert False, "Could not find the NooBaa DB pod" # Validate if RGW pod and noobaa-db are hosted on same node # If not, make sure both pods are hosted on same node log.info("Validate if RGW pod and noobaa-db are hosted on same node") rgw_pod_obj = get_rgw_pods() rgw_pod_node_list = [ rgw_pod.get().get("spec").get("nodeName") for rgw_pod in rgw_pod_obj ] if not list( set(rgw_pod_node_list).intersection( noobaa_pod_node.name.split())): log.info("Unschedule other two nodes such that RGW " "pod moves to node where NooBaa DB pod hosted") worker_node_list = get_worker_nodes() node_names = list( set(worker_node_list) - set(noobaa_pod_node.name.split())) unschedule_nodes(node_names=node_names) ocp_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) rgw_pod_obj[0].delete() ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, timeout=300, sleep=5, ) log.info("Schedule those nodes again") schedule_nodes(node_names=node_names) # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Verify all storage pods are running wait_for_storage_pods() # Check again the rgw pod move to node where NooBaa DB pod hosted rgw_pod_obj_list = get_rgw_pods() rgw_pod_node_list = [ get_pod_node(rgw_pod_obj) for rgw_pod_obj in rgw_pod_obj_list ] value = [ True if rgw_pod_node == noobaa_pod_node.name else False for rgw_pod_node in rgw_pod_node_list ] assert value, ("RGW Pod didn't move to node where NooBaa DB pod" " hosted even after cordoned and uncordoned nodes" f"RGW pod hosted: {rgw_pod_node_list}" f"NooBaa DB pod hosted: {noobaa_pod_node.name}") log.info( "RGW and noobaa-db are hosted on same node start the test execution" ) rgw_pod_obj = get_rgw_pods() for rgw_pod in rgw_pod_obj: pod_node = rgw_pod.get().get("spec").get("nodeName") if pod_node == noobaa_pod_node.name: # Stop the node log.info(f"Stopping node {pod_node} where" f" rgw pod {rgw_pod.name} and NooBaa DB are hosted") node_obj = get_node_objs(node_names=[pod_node]) nodes.stop_nodes(node_obj) # Validate old rgw pod went terminating state wait_for_resource_state(resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720) # Validate new rgw pod spun ocp_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, ) # Start the node nodes.start_nodes(node_obj) # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Verify all storage pods are running wait_for_storage_pods() # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2") # Verify cluster health self.sanity_helpers.health_check()
def test_ceph_daemon_kill_during_resource_creation(self, interface, operation_to_disrupt, resource_to_delete, multi_pvc_factory, pod_factory): """ Base function for ceph daemon kill disruptive tests. Deletion of 'resource_to_delete' daemon will be introduced while 'operation_to_disrupt' is progressing. """ disruption = disruption_helpers.Disruptions() pod_functions = { 'mds': partial(pod.get_mds_pods), 'mon': partial(pod.get_mon_pods), 'mgr': partial(pod.get_mgr_pods), 'osd': partial(pod.get_osd_pods), 'rbdplugin': partial(pod.get_plugin_pods, interface=interface), 'cephfsplugin': partial(pod.get_plugin_pods, interface=interface), 'cephfsplugin_provisioner': partial(pod.get_cephfsplugin_provisioner_pods), 'rbdplugin_provisioner': partial(pod.get_rbdfsplugin_provisioner_pods), 'operator': partial(pod.get_operator_pods) } # Get number of pods of type 'resource_to_delete' num_of_resource_to_delete = len(pod_functions[resource_to_delete]()) num_of_pvc = 12 namespace = self.proj_obj.namespace # Fetch the number of Pods and PVCs initial_num_of_pods = len(pod.get_all_pods(namespace=namespace)) initial_num_of_pvc = len(get_all_pvcs(namespace=namespace)['items']) executor = ThreadPoolExecutor(max_workers=(2 * num_of_pvc)) disruption.set_resource(resource=resource_to_delete) disruption.select_daemon() access_modes = [constants.ACCESS_MODE_RWO] if interface == constants.CEPHFILESYSTEM: access_modes.append(constants.ACCESS_MODE_RWX) # Modify access_modes list to create rbd `block` type volume with # RWX access mode. RWX is not supported in non-block type rbd if interface == constants.CEPHBLOCKPOOL: access_modes.extend([ f'{constants.ACCESS_MODE_RWO}-Block', f'{constants.ACCESS_MODE_RWX}-Block' ]) # Start creation of PVCs bulk_pvc_create = executor.submit( multi_pvc_factory, interface=interface, project=self.proj_obj, storageclass=self.sc_obj, size=8, access_modes=access_modes, access_modes_selection='distribute_random', status=constants.STATUS_BOUND, num_of_pvc=num_of_pvc, wait_each=False) if operation_to_disrupt == 'create_pvc': # Ensure PVCs are being created before deleting the resource ret = self.verify_resource_creation(get_all_pvcs, initial_num_of_pvc, namespace) assert ret, "Wait timeout: PVCs are not being created." log.info("PVCs creation has started.") disruption.kill_daemon() pvc_objs = bulk_pvc_create.result() # Confirm that PVCs are Bound for pvc_obj in pvc_objs: helpers.wait_for_resource_state(resource=pvc_obj, state=constants.STATUS_BOUND, timeout=120) pvc_obj.reload() log.info("Verified: PVCs are Bound.") # Start creating pods bulk_pod_create = executor.submit(self.pods_creation, pvc_objs, pod_factory, interface) if operation_to_disrupt == 'create_pod': # Ensure that pods are being created before deleting the resource ret = self.verify_resource_creation(pod.get_all_pods, initial_num_of_pods, namespace) assert ret, "Wait timeout: Pods are not being created." log.info(f"Pods creation has started.") disruption.kill_daemon() pod_objs = bulk_pod_create.result() # Verify pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) pod_obj.reload() log.info("Verified: All pods are Running.") # Do setup on pods for running IO log.info("Setting up pods for running IO.") for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info['spec']['volumeMode'] == 'Block': storage_type = 'block' else: storage_type = 'fs' executor.submit(pod_obj.workload_setup, storage_type=storage_type) # Wait for setup on pods to complete for pod_obj in pod_objs: for sample in TimeoutSampler(180, 2, getattr, pod_obj, 'wl_setup_done'): if sample: log.info(f"Setup for running IO is completed on pod " f"{pod_obj.name}.") break log.info("Setup for running IO is completed on all pods.") # Start IO on each pod for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info['spec']['volumeMode'] == 'Block': storage_type = 'block' else: storage_type = 'fs' pod_obj.run_io(storage_type=storage_type, size='2G', runtime=30, fio_filename=f'{pod_obj.name}_io_file1') log.info("FIO started on all pods.") if operation_to_disrupt == 'run_io': disruption.kill_daemon() log.info("Fetching FIO results.") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get('jobs')[0].get('error') assert err_count == 0, ( f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}") log.info("Verified FIO result on pods.") # Delete pods for pod_obj in pod_objs: pod_obj.delete(wait=True) for pod_obj in pod_objs: pod_obj.ocp.wait_for_delete(pod_obj.name) # Verify that PVCs are reusable by creating new pods create_pods = executor.submit(self.pods_creation, pvc_objs, pod_factory, interface) pod_objs = create_pods.result() # Verify new pods are Running for pod_obj in pod_objs: helpers.wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING) pod_obj.reload() log.info("Verified: All new pods are Running.") # Run IO on each of the new pods for pod_obj in pod_objs: pvc_info = pod_obj.pvc.get() if pvc_info['spec']['volumeMode'] == 'Block': storage_type = 'block' else: storage_type = 'fs' pod_obj.run_io(storage_type=storage_type, size='1G', runtime=10, fio_filename=f'{pod_obj.name}_io_file2') log.info("Fetching FIO results from new pods") for pod_obj in pod_objs: fio_result = pod_obj.get_fio_results() err_count = fio_result.get('jobs')[0].get('error') assert err_count == 0, ( f"FIO error on pod {pod_obj.name}. FIO result: {fio_result}") log.info("Verified FIO result on new pods.") # Verify number of pods of type 'resource_to_delete' final_num_resource_to_delete = len(pod_functions[resource_to_delete]()) assert final_num_resource_to_delete == num_of_resource_to_delete, ( f"Total number of {resource_to_delete} pods is not matching with " f"initial value. Total number of pods before deleting a pod: " f"{num_of_resource_to_delete}. Total number of pods present now: " f"{final_num_resource_to_delete}") # Check ceph status ceph_health_check(namespace=config.ENV_DATA['cluster_namespace']) log.info("Ceph cluster health is OK")
def teardown(self): """ Cleaning up the environment : Delete all snapshot Delete the POD Delete the PVC and the PV Delete the StorageClass Delete the VolumeSnapshotClass Delete the data pool Switch to the default namespace Delete the tested namespace """ log.info("Cleanup the test environment") # Getting the name of the PCV's backed PV try: pv = self.pvc_obj.get("spec")["spec"]["volumeName"] except KeyError: log.error( f"Cannot found key in the PVC object {json.dumps(self.pvc_obj.get('spec').get('spec'), indent=3)}" ) # Getting the list of all snapshots try: snapshot_list = self.snapshot.get(all_namespaces=True)["items"] except Exception as err: log.error(f"Cannot get the list of snapshots : {err}") snapshot_list = [] # Deleting al snapshots from the cluster log.info(f"Trying to delete all ({len(snapshot_list)}) Snapshots") log.debug( f"The list of all snapshots is : {json.dumps(snapshot_list, indent=3)}" ) for vs in snapshot_list: snap_name = vs["metadata"]["name"] log.info(f"Try to delete {snap_name}") try: self.snapshot.delete(resource_name=snap_name) except Exception as err: log.error(f"Cannot delete {snap_name} : {err}") # Deleting the pod which wrote data to the pvc log.info(f"Deleting the test POD : {self.pod_obj.name}") try: self.pod_obj.delete() log.info("Wait until the pod is deleted.") self.pod_obj.ocp.wait_for_delete(resource_name=self.pod_obj.name) except Exception as ex: log.error(f"Cannot delete the test pod : {ex}") # Deleting the PVC which used in the test. log.info(f"Delete the PVC : {self.pvc_obj.name}") try: self.pvc_obj.delete() log.info("Wait until the pvc is deleted.") self.pvc_obj.ocp.wait_for_delete(resource_name=self.pvc_obj.name) except Exception as ex: log.error(f"Cannot delete the test pvc : {ex}") # Delete the backend PV of the PVC log.info(f"Try to delete the backend PV : {pv}") try: run_oc_command(f"delete pv {pv}") except Exception as ex: err_msg = f"cannot delete PV {pv} - [{ex}]" log.error(err_msg) # Deleting the StorageClass used in the test log.info(f"Deleting the test StorageClass : {self.sc_obj.name}") try: self.sc_obj.delete() log.info("Wait until the SC is deleted.") self.sc_obj.ocp.wait_for_delete(resource_name=self.sc_obj.name) except Exception as ex: log.error(f"Can not delete the test sc : {ex}") # Deleting the VolumeSnapshotClass used in the test log.info(f"Deleting the test Snapshot Class : {self.snap_class.name}") try: self.snap_class.delete() log.info("Wait until the VSC is deleted.") self.snap_class.ocp.wait_for_delete( resource_name=self.snap_class.name) except Exception as ex: log.error(f"Can not delete the test vsc : {ex}") # Deleting the Data pool log.info(f"Deleting the test storage pool : {self.sc_name}") self.delete_ceph_pool(self.sc_name) # Verify deletion by checking the backend CEPH pools using the toolbox results = self.ceph_cluster.toolbox.exec_cmd_on_pod("ceph osd pool ls") log.debug(f"Existing pools are : {results}") if self.sc_name in results.split(): log.warning( "The pool did not deleted by CSI, forcing delete it manually") self.ceph_cluster.toolbox.exec_cmd_on_pod( f"ceph osd pool delete {self.sc_name} {self.sc_name} " "--yes-i-really-really-mean-it") else: log.info(f"The pool {self.sc_name} was deleted successfully") # Deleting the namespace used by the test log.info(f"Deleting the test namespace : {self.nss_name}") switch_to_default_rook_cluster_project() try: self.proj.delete(resource_name=self.nss_name) self.proj.wait_for_delete(resource_name=self.nss_name, timeout=60, sleep=10) except CommandFailed: log.error(f"Can not delete project {self.nss_name}") raise CommandFailed(f"{self.nss_name} was not created") # After deleteing all data from the cluster, we need to wait until it will re-balnce ceph_health_check(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, tries=30, delay=60) super(TestPvcMultiSnapshotPerformance, self).teardown()