def test_rgw_pod_existence(self): if (config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS or storagecluster_independent_check()): if (not config.ENV_DATA["platform"] == constants.AZURE_PLATFORM and not config.ENV_DATA["platform"] == constants.IBMCLOUD_PLATFORM and (version.get_semantic_ocs_version_from_config() > version.VERSION_4_5)): logger.info("Checking whether RGW pod is not present") assert ( not pod.get_rgw_pods() ), "RGW pods should not exist in the current platform/cluster" elif (config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS and not config.ENV_DATA["mcg_only_deployment"]): rgw_count = get_rgw_count(config.ENV_DATA["ocs_version"], check_if_cluster_was_upgraded(), None) logger.info( f'Checking for RGW pod/s on {config.ENV_DATA.get("platform")} platform' ) rgw_pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) assert rgw_pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.RGW_APP_LABEL, resource_count=rgw_count, timeout=60, )
def template_pvc( name, namespace=config.ENV_DATA["cluster_namespace"], storageclass=constants.CEPHFILESYSTEM_SC, access_mode=constants.ACCESS_MODE_RWX, size="20Gi", ): """ Create a PVC using the MCG CLI Args: name (str): Name of the PVC namespace (str): Namespace to create the PVC in access_mode (str): Access mode for the PVC size (str): Size of the PVC in GiB """ pvc_data = templating.load_yaml(constants.CSI_PVC_YAML) pvc_data["metadata"]["name"] = name pvc_data["metadata"]["namespace"] = namespace pvc_data["spec"]["accessModes"] = [access_mode] pvc_data["spec"]["resources"]["requests"]["storage"] = size pvc_data["spec"]["storageClassName"] = ( constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS if storagecluster_independent_check() else storageclass ) return pvc_data
def create_couchbase_worker(self, replicas=1, sc_name=None): """ Deploy a Couchbase server and pillowfight workload using operator The couchbase workers do not come up unless there is an admission controller running. The admission controller is started from the default project prior to bringing up the operator. Secrets, rolebindings and serviceaccounts need to also be generated. Once the couchbase operator is running, we need to wait for the three worker pods to also be up. Then a pillowfight task is started. After the pillowfight task has finished, the log is collected and analyzed. Raises: Exception: If pillowfight results indicate that a minimum performance level is not reached (1 second response time, less than 1000 ops per second) """ logging.info("Creating pods..") cb_example = templating.load_yaml(constants.COUCHBASE_WORKER_EXAMPLE) if storagecluster_independent_check(): cb_example["spec"]["volumeClaimTemplates"][0]["spec"][ "storageClassName" ] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD cb_example["spec"]["servers"][0]["size"] = replicas if sc_name: cb_example["spec"]["volumeClaimTemplates"][0]["spec"][ "storageClassName" ] = sc_name self.cb_examples = OCS(**cb_example) self.cb_examples.create() # Wait for last of three workers to be running. logging.info("Waiting for the pods to Running") for cb_wrk_pods in TimeoutSampler( self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, "cb-example", constants.COUCHBASE_OPERATOR, ): try: if len(cb_wrk_pods) == replicas: counter = 0 for cb_pod in cb_wrk_pods: if self.is_up_and_running(cb_pod, self.up_check): counter += 1 logging.info(f"Couchbase worker {cb_pod} is up") if counter == replicas: break except IndexError: logging.info( f"Expected number of couchbase pods are {replicas} " f"but only found {len(cb_wrk_pods)}" )
def init_sanity(self): """ Initialize Sanity instance """ if storagecluster_independent_check(): self.sanity_helpers = SanityExternalCluster() else: self.sanity_helpers = Sanity()
def setup_postgresql(self, replicas, node_selector=None): # Node selector for postgresql pgsql_sset = templating.load_yaml(constants.PGSQL_STATEFULSET_YAML) if node_selector is not None: pgsql_sset["spec"]["template"]["spec"][ "nodeSelector"] = node_selector if helpers.storagecluster_independent_check(): pgsql_sset["spec"]["volumeClaimTemplates"][0]["metadata"][ "annotations"][ "volume.beta.kubernetes.io/storage-class"] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD Postgresql.setup_postgresql(self, replicas=replicas)
def create_instance_in_clusterlogging(): """ Creation of instance for clusterlogging that creates PVCs, ElasticSearch, curator fluentd and kibana pods and checks for all the pods and PVCs Args: sc_name (str): Storage class name to create PVCs Returns: dict: Contains all detailed information of the instance such as pods that got created, its resources and limits values, storage class and size details etc. """ nodes_in_cluster = len(get_all_nodes()) inst_data = templating.load_yaml(constants.CL_INSTANCE_YAML) es_node_count = inst_data["spec"]["logStore"]["elasticsearch"]["nodeCount"] if helpers.storagecluster_independent_check(): inst_data["spec"]["logStore"]["elasticsearch"]["storage"][ "storageClassName" ] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD helpers.create_resource(wait=False, **inst_data) oc = ocp.OCP("v1", "ClusterLogging", "openshift-logging") logging_instance = oc.get(resource_name="instance", out_yaml_format="True") if logging_instance: logger.info("Successfully created instance for cluster-logging") logger.debug(logging_instance) else: logger.error("Instance for clusterlogging is not created properly") pod_obj = ocp.OCP(kind=constants.POD, namespace="openshift-logging") pod_status = pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=2 + es_node_count + nodes_in_cluster, timeout=500, sleep=2, ) assert pod_status, "Pods are not in Running state." logger.info("All pods are in Running state") pvc_obj = ocp.OCP(kind=constants.PVC, namespace="openshift-logging") pvc_status = pvc_obj.wait_for_resource( condition=constants.STATUS_BOUND, resource_count=es_node_count, timeout=150, sleep=5, ) assert pvc_status, "PVCs are not in bound state." logger.info("PVCs are Bound") return logging_instance
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) obc_data = templating.load_yaml(constants.MCG_OBC_YAML) if self.name is None: self.name = create_unique_resource_name("oc", "obc") obc_data["metadata"]["name"] = self.name obc_data["spec"]["bucketName"] = self.name if storagecluster_independent_check(): obc_data["spec"][ "storageClassName"] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RGW else: obc_data["spec"][ "storageClassName"] = constants.DEFAULT_STORAGECLASS_RGW obc_data["metadata"]["namespace"] = self.namespace create_resource(**obc_data)
def create_ocs_jenkins_template(self): """ Create OCS Jenkins Template """ log.info("Create Jenkins Template, jenkins-persistent-ocs") ocp_obj = OCP(namespace="openshift", kind="template") tmp_dict = ocp_obj.get(resource_name="jenkins-persistent", out_yaml_format=True) tmp_dict["labels"]["app"] = "jenkins-persistent-ocs" tmp_dict["labels"]["template"] = "jenkins-persistent-ocs-template" tmp_dict["metadata"]["name"] = "jenkins-persistent-ocs" # Find Kind: 'PersistentVolumeClaim' position in the objects list, differs in OCP 4.5 and OCP 4.6. sc_name = (constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD if storagecluster_independent_check() else constants.DEFAULT_STORAGECLASS_RBD) for i in range(len(tmp_dict["objects"])): if tmp_dict["objects"][i]["kind"] == constants.PVC: tmp_dict["objects"][i]["metadata"]["annotations"] = { "volume.beta.kubernetes.io/storage-class": sc_name } tmp_dict["parameters"][4]["value"] = "10Gi" tmp_dict["parameters"].append({ "description": "Override jenkins options to speed up slave spawning", "displayName": "Override jenkins options to speed up slave spawning", "name": "JAVA_OPTS", "value": "-Dhudson.slaves.NodeProvisioner.initialDelay=0 " "-Dhudson.slaves.NodeProvisioner.MARGIN=50 -Dhudson." "slaves.NodeProvisioner.MARGIN0=0.85", }) if Version.coerce(self.ocp_version) >= Version.coerce("4.8"): # Added "Pipeline Utility Steps" plugin via Jenkins Template # OCP team changed the default plugin list on OCP4.9 tmp_dict["objects"][3]["spec"]["template"]["spec"]["containers"][ 0]["env"].append({ "name": "INSTALL_PLUGINS", "value": "scm-api:2.6.5,pipeline-utility-steps:2.12.0,workflow-step-api:622." "vb_8e7c15b_c95a_,workflow-cps:2648.va9433432b33c,workflow-api:2.47", }) ocs_jenkins_template_obj = OCS(**tmp_dict) ocs_jenkins_template_obj.create()
def setup_amq_kafka_persistent(self, sc_name, size=100, replicas=3): """ Function to setup amq-kafka-persistent, the file is pulling from github it will make kind: Kafka and will make sure the status is running Args: sc_name (str): Name of sc size (int): Size of the storage in Gi replicas (int): Number of kafka and zookeeper pods to be created return : kafka_persistent """ if storagecluster_independent_check(): sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD try: kafka_persistent = templating.load_yaml( os.path.join(self.dir, self.amq_kafka_pers_yaml) ) kafka_persistent["spec"]["kafka"]["replicas"] = replicas kafka_persistent["spec"]["kafka"]["storage"]["volumes"][0][ "class" ] = sc_name kafka_persistent["spec"]["kafka"]["storage"]["volumes"][0][ "size" ] = f"{size}Gi" kafka_persistent["spec"]["zookeeper"]["replicas"] = replicas kafka_persistent["spec"]["zookeeper"]["storage"]["class"] = sc_name kafka_persistent["spec"]["zookeeper"]["storage"]["size"] = f"{size}Gi" self.kafka_persistent = OCS(**kafka_persistent) self.kafka_persistent.create() except (CommandFailed, CalledProcessError) as cf: log.error("Failed during setup of AMQ Kafka-persistent") raise cf time.sleep(40) if self.is_amq_pod_running( pod_pattern="my-cluster", expected_pods=(replicas * 2) + 1 ): return self.kafka_persistent else: raise ResourceWrongStatusException( "my-cluster-kafka and my-cluster-zookeeper " "Pod is not getting to running state" )
def get_credentials(self, secret_name=constants.NOOBAA_OBJECTSTOREUSER_SECRET): """ Get Endpoint, Access key and Secret key from OCS secret. Endpoint is taken from rgw exposed service. Use rgw_endpoint fixture in test to get it exposed. Args: secret_name (str): Name of secret to be used for getting RGW credentials Returns: tuple: Endpoint, Access key, Secret key """ secret_ocp_obj = OCP(kind=constants.SECRET, namespace=self.namespace) route_ocp_obj = OCP(kind=constants.ROUTE, namespace=config.ENV_DATA["cluster_namespace"]) if storagecluster_independent_check(): if version.get_semantic_ocs_version_from_config( ) < version.VERSION_4_7: endpoint = route_ocp_obj.get( resource_name=constants.RGW_SERVICE_EXTERNAL_MODE) else: endpoint = route_ocp_obj.get( resource_name=constants.RGW_ROUTE_EXTERNAL_MODE) if secret_name == constants.NOOBAA_OBJECTSTOREUSER_SECRET: secret_name = constants.EXTERNAL_MODE_NOOBAA_OBJECTSTOREUSER_SECRET elif secret_name == constants.CEPH_OBJECTSTOREUSER_SECRET: secret_name = constants.CEPH_EXTERNAL_OBJECTSTOREUSER_SECRET else: if version.get_semantic_ocs_version_from_config( ) < version.VERSION_4_7: endpoint = route_ocp_obj.get( resource_name=constants.RGW_SERVICE_INTERNAL_MODE) else: endpoint = route_ocp_obj.get( resource_name=constants.RGW_ROUTE_INTERNAL_MODE) creds_secret_obj = secret_ocp_obj.get(secret_name) endpoint = f"http://{endpoint['status']['ingress'][0]['host']}" access_key = base64.b64decode( creds_secret_obj.get("data").get("AccessKey")).decode("utf-8") secret_key = base64.b64decode( creds_secret_obj.get("data").get("SecretKey")).decode("utf-8") return (endpoint, access_key, secret_key)
def setup_postgresql(self, replicas, sc_name=None): """ Deploy postgres sql server Args: replicas (int): Number of postgresql pods to be deployed Raises: CommandFailed: If PostgreSQL server setup fails """ log.info("Deploying postgres database") try: pgsql_service = templating.load_yaml(constants.PGSQL_SERVICE_YAML) pgsql_cmap = templating.load_yaml(constants.PGSQL_CONFIGMAP_YAML) pgsql_sset = templating.load_yaml(constants.PGSQL_STATEFULSET_YAML) pgsql_sset["spec"]["replicas"] = replicas if ( storagecluster_independent_check() and config.ENV_DATA["platform"].lower() not in constants.MANAGED_SERVICE_PLATFORMS ): pgsql_sset["spec"]["volumeClaimTemplates"][0]["spec"][ "storageClassName" ] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD if sc_name: pgsql_sset["spec"]["volumeClaimTemplates"][0]["spec"][ "storageClassName" ] = sc_name self.pgsql_service = OCS(**pgsql_service) self.pgsql_service.create() self.pgsql_cmap = OCS(**pgsql_cmap) self.pgsql_cmap.create() self.pgsql_sset = OCS(**pgsql_sset) self.pgsql_sset.create() self.pod_obj.wait_for_resource( condition="Running", selector="app=postgres", resource_count=replicas, timeout=3600, ) except (CommandFailed, CalledProcessError) as cf: log.error("Failed during setup of PostgreSQL server") raise cf self.pgsql_is_setup = True log.info("Successfully deployed postgres database")
def __init__(self): """ Initializer function """ self.namespace = constants.OPENSHIFT_OPERATORS self.quay_operator = None self.quay_registry = None self.quay_pod_obj = OCP(kind=constants.POD, namespace=self.namespace) self.quay_registry_name = "" self.quay_operator_csv = "" self.sc_default = False self.sc_name = ( constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD if storagecluster_independent_check() else constants.DEFAULT_STORAGECLASS_RBD )
def search_file_path(self): """ Search File Path """ version = get_ocs_parsed_version() if self.type_log == "OTHERS" and storagecluster_independent_check(): files = GATHER_COMMANDS_VERSION[version]["OTHERS_EXTERNAL"] else: files = GATHER_COMMANDS_VERSION[version][self.type_log] for file in files: self.files_not_exist.append(file) for dir_name, subdir_list, files_list in os.walk(self.root): if file in files_list: self.files_path[file] = os.path.join(dir_name, file) self.files_not_exist.remove(file) break
def create_cb_cluster(self, replicas=1, sc_name=None): """ Deploy a Couchbase server using Couchbase operator Once the couchbase operator is running, we need to wait for the worker pods to be up. Once the Couchbase worker pods are up, pillowfight task is started. After the pillowfight task has finished, the log is collected and analyzed. Raises: Exception: If pillowfight results indicate that a minimum performance level is not reached (1 second response time, less than 1000 ops per second) """ log.info("Creating Couchbase worker pods...") cb_example = templating.load_yaml(constants.COUCHBASE_WORKER_EXAMPLE) if (storagecluster_independent_check() and config.ENV_DATA["platform"].lower() not in constants.MANAGED_SERVICE_PLATFORMS): cb_example["spec"]["volumeClaimTemplates"][0]["spec"][ "storageClassName"] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD cb_example["spec"]["servers"][0]["size"] = replicas if sc_name: cb_example["spec"]["volumeClaimTemplates"][0]["spec"][ "storageClassName"] = sc_name self.cb_example = OCS(**cb_example) self.cb_example.create() self.cb_create_cb_cluster = True # Wait for the Couchbase workers to be running. log.info("Waiting for the Couchbase pods to be Running") self.pod_obj.wait_for_resource( condition="Running", selector="app=couchbase", resource_count=replicas, timeout=900, ) log.info( f"Expected number: {replicas} of couchbase workers reached running state" )
def search_file_path(self): """ Search File Path """ ocs_version = float( f"{version.get_ocs_version_from_csv(only_major_minor=True)}") if self.type_log == "OTHERS" and storagecluster_independent_check(): files = GATHER_COMMANDS_VERSION[ocs_version]["OTHERS_EXTERNAL"] else: files = GATHER_COMMANDS_VERSION[ocs_version][self.type_log] for file in files: self.files_not_exist.append(file) for dir_name, subdir_list, files_list in os.walk(self.root): if file in files_list: self.files_path[file] = os.path.join(dir_name, file) self.files_not_exist.remove(file) break
def test_monitoring_enabled(): """ OCS Monitoring is enabled after OCS installation (which is why this test has a post deployment marker) by asking for values of one ceph and one noobaa related metrics. """ prometheus = PrometheusAPI() if ( storagecluster_independent_check() and float(config.ENV_DATA["ocs_version"]) < 4.6 ): logger.info( f"Skipping ceph metrics because it is not enabled for external " f"mode for OCS {float(config.ENV_DATA['ocs_version'])}" ) else: # ask for values of ceph_pool_stored metric logger.info("Checking that ceph data are provided in OCS monitoring") result = prometheus.query("ceph_pool_stored") msg = "check that we actually received some values for a ceph query" assert len(result) > 0, msg for metric in result: _, value = metric["value"] assert_msg = "number of bytes in a pool isn't a positive integer or zero" assert int(value) >= 0, assert_msg # additional check that values makes at least some sense logger.info( "Checking that size of ceph_pool_stored result matches number of pools" ) ct_pod = pod.get_ceph_tools_pod() ceph_pools = ct_pod.exec_ceph_cmd("ceph osd pool ls") assert len(result) == len(ceph_pools) # again for a noobaa metric logger.info("Checking that MCG/NooBaa data are provided in OCS monitoring") result = prometheus.query("NooBaa_bucket_status") msg = "check that we actually received some values for a MCG/NooBaa query" assert len(result) > 0, msg for metric in result: _, value = metric["value"] assert int(value) >= 0, "bucket status isn't a positive integer or zero"
def __init__(self, namespace=None): self.namespace = (namespace if namespace else config.ENV_DATA["cluster_namespace"]) if storagecluster_independent_check(): sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RGW else: sc_name = constants.DEFAULT_STORAGECLASS_RGW self.storageclass = OCP(kind="storageclass", namespace=namespace, resource_name=sc_name) self.s3_internal_endpoint = ( self.storageclass.get().get("parameters").get("endpoint")) self.region = self.storageclass.get().get("parameters").get("region") # Todo: Implement retrieval in cases where CephObjectStoreUser is available self.key_id = None self.secret_key = None self.s3_resource = None
def setup_amq_cluster( self, sc_name, namespace=constants.AMQ_NAMESPACE, size=100, replicas=3 ): """ Creates amq cluster with persistent storage. Args: sc_name (str): Name of sc namespace (str): Namespace for amq cluster size (int): Size of the storage replicas (int): Number of kafka and zookeeper pods to be created """ if storagecluster_independent_check(): sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD self.setup_amq_cluster_operator(namespace) self.setup_amq_kafka_persistent(sc_name, size, replicas) self.setup_amq_kafka_connect() self.setup_amq_kafka_bridge() self.amq_is_setup = True return self
def create_jenkins_pvc(self): """ create jenkins pvc Returns: List: pvc_objs """ pvc_objs = [] sc_name = (constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD if storagecluster_independent_check() else constants.DEFAULT_STORAGECLASS_RBD) for project in self.projects: log.info(f"create jenkins pvc on project {project}") pvc_obj = create_pvc( pvc_name="dependencies", size="10Gi", sc_name=sc_name, namespace=project, ) pvc_objs.append(pvc_obj) return pvc_objs
def __init__( self, project, tmp_path, storage_size=2, ): """ Init of the LogReaderWriterParallel object Args: project (pytest fixture): The project fixture. tmp_path (pytest fixture): The tmp_path fixture. storage_size (str): The size of the storage in GB. The default value is 2 GB. """ self.project = project self.tmp_path = tmp_path self.pvc_dict = get_pvc_dict() # we need to mount the volume on every worker node, so RWX/cephfs self.pvc_dict["metadata"]["name"] = "logwriter-cephfs-many" self.pvc_dict["spec"]["accessModes"] = [constants.ACCESS_MODE_RWX] if storagecluster_independent_check( ) and not is_managed_service_cluster(): sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS else: sc_name = constants.CEPHFILESYSTEM_SC logger.info(f"Storage class name = {sc_name}") self.pvc_dict["spec"]["storageClassName"] = sc_name self.pvc_dict["spec"]["resources"]["requests"][ "storage"] = f"{storage_size}Gi" self.deploy_dict = {} self.workload_file = None self.ocp_pod = None self.local_dir = self.tmp_path / "logwriter" self.local_dir.mkdir()
def setup( self, request, scenario, num_of_nodes, num_of_fail_nodes, disrupt_provisioner, project_factory, multi_pvc_factory, dc_pod_factory, ): """ Identify the nodes and start DeploymentConfig based app pods using PVC with ReadWriteOnce (RWO) access mode on selected nodes Args: scenario (str): Scenario of app pods running on OCS or dedicated nodes (eg., 'colocated', 'dedicated') num_of_nodes (int): number of nodes required for running test num_of_fail_nodes (int): number of nodes to make unresponsive during test disrupt_provisioner (bool): True to disrupt the leader provisioner pods if not running on selected nodes, else False project_factory: A fixture to create new project multi_pvc_factory: A fixture create a set of new PVCs dc_pod_factory: A fixture to create deploymentconfig pods Returns: tuple: containing the params used in test cases """ ocs_nodes, non_ocs_nodes = self.identify_and_add_nodes( scenario, num_of_nodes) test_nodes = ocs_nodes if (scenario == "colocated") else non_ocs_nodes logger.info(f"Using nodes {test_nodes} for running test") def finalizer(): helpers.remove_label_from_worker_node(node_list=test_nodes, label_key="nodetype") # Check ceph health ceph_health_check(tries=40) request.addfinalizer(finalizer) project = project_factory() if helpers.storagecluster_independent_check(): ceph_cluster = CephClusterExternal() else: ceph_cluster = CephCluster() # Wait for mon pods to reach expected count # Bug 1778273 - [RFE]: Configure 5 MONs for OCS cluster with 5 or more nodes # This wait is required for some of the previous OCS versions (< 4.5) current_mon_count = int( ceph_cluster.CEPHCLUSTER.get_resource(resource_name="", column="MONCOUNT")) assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=current_mon_count, timeout=900, ) ceph_cluster.mons = [] ceph_cluster.scan_cluster() # Select nodes for running app pods and inducing network failure later app_pod_nodes = self.select_nodes_for_app_pods(scenario, ceph_cluster, ocs_nodes, non_ocs_nodes, num_of_fail_nodes) # Create multiple RBD and CephFS backed PVCs with RWO accessmode num_of_pvcs = self.num_of_app_pods_per_node * num_of_fail_nodes rbd_pvcs = multi_pvc_factory( interface=constants.CEPHBLOCKPOOL, project=project, size=self.pvc_size, access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs, ) cephfs_pvcs = multi_pvc_factory( interface=constants.CEPHFILESYSTEM, project=project, size=self.pvc_size, access_modes=[constants.ACCESS_MODE_RWO], num_of_pvc=num_of_pvcs, ) # Create deploymentconfig based pods dc_pods = [] # Start app-pods on selected node(s) for node_name in app_pod_nodes: logger.info(f"Starting app pods on the node {node_name}") helpers.label_worker_node(node_list=[node_name], label_key="nodetype", label_value="app-pod") for num in range(self.num_of_app_pods_per_node): dc_pods.append( dc_pod_factory( interface=constants.CEPHBLOCKPOOL, pvc=rbd_pvcs.pop(0), node_selector={"nodetype": "app-pod"}, )) assert pod.verify_node_name( dc_pods[-1], node_name ), f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}" dc_pods.append( dc_pod_factory( interface=constants.CEPHFILESYSTEM, pvc=cephfs_pvcs.pop(0), node_selector={"nodetype": "app-pod"}, )) assert pod.verify_node_name( dc_pods[-1], node_name ), f"Pod {dc_pods[-1].name} is not running on labeled node {node_name}" helpers.remove_label_from_worker_node(node_list=[node_name], label_key="nodetype") # Label other test nodes to be able to run app pods later helpers.label_worker_node(node_list=test_nodes, label_key="nodetype", label_value="app-pod") # Get ceph mon,osd pods running on selected node if colocated scenario # and extra OCS nodes are present # Recovery steps for MON and OSDS not required from OCS 4.4 onwards # Refer to BZ 1830015 and BZ 1835908 ceph_pods = [] if float(config.ENV_DATA["ocs_version"]) < 4.4 and ( scenario == "colocated" and len(test_nodes) > 3): pods_to_check = ceph_cluster.osds # Skip mon pods if mon_count is 5 as there may not be enough nodes # for all mons to run after multiple node failures if ceph_cluster.mon_count == 3: pods_to_check.extend(ceph_cluster.mons) for pod_obj in pods_to_check: if pod.get_pod_node(pod_obj).name in app_pod_nodes[0]: ceph_pods.append(pod_obj) logger.info( f"Colocated Mon, OSD pods: {[pod_obj.name for pod_obj in ceph_pods]}" ) disruptor = [] if disrupt_provisioner: disruptor = self.disrupt_plugin_provisioner_pods(app_pod_nodes) return ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor
def __init__(self, *args, **kwargs): """ Constructor for the MCG class """ self.namespace = config.ENV_DATA["cluster_namespace"] self.operator_pod = Pod(**get_pods_having_label( constants.NOOBAA_OPERATOR_POD_LABEL, self.namespace)[0]) self.core_pod = Pod(**get_pods_having_label( constants.NOOBAA_CORE_POD_LABEL, self.namespace)[0]) self.retrieve_noobaa_cli_binary() """ The certificate will be copied on each mcg_obj instantiation since the process is so light and quick, that the time required for the redundant copy is neglible in comparison to the time a hash comparison will take. """ retrieve_default_ingress_crt() get_noobaa = OCP(kind="noobaa", namespace=self.namespace).get() self.s3_endpoint = (get_noobaa.get("items")[0].get("status").get( "services").get("serviceS3").get("externalDNS")[0]) self.s3_internal_endpoint = (get_noobaa.get("items")[0].get( "status").get("services").get("serviceS3").get("internalDNS")[0]) self.mgmt_endpoint = (get_noobaa.get("items")[0].get("status").get( "services").get("serviceMgmt").get("externalDNS")[0]) + "/rpc" self.region = config.ENV_DATA["region"] creds_secret_name = (get_noobaa.get("items")[0].get("status").get( "accounts").get("admin").get("secretRef").get("name")) secret_ocp_obj = OCP(kind="secret", namespace=self.namespace) creds_secret_obj = secret_ocp_obj.get(creds_secret_name) self.access_key_id = base64.b64decode( creds_secret_obj.get("data").get("AWS_ACCESS_KEY_ID")).decode( "utf-8") self.access_key = base64.b64decode( creds_secret_obj.get("data").get("AWS_SECRET_ACCESS_KEY")).decode( "utf-8") self.noobaa_user = base64.b64decode( creds_secret_obj.get("data").get("email")).decode("utf-8") self.noobaa_password = base64.b64decode( creds_secret_obj.get("data").get("password")).decode("utf-8") self.noobaa_token = self.retrieve_nb_token() self.s3_resource = boto3.resource( "s3", verify=retrieve_verification_mode(), endpoint_url=self.s3_endpoint, aws_access_key_id=self.access_key_id, aws_secret_access_key=self.access_key, ) self.s3_client = self.s3_resource.meta.client if config.ENV_DATA["platform"].lower() == "aws" and kwargs.get( "create_aws_creds"): ( self.cred_req_obj, self.aws_access_key_id, self.aws_access_key, ) = self.request_aws_credentials() self.aws_s3_resource = boto3.resource( "s3", endpoint_url="https://s3.amazonaws.com", aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_access_key, ) if (config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS or storagecluster_independent_check()): if not config.ENV_DATA["platform"] == constants.AZURE_PLATFORM and ( float(config.ENV_DATA["ocs_version"]) > 4.5): logger.info("Checking whether RGW pod is not present") pods = pod.get_pods_having_label(label=constants.RGW_APP_LABEL, namespace=self.namespace) assert ( not pods ), "RGW pods should not exist in the current platform/cluster" elif config.ENV_DATA.get("platform") in constants.ON_PREM_PLATFORMS: rgw_count = get_rgw_count(config.ENV_DATA["ocs_version"], check_if_cluster_was_upgraded(), None) logger.info( f'Checking for RGW pod/s on {config.ENV_DATA.get("platform")} platform' ) rgw_pod = OCP(kind=constants.POD, namespace=self.namespace) assert rgw_pod.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.RGW_APP_LABEL, resource_count=rgw_count, timeout=60, )
def test_rwo_pvc_fencing_node_prolonged_and_short_network_failure( self, nodes, setup, node_restart_teardown): """ OCS-1431/OCS-1436: - Start DeploymentConfig based app pods on 1 node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Disrupt the leader provisioner pods if not running on above selected node - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive node - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods - Again make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods scheduled on another node are stuck due to Multi-Attach error. - Reboot the unresponsive node - When unresponsive node recovers, run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup external_mode = helpers.storagecluster_independent_check() extra_nodes = list(set(test_nodes) - set(app_pod_nodes)) helpers.remove_label_from_worker_node(node_list=extra_nodes[:-1], label_key="nodetype") # Run IO on pods md5sum_data = self.run_and_verify_io(pod_list=dc_pods, fio_filename="io_file1", run_io_in_bg=True) # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds") sleep(self.prolong_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in dc_pods + ceph_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len( dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods) new_ceph_pods = [] if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, "Unexpected number of osd pods" self.verify_multi_attach_error(new_ceph_pods) logger.info("Executing manual recovery steps") # Power off the unresponsive node logger.info(f"Powering off the unresponsive node: {app_pod_nodes}") nodes.stop_nodes(node.get_node_objs(app_pod_nodes)) # Force delete the app pods and/or mon,osd pods on the unresponsive node for pod_obj in dc_pods + ceph_pods: pod_obj.delete(force=True) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not external_mode: # Wait for mon and osd pods to reach Running state selectors_to_check = { constants.MON_APP_LABEL: self.expected_mon_count, constants.OSD_APP_LABEL: ceph_cluster.osd_count, } for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == self.expected_mon_count: # Check ceph health toolbox_status = ceph_cluster.POD.get_resource_status( ceph_cluster.toolbox.name) if toolbox_status == constants.STATUS_TERMINATING: ceph_cluster.toolbox.delete(force=True) assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods md5sum_data2 = self.run_and_verify_io(pod_list=new_dc_pods, fio_filename="io_file2", run_io_in_bg=True) helpers.label_worker_node(node_list=extra_nodes[:-1], label_key="nodetype", label_value="app-pod") # Induce network failure on the node node.node_network_failure(extra_nodes[-1]) logger.info(f"Waiting for {self.short_nw_fail_time} seconds") sleep(self.short_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name, timeout=600, sleep=30, ) # Fetch info of new pods and verify Multi-Attach error new_dc_pods2 = self.get_new_pods(new_dc_pods) assert len(new_dc_pods2) == len( new_dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods2) # Reboot the unresponsive node logger.info(f"Rebooting the unresponsive node: {extra_nodes[-1]}") nodes.restart_nodes_by_stop_and_start( node.get_node_objs([extra_nodes[-1]])) node.wait_for_nodes_status(node_names=[extra_nodes[-1]], status=constants.NODE_READY) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods2: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not external_mode: # Wait for mon and osd pods to reach Running state for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == 3: # Check ceph health assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods2): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file2", original_md5sum=md5sum_data2[num]) for num, pod_obj in enumerate(new_dc_pods2): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods self.run_and_verify_io(pod_list=new_dc_pods2, fio_filename="io_file3", return_md5sum=False)
def test_rwo_pvc_fencing_node_prolonged_network_failure( self, nodes, setup, node_restart_teardown): """ OCS-1427/OCS-1429: - Start DeploymentConfig based app pods on 1 OCS/Non-OCS node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive node - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods OCS-1430/OCS-1435: - Start DeploymentConfig based app pods on multiple node Colocated scenario: Select 1 node where osd and/or mon is running, select other 2 nodes where mon/osd are not running Dedicated scenario: 3 Non-OCS nodes - Disrupt the leader provisioner pods if not running on above selected nodes - Make the nodes (where app pods are running) unresponsive by bringing their main network interface down - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive nodes - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup external_mode = helpers.storagecluster_independent_check() # Run IO on pods md5sum_data = self.run_and_verify_io(pod_list=dc_pods, fio_filename="io_file1", run_io_in_bg=True) # OCS-1430/OCS-1435 # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds") sleep(self.prolong_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in dc_pods + ceph_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len( dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods) if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, "Unexpected number of osd pods" self.verify_multi_attach_error(new_ceph_pods) logger.info("Executing manual recovery steps") # Power off the unresponsive node(s) logger.info(f"Powering off the unresponsive node(s): {app_pod_nodes}") nodes.stop_nodes(node.get_node_objs(app_pod_nodes)) # Force delete the app pods and/or mon,osd pods on the unresponsive node if float(config.ENV_DATA["ocs_version"] ) < 4.4 and ceph_cluster.mon_count == 5: for pod_obj in ceph_cluster.mons: if pod.get_pod_node(pod_obj).name in app_pod_nodes: ceph_pods.append(pod_obj) for pod_obj in dc_pods + ceph_pods: pod_obj.delete(force=True) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not external_mode: # Wait for mon and osd pods to reach Running state selectors_to_check = { constants.MON_APP_LABEL: self.expected_mon_count, constants.OSD_APP_LABEL: ceph_cluster.osd_count, } for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == self.expected_mon_count: # Check ceph health toolbox_status = ceph_cluster.POD.get_resource_status( ceph_cluster.toolbox.name) if toolbox_status == constants.STATUS_TERMINATING: ceph_cluster.toolbox.delete(force=True) assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods self.run_and_verify_io(pod_list=new_dc_pods, fio_filename="io_file2", return_md5sum=False)
def test_log_reader_writer_parallel(project, tmp_path): """ Write and read logfile stored on cephfs volume, from all worker nodes of a cluster via k8s Deployment, while fetching content of the stored data via oc rsync to check the data locally. Reproduces BZ 1989301. Test failure means new blocker high priority bug. """ pvc_dict = get_pvc_dict() # we need to mount the volume on every worker node, so RWX/cephfs pvc_dict["metadata"]["name"] = "logwriter-cephfs-many" pvc_dict["spec"]["accessModes"] = [constants.ACCESS_MODE_RWX] if ( config.ENV_DATA["platform"].lower() not in constants.MANAGED_SERVICE_PLATFORMS ) and storagecluster_independent_check(): sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_CEPHFS else: sc_name = constants.CEPHFILESYSTEM_SC pvc_dict["spec"]["storageClassName"] = sc_name # there is no need for lot of storage capacity for this test pvc_dict["spec"]["resources"]["requests"]["storage"] = "1Gi" # get deployment dict for the reproducer logwriter workload with open(constants.LOGWRITER_CEPHFS_REPRODUCER, "r") as deployment_file: deploy_dict = yaml.safe_load(deployment_file.read()) # if we are running in disconnected environment, we need to mirror the # container image first, and then use the mirror instead of the original if config.DEPLOYMENT.get("disconnected"): update_container_with_mirrored_image(deploy_dict["spec"]["template"]) # we need to match deployment replicas with number of worker nodes deploy_dict["spec"]["replicas"] = len(get_worker_nodes()) # drop topology spread constraints related to zones topology.drop_topology_constraint( deploy_dict["spec"]["template"]["spec"], topology.ZONE_LABEL ) # and link the deployment with the pvc try: link_spec_volume( deploy_dict["spec"]["template"]["spec"], "logwriter-cephfs-volume", pvc_dict["metadata"]["name"], ) except Exception as ex: error_msg = "LOGWRITER_CEPHFS_REPRODUCER no longer matches code of this test" raise Exception(error_msg) from ex # prepare k8s yaml file for deployment workload_file = ObjectConfFile( "log_reader_writer_parallel", [pvc_dict, deploy_dict], project, tmp_path ) # deploy the workload, starting the log reader/writer pods logger.info( "starting log reader/writer workload via Deployment, one pod per worker" ) workload_file.create() logger.info("waiting for all pods of the workload Deployment to run") ocp_pod = ocp.OCP(kind="Pod", namespace=project.namespace) try: ocp_pod.wait_for_resource( resource_count=deploy_dict["spec"]["replicas"], condition=constants.STATUS_RUNNING, error_condition=constants.STATUS_ERROR, timeout=300, sleep=30, ) except Exception as ex: # this is not a problem with feature under test, but with infra, # cluster configuration or unrelated bug which must have happened # before this test case error_msg = "unexpected problem with start of the workload, cluster is either misconfigured or broken" logger.exception(error_msg) logger.debug(workload_file.describe()) raise exceptions.UnexpectedBehaviour(error_msg) from ex # while the workload is running, we will try to fetch and validate data # from the cephfs volume of the workload 120 times (this number of retries # is a bit larger than usual number required to reproduce bug from # BZ 1989301, but we need to be sure here) number_of_fetches = 120 # if given fetch fail, we will ignore the failure unless the number of # failures is too high (this has no direct impact on feature under test, # we should be able to detect the bug even with 10% of rsync failures, # since data corruption doesn't simply go away ...) number_of_failures = 0 allowed_failures = 12 is_local_data_ok = True local_dir = tmp_path / "logwriter" local_dir.mkdir() workload_pods = ocp_pod.get() workload_pod_name = workload_pods["items"][0]["metadata"]["name"] logger.info( "while the workload is running, we will fetch and check data from the cephfs volume %d times", number_of_fetches, ) for _ in range(number_of_fetches): # fetch data from cephfs volume into the local dir oc_cmd = [ "oc", "rsync", "--loglevel=4", "-n", project.namespace, f"pod/{workload_pod_name}:/mnt/target", local_dir, ] try: run_cmd(cmd=oc_cmd, timeout=300) except Exception as ex: number_of_failures += 1 # in case this fails, we are going to fetch extra evidence, that # said such failure is most likely related to OCP or infrastructure error_msg = "oc rsync failed: something is wrong with the cluster" logger.exception(error_msg) logger.debug(workload_file.describe()) oc_rpm_debug = [ "oc", "rsh", "-n", project.namespace, f"pod/{workload_pod_name}", "bash", "-c", ";".join( [ "rpm -qa", "rpm -qaV", "type -a tar", "tar --version", "type -a rsync", "rsync --version", ] ), ] try: run_cmd(cmd=oc_rpm_debug, timeout=600) except Exception: # if fetch of additional evidence fails, log and ignore the # exception (so that we can retry if needed) logger.exception("failed to fetch additional evidence") # in case the rsync run failed because of a container restart, # we assume the pod name hasn't changed, and just wait for the # container to be running again - unless the number of rsync # failures is too high if number_of_failures > allowed_failures: logger.error("number of ignored rsync failures is too high") else: ocp_pod.wait_for_resource( resource_count=deploy_dict["spec"]["replicas"], condition=constants.STATUS_RUNNING, error_condition=constants.STATUS_ERROR, timeout=300, sleep=30, ) continue logger.debug( "before this failure, we ignored %d previous failures", number_of_failures, ) raise exceptions.UnexpectedBehaviour(error_msg) from ex # look for null bytes in the just fetched local files in target dir, # and if these binary bytes are found, the test failed (the bug # was reproduced) target_dir = os.path.join(local_dir, "target") for file_name in os.listdir(target_dir): with open(os.path.join(target_dir, file_name), "r") as fo: data = fo.read() if "\0" in data: is_local_data_ok = False logger.error( "file %s is corrupted: null byte found in a text file", file_name, ) # is_local_data_ok = False assert is_local_data_ok, "data corruption detected" time.sleep(2) logger.debug("number of ignored rsync failures: %d", number_of_failures) # if no obvious problem was detected, run the logreader job to validate # checksums in the log files (so that we are 100% sure that nothing went # wrong with the IO or the data) with open(constants.LOGWRITER_CEPHFS_READER, "r") as job_file: job_dict = yaml.safe_load(job_file.read()) # mirroring for disconnected environment, if necessary if config.DEPLOYMENT.get("disconnected"): update_container_with_mirrored_image(job_dict["spec"]["template"]) # drop topology spread constraints related to zones topology.drop_topology_constraint( job_dict["spec"]["template"]["spec"], topology.ZONE_LABEL ) # we need to match number of jobs with the number used in the workload job_dict["spec"]["completions"] = deploy_dict["spec"]["replicas"] job_dict["spec"]["parallelism"] = deploy_dict["spec"]["replicas"] # and reffer to the correct pvc name try: link_spec_volume( job_dict["spec"]["template"]["spec"], "logwriter-cephfs-volume", pvc_dict["metadata"]["name"], ) except Exception as ex: error_msg = "LOGWRITER_CEPHFS_READER no longer matches code of this test" raise Exception(error_msg) from ex # prepare k8s yaml file for the job job_file = ObjectConfFile("log_reader", [job_dict], project, tmp_path) # deploy the job, starting the log reader pods logger.info( "starting log reader data validation job to fully check the log data", ) job_file.create() # wait for the logreader job to complete (this should be rather quick) try: job.wait_for_job_completion( job_name=job_dict["metadata"]["name"], namespace=project.namespace, timeout=300, sleep_time=30, ) except exceptions.TimeoutExpiredError: error_msg = ( "verification failed to complete in time: data loss or broken cluster?" ) logger.exception(error_msg) # and then check that the job completed with success logger.info("checking the result of data validation job") logger.debug(job_file.describe()) ocp_job = ocp.OCP( kind="Job", namespace=project.namespace, resource_name=job_dict["metadata"]["name"], ) job_status = ocp_job.get()["status"] logger.info("last status of data verification job: %s", job_status) if ( "failed" in job_status or job_status["succeeded"] != deploy_dict["spec"]["replicas"] ): error_msg = "possible data corruption: data verification job failed!" logger.error(error_msg) job.log_output_of_job_pods( job_name=job_dict["metadata"]["name"], namespace=project.namespace ) raise Exception(error_msg)
def test_rwo_pvc_fencing_node_short_network_failure( self, nodes, setup, node_restart_teardown): """ OCS-1423/OCS-1428/OCS-1426: - Start DeploymentConfig based app pods on 1 OCS/Non-OCS node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Reboot the unresponsive node - When unresponsive node recovers, run IOs on new app pods OCS-1424/OCS-1434: - Start DeploymentConfig based app pods on multiple node Colocated scenario: Select 1 node where osd and/or mon is running, select other 2 nodes where mon/osd are not running Dedicated scenario: 3 Non-OCS nodes - Disrupt the leader provisioner pods if not running on above selected nodes - Make the nodes (where app pods are running) unresponsive by bringing their main network interface down - Check new app pods and/or mon, osd pods scheduled on another node and are stuck due to Multi-Attach error. - Reboot the unresponsive nodes - When unresponsive nodes recover, run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup # Run IO on pods md5sum_data = self.run_and_verify_io(pod_list=dc_pods, fio_filename="io_file1", run_io_in_bg=True) # OCS-1424/OCS-1434 # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.short_nw_fail_time} seconds") sleep(self.short_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in dc_pods + ceph_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name, timeout=600, sleep=30, ) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len( dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods) if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, "Unexpected number of osd pods" self.verify_multi_attach_error(new_ceph_pods) # Reboot the unresponsive node(s) logger.info(f"Rebooting the unresponsive node(s): {app_pod_nodes}") nodes.restart_nodes_by_stop_and_start( node.get_node_objs(app_pod_nodes)) node.wait_for_nodes_status(node_names=app_pod_nodes, status=constants.NODE_READY) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not helpers.storagecluster_independent_check(): # Wait for mon and osd pods to reach Running state selectors_to_check = { constants.MON_APP_LABEL: ceph_cluster.mon_count, constants.OSD_APP_LABEL: ceph_cluster.osd_count, } for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): assert pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num] ), "Data integrity check failed" # Run IO on new pods self.run_and_verify_io(pod_list=new_dc_pods, fio_filename="io_file2", return_md5sum=False)
def _create_backingstore(method, uls_dict): """ Tracks creation and cleanup of all the backing stores that were created in the scope Args: method (str): String for selecting method of backing store creation (CLI/OC) uls_dict (dict): Dictionary containing storage provider as key and a list of tuples as value. Cloud backing stores form - 'CloudName': [(amount, region), (amount, region)] i.e. - 'aws': [(3, us-west-1),(2, eu-west-2)] PV form - 'pv': [(amount, size_in_gb, storagecluster), ...] i.e. - 'pv': [(3, 32, ocs-storagecluster-ceph-rbd),(2, 100, ocs-storagecluster-ceph-rbd)] Returns: list: A list of backingstore names. """ if method.lower() not in cmdMap: raise RuntimeError(f"Invalid method type received: {method}. " f'available types: {", ".join(cmdMap.keys())}') for cloud, uls_lst in uls_dict.items(): for uls_tup in uls_lst: # Todo: Replace multiple .append calls, create names in advance, according to amountoc if cloud.lower() not in cmdMap[method.lower()]: raise RuntimeError( f"Invalid cloud type received: {cloud}. " f'available types: {", ".join(cmdMap[method.lower()].keys())}' ) if cloud == "pv": vol_num, size, storagecluster = uls_tup if (storagecluster == constants.DEFAULT_STORAGECLASS_RBD and storagecluster_independent_check()): storagecluster = ( constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD) backingstore_name = create_unique_resource_name( resource_description="backingstore", resource_type=cloud.lower()) created_backingstores.append( BackingStore( name=backingstore_name, method=method.lower(), type="pv", mcg_obj=mcg_obj, vol_num=vol_num, vol_size=size, )) if method.lower() == "cli": cmdMap[method.lower()][cloud.lower()]( mcg_obj, backingstore_name, vol_num, size, storagecluster) else: cmdMap[method.lower()][cloud.lower()]( backingstore_name, vol_num, size, storagecluster) else: _, region = uls_tup uls_dict = cloud_uls_factory({cloud: [uls_tup]}) for uls_name in uls_dict[cloud.lower()]: backingstore_name = create_unique_resource_name( resource_description="backingstore", resource_type=cloud.lower(), ) created_backingstores.append( BackingStore( name=backingstore_name, method=method.lower(), type="cloud", uls_name=uls_name, mcg_obj=mcg_obj, )) if method.lower() == "cli": cmdMap[method.lower()][cloud.lower()]( mcg_obj, cld_mgr, backingstore_name, uls_name, region) elif method.lower() == "oc": cmdMap[method.lower()][cloud.lower()]( cld_mgr, backingstore_name, uls_name, region) mcg_obj.check_backingstore_state( backingstore_name, constants.BS_OPTIMAL) # TODO: Verify OC\CLI BS health by using the appropriate methods return created_backingstores