def cleanup(self): """ Function to tear down """ # Delete all pods, pvcs and namespaces for namespace in self.namespace_list: delete_objs_parallel( obj_list=pod.get_all_pods(namespace=namespace.namespace), namespace=namespace.namespace, kind=self.kind, ) delete_objs_parallel( obj_list=pvc.get_all_pvc_objs(namespace=namespace.namespace), namespace=namespace.namespace, kind=constants.PVC, ) ocp = OCP(kind=constants.NAMESPACE) ocp.delete(resource_name=namespace.namespace) # Remove scale label from worker nodes in cleanup scale_workers = machine.get_labeled_nodes(constants.SCALE_LABEL) helpers.remove_label_from_worker_node(node_list=scale_workers, label_key="scale-label") # Delete machineset which will delete respective nodes too for aws-ipi platform if self.ms_name: for name in self.ms_name: machine.delete_custom_machineset(name)
def test_delete_rook_ceph_osd_deployment(self): osd_deployments = get_osd_deployments() deployment_obj = OCP(kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) pod_obj = OCP(kind=constants.POD, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) for osd_deployment in osd_deployments: # Get rook-ceph-osd pod name associated with the deployment osd_deployment_name = osd_deployment.name old_osd_pod = get_pod_name_by_pattern( pattern=osd_deployment_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, )[0] logger.info(f"Deleting OSD deployment: {osd_deployment_name}") try: deployment_obj.delete(resource_name=osd_deployment_name) deployment_obj.wait_for_resource( condition="0/1", resource_name=osd_deployment_name, column="READY") except CommandFailed as err: if "NotFound" not in str(err): raise # Wait for new OSD deployment to be Ready deployment_obj.wait_for_resource(condition="1/1", resource_name=osd_deployment_name, column="READY") # Check if a new OSD pod is created new_osd_pod = get_pod_name_by_pattern( pattern=osd_deployment_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, )[0] assert old_osd_pod != new_osd_pod, "New OSD pod not created" # Check if new OSD pod is up and running logger.info( "Waiting for a new OSD pod to get created and reach Running state" ) assert pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=new_osd_pod, column="STATUS", ), f"New OSD pod {new_osd_pod} is not in {constants.STATUS_RUNNING} state" # If clusterwide encryption is enabled, verify that the new OSDs are encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() assert ceph_health_check(delay=120, tries=50), "Ceph health check failed"
def delete_machine(machine_name): """ Deletes a machine Args: machine_name (str): Name of the machine you want to delete Raises: CommandFailed: In case yaml_file and resource_name wasn't provided """ machine_obj = OCP(kind="machine", namespace=constants.OPENSHIFT_MACHINE_API_NAMESPACE) log.info(f"Deleting machine {machine_name}") machine_obj.delete(resource_name=machine_name)
def delete_all_pvcs(namespace=None): """ Deletes all pvc in namespace Args: namespace (str): Name of namespace Returns: bool: True if deletion is successful """ if not namespace: namespace = config.ENV_DATA['cluster_namespace'] ocp_pvc_obj = OCP(kind=constants.PVC, namespace=namespace) ocp_pvc_list = get_all_pvcs(namespace=namespace) pvc_list = ocp_pvc_list['items'] for item in pvc_list: ocp_pvc_obj.delete(resource_name=item.get('metadata').get('name')) return True
def finalizer(): op_obj = OCP( kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) pod_obj = OCP( kind=constants.POD, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) operator_obj = op_obj.get(resource_name=constants.ROOK_CEPH_OPERATOR) if operator_obj.get("spec").get("replicas") != 1: modify_deployment_replica_count( deployment_name=constants.ROOK_CEPH_OPERATOR, replica_count=1 ), "Failed to scale up rook-ceph-operator to 1" log.info("Validate all mons are up and running") try: pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=3, timeout=60, sleep=5, ) except (TimeoutExpiredError, ResourceWrongStatusException) as ex: log.warning(ex) op_obj.delete(resource_name=constants.ROOK_CEPH_OPERATOR) for pod in get_mon_pods(): pod.delete() pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=3, timeout=360, sleep=5, ) log.info("All mons are up and running")
class ElasticSearch(object): """ ElasticSearch Environment """ def __init__(self): """ Initializer function """ log.info("Initializing the Elastic-Search environment object") self.namespace = "elastic-system" self.eck_file = "ocs_ci/templates/app-pods/eck.1.3.1-all-in-one.yaml" self.dumper_file = "ocs_ci/templates/app-pods/esclient.yaml" self.pvc = "ocs_ci/templates/app-pods/es-pvc.yaml" self.crd = "ocs_ci/templates/app-pods/esq.yaml" # Creating some different types of OCP objects self.ocp = OCP(kind="pod", resource_name="elastic-operator-0", namespace=self.namespace) self.ns_obj = OCP(kind="namespace", namespace=self.namespace) self.es = OCP(resource_name="quickstart-es-http", namespace=self.namespace) self.elasticsearch = OCP(namespace=self.namespace, kind="elasticsearch") self.password = OCP( kind="secret", resource_name="quickstart-es-elastic-user", namespace=self.namespace, ) # Deploy the ECK all-in-one.yaml file self._deploy_eck() # Deploy the Elastic-Search server self._deploy_es() # Verify that ES is Up & Running timeout = 600 while timeout > 0: if self.get_health(): log.info("The ElasticSearch server is ready !") break else: log.warning("The ElasticSearch server is not ready yet") log.info("going to sleep for 30 sec. before next check") time.sleep(30) timeout -= 30 self._deploy_data_dumper_client() # Connect to the server self.con = self._es_connect() def _deploy_eck(self): """ Deploying the ECK environment for the Elasticsearch, and make sure it is in Running mode """ log.info("Deploying the ECK environment for the ES cluster") self.ocp.apply(self.eck_file) for es_pod in TimeoutSampler(300, 10, get_pod_name_by_pattern, "elastic-operator", self.namespace): try: if es_pod[0] is not None: self.eckpod = es_pod[0] log.info(f"The ECK pod {self.eckpod} is ready !") break except IndexError: log.info("ECK operator pod not ready yet") def _deploy_data_dumper_client(self): """ Deploying elastic search client pod with utility which dump all the data from the server to .tgz file """ log.info("Deploying the es client for dumping all data") self.ocp.apply(self.dumper_file) for dmp_pod in TimeoutSampler(300, 10, get_pod_name_by_pattern, "es-dumper", self.namespace): try: if dmp_pod[0] is not None: self.dump_pod = dmp_pod[0] log.info( f"The dumper client pod {self.dump_pod} is ready !") break except IndexError: log.info("Dumper pod not ready yet") def get_ip(self): """ This function return the IP address of the Elasticsearch cluster. this IP is to use inside the OCP cluster Return str : String that represent the Ip Address. """ return self.es.get()["spec"]["clusterIP"] def get_port(self): """ This function return the port of the Elasticsearch cluster. Return str : String that represent the port. """ return self.es.get()["spec"]["ports"][0]["port"] def _deploy_es(self): log.info("Deploy the PVC for the ElasticSearch cluster") self.ocp.apply(self.pvc) log.info("Deploy the ElasticSearch cluster") self.ocp.apply(self.crd) for es_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern, "quickstart-es-default", self.namespace): try: if es_pod[0] is not None: self.espod = es_pod[0] log.info(f"The ElasticSearch pod {self.espod} Started") break except IndexError: log.info("elasticsearch pod not ready yet") es_pod = OCP(kind="pod", namespace=self.namespace) log.info("Waiting for ElasticSearch to Run") assert es_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=self.espod, sleep=30, timeout=600, ) log.info("Elastic Search is ready !!!") def get_health(self): """ This method return the health status of the Elasticsearch. Returns: bool : True if the status is green (OK) otherwise - False """ return self.elasticsearch.get( )["items"][0]["status"]["health"] == "green" def get_password(self): """ This method return the password used to connect the Elasticsearch. Returns: str : The password as text """ return base64.b64decode( self.password.get()["data"]["elastic"]).decode("utf-8") def cleanup(self): """ Cleanup the environment from all Elasticsearch components, and from the port forwarding process. """ log.info("Teardown the Elasticsearch environment") log.info("Deleting all resources") log.info("Deleting the dumper client pod") self.ocp.delete(yaml_file=self.dumper_file) log.info("Deleting the es resource") self.ocp.delete(yaml_file=self.crd) log.info("Deleting the es project") self.ns_obj.delete_project(project_name=self.namespace) self.ns_obj.wait_for_delete(resource_name=self.namespace, timeout=180) def _es_connect(self): """ Create a connection to the local ES Returns: Elasticsearch: elasticsearch connection object Raise: ConnectionError: if can not connect to the server """ try: es = Elasticsearch([{ "host": self.get_ip(), "port": self.get_port() }]) except esexp.ConnectionError: log.error("Can not connect to ES server in the LocalServer") raise return es def get_indices(self): """ Getting list of all indices in the ES server - all created by the test, the installation of the ES was without any indexes pre-installed. Returns: list : list of all indices defined in the ES server """ results = [] log.info("Getting all indices") for ind in self.con.indices.get_alias("*"): results.append(ind) return results def _copy(self, es): """ Copy All data from the internal ES server to the main ES. **This is deprecated function** , use the dump function, and load the data from the files for the main ES server Args: es (obj): elasticsearch object which connected to the main ES """ query = {"size": 1000, "query": {"match_all": {}}} for ind in self.get_indices(): log.info(f"Reading {ind} from internal ES server") try: result = self.con.search(index=ind, body=query) except esexp.NotFoundError: log.warning(f"{ind} Not found in the Internal ES.") continue log.debug(f"The results from internal ES for {ind} are :{result}") log.info(f"Writing {ind} into main ES server") for doc in result["hits"]["hits"]: log.debug(f"Going to write : {doc}") es.index(index=ind, doc_type="_doc", body=doc["_source"]) def dumping_all_data(self, target_path): """ Dump All data from the internal ES server to .tgz file. Args: target_path (str): the path where the results file will be copy into Return: bool: True if the dump operation succeed and return the results data to the host otherwise False """ log.info("dumping data from ES server to .tgz file") rsh_cmd = f"rsh {self.dump_pod} /elasticsearch-dump/esdumper.py --ip {self.get_ip()} --port {self.get_port()}" result = self.ocp.exec_oc_cmd(rsh_cmd, out_yaml_format=False, timeout=1200) if "ES dump is done." not in result: log.error("There is no data in the Elasticsearch server") return False else: src_file = result.split()[-1] log.info(f"Copy {src_file} from the client pod") cp_command = f"cp {self.dump_pod}:{src_file} {target_path}/FullResults.tgz" result = self.ocp.exec_oc_cmd(cp_command, timeout=120) log.info(f"The output from the POD is {result}") log.info("Extracting the FullResults.tgz file") kwargs = {"cwd": target_path} results = run_command(f"tar zxvf {target_path}/FullResults.tgz", **kwargs) log.debug(f"The untar results is {results}") if "Error in command" in results: log.warning("Can not untar the dumped file") return False return True
class OCS(object): """ Base OCSClass """ def __init__(self, **kwargs): """ Initializer function Args: kwargs (dict): 1) For existing resource, use OCP.reload() to get the resource's dictionary and use it to pass as **kwargs 2) For new resource, use yaml files templates under /templates/CSI like: obj_dict = load_yaml( os.path.join( TEMPLATE_DIR, "some_resource.yaml" ) ) """ self.data = kwargs self._api_version = self.data.get('api_version') self._kind = self.data.get('kind') self._namespace = None if 'metadata' in self.data: self._namespace = self.data.get('metadata').get('namespace') self._name = self.data.get('metadata').get('name') self.ocp = OCP(api_version=self._api_version, kind=self.kind, namespace=self._namespace) self.temp_yaml = tempfile.NamedTemporaryFile(mode='w+', prefix=self._kind, delete=False) # This _is_delete flag is set to True if the delete method was called # on object of this class and was successfull. self._is_deleted = False @property def api_version(self): return self._api_version @property def kind(self): return self._kind @property def namespace(self): return self._namespace @property def name(self): return self._name @property def is_deleted(self): return self._is_deleted def reload(self): """ Reloading the OCS instance with the new information from its actual data. After creating a resource from a yaml file, the actual yaml file is being changed and more information about the resource is added. """ self.data = self.get() self.__init__(**self.data) def get(self, out_yaml_format=True): return self.ocp.get(resource_name=self.name, out_yaml_format=out_yaml_format) def describe(self): return self.ocp.describe(resource_name=self.name) def create(self, do_reload=True): log.info(f"Adding {self.kind} with name {self.name}") templating.dump_data_to_temp_yaml(self.data, self.temp_yaml.name) status = self.ocp.create(yaml_file=self.temp_yaml.name) if do_reload: self.reload() return status def delete(self, wait=True, force=False): """ Delete the OCS object if its not already deleted (using the internal is_deleted flag) Args: wait (bool): Wait for object to be deleted force (bool): Force delete object Returns: bool: True if deleted, False otherwise """ # Avoid accidental delete of default storageclass and secret if (self.name == constants.DEFAULT_STORAGECLASS_CEPHFS or self.name == constants.DEFAULT_STORAGECLASS_RBD): log.info(f"Attempt to delete default Secret or StorageClass") return if self._is_deleted: log.info(f"Attempt to remove resource: {self.name} which is" f"already deleted! Skipping delete of this resource!") result = True else: result = self.ocp.delete(resource_name=self.name, wait=wait, force=force) self._is_deleted = True return result def apply(self, **data): with open(self.temp_yaml.name, 'w') as yaml_file: yaml.dump(data, yaml_file) assert self.ocp.apply( yaml_file=self.temp_yaml.name), (f"Failed to apply changes {data}") self.reload() def add_label(self, label): """ Addss a new label Args: label (str): New label to be assigned for this pod E.g: "label=app='rook-ceph-mds'" """ status = self.ocp.add_label(resource_name=self.name, label=label) self.reload() return status def delete_temp_yaml_file(self): utils.delete_file(self.temp_yaml.name)
def test_multiple_mon_pod_stays_on_same_node(self): """ A testcase to verify multiple mon pods stays on same node 1. Edit the rook-ceph-mon-endpoints configmap say, assign mon-a to another node that would be on the same node as another mon (compute-1 instead of compute-0) 2. Delete the mon-a deployment 3. Edit the mon-b deployment to remove the required mon anti-affinity 4. Restart the operator 5. Edit the mon-a deployment to remove the required mon anti-affinity 6. See mon-a start on compute-1 with mon-b 7. Soon after, see the operator failover one of these mons onto the node that doesn't currently have a mon (compute-0) and start mon-d """ ocs_version = config.ENV_DATA["ocs_version"] # Check that we have LSO cluster and OCS version is 4.8 and below # This is a workaround due to issue https://github.com/red-hat-storage/ocs-ci/issues/4937 if not (is_lso_cluster() and Version.coerce(ocs_version) <= Version.coerce("4.8")): pytest.skip( "Skip the test because mons are not node assignment from Rook, if cluster is not " "LSO based. And also currently, we want to run the test only with OCS 4.8 and " "below. This is a workaround due to issue " "https://github.com/red-hat-storage/ocs-ci/issues/4937") # Initialize rook_ceph_mon = "rook-ceph-mon" # Get mons running on pod mon_pods = get_mon_pods() mon_name_to_del = mon_pods[0].get().get("metadata").get("labels").get( "mon") mon_name_to_edit = mon_pods[1].get().get("metadata").get("labels").get( "mon") mon_node = get_pod_node(mon_pods[1]) # Edit the rook-ceph-mon-endpoints log.info(f"Edit the configmap {ROOK_CEPH_MON_ENDPOINTS}") configmap_obj = OCP(kind=CONFIGMAP, namespace=OPENSHIFT_STORAGE_NAMESPACE) rook_ceph_mon_configmap = configmap_obj.get( resource_name=ROOK_CEPH_MON_ENDPOINTS) json_val = json.loads(rook_ceph_mon_configmap["data"]["mapping"]) json_val["node"][mon_name_to_del].update( json_val["node"][mon_name_to_edit]) rook_ceph_mon_configmap["data"]["mapping"] = json.dumps(json_val) new_data = rook_ceph_mon_configmap["data"] params = f'{{"data": {json.dumps(new_data)}}}' configmap_obj.patch( resource_name=ROOK_CEPH_MON_ENDPOINTS, params=params, format_type="strategic", ) log.info(f"Configmap {ROOK_CEPH_MON_ENDPOINTS} edited successfully") log.info( f"Rook-ceph-mon-endpoints updated configmap: {rook_ceph_mon_configmap}" ) # Delete one mon deployment which had been edited dep_obj = OCP(kind=DEPLOYMENT, namespace=OPENSHIFT_STORAGE_NAMESPACE) mon_deployment_name_to_del = f"{rook_ceph_mon}-{mon_name_to_del}" log.info(f"Deleting mon {mon_deployment_name_to_del} deployments") dep_obj.delete(resource_name=mon_deployment_name_to_del) # Edit other mon deployment to remove mon anti-affinity mon_deployment_name_to_edit = f"{rook_ceph_mon}-{mon_name_to_edit}" log.info(f"Edit mon {mon_deployment_name_to_edit} deployment " "to remove the required mon anti-affinity") params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]' dep_obj.patch(resource_name=mon_deployment_name_to_edit, params=params, format_type="json") log.info( f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_edit}" ) # Restart operator operator_pod_obj = get_operator_pods() delete_pods(pod_objs=operator_pod_obj) POD_OBJ.wait_for_resource(condition=STATUS_RUNNING, selector=OPERATOR_LABEL) # Validate deleted deployment mon came up and in pending state # Initially mon stucks in pending state, remove defined anti-affinity POD_OBJ.wait_for_resource( condition=STATUS_PENDING, resource_count=1, selector=MON_APP_LABEL, timeout=1200, ) # Edit mon deployment to remove mon anti-affinity log.info(f"Edit mon {mon_deployment_name_to_del} deployment " "to remove the required mon anti-affinity") params = '[{"op": "remove", "path": "/spec/template/spec/affinity"}]' dep_obj.patch(resource_name=mon_deployment_name_to_del, params=params, format_type="json") log.info( f"Successfully removed defined mon anti-affinity {mon_deployment_name_to_del}" ) # Validate mon pod moved to another node such that 2 mons are running on same node log.info("Waiting for 5 seconds for mon recovery") time.sleep(5) new_mon_pods = get_mon_pods() new_node = [ get_pod_node(mon) for mon in new_mon_pods if mon.get().get( "metadata").get("labels").get("mon") == mon_name_to_del ] assert ( new_node[0].name == mon_node.name ), f"Mon moved to node {mon_node} such that 2 mons are running on same node" # Verify rook deletes one of the mon and move to another node timeout = 60 log.info(f"Waiting for {timeout} seconds for mon recovery") time.sleep(timeout) POD_OBJ.wait_for_resource( condition=STATUS_RUNNING, resource_count=len(mon_pods), selector=MON_APP_LABEL, timeout=3600, sleep=5, ) log.info( "Mons are up and running state and validate are running on different nodes" ) mon_pods_running_on_same_node()
def test_pvc_creation_after_del_mon_services(self, interface, pod_factory): """ 1. Delete one mon service 2. Edit the configmap rook-ceph-endpoints remove all the deleted mon services entries 3. Delete deployment, pvc of deleted mon service 4. Restart rook-ceph-operator 5. Make sure all mon pods are running 6. Make sure ceph health Ok and storage pods are running 7. Sleep for 300 seconds before deleting another mon 8. Repeat above steps for all mons and at the end each mon should contain different endpoints 9. Create PVC, should succeeded. """ pod_obj = pod_factory(interface=interface) run_io_in_bg(pod_obj) # Get all mon services mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) # Get all mon pods mon_pods = get_mon_pods() mon_count = len(mon_pods) list_old_svc = [] for svc in mon_svc: # Get rook-ceph-operator pod obj operator_pod_obj = get_operator_pods() operator_name = operator_pod_obj[0].name # Scale down rook-ceph-operator log.info("Scale down rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=0 ), "Failed to scale down rook-ceph-operator to 0" log.info("Successfully scaled down rook-ceph-operator to 0") # Validate rook-ceph-operator pod not running POD_OBJ.wait_for_delete(resource_name=operator_name) svc_name = svc["metadata"]["name"] cluster_ip = svc["spec"]["clusterIP"] port = svc["spec"]["ports"][0]["port"] mon_endpoint = f"{cluster_ip}:{port}" mon_id = svc["spec"]["selector"]["mon"] list_old_svc.append(cluster_ip) # Delete deployment log.info("Delete mon deployments") del_obj = OCP( kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) mon_info = del_obj.get(resource_name=svc_name) del_obj.delete(resource_name=svc_name) # Delete pvc if is_lso_cluster(): mon_data_path = f"/var/lib/rook/mon-{mon_id}" mon_node = mon_info["spec"]["template"]["spec"]["nodeSelector"][ "kubernetes.io/hostname" ] log.info(f"Delete the directory `{mon_data_path}` from {mon_node}") cmd = f"rm -rf {mon_data_path}" ocp_obj = OCP(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) ocp_obj.exec_oc_debug_cmd(node=mon_node, cmd_list=[cmd]) else: log.info("Delete mon PVC") pvc_name = svc["metadata"]["labels"]["pvc_name"] pvc_obj = OCP( kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) pvc_obj.delete(resource_name=pvc_name) # Delete the mon service log.info("Delete mon service") svc_obj = OCP( kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) svc_obj.delete(resource_name=svc_name) # Edit the cm log.info(f"Edit the configmap {constants.ROOK_CEPH_MON_ENDPOINTS}") configmap_obj = OCP( kind=constants.CONFIGMAP, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) output_get = configmap_obj.get( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS ) new_data = output_get["data"] new_data["csi-cluster-config-json"] = ( new_data["csi-cluster-config-json"].replace(f'"{mon_endpoint}",', "") if new_data["csi-cluster-config-json"].find(f'"{mon_endpoint}",') != 1 else new_data["csi-cluster-config-json"].replace( f',"{mon_endpoint}"', "" ) ) new_data["data"] = ",".join( [ value for value in new_data["data"].split(",") if f"{mon_id}=" not in value ] ) new_data["mapping"] = ( new_data["mapping"].replace(f'"{mon_id}":null,', "") if new_data["mapping"].find(f'"{mon_id}":null,') != -1 else new_data["mapping"].replace(f',"{mon_id}":null', "") ) params = f'{{"data": {json.dumps(new_data)}}}' log.info(f"Removing {mon_id} entries from configmap") configmap_obj.patch( resource_name=constants.ROOK_CEPH_MON_ENDPOINTS, params=params, format_type="strategic", ) log.info( f"Configmap {constants.ROOK_CEPH_MON_ENDPOINTS} edited successfully" ) # Scale up rook-ceph-operator log.info("Scale up rook-ceph-operator") assert modify_deployment_replica_count( deployment_name="rook-ceph-operator", replica_count=1 ), "Failed to scale up rook-ceph-operator to 1" log.info("Successfully scaled up rook-ceph-operator to 1") log.info("Validate rook-ceph-operator pod is running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL, resource_count=1, timeout=600, sleep=5, ) # Validate all mons are running log.info("Validate all mons are up and running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=mon_count, timeout=1200, sleep=5, ) log.info("All mons are up and running") # Check the ceph health OK ceph_health_check(tries=90, delay=15) # Validate all storage pods are running wait_for_storage_pods() # Sleep for some seconds before deleting another mon sleep_time = 300 log.info(f"Waiting for {sleep_time} seconds before deleting another mon") time.sleep(sleep_time) # Check the endpoints are different log.info("Validate the mon endpoints are changed") new_mon_svc = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) list_new_svc = [] for new_svc in new_mon_svc: cluster_ip = new_svc["spec"]["clusterIP"] list_new_svc.append(cluster_ip) diff = set(list_new_svc) ^ set(list_old_svc) assert len(diff) == len(list_old_svc + list_new_svc), ( f"Not all endpoints are changed. Set of old " f"endpoints {list_old_svc} and new endpoints {list_new_svc}" ) log.info(f"All new mon endpoints are created {list_new_svc}") # Create PVC and pods log.info(f"Create {interface} PVC") pod_obj = pod_factory(interface=interface) pod_obj.run_io(storage_type="fs", size="500M")
def test_del_mon_svc( self, multi_pvc_factory, validate_all_mon_svc_are_up_at_teardown ): """ Test to verify same mon comes up and running after deleting mon services manually and joins the quorum 1. Delete the mon services 2. Restart the rook operator 3. Make sure all mon pods are running, and same service or endpoints are running 4. Make sure ceph health Ok and storage pods are running 5. Create PVC, should succeeded. """ self.sanity_helpers = Sanity() # Get all mon services mon_svc_before = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) # Get all mon pods mon_pods = get_mon_pods() # Delete the mon services one by one svc_obj = OCP( kind=constants.SERVICE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE ) mon_svc_ip_before = [] for svc in mon_svc_before: svc_name = svc["metadata"]["name"] mon_svc_ip_before.append(svc["spec"]["clusterIP"]) log.info(f"Delete mon service {svc_name}") svc_obj.delete(resource_name=svc_name) # Verify mon services deleted svc_obj.wait_for_delete(resource_name=svc_name) # Restart the rook-operator pod operator_pod_obj = get_operator_pods() delete_pods(pod_objs=operator_pod_obj) POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.OPERATOR_LABEL ) # Verify same mon services are created again for svc in mon_svc_before: svc_name = svc["metadata"]["name"] svc_obj.check_resource_existence( should_exist=True, timeout=300, resource_name=svc_name ) log.info("Same old mon services are recreated") # Validate all mons are running log.info("Validate all mons are up and running") POD_OBJ.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MON_APP_LABEL, resource_count=len(mon_pods), timeout=600, sleep=3, ) # Validate same mon services are running log.info("Validate same mon services are running") mon_svc_after = get_services_by_label( label=constants.MON_APP_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) mon_svc_ip_after = [svc["spec"]["clusterIP"] for svc in mon_svc_after] assert len(set(mon_svc_ip_after) ^ set(mon_svc_ip_before)) == 0, ( "Different mon services are running. " f"Before mon services list: {mon_svc_ip_before}, " f"After mon services list: {mon_svc_ip_after}" ) log.info("Same old mon services are running and all mons are in running state") # Verify everything running fine log.info("Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=120) # Validate all storage pods are running wait_for_storage_pods() # Create and delete resources self.sanity_helpers.create_pvc_delete(multi_pvc_factory=multi_pvc_factory)
def test_noobaa_rebuild(self, bucket_factory): """ Test case to verify noobaa rebuild. Verifies KCS: https://access.redhat.com/solutions/5948631 1. Stop the noobaa-operator by setting the replicas of noobaa-operator deployment to 0. 2. Delete the noobaa deployments/statefulsets. 3. Delete the PVC db-noobaa-db-0. 4. Patch existing backingstores and bucketclasses to remove finalizer 5. Delete the backingstores/bucketclass. 6. Delete the noobaa secrets. 7. Restart noobaa-operator by setting the replicas back to 1. 8. Monitor the pods in openshift-storage for noobaa pods to be Running. """ dep_ocp = OCP(kind=constants.DEPLOYMENT, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) state_ocp = OCP(kind=constants.STATEFULSET, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) noobaa_pvc_obj = get_pvc_objs(pvc_names=["db-noobaa-db-pg-0"]) # Scale down noobaa operator logger.info( f"Scaling down {constants.NOOBAA_OPERATOR_DEPLOYMENT} deployment to replica: 0" ) dep_ocp.exec_oc_cmd( f"scale deployment {constants.NOOBAA_OPERATOR_DEPLOYMENT} --replicas=0" ) # Delete noobaa deployments and statefulsets logger.info("Deleting noobaa deployments and statefulsets") dep_ocp.delete(resource_name=constants.NOOBAA_ENDPOINT_DEPLOYMENT) state_ocp.delete(resource_name=constants.NOOBAA_DB_STATEFULSET) state_ocp.delete(resource_name=constants.NOOBAA_CORE_STATEFULSET) # Delete noobaa-db pvc pvc_obj = OCP(kind=constants.PVC, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) logger.info("Deleting noobaa-db pvc") pvc_obj.delete(resource_name=noobaa_pvc_obj[0].name, wait=True) pvc_obj.wait_for_delete(resource_name=noobaa_pvc_obj[0].name, timeout=300) # Patch and delete existing backingstores params = '{"metadata": {"finalizers":null}}' bs_obj = OCP(kind=constants.BACKINGSTORE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) for bs in bs_obj.get()["items"]: assert bs_obj.patch( resource_name=bs["metadata"]["name"], params=params, format_type="merge", ), "Failed to change the parameter in backingstore" logger.info(f"Deleting backingstore: {bs['metadata']['name']}") bs_obj.delete(resource_name=bs["metadata"]["name"]) # Patch and delete existing bucketclass bc_obj = OCP(kind=constants.BUCKETCLASS, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE) for bc in bc_obj.get()["items"]: assert bc_obj.patch( resource_name=bc["metadata"]["name"], params=params, format_type="merge", ), "Failed to change the parameter in bucketclass" logger.info(f"Deleting bucketclass: {bc['metadata']['name']}") bc_obj.delete(resource_name=bc["metadata"]["name"]) # Delete noobaa secrets logger.info("Deleting noobaa related secrets") dep_ocp.exec_oc_cmd( "delete secrets noobaa-admin noobaa-endpoints noobaa-operator noobaa-server" ) # Scale back noobaa-operator deployment logger.info( f"Scaling back {constants.NOOBAA_OPERATOR_DEPLOYMENT} deployment to replica: 1" ) dep_ocp.exec_oc_cmd( f"scale deployment {constants.NOOBAA_OPERATOR_DEPLOYMENT} --replicas=1" ) # Wait and validate noobaa PVC is in bound state pvc_obj.wait_for_resource( condition=constants.STATUS_BOUND, resource_name=noobaa_pvc_obj[0].name, timeout=600, sleep=120, ) # Validate noobaa pods are up and running pod_obj = OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) noobaa_pods = get_noobaa_pods() pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(noobaa_pods), selector=constants.NOOBAA_APP_LABEL, timeout=900, ) # Verify everything running fine logger.info( "Verifying all resources are Running and matches expected result") self.sanity_helpers.health_check(tries=120) # Verify default backingstore/bucketclass default_bs = OCP(kind=constants.BACKINGSTORE, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE).get( resource_name=DEFAULT_NOOBAA_BACKINGSTORE) default_bc = OCP(kind=constants.BUCKETCLASS, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE).get( resource_name=DEFAULT_NOOBAA_BUCKETCLASS) assert (default_bs["status"]["phase"] == default_bc["status"]["phase"] == constants.STATUS_READY ), "Failed: Default bs/bc are not in ready state" # Create OBCs logger.info("Creating OBCs after noobaa rebuild") bucket_factory(amount=3, interface="OC", verify_health=True)
class TestPvcMultiSnapshotPerformance(PASTest): """ Tests to measure PVC snapshots creation performance & scale The test is trying to to take the maximal number of snapshot for one PVC """ def setup(self): """ Setting up the test environment : Calculating the amount of storage which available for the test Creating namespace (project) for the test """ log.info("Setting up the test environment") super(TestPvcMultiSnapshotPerformance, self).setup() self.total_creation_time = 0 self.total_csi_creation_time = 0 self.total_creation_speed = 0 # Getting the total Storage capacity try: self.ceph_capacity = int(self.ceph_cluster.get_ceph_capacity()) except Exception as err: err_msg = f"Failed to get Storage capacity : {err}" log.error(err_msg) raise Exception(err_msg) # Use 70% of the storage capacity in the test self.capacity_to_use = int(self.ceph_capacity * 0.7) # Creating new namespace for the test self.nss_name = "pas-test-namespace" log.info(f"Creating new namespace ({self.nss_name}) for the test") try: self.proj = helpers.create_project(project_name=self.nss_name) except CommandFailed as ex: if str(ex).find("(AlreadyExists)"): log.warning("The namespace is already exists !") log.error("Cannot create new project") raise CommandFailed(f"{self.nss_name} was not created") # Initialize a general Snapshot object to use in the test self.snapshot = OCP(kind="volumesnapshot", namespace=self.nss_name) def teardown(self): """ Cleaning up the environment : Delete all snapshot Delete the POD Delete the PVC and the PV Delete the StorageClass Delete the VolumeSnapshotClass Delete the data pool Switch to the default namespace Delete the tested namespace """ log.info("Cleanup the test environment") if self.full_teardown: # Getting the name of the PCV's backed PV try: pv = self.pvc_obj.get("spec")["spec"]["volumeName"] except KeyError: log.error( f"Cannot found key in the PVC object {json.dumps(self.pvc_obj.get('spec').get('spec'), indent=3)}" ) # Getting the list of all snapshots try: snapshot_list = self.snapshot.get(all_namespaces=True)["items"] except Exception as err: log.error(f"Cannot get the list of snapshots : {err}") snapshot_list = [] # Deleting al snapshots from the cluster log.info(f"Trying to delete all ({len(snapshot_list)}) Snapshots") log.debug( f"The list of all snapshots is : {json.dumps(snapshot_list, indent=3)}" ) for vs in snapshot_list: snap_name = vs["metadata"]["name"] log.info(f"Try to delete {snap_name}") try: self.snapshot.delete(resource_name=snap_name) except Exception as err: log.error(f"Cannot delete {snap_name} : {err}") # Deleting the pod which wrote data to the pvc log.info(f"Deleting the test POD : {self.pod_obj.name}") try: self.pod_obj.delete() log.info("Wait until the pod is deleted.") self.pod_obj.ocp.wait_for_delete( resource_name=self.pod_obj.name) except Exception as ex: log.error(f"Cannot delete the test pod : {ex}") # Deleting the PVC which used in the test. try: log.info(f"Delete the PVC : {self.pvc_obj.name}") self.pvc_obj.delete() log.info("Wait until the pvc is deleted.") self.pvc_obj.ocp.wait_for_delete( resource_name=self.pvc_obj.name) except Exception as ex: log.error(f"Cannot delete the test pvc : {ex}") # Delete the backend PV of the PVC log.info(f"Try to delete the backend PV : {pv}") try: run_oc_command(f"delete pv {pv}") except Exception as ex: err_msg = f"cannot delete PV {pv} - [{ex}]" log.error(err_msg) # Deleting the StorageClass used in the test log.info(f"Deleting the test StorageClass : {self.sc_obj.name}") try: self.sc_obj.delete() log.info("Wait until the SC is deleted.") self.sc_obj.ocp.wait_for_delete(resource_name=self.sc_obj.name) except Exception as ex: log.error(f"Can not delete the test sc : {ex}") # Deleting the VolumeSnapshotClass used in the test log.info( f"Deleting the test Snapshot Class : {self.snap_class.name}") try: self.snap_class.delete() log.info("Wait until the VSC is deleted.") self.snap_class.ocp.wait_for_delete( resource_name=self.snap_class.name) except Exception as ex: log.error(f"Can not delete the test vsc : {ex}") # Deleting the Data pool log.info(f"Deleting the test storage pool : {self.sc_name}") self.delete_ceph_pool(self.sc_name) # Verify deletion by checking the backend CEPH pools using the toolbox results = self.ceph_cluster.toolbox.exec_cmd_on_pod( "ceph osd pool ls") log.debug(f"Existing pools are : {results}") if self.sc_name in results.split(): log.warning( "The pool did not deleted by CSI, forcing delete it manually" ) self.ceph_cluster.toolbox.exec_cmd_on_pod( f"ceph osd pool delete {self.sc_name} {self.sc_name} " "--yes-i-really-really-mean-it") else: log.info(f"The pool {self.sc_name} was deleted successfully") # Deleting the namespace used by the test log.info(f"Deleting the test namespace : {self.nss_name}") switch_to_default_rook_cluster_project() try: self.proj.delete(resource_name=self.nss_name) self.proj.wait_for_delete(resource_name=self.nss_name, timeout=60, sleep=10) except CommandFailed: log.error(f"Can not delete project {self.nss_name}") raise CommandFailed(f"{self.nss_name} was not created") # After deleting all data from the cluster, we need to wait until it will re-balance ceph_health_check(namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, tries=30, delay=60) super(TestPvcMultiSnapshotPerformance, self).teardown() def init_full_results(self, full_results): """ Initialize the full results object which will send to the ES server Args: full_results (obj): an empty FIOResultsAnalyse object Returns: FIOResultsAnalyse (obj): the input object fill with data """ for key in self.environment: full_results.add_key(key, self.environment[key]) full_results.add_key("index", full_results.new_index) full_results.add_key("snapshot_num", self.num_of_snaps) full_results.add_key("pvc_size", self.pvc_size) full_results.add_key("storageclass", self.sc_name.split("-")[-1]) full_results.add_key("dataset", self.capacity_to_use) return full_results def get_csi_pod(self, namespace): """ Getting pod list in specific namespace, for the provision logs Args: namespace (str): the namespace where the pod is deployed. Returns: list : list of lines from the output of the command. """ results = run_oc_command(cmd="get pod", namespace=namespace) if ERRMSG in results: err_msg = "Can not get the CSI controller pod" log.error(err_msg) raise Exception(err_msg) return results def build_fio_command(self): """ Building the FIO command that will be run on the pod before each snapshot """ # Find the path that the PVC is mounted within the POD path = (self.pod_obj.get("spec").get("spec").get("containers")[0].get( "volumeMounts")[0].get("mountPath")) self.fio_cmd = ( "fio --name=fio-fillup --rw=write --bs=4m --direct=1 --numjobs=1" " --time_based=0 --runtime=36000 --ioengine=libaio --end_fsync=1" f" --filename={path}/{self.file_name} --size={self.file_size}" " --output-format=json") log.info(f"The FIO command is : {self.fio_cmd}") def create_snapshotclass(self, interface): """ Creates own VolumeSnapshotClass Args: interface (str): Interface type used Returns: ocs_obj (obj): SnapshotClass obj instances """ if interface == constants.CEPHFILESYSTEM: snapclass_name = "pas-test-cephfs-snapshot-class" else: snapclass_name = "pas-test-rbd-snapshot-class" yaml_files = { constants.CEPHBLOCKPOOL: constants.CSI_RBD_SNAPSHOTCLASS_YAML, constants.CEPHFILESYSTEM: constants.CSI_CEPHFS_SNAPSHOTCLASS_YAML, } snapshotclass_data = templating.load_yaml(yaml_files[interface]) snapshotclass_data["metadata"]["name"] = snapclass_name ocs_obj = ocs.OCS(**snapshotclass_data) log.info(f"Creating new snapshot class : {snapclass_name}") try: created_snapclass = ocs_obj.create(do_reload=True) log.debug(created_snapclass) except Exception as ex: err_msg = f"Failed to create new snapshot class : {snapclass_name} [{ex}]" log.error(err_msg) raise Exception(err_msg) return ocs_obj def create_snapshot(self, snap_num): """ Creating snapshot of volume measure the total snapshot creation time and the CSI creation time Args: snap_num (int) the number of snapshot to create Returns: int: the creation time of the snapshot (in sec.) """ log.info(f"Taking snapshot number {snap_num}") # Getting UTC time before test starting for log retrieve start_time = self.get_time("csi") snap_name = f"pvc-snap-{snap_num}-" snap_name += self.pvc_obj.name.split("-")[-1] self.snap_templ["metadata"]["name"] = snap_name self.snap_templ["spec"][ "volumeSnapshotClassName"] = self.snap_class.name fd, tmpfile = tempfile.mkstemp(suffix=".yaml", prefix="Snap") log.debug(f"Going to create {tmpfile}") with open(tmpfile, "w") as f: yaml.dump(self.snap_templ, f, default_flow_style=False) res = run_oc_command(cmd=f"create -f {tmpfile}", namespace=self.nss_name) if ERRMSG in res[0]: err_msg = f"Failed to create snapshot : {res}" log.error(err_msg) raise Exception(err_msg) # wait until snapshot is ready timeout = 720 sleep_time = 10 snap_con_name = None snap_uid = None while timeout > 0: res = run_oc_command(f"get volumesnapshot {snap_name} -o yaml", namespace=self.nss_name) if ERRMSG not in res[0]: res = yaml.safe_load("\n".join(res)) log.debug(f"The command output is : {yaml.dump(res)}") try: if res["status"]["readyToUse"]: log.info(f"{snap_name} Created and ready to use") snap_con_name = res["status"][ "boundVolumeSnapshotContentName"] snap_uid = res["metadata"]["uid"] break else: log.info( f"{snap_name} is not ready yet, sleep {sleep_time} sec before re-check" ) time.sleep(sleep_time) timeout -= sleep_time except Exception: log.info( f"{snap_name} is not ready yet, sleep {sleep_time} sec before re-check" ) time.sleep(sleep_time) timeout -= sleep_time else: err_msg = f"Can not get snapshot status {res}" log.error(err_msg) raise Exception(err_msg) if snap_con_name: creation_time = performance_lib.measure_total_snapshot_creation_time( snap_name, start_time) csi_creation_time = performance_lib.measure_csi_snapshot_creation_time( self.interface, snap_uid, start_time) return (creation_time, csi_creation_time) else: err_msg = "Snapshot was not created on time" log.error(err_msg) raise TimeoutError(err_msg) def run(self): """ Running the test for each snapshot : write data on the pod and take snapshot """ results = [] for test_num in range(1, self.num_of_snaps + 1): log.info(f"Starting test number {test_num}") # Running IO on the POD - (re)-write data on the PVC self.pod_obj.exec_cmd_on_pod(self.fio_cmd, out_yaml_format=False, timeout=3600) # Taking Snapshot of the PVC ct, sci_ct = self.create_snapshot(test_num) speed = self.filesize / ct self.total_creation_time += ct self.total_csi_creation_time += sci_ct self.total_creation_speed += speed results.append({ "Snap Num": test_num, "time": ct, "csi_time": sci_ct, "speed": speed }) log.info(f"Results for snapshot number {test_num} are : " f"Creation time is {ct} , Creation speed {speed}, " f"Csi creation time is {sci_ct}") log.debug(f"All results are : {json.dumps(results, indent=3)}") return results @pytest.mark.polarion_id("OCS-2623") @pytest.mark.parametrize( argnames=["interface_type", "snap_number"], argvalues=[ pytest.param(*[constants.CEPHBLOCKPOOL, 512]), pytest.param(*[constants.CEPHFILESYSTEM, 100]), ], ) def test_pvc_multiple_snapshot_performance( self, pvc_factory, pod_factory, secret_factory, interface_type, snap_number, ): """ 1. Creating PVC size is depend on storage capacity, but not less then 1 GiB it will use ~75% capacity of the Storage, Min storage capacity 1 TiB 2. Fill the PVC with 80% of data 3. Take a snapshot of the PVC and measure the total and CSI times of creation. 4. re-write the data on the PVC 5. Take a snapshot of the PVC and measure the total and the CSI times of creation. 6. repeat steps 4-5 the numbers of snapshot we want to take : 512 this will be run by outside script for low memory consumption 7. print all information. Raises: StorageNotSufficientException: in case of not enough capacity """ # Getting the full path for the test logs self.results_path = get_full_test_logs_path(cname=self) self.full_log_path = f"{self.results_path}-{interface_type}-{snap_number}" log.info(f"Logs file path name is : {self.full_log_path}") log.info(f"Reslut path is : {self.results_path}") self.full_teardown = True self.num_of_snaps = snap_number if self.dev_mode: self.num_of_snaps = 2 log.info( f"Going to create {self.num_of_snaps} {interface_type} snapshots") # since we do not want to use more then 65%, we add 35% to the needed # capacity, and minimum PVC size is 1 GiB self.need_capacity = int((self.num_of_snaps + 2) * 1.35) # Test will run only on system with enough capacity if self.capacity_to_use < self.need_capacity: err_msg = (f"The system have only {self.ceph_capacity} GiB, " f"we want to use only {self.capacity_to_use} GiB, " f"and we need {self.need_capacity} GiB to run the test") log.error(err_msg) raise exceptions.StorageNotSufficientException(err_msg) # Calculating the PVC size in GiB self.pvc_size = int(self.capacity_to_use / (self.num_of_snaps + 2)) if self.dev_mode: self.pvc_size = 5 self.interface = interface_type self.sc_name = "pas-testing-rbd" pool_name = self.sc_name if self.interface == constants.CEPHFILESYSTEM: self.sc_name = "pas-testing-cephfs" pool_name = f"{self.sc_name}-data0" # Creating new storage pool self.create_new_pool(self.sc_name) # Creating new StorageClass (pool) for the test. secret = secret_factory(interface=self.interface) self.sc_obj = helpers.create_storage_class( interface_type=self.interface, interface_name=pool_name, secret_name=secret.name, sc_name=self.sc_name, fs_name=self.sc_name, ) log.info(f"The new SC is : {self.sc_obj.name}") log.debug(f"All SC data is {json.dumps(self.sc_obj.data, indent=3)}") # Create new VolumeSnapshotClass self.snap_class = self.create_snapshotclass(self.interface) # Create new PVC log.info(f"Creating {self.pvc_size} GiB PVC of {interface_type}") self.pvc_obj = pvc_factory( interface=self.interface, storageclass=self.sc_obj, size=self.pvc_size, status=constants.STATUS_BOUND, project=self.proj, ) # Create POD which will attache to the new PVC log.info("Creating A POD") self.pod_obj = pod_factory( interface=self.interface, pvc=self.pvc_obj, status=constants.STATUS_RUNNING, pod_dict_path=constants.PERF_POD_YAML, ) # Calculating the file size as 80% of the PVC size self.filesize = self.pvc_obj.size * 0.80 # Change the file size to MB for the FIO function self.file_size = f"{int(self.filesize * constants.GB2MB)}M" self.file_name = self.pod_obj.name log.info( f"Total capacity size is : {self.ceph_capacity} GiB, " f"Going to use {self.need_capacity} GiB, " f"With {self.num_of_snaps} Snapshots to {self.pvc_size} GiB PVC. " f"File size to be written is : {self.file_size} " f"with the name of {self.file_name}") # Reading basic snapshot yaml file self.snap_yaml = constants.CSI_CEPHFS_SNAPSHOT_YAML self.sc = constants.DEFAULT_VOLUMESNAPSHOTCLASS_CEPHFS self.fs_type = "cephfs" if interface_type == constants.CEPHBLOCKPOOL: self.snap_yaml = constants.CSI_RBD_SNAPSHOT_YAML self.fs_type = "rbd" self.sc = constants.DEFAULT_VOLUMESNAPSHOTCLASS_RBD with open(self.snap_yaml, "r") as stream: try: self.snap_templ = yaml.safe_load(stream) self.snap_templ["spec"]["volumeSnapshotClassName"] = self.sc self.snap_templ["spec"]["source"][ "persistentVolumeClaimName"] = self.pvc_obj.name except yaml.YAMLError as exc: log.error(f"Can not read template yaml file {exc}") log.debug( f"Snapshot yaml file : {self.snap_yaml} " f"Content of snapshot yaml file {json.dumps(self.snap_templ, indent=4)}" ) self.build_fio_command() self.start_time = self.get_time() # Initialize the results doc file. full_results = self.init_full_results( ResultsAnalyse(self.uuid, self.crd_data, self.full_log_path, "multiple_snapshots")) full_results.all_results = self.run() self.end_time = self.get_time() full_results.add_key( "avg_creation_time", f"{float(self.total_creation_time / self.num_of_snaps):.2f}", ) full_results.add_key( "avg_csi_creation_time", f"{float(self.total_csi_creation_time / self.num_of_snaps):.2f}", ) full_results.add_key( "avg_creation_speed", f"{float(self.total_creation_speed / self.num_of_snaps):.2f}", ) full_results.add_key("test_time", { "start": self.start_time, "end": self.end_time }) # Writing the analyzed test results to the Elastic-Search server if full_results.es_write(): res_link = full_results.results_link() log.info(f"The Result can be found at : {res_link}") # Create text file with results of all subtests (2 - according to the parameters) self.write_result_to_file(res_link) def test_pvc_multiple_snapshot_performance_results(self): """ This is not a test - it only checks that previous tests were completed and finished as expected with reporting the full results (links in the ES) of previous 2 tests """ self.full_teardown = False self.number_of_tests = 2 results_path = get_full_test_logs_path( cname=self, fname="test_pvc_multiple_snapshot_performance") self.results_file = os.path.join(results_path, "all_results.txt") log.info(f"Check results in {self.results_file}.") self.check_tests_results() self.push_to_dashboard(test_name="PVC Multiple Snapshots Creation")
class ElasticSearch(object): """ ElasticSearch Environment """ def __init__(self, **kwargs): """ Initializer function """ log.info("Initializing the Elastic-Search environment object") self.args = kwargs self.namespace = "elastic-system" self.repo = self.args.get("repo", constants.OCS_WORKLOADS) self.branch = self.args.get("branch", "master") self.dir = tempfile.mkdtemp(prefix="eck_") # Clone the ECK repo locally self._clone() self.eck_path = os.path.join(self.dir, "ocs-workloads/eck") self.eck_file = os.path.join(self.eck_path, "crds.yaml") self.dumper_file = os.path.join(constants.TEMPLATE_APP_POD_DIR, "esclient.yaml") self.crd = os.path.join(constants.TEMPLATE_APP_POD_DIR, "esq.yaml") # Creating some different types of OCP objects self.ocp = OCP(kind="pod", resource_name="elastic-operator-0", namespace=self.namespace) self.ns_obj = OCP(kind="namespace", namespace=self.namespace) self.es = OCP(resource_name="quickstart-es-http", namespace=self.namespace) self.elasticsearch = OCP(namespace=self.namespace, kind="elasticsearch") self.password = OCP( kind="secret", resource_name="quickstart-es-elastic-user", namespace=self.namespace, ) # Deploy the ECK all-in-one.yaml file self._deploy_eck() # Deploy the Elastic-Search server self._deploy_es() # Verify that ES is Up & Running sample = TimeoutSampler(timeout=180, sleep=10, func=self.get_health) if not sample.wait_for_func_status(True): raise Exception("Elasticsearch deployment Failed") # Deploy the elasticsearch dumper pod self._deploy_data_dumper_client() # Connect to the server self.con = self._es_connect() def _clone(self): """ clone the ECK repo into temp directory """ try: log.info(f"Cloning ECK in {self.dir}") git_clone_cmd = f"git clone -b {self.branch} {self.repo} --depth 1" run(git_clone_cmd, shell=True, cwd=self.dir, check=True) except (CommandFailed, CalledProcessError) as cf: log.error("Error during cloning of ECK repository") raise cf def _pod_is_found(self, pattern): """ Boolean function which check if pod (by pattern) is exist. Args: pattern (str): the pattern of the pod to look for Returns: bool : True if pod found, otherwise False """ return len(get_pod_name_by_pattern(pattern, self.namespace)) > 0 def _deploy_eck(self): """ Deploying the ECK environment for the Elasticsearch, and make sure it is in Running mode """ log.info("Deploying the ECK environment for the ES cluster") log.info("Deploy the ECK CRD's") self.ocp.apply(self.eck_file) log.info("deploy the ECK operator") self.ocp.apply(f"{self.eck_path}/operator.yaml") sample = TimeoutSampler(timeout=300, sleep=10, func=self._pod_is_found, pattern="elastic-operator") if not sample.wait_for_func_status(True): err_msg = "ECK deployment Failed" log.error(err_msg) self.cleanup() raise Exception(err_msg) log.info("The ECK pod is ready !") def _deploy_data_dumper_client(self): """ Deploying elastic search client pod with utility which dump all the data from the server to .tgz file """ log.info("Deploying the es client for dumping all data") self.ocp.apply(self.dumper_file) sample = TimeoutSampler(timeout=300, sleep=10, func=self._pod_is_found, pattern="es-dumper") if not sample.wait_for_func_status(True): self.cleanup() raise Exception("Dumper pod deployment Failed") self.dump_pod = get_pod_name_by_pattern("es-dumper", self.namespace)[0] log.info(f"The dumper client pod {self.dump_pod} is ready !") def get_ip(self): """ This function return the IP address of the Elasticsearch cluster. this IP is to use inside the OCP cluster Return str : String that represent the Ip Address. """ return self.es.get()["spec"]["clusterIP"] def get_port(self): """ This function return the port of the Elasticsearch cluster. Return str : String that represent the port. """ return self.es.get()["spec"]["ports"][0]["port"] def _deploy_es(self): """ Deploying the Elasticsearch server """ # Creating PVC for the elasticsearch server and wait until it bound log.info("Creating 10 GiB PVC for the ElasticSearch cluster on") self.pvc_obj = create_pvc( sc_name=constants.CEPHBLOCKPOOL_SC, namespace=self.namespace, pvc_name="elasticsearch-data-quickstart-es-default-0", access_mode=constants.ACCESS_MODE_RWO, size="10Gi", ) wait_for_resource_state(self.pvc_obj, constants.STATUS_BOUND) self.pvc_obj.reload() log.info("Deploy the ElasticSearch cluster") self.ocp.apply(self.crd) sample = TimeoutSampler( timeout=300, sleep=10, func=self._pod_is_found, pattern="quickstart-es-default", ) if not sample.wait_for_func_status(True): self.cleanup() raise Exception("The ElasticSearch pod deployment Failed") self.espod = get_pod_name_by_pattern("quickstart-es-default", self.namespace)[0] log.info(f"The ElasticSearch pod {self.espod} Started") es_pod = OCP(kind="pod", namespace=self.namespace) log.info("Waiting for ElasticSearch to Run") assert es_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=self.espod, sleep=30, timeout=600, ) log.info("Elastic Search is ready !!!") def get_health(self): """ This method return the health status of the Elasticsearch. Returns: bool : True if the status is green (OK) otherwise - False """ return self.elasticsearch.get( )["items"][0]["status"]["health"] == "green" def get_password(self): """ This method return the password used to connect the Elasticsearch. Returns: str : The password as text """ return base64.b64decode( self.password.get()["data"]["elastic"]).decode("utf-8") def cleanup(self): """ Cleanup the environment from all Elasticsearch components, and from the port forwarding process. """ log.info("Teardown the Elasticsearch environment") log.info("Deleting all resources") log.info("Deleting the dumper client pod") self.ocp.delete(yaml_file=self.dumper_file) log.info("Deleting the es resource") self.ocp.delete(yaml_file=self.crd) log.info("Deleting the es project") # self.ns_obj.delete_project(project_name=self.namespace) self.ocp.delete(f"{self.eck_path}/operator.yaml") self.ocp.delete(yaml_file=self.eck_file) self.ns_obj.wait_for_delete(resource_name=self.namespace, timeout=180) def _es_connect(self): """ Create a connection to the local ES Returns: Elasticsearch: elasticsearch connection object, None if Cannot connect to ES """ try: es = Elasticsearch([{ "host": self.get_ip(), "port": self.get_port() }]) except esexp.ConnectionError: log.warning("Cannot connect to ES server in the LocalServer") es = None return es def get_indices(self): """ Getting list of all indices in the ES server - all created by the test, the installation of the ES was without any indexes pre-installed. Returns: list : list of all indices defined in the ES server """ results = [] log.info("Getting all indices") for ind in self.con.indices.get_alias("*"): results.append(ind) return results def dumping_all_data(self, target_path): """ Dump All data from the internal ES server to .tgz file. Args: target_path (str): the path where the results file will be copy into Return: bool: True if the dump operation succeed and return the results data to the host otherwise False """ log.info("dumping data from ES server to .tgz file") rsh_cmd = f"rsh {self.dump_pod} /elasticsearch-dump/esdumper.py --ip {self.get_ip()} --port {self.get_port()}" result = self.ocp.exec_oc_cmd(rsh_cmd, out_yaml_format=False, timeout=1200) if "ES dump is done." not in result: log.error("There is no data in the Elasticsearch server") return False else: src_file = result.split()[-1] log.info(f"Copy {src_file} from the client pod") cp_command = f"cp {self.dump_pod}:{src_file} {target_path}/FullResults.tgz" result = self.ocp.exec_oc_cmd(cp_command, timeout=120) log.info(f"The output from the POD is {result}") log.info("Extracting the FullResults.tgz file") kwargs = {"cwd": target_path} results = run_command(f"tar zxvf {target_path}/FullResults.tgz", **kwargs) log.debug(f"The untar results is {results}") if "Error in command" in results: log.warning("Cannot untar the dumped file") return False return True
class TestPvcMultiSnapshotPerformance(PASTest): """ Tests to measure PVC snapshots creation performance & scale The test is trying to to take the maximal number of snapshot for one PVC """ def setup(self): """ Setting up the test environment : Calculating the amount of storage which available for the test Creating namespace (project) for the test """ log.info("Setting up the test environment") super(TestPvcMultiSnapshotPerformance, self).setup() # Getting the total Storage capacity try: self.ceph_capacity = int(self.ceph_cluster.get_ceph_capacity()) except Exception as err: err_msg = f"Failed to get Storage capacity : {err}" log.error(err_msg) raise Exception(err_msg) # Use 70% of the storage capacity in the test self.capacity_to_use = int(self.ceph_capacity * 0.7) # Creating new namespace for the test self.nss_name = "pas-test-namespace" log.info(f"Creating new namespace ({self.nss_name}) for the test") try: self.proj = helpers.create_project(project_name=self.nss_name) except CommandFailed as ex: if str(ex).find("(AlreadyExists)"): log.warning("The Namespace is Already Exists !") log.error("Can not create new project") raise CommandFailed(f"{self.nss_name} was not created") # Initialize a general Snapshot object to use in the test self.snapshot = OCP(kind="volumesnapshot", namespace=self.nss_name) def teardown(self): """ Cleaning up the environment : Delete all snapshot Delete the POD Delete the PVC and the PV Delete the StorageClass Delete the VolumeSnapshotClass Delete the data pool Switch to the default namespace Delete the tested namespace """ log.info("Cleanup the test environment") # Getting the name of the PCV's backed PV try: pv = self.pvc_obj.get("spec")["spec"]["volumeName"] except KeyError: log.error( f"Can not found key in the PVC object {json.dumps(self.pvc_obj.get('spec').get('spec'), indent=3)}" ) # Getting the list of all snapshots try: snapshot_list = self.snapshot.get(all_namespaces=True)["items"] except Exception as err: log.error(f"Cannot get the list of snapshots : {err}") snapshot_list = [] # Deleting al snapshots from the cluster log.info(f"Trying to delete all ({len(snapshot_list)}) Snapshots") log.debug( f"The list of all snapshots is : {json.dumps(snapshot_list, indent=3)}" ) for vs in snapshot_list: snap_name = vs["metadata"]["name"] log.info(f"Try to delete {snap_name}") try: self.snapshot.delete(resource_name=snap_name) except Exception as err: log.error(f"Can not delete {snap_name} : {err}") # Deleting the pod which wrote data to the pvc log.info(f"Deleting the test POD : {self.pod_obj.name}") try: self.pod_obj.delete() log.info("Wait until the pod is deleted.") self.pod_obj.ocp.wait_for_delete(resource_name=self.pod_obj.name) except Exception as ex: log.error(f"Can not delete the test pod : {ex}") # Deleting the PVC which used in the test. log.info(f"Delete the PVC : {self.pvc_obj.name}") try: self.pvc_obj.delete() log.info("Wait until the pvc is deleted.") self.pvc_obj.ocp.wait_for_delete(resource_name=self.pvc_obj.name) except Exception as ex: log.error(f"Can not delete the test pvc : {ex}") # Delete the backend PV of the PVC log.info(f"Try to delete the backend PV : {pv}") try: run_oc_command(f"delete pv {pv}") except Exception as ex: err_msg = f"can not delete PV {pv} - [{ex}]" log.error(err_msg) # Deleting the StorageClass used in the test log.info(f"Deleting the test StorageClass : {self.sc_obj.name}") try: self.sc_obj.delete() log.info("Wait until the SC is deleted.") self.sc_obj.ocp.wait_for_delete(resource_name=self.sc_obj.name) except Exception as ex: log.error(f"Can not delete the test sc : {ex}") # Deleting the VolumeSnapshotClass used in the test log.info(f"Deleting the test Snapshot Class : {self.snap_class.name}") try: self.snap_class.delete() log.info("Wait until the VSC is deleted.") self.snap_class.ocp.wait_for_delete( resource_name=self.snap_class.name) except Exception as ex: log.error(f"Can not delete the test vsc : {ex}") # Deleting the Data pool log.info(f"Deleting the test storage pool : {self.sc_name}") self.delete_ceph_pool(self.sc_name) # Verify deletion by checking the backend CEPH pools using the toolbox results = self.ceph_cluster.toolbox.exec_cmd_on_pod("ceph osd pool ls") log.debug(f"Existing pools are : {results}") if self.sc_name in results.split(): log.warning( "The pool did not deleted by CSI, forcing delete it manually") self.ceph_cluster.toolbox.exec_cmd_on_pod( f"ceph osd pool delete {self.sc_name} {self.sc_name} " "--yes-i-really-really-mean-it") else: log.info(f"The pool {self.sc_name} was deleted successfully") # Deleting the namespace used by the test log.info(f"Deleting the test namespace : {self.nss_name}") switch_to_default_rook_cluster_project() try: self.proj.delete(resource_name=self.nss_name) self.proj.wait_for_delete(resource_name=self.nss_name, timeout=60, sleep=10) except CommandFailed: log.error(f"Can not delete project {self.nss_name}") raise CommandFailed(f"{self.nss_name} was not created") super(TestPvcMultiSnapshotPerformance, self).teardown() def get_csi_pod(self, namespace): """ Getting pod list in specific namespace, for the provision logs Args: namespace (str): the namespace where the pod is deployed. Returns: list : list of lines from the output of the command. """ results = run_oc_command(cmd="get pod", namespace=namespace) if ERRMSG in results: err_msg = "Can not get the CSI controller pod" log.error(err_msg) raise Exception(err_msg) return results def get_log_names(self): """ Finding the name of snapshot logging file the start time is in the 'csi-snapshot-controller' pod, and the end time is in the provisioner pod (csi-snapshotter container) """ self.log_names = {"start": [], "end": []} log.info("Looking for logs pod name") # Getting csi log name for snapshot start creation messages results = self.get_csi_pod( namespace="openshift-cluster-storage-operator") for line in results: if "csi-snapshot-controller" in line and "operator" not in line: self.log_names["start"].append(line.split()[0]) # Getting csi log name for snapshot end creation messages results = self.get_csi_pod(namespace="openshift-storage") for line in results: if "prov" in line and self.fs_type in line: self.log_names["end"].append(line.split()[0]) log.info( f"The CSI logs for the test are : {json.dumps(self.log_names, indent=4)}" ) def build_fio_command(self): """ Building the FIO command that will be run on the pod before each snapshot """ # Find the path that the PVC is mounted within the POD path = (self.pod_obj.get("spec").get("spec").get("containers")[0].get( "volumeMounts")[0].get("mountPath")) self.fio_cmd = ( "fio --name=fio-fillup --rw=write --bs=4m --direct=1 --numjobs=1" " --time_based=0 --runtime=36000 --ioengine=libaio --end_fsync=1" f" --filename={path}/{self.file_name} --size={self.file_size}" " --output-format=json") log.info(f"The FIO command is : {self.fio_cmd}") def create_snapshotclass(self, interface): """ Creates own VolumeSnapshotClass Args: interface (str): Interface type used Returns: ocs_obj (obj): SnapshotClass obj instances """ if interface == constants.CEPHFILESYSTEM: snapclass_name = "pas-test-cephfs-snapshot-class" else: snapclass_name = "pas-test-rbd-snapshot-class" yaml_files = { constants.CEPHBLOCKPOOL: constants.CSI_RBD_SNAPSHOTCLASS_YAML, constants.CEPHFILESYSTEM: constants.CSI_CEPHFS_SNAPSHOTCLASS_YAML, } snapshotclass_data = templating.load_yaml(yaml_files[interface]) snapshotclass_data["metadata"]["name"] = snapclass_name ocs_obj = ocs.OCS(**snapshotclass_data) log.info(f"Creating new snapshot class : {snapclass_name}") try: created_snapclass = ocs_obj.create(do_reload=True) log.debug(created_snapclass) except Exception as ex: err_msg = f"Failed to create new snapshot class : {snapclass_name} [{ex}]" log.error(err_msg) raise Exception(err_msg) return ocs_obj def create_snapshot(self, snap_num): """ Creating snapshot of volume, and measure the creation time Args: snap_num (int) the number of snapshot to create Returns: int: the creation time of the snapshot (in sec.) """ log.info(f"Taking snapshot number {snap_num}") # Getting UTC time before test starting for log retrieve UTC_datetime = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") snap_name = f"pvc-snap-{snap_num}-" snap_name += self.pvc_obj.name.split("-")[-1] self.snap_templ["metadata"]["name"] = snap_name self.snap_templ["spec"][ "volumeSnapshotClassName"] = self.snap_class.name fd, tmpfile = tempfile.mkstemp(suffix=".yaml", prefix="Snap") log.debug(f"Going to create {tmpfile}") with open(tmpfile, "w") as f: yaml.dump(self.snap_templ, f, default_flow_style=False) res = run_oc_command(cmd=f"create -f {tmpfile}", namespace=self.nss_name) if ERRMSG in res[0]: err_msg = f"Failed to create snapshot : {res}" log.error(err_msg) raise Exception(err_msg) # wait until snapshot is ready timeout = 720 sleep_time = 10 snap_con_name = None while timeout > 0: res = run_oc_command(f"get volumesnapshot {snap_name} -o yaml", namespace=self.nss_name) if ERRMSG not in res[0]: res = yaml.safe_load("\n".join(res)) log.debug(f"The command output is : {yaml.dump(res)}") try: if res["status"]["readyToUse"]: log.info(f"{snap_name} Created and ready to use") snap_con_name = res["status"][ "boundVolumeSnapshotContentName"] break else: log.info( f"{snap_name} is not ready yet, sleep {sleep_time} sec before re-check" ) time.sleep(sleep_time) timeout -= sleep_time except Exception: log.info( f"{snap_name} is not ready yet, sleep {sleep_time} sec before re-check" ) time.sleep(sleep_time) timeout -= sleep_time else: err_msg = f"Can not get snapshot status {res}" log.error(err_msg) raise Exception(err_msg) if snap_con_name: return self.get_creation_time(snap_name, snap_con_name, UTC_datetime) else: err_msg = "Snapshot did not created on time" log.error(err_msg) raise TimeoutError(err_msg) def read_logs(self, kind, namespace, start_time): """ Reading the csi-driver logs, since we use different logs for the start time for end time (creation snapshot), we call this function twice. Args: kind (str): the kind of logs to read 'start' or 'end' namespace (str): in which namespace the pod exists start_time (time): the start time of the specific test, so we dont need to read the full log Returns: list : the contant of all read logs(s) - can be more then one log """ logs = [] # The pod with the logs for 'start' creation time have only one container container = "" if kind == "end": # The pod with the logs for 'end' creation time have more then one container container = "-c csi-snapshotter" for l in self.log_names[kind]: logs.append( run_oc_command( f"logs {l} {container} --since-time={start_time}", namespace=namespace, )) return logs def get_creation_time(self, snap_name, content_name, start_time): """ Calculate the creation time of the snapshot. find the start / end time in the logs, and calculate the total time. Args: snap_name (str): the snapshot name that create content_name (str): the content name of the snapshot, the end time lodged on the content name and not on the snap name. start_time (time): time of test starting so, retrieving log will be short as possible Returns: int: creation time in seconds Raises: General exception : can not found start/end of creation time """ # Start and End snapshot creation time times = {"start": None, "end": None} logs_info = { "start": { "ns": "openshift-cluster-storage-operator", "log_line": "Creating content for snapshot", }, "end": { "ns": "openshift-storage", "log_line": "readyToUse true" }, } for op in ["start", "end"]: logs = self.read_logs(op, logs_info[op]["ns"], start_time) for sublog in logs: for line in sublog: if (snap_name in line or content_name in line) and logs_info[op]["log_line"] in line: times[op] = line.split(" ")[1] times[op] = datetime.datetime.strptime( times[op], time_format) if times[op] is None: err_msg = f"Can not find {op} time of {snap_name}" log.error(err_msg) raise Exception(err_msg) results = (times["end"] - times["start"]).total_seconds() log.debug( f"Start creation time is : {times['start']}, End creation time is : {times['end']}" f" and Total creation time is {results}") return results def run(self): """ Running the test for each snapshot : write data on the pod and take snapshot """ results = [] for test_num in range(1, self.num_of_snaps + 1): log.info(f"Starting test number {test_num}") # Running IO on the POD - (re)-write data on the PVC self.pod_obj.exec_cmd_on_pod(self.fio_cmd, out_yaml_format=False) # Taking Snapshot of the PVC ct = self.create_snapshot(test_num) speed = self.filesize / ct results.append({"Snap Num": test_num, "time": ct, "speed": speed}) log.info(f"Results for snapshot number {test_num} are : " f"Creation time is {ct} , Creation speed {speed}") log.debug(f"All results are : {json.dumps(results, indent=3)}") return results @ignore_leftovers @pytest.mark.polarion_id("OCS-2623") @pytest.mark.parametrize( argnames=["interface_type", "snap_number"], argvalues=[ pytest.param(*[constants.CEPHBLOCKPOOL, 512]), pytest.param(*[constants.CEPHFILESYSTEM, 100]), ], ) def test_pvc_multiple_snapshot_performance( self, pvc_factory, pod_factory, secret_factory, interface_type, snap_number, ): """ 1. Creating PVC size is depend on storage capacity, but not less then 1 GiB it will use ~75% capacity of the Storage, Min storage capacity 1 TiB 2. Fill the PVC with 80% of data 3. Take a snapshot of the PVC and measure the time of creation. 4. re-write the data on the PVC 5. Take a snapshot of the PVC and measure the time of creation. 6. repeat steps 4-5 the numbers of snapshot we want to take : 512 this will be run by outside script for low memory consumption 7. print all information. Raises: StorageNotSufficientException: in case of not enough capacity """ self.num_of_snaps = snap_number if self.dev_mode: self.num_of_snaps = 2 log.info( f"Going to Create {self.num_of_snaps} {interface_type} snapshots") # since we do not want to use more then 65%, we add 35% to the needed # capacity, and minimum PVC size is 1 GiB self.need_capacity = int((self.num_of_snaps + 2) * 1.35) # Test will run only on system with enough capacity if self.capacity_to_use < self.need_capacity: err_msg = (f"The system have only {self.ceph_capacity} GiB, " f"we want to use only {self.capacity_to_use} GiB, " f"and we need {self.need_capacity} GiB to run the test") log.error(err_msg) raise exceptions.StorageNotSufficientException(err_msg) # Calculating the PVC size in GiB self.pvc_size = int(self.capacity_to_use / (self.num_of_snaps + 2)) if self.dev_mode: self.pvc_size = 5 self.interface = interface_type self.sc_name = "pas-testing-rbd" pool_name = self.sc_name if self.interface == constants.CEPHFILESYSTEM: self.sc_name = "pas-testing-cephfs" pool_name = f"{self.sc_name}-data0" # Creating new storage pool self.create_new_pool(self.sc_name) # Creating new StorageClass (pool) for the test. secret = secret_factory(interface=self.interface) self.sc_obj = helpers.create_storage_class( interface_type=self.interface, interface_name=pool_name, secret_name=secret.name, sc_name=self.sc_name, fs_name=self.sc_name, ) log.info(f"The new SC is : {self.sc_obj.name}") log.debug(f"All Sc data is {json.dumps(self.sc_obj.data, indent=3)}") # Create new VolumeSnapshotClass self.snap_class = self.create_snapshotclass(self.interface) # Create new PVC log.info(f"Creating {self.pvc_size} GiB PVC of {interface_type}") self.pvc_obj = pvc_factory( interface=self.interface, storageclass=self.sc_obj, size=self.pvc_size, status=constants.STATUS_BOUND, project=self.proj, ) # Create POD which will attache to the new PVC log.info("Creating A POD") self.pod_obj = pod_factory( interface=self.interface, pvc=self.pvc_obj, status=constants.STATUS_RUNNING, pod_dict_path=constants.PERF_POD_YAML, ) # Calculating the file size as 80% of the PVC size self.filesize = self.pvc_obj.size * 0.80 # Change the file size to MB for the FIO function self.file_size = f"{int(self.filesize * constants.GB2MB)}M" self.file_name = self.pod_obj.name log.info( f"Total capacity size is : {self.ceph_capacity} GiB, " f"Going to use {self.need_capacity} GiB, " f"With {self.num_of_snaps} Snapshots to {self.pvc_size} GiB PVC. " f"File size to be written is : {self.file_size} " f"with the name of {self.file_name}") # Reading basic snapshot yaml file self.snap_yaml = constants.CSI_CEPHFS_SNAPSHOT_YAML self.sc = constants.DEFAULT_VOLUMESNAPSHOTCLASS_CEPHFS self.fs_type = "cephfs" if interface_type == constants.CEPHBLOCKPOOL: self.snap_yaml = constants.CSI_RBD_SNAPSHOT_YAML self.fs_type = "rbd" self.sc = constants.DEFAULT_VOLUMESNAPSHOTCLASS_RBD with open(self.snap_yaml, "r") as stream: try: self.snap_templ = yaml.safe_load(stream) self.snap_templ["spec"]["volumeSnapshotClassName"] = self.sc self.snap_templ["spec"]["source"][ "persistentVolumeClaimName"] = self.pvc_obj.name except yaml.YAMLError as exc: log.error(f"Can not read template yaml file {exc}") log.debug( f"Snapshot yaml file : {self.snap_yaml} " f"Content of snapshot yaml file {json.dumps(self.snap_templ, indent=4)}" ) self.get_log_names() self.build_fio_command() self.run()
class OCS(object): """ Base OCSClass """ def __init__(self, **kwargs): """ Initializer function Args: kwargs (dict): 1) For existing resource, use OCP.reload() to get the resource's dictionary and use it to pass as **kwargs 2) For new resource, use yaml files templates under /templates/CSI like: obj_dict = load_yaml_to_dict( os.path.join( TEMPLATE_DIR, "some_resource.yaml" ) ) """ self.data = kwargs self._api_version = self.data.get('api_version') self._kind = self.data.get('kind') self._namespace = None if 'metadata' in self.data: self._namespace = self.data.get('metadata').get('namespace') self._name = self.data.get('metadata').get('name') self.ocp = OCP(api_version=self._api_version, kind=self.kind, namespace=self._namespace) self.temp_yaml = tempfile.NamedTemporaryFile(mode='w+', prefix=self._kind, delete=False) @property def api_version(self): return self._api_version @property def kind(self): return self._kind @property def namespace(self): return self._namespace @property def name(self): return self._name def reload(self): """ Reloading the OCS instance with the new information from its actual data. After creating a resource from a yaml file, the actual yaml file is being changed and more information about the resource is added. """ self.data = self.get() self.__init__(**self.data) def get(self, out_yaml_format=True): return self.ocp.get(resource_name=self.name, out_yaml_format=out_yaml_format) def create(self): log.info(f"Adding {self.kind} with name {self.name}") templating.dump_dict_to_temp_yaml(self.data, self.temp_yaml.name) status = self.ocp.create(yaml_file=self.temp_yaml.name) self.reload() return status def delete(self, wait=True): return self.ocp.delete(resource_name=self.name, wait=wait) def apply(self, **data): with open(self.temp_yaml.name, 'w') as yaml_file: yaml.dump(data, yaml_file) assert self.ocp.apply( yaml_file=self.temp_yaml.name), (f"Failed to apply changes {data}") self.reload() def add_label(self, label): """ Addss a new label Args: label (str): New label to be assigned for this pod E.g: "label=app='rook-ceph-mds'" """ status = self.ocp.add_label(resource_name=self.name, label=label) self.reload() return status def delete_temp_yaml_file(self): utils.delete_file(self.temp_yaml.name)
def test_rbd_based_rwo_pvc(self, reclaim_policy): """ Verifies RBD Based RWO Dynamic PVC creation with Reclaim policy set to Delete/Retain Steps: 1. Create Storage Class with reclaimPolicy: Delete/Retain 2. Create PVC with 'accessModes' 'ReadWriteOnce' 3. Create two pods using same PVC 4. Run IO on first pod 5. Verify second pod is not getting into Running state 6. Delete first pod 7. Verify second pod is in Running state 8. Verify usage of volume in second pod is matching with usage in first pod 9. Run IO on second pod 10. Delete second pod 11. Delete PVC 12. Verify PV associated with deleted PVC is also deleted/released """ # Create Storage Class with reclaimPolicy: Delete sc_obj = helpers.create_storage_class( interface_type=constants.CEPHBLOCKPOOL, interface_name=self.cbp_obj.name, secret_name=self.rbd_secret_obj.name, reclaim_policy=reclaim_policy ) # Create PVC with 'accessModes' 'ReadWriteOnce' pvc_data = templating.load_yaml_to_dict(constants.CSI_PVC_YAML) pvc_data['metadata']['name'] = helpers.create_unique_resource_name( 'test', 'pvc' ) pvc_data['metadata']['namespace'] = self.namespace pvc_data['spec']['storageClassName'] = sc_obj.name pvc_data['spec']['accessModes'] = ['ReadWriteOnce'] pvc_obj = PVC(**pvc_data) pvc_obj.create() # Create first pod log.info(f"Creating two pods which use PVC {pvc_obj.name}") pod_data = templating.load_yaml_to_dict(constants.CSI_RBD_POD_YAML) pod_data['metadata']['name'] = helpers.create_unique_resource_name( 'test', 'pod' ) pod_data['metadata']['namespace'] = self.namespace pod_data['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] = pvc_obj.name pod_obj = Pod(**pod_data) pod_obj.create() assert helpers.wait_for_resource_state(pod_obj, constants.STATUS_RUNNING) node_pod1 = pod_obj.get()['spec']['nodeName'] # Create second pod # Try creating pod until it is on a different node than first pod for retry in range(1, 6): pod_data = templating.load_yaml_to_dict(constants.CSI_RBD_POD_YAML) pod_data['metadata']['name'] = helpers.create_unique_resource_name( 'test', 'pod' ) pod_data['metadata']['namespace'] = self.namespace pod_data['spec']['volumes'][0]['persistentVolumeClaim']['claimName'] = pvc_obj.name pod_obj2 = Pod(**pod_data) pod_obj2.create() assert helpers.wait_for_resource_state(pod_obj2, constants.STATUS_PENDING) node_pod2 = pod_obj2.get()['spec']['nodeName'] if node_pod1 != node_pod2: break log.info( f"Both pods are on same node. Deleting second pod and " f"creating another pod. Retry count:{retry}" ) pod_obj2.delete() if retry == 5: raise UnexpectedBehaviour( "Second pod is always created on same node as of first " "pod even after trying 5 times." ) # Run IO on first pod log.info(f"Running IO on first pod {pod_obj.name}") pod_obj.run_io('fs', '1G') logging.info(f"Waiting for IO results from pod {pod_obj.name}") fio_result = pod_obj.get_fio_results() logging.info("IOPs after FIO:") logging.info( f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}" ) logging.info( f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}" ) # Fetch usage details mount_point = pod_obj.exec_cmd_on_pod(command="df -kh") mount_point = mount_point.split() usage = mount_point[mount_point.index('/var/lib/www/html') - 1] # Verify that second pod is not getting into Running state. Check it # for some period of time. try: assert not pod_obj2.ocp.wait_for_resource( condition='Running', resource_name=pod_obj2.name, ), "Unexpected: Second pod is in Running state" except TimeoutExpiredError: log.info( f"Verified: Second pod {pod_obj2.name} is not in " f"Running state" ) # Delete first pod pod_obj.delete(wait=True) # Verify pod is deleted try: pod_obj.get() raise UnexpectedBehaviour( f"First pod {pod_obj.name} is not deleted." ) except CommandFailed as exp: assert "not found" in str(exp), ( "Failed to fetch pod details" ) log.info(f"First pod {pod_obj.name} is deleted.") # Wait for second pod to be in Running state try: pod_obj2.ocp.wait_for_resource( condition='Running', resource_name=pod_obj2.name, timeout=180 ) except TimeoutExpiredError as exp: raise TimeoutExpiredError( f"Second pod {pod_obj2.name} is not in Running state " f"after deleting first pod." ) from exp log.info( f"Second pod {pod_obj2.name} is in Running state after " f"deleting the first pod." ) # Verify that volume usage in second pod is matching with the usage in # first pod mount_point = pod_obj2.exec_cmd_on_pod(command="df -kh") mount_point = mount_point.split() usage_re = mount_point[mount_point.index('/var/lib/www/html') - 1] assert usage_re == usage, ( "Use percentage in new pod is not matching with old pod" ) # Run IO on second pod log.info(f"Running IO on second pod {pod_obj2.name}") pod_obj2.run_io('fs', '1G') logging.info(f"Waiting for IO results from pod {pod_obj2.name}") fio_result = pod_obj2.get_fio_results() logging.info("IOPs after FIO:") logging.info( f"Read: {fio_result.get('jobs')[0].get('read').get('iops')}" ) logging.info( f"Write: {fio_result.get('jobs')[0].get('write').get('iops')}" ) # Delete second pod pod_obj2.delete() # Verify pod is deleted try: pod_obj2.get() raise UnexpectedBehaviour( f"Second pod {pod_obj2.name} is not deleted." ) except CommandFailed as exp: assert "not found" in str(exp), ( "Failed to fetch pod details" ) log.info(f"Second pod {pod_obj2.name} is deleted.") # Get PV name pvc_obj.reload() pv_name = pvc_obj.backed_pv # Delete PVC pvc_obj.delete() # Verify PVC is deleted try: pvc_obj.get() raise UnexpectedBehaviour( f"PVC {pvc_obj.name} is not deleted." ) except CommandFailed as exp: assert "not found" in str(exp), ( "Failed to verify PVC deletion." ) log.info(f"PVC {pvc_obj.name} is deleted.") pv_obj = OCP( kind=constants.PV, namespace=self.namespace ) if reclaim_policy == "Delete": # Verify PV is deleted for pv_info in TimeoutSampler( 30, 2, pv_obj.get, out_yaml_format=False ): if pv_name not in pv_info: break log.warning( f"PV {pv_name} exists after deleting PVC {pvc_obj.name}. " f"Checking again." ) # TODO: Verify PV using ceph toolbox. PV should be deleted. # Blocked by bz 1723656 elif reclaim_policy == "Retain": # Wait for PV to be in Released state assert pv_obj.wait_for_resource( condition='Released', resource_name=pv_name ) log.info(f"PV {pv_name} is in Released state") # TODO: Delete PV from backend and verify # Blocked by bz 1723656 pv_obj.delete(resource_name=pv_name) # Delete Storage Class sc_obj.delete()