def teardown(self): """ Delete objects created in roughly reverse order of how they were created. """ self.cb_examples.delete() self.cb_worker.delete() self.cb_deploy.delete() self.pod_obj.exec_oc_cmd( command="delete rolebinding couchbase-operator-rolebinding") self.pod_obj.exec_oc_cmd( command="delete serviceaccount couchbase-operator") self.operator_role.delete() self.couchbase_obj.delete() switch_to_project('default') self.pod_obj.delete_project(constants.COUCHBASE_OPERATOR) for adm_yaml in self.admission_parts: adm_data = templating.load_yaml(adm_yaml) adm_obj = OCS(**adm_data) adm_obj.delete() # Before the code below was added, the teardown task would sometimes # fail with the leftover objects because it would still see one of the # couchbase pods. for admin_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase', 'default'): if admin_pod: continue else: break PillowFight.cleanup(self) switch_to_default_rook_cluster_project()
def cleanup(self): """ Clean up """ switch_to_project(BMO_NAME) log.info("Deleting postgres pods and configuration") if self.pgsql_is_setup: self.pgsql_sset._is_deleted = False self.pgsql_sset.delete() self.pgsql_cmap._is_deleted = False self.pgsql_cmap.delete() self.pgsql_service._is_deleted = False self.pgsql_service.delete() log.info("Deleting pgbench pods") pods_obj = self.get_pgbench_pods() pvcs_obj = self.get_postgres_pvc() for pod in pods_obj: pod.delete() pod.ocp.wait_for_delete(pod.name) for pvc in pvcs_obj: pvc.delete() pvc.ocp.wait_for_delete(pvc.name) validate_pv_delete(pvc.backed_pv) log.info("Deleting benchmark operator configuration") BenchmarkOperator.cleanup(self)
def cleanup(self): """ Cosbench cleanup """ switch_to_project(constants.COSBENCH_PROJECT) logger.info("Deleting Cosbench pod, configmap and namespace") self.cosbench_pod.delete() self.cosbench_config.delete() self.ns_obj.delete_project(self.namespace) self.ns_obj.wait_for_delete(resource_name=self.namespace, timeout=90)
def finalizer(): """ Delete 'tiller' project and temporary files """ ocp_project = ocp.OCP(kind="Project", namespace=tiller_namespace) ocp.switch_to_project("openshift-storage") log.info(f"Deleting project {tiller_namespace}") ocp_project.delete_project(project_name=tiller_namespace) ocp_project.wait_for_delete(resource_name=tiller_namespace) if os.path.isdir(helm_dir): exec_cmd(cmd="rm -rf " + helm_dir)
def teardown(self): """ Cleaning up the resources created during Couchbase deployment """ if self.cb_create_cb_secret: self.cb_secrets.delete() if self.cb_create_cb_cluster: self.cb_example.delete() if self.cb_create_bucket: self.cb_bucket.delete() self.subscription_yaml.delete() switch_to_project("default") self.ns_obj.delete_project(constants.COUCHBASE_OPERATOR) self.ns_obj.wait_for_delete(resource_name=constants.COUCHBASE_OPERATOR, timeout=90) PillowFight.cleanup(self) switch_to_default_rook_cluster_project()
def teardown(self): """ Delete objects created in roughly reverse order of how they were created. """ self.cb_examples.delete() self.cb_worker.delete() self.cb_deploy.delete() self.pod_obj.exec_oc_cmd( command= "delete rolebinding couchbase-operator-rolebinding -n couchbase-operator-namespace" ) self.pod_obj.exec_oc_cmd( command= "delete serviceaccount couchbase-operator -n couchbase-operator-namespace" ) self.operator_role.delete() self.couchbase_obj.delete() switch_to_project("default") self.ns_obj.delete_project(constants.COUCHBASE_OPERATOR) self.ns_obj.wait_for_delete(resource_name=constants.COUCHBASE_OPERATOR, timeout=90) for adm_obj in self.adm_objects: adm_obj.delete() # Before the code below was added, the teardown task would sometimes # fail with the leftover objects because it would still see one of the # couchbase pods. for admin_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, "couchbase", "default"): if admin_pod: continue else: break PillowFight.cleanup(self) switch_to_default_rook_cluster_project()
def setup_cb(self): """ Creating admission parts,couchbase operator pod, couchbase worker secret """ # Create admission controller log.info("Create admission controller process for Couchbase") switch_to_project('default') self.up_adm_chk = OCP(namespace="default") self.up_check = OCP(namespace=constants.COUCHBASE_OPERATOR) for adm_yaml in self.admission_parts: adm_data = templating.load_yaml(adm_yaml) adm_obj = OCS(**adm_data) adm_obj.create() # Wait for admission pod to be created for adm_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase-operator-admission', 'default'): try: if self.is_up_and_running(adm_pod[0], self.up_adm_chk): self.admission_pod = adm_pod[0] break except IndexError: log.info("Admission pod is not ready yet") # Wait for admission pod to be running log.info("Waiting for admission pod to be running") self.pod_obj.wait_for_resource( condition='Running', resource_name=self.admission_pod, timeout=self.WAIT_FOR_TIME, sleep=10, ) self.pod_obj.new_project(constants.COUCHBASE_OPERATOR) couchbase_data = templating.load_yaml(constants.COUCHBASE_CRD_YAML) self.couchbase_obj = OCS(**couchbase_data) self.couchbase_obj.create() op_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_ROLE) self.operator_role = OCS(**op_data) self.operator_role.create() self.serviceaccount = OCP(namespace=constants.COUCHBASE_OPERATOR) self.serviceaccount.exec_oc_cmd( "create serviceaccount couchbase-operator") dockercfgs = self.serviceaccount.exec_oc_cmd("get secrets") startloc = dockercfgs.find('couchbase-operator-dockercfg') newdockerstr = dockercfgs[startloc:] endloc = newdockerstr.find(' ') dockerstr = newdockerstr[:endloc] self.secretsadder.exec_oc_cmd( f"secrets link serviceaccount/couchbase-operator secrets/{dockerstr}" ) self.rolebinding = OCP(namespace=constants.COUCHBASE_OPERATOR) rolebind_cmd = "".join([ "create rolebinding couchbase-operator-rolebinding ", "--role couchbase-operator ", "--serviceaccount couchbase-operator-namespace:couchbase-operator" ]) self.rolebinding.exec_oc_cmd(rolebind_cmd) dep_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_DEPLOY) self.cb_deploy = OCS(**dep_data) self.cb_deploy.create() # Wait for couchbase operator pod to be running for couchbase_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase-operator', constants.COUCHBASE_OPERATOR): try: if self.is_up_and_running(couchbase_pod[0], self.up_check): break except IndexError: log.info("Couchbase operator is not up") cb_work = templating.load_yaml(constants.COUCHBASE_WORKER_SECRET) self.cb_worker = OCS(**cb_work) self.cb_worker.create()
def uninstall_ocs(): """ The function uninstalls the OCS operator from a openshift cluster and removes all its settings and dependencies """ ocp_obj = ocp.OCP() log.info("deleting volume snapshots") vs_ocp_obj = ocp.OCP(kind=constants.VOLUMESNAPSHOT) vs_list = vs_ocp_obj.get(all_namespaces=True)["items"] for vs in vs_list: vs_obj = ocp.OCP(kind=constants.VOLUMESNAPSHOT, namespace=vs.get("metadata").get("namespace")) vs_obj.delete(resource_name=vs.get("metadata").get("name")) log.info("queering for OCS PVCs") provisioners = constants.OCS_PROVISIONERS sc_list = [ sc for sc in get_all_storageclass() if sc.get("provisioner") in provisioners ] pvc_to_delete = [] for sc in sc_list: pvc_to_delete.extend(pvc for pvc in get_all_pvcs_in_storageclass( sc.get("metadata").get("name")) if "noobaa" not in pvc.name) if config.ENV_DATA["platform"].lower() == constants.ROSA_PLATFORM: log.info("Deleting OCS PVCs") for pvc in pvc_to_delete: log.info(f"Deleting PVC: {pvc.name}") pvc.delete() rosa.delete_odf_addon(config.ENV_DATA["cluster_name"]) return None log.info("Removing monitoring stack from OpenShift Container Storage") remove_monitoring_stack_from_ocs() log.info( "Removing OpenShift Container Platform registry from OpenShift Container Storage" ) remove_ocp_registry_from_ocs(config.ENV_DATA["platform"]) log.info( "Removing the cluster logging operator from OpenShift Container Storage" ) try: remove_cluster_logging_operator_from_ocs() except CommandFailed: log.info("No cluster logging found") log.info("Deleting OCS PVCs") for pvc in pvc_to_delete: log.info(f"Deleting PVC: {pvc.name}") pvc.delete() storage_cluster = ocp.OCP( kind=constants.STORAGECLUSTER, resource_name=constants.DEFAULT_CLUSTERNAME, namespace="openshift-storage", ) log.info("Checking for local storage") lso_sc = None if check_local_volume_local_volume_set(): "Local volume was found. Will be removed later" lso_sc = (storage_cluster.get().get("spec").get("storageDeviceSets")[0] .get("dataPVCTemplate").get("spec").get("storageClassName")) cleanup_policy = (storage_cluster.get().get("metadata").get( "annotations").get("uninstall.ocs.openshift.io/cleanup-policy")) log.info("Deleting storageCluster object") storage_cluster.delete(resource_name=constants.DEFAULT_CLUSTERNAME) if cleanup_policy == "delete": log.info("Cleanup policy set to delete. checking cleanup pods") cleanup_pods = [ pod for pod in get_all_pods() if "cluster-cleanup-job" in pod.name ] for pod in cleanup_pods: while pod.get().get("status").get("phase") != "Succeeded": log.info(f"waiting for cleanup pod {pod.name} to complete") TimeoutSampler(timeout=10, sleep=30) log.info(f"Cleanup pod {pod.name} completed successfully ") # no need to confirm var/vib/rook was deleted from nodes if all cleanup pods are completed. else: log.info("Cleanup policy set to retain. skipping nodes cleanup") log.info("Deleting openshift-storage namespace") ocp_obj.delete_project(constants.OPENSHIFT_STORAGE_NAMESPACE) ocp_obj.wait_for_delete(constants.OPENSHIFT_STORAGE_NAMESPACE) switch_to_project(constants.DEFAULT_NAMESPACE) # step 10: TODO remove crypto from nodes. """for node in storage_node_list: log.info(f"removing encryption from {node}") ocp_obj.exec_oc_debug_cmd(node=node, cmd_list=[])""" if lso_sc is not None: log.info("Removing LSO") try: uninstall_lso(lso_sc) except Exception as e: log.info(f"LSO removal failed.{e}") log.info("deleting noobaa storage class") noobaa_sc = ocp.OCP(kind=constants.STORAGECLASS) noobaa_sc.delete(resource_name=constants.NOOBAA_SC) nodes = get_all_nodes() node_objs = get_node_objs(nodes) log.info("Unlabeling storage nodes") label_nodes(nodes=node_objs, label=constants.OPERATOR_NODE_LABEL[:-3] + "-") label_nodes(nodes=node_objs, label=constants.TOPOLOGY_ROOK_LABEL + "-") log.info("Removing taints from storage nodes") taint_nodes(nodes=nodes, taint_label=constants.OPERATOR_NODE_TAINT + "-") log.info("Deleting remaining OCS PVs (if there are any)") try: rbd_pv = ocp.OCP(kind=constants.PV, resource_name="ocs-storagecluster-ceph-rbd") fs_pv = ocp.OCP(kind=constants.PV, resource_name="ocs-storagecluster-cephfs") rbd_pv.delete() fs_pv.delete() log.info("OCS PVs deleted") except Exception as e: log.info(f"OCS PV(s) not found. {e}") log.info("Removing CRDs") crd_list = [ "backingstores.noobaa.io", "bucketclasses.noobaa.io", "cephblockpools.ceph.rook.io", "cephclusters.ceph.rook.io", "cephfilesystems.ceph.rook.io", "cephnfses.ceph.rook.io", "cephobjectstores.ceph.rook.io", "cephobjectstoreusers.ceph.rook.io", "noobaas.noobaa.io", "ocsinitializations.ocs.openshift.io", "storageclusters.ocs.openshift.io", "cephclients.ceph.rook.io", "cephobjectrealms.ceph.rook.io", "cephobjectzonegroups.ceph.rook.io", "cephobjectzones.ceph.rook.io", "cephrbdmirrors.ceph.rook.io", ] for crd in crd_list: try: ocp_obj.exec_oc_cmd(f"delete crd {crd} --timeout=300m") except Exception: log.info(f"crd {crd} was not found")
def test_monitor_recovery( self, dc_pod_factory, mcg_obj, bucket_factory, ): """ Verifies Monitor recovery procedure as per: https://access.redhat.com/documentation/en-us/red_hat_openshift_container_storage/4.8/html/troubleshooting_openshift_container_storage/restoring-the-monitor-pods-in-openshift-container-storage_rhocs """ # Initialize mon recovery class mon_recovery = MonitorRecovery() logger.info("Corrupting ceph monitors by deleting store.db") corrupt_ceph_monitors() logger.info("Backing up all the deployments") mon_recovery.backup_deployments() dep_revert, mds_revert = mon_recovery.deployments_to_revert() logger.info("Starting the monitor recovery procedure") logger.info("Scaling down rook and ocs operators") mon_recovery.scale_rook_ocs_operators(replica=0) logger.info( "Preparing script and patching OSDs to remove LivenessProbe and sleep to infinity" ) mon_recovery.prepare_monstore_script() mon_recovery.patch_sleep_on_osds() switch_to_project(constants.OPENSHIFT_STORAGE_NAMESPACE) logger.info("Getting mon-store from OSDs") mon_recovery.run_mon_store() logger.info("Patching MONs to sleep infinitely") mon_recovery.patch_sleep_on_mon() logger.info("Updating initial delay on all monitors") update_mon_initial_delay() logger.info("Generating monitor map command using the IPs") mon_map_cmd = generate_monmap_cmd() logger.info("Getting ceph keyring from ocs secrets") mon_recovery.get_ceph_keyrings() logger.info("Rebuilding Monitors to recover store db") mon_recovery.monitor_rebuild(mon_map_cmd) logger.info("Reverting mon, osd and mgr deployments") mon_recovery.revert_patches(dep_revert) logger.info("Scaling back rook and ocs operators") mon_recovery.scale_rook_ocs_operators(replica=1) logger.info("Recovering CephFS") mon_recovery.scale_rook_ocs_operators(replica=0) logger.info( "Patching MDSs to remove LivenessProbe and setting sleep to infinity" ) mon_recovery.patch_sleep_on_mds() logger.info("Resetting the fs") ceph_fs_recovery() logger.info("Reverting MDS deployments") mon_recovery.revert_patches(mds_revert) logger.info("Scaling back rook and ocs operators") mon_recovery.scale_rook_ocs_operators(replica=1) logger.info("Recovering mcg by re-spinning the pods") recover_mcg() remove_global_id_reclaim() for pod_obj in self.dc_pods: pod_obj.delete(force=True) new_md5_sum = [] logger.info("Verifying md5sum of files after recovery") for pod_obj in get_spun_dc_pods(self.dc_pods): pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=600, sleep=10, ) new_md5_sum.append(pod.cal_md5sum(pod_obj, self.filename)) logger.info(f"Md5sum calculated after recovery: {new_md5_sum}") if collections.Counter(new_md5_sum) == collections.Counter(self.md5sum): logger.info( f"Verified: md5sum of {self.filename} on pods matches with the original md5sum" ) else: assert False, f"Data corruption found {new_md5_sum} and {self.md5sum}" logger.info("Getting object after recovery") assert bucket_utils.s3_get_object( s3_obj=mcg_obj, bucketname=self.bucket_name, object_key=self.object_key, ), "Failed: GetObject" # New pvc, dc pods, obcs new_dc_pods = [ dc_pod_factory( interface=constants.CEPHBLOCKPOOL, ), dc_pod_factory( interface=constants.CEPHFILESYSTEM, ), ] for pod_obj in new_dc_pods: pod_obj.exec_cmd_on_pod(command=self.dd_cmd) logger.info("Creating new bucket and write object") new_bucket = bucket_factory(interface="OC")[0].name assert bucket_utils.s3_put_object( s3_obj=mcg_obj, bucketname=new_bucket, object_key=self.object_key, data=self.object_data, ), "Failed: PutObject" wait_for_storage_pods() logger.info("Archiving the ceph crash warnings") tool_pod = get_ceph_tools_pod() tool_pod.exec_ceph_cmd(ceph_cmd="ceph crash archive-all", format=None) self.sanity_helpers.health_check(tries=10)
def test_couchbase_workload_simple(self, pillowfight): """ Deploy a Couchbase server and pillowfight workload using operator The couchbase workers do not come up unless there is an admission controller running. The admission controller is started from the default project prior to bringing up the operator. Secrets, rolebindings and serviceaccounts need to also be generated. Once the couchbase operator is running, we need to wait for the three worker pods to also be up. Then a pillowfight task is started. After the pillowfight task has finished, the log is collected and analyzed. Raises: Exception: If pillowfight results indicate that a minimum performance level is not reached (1 second response time, less than 1000 ops per second) """ # Create admission controller log.info("Create admission controller process for Couchbase") switch_to_project('default') self.up_adm_chk = OCP(namespace="default") self.up_check = OCP(namespace=self.COUCHBASE_OPERATOR) for adm_yaml in self.admission_parts: adm_data = templating.load_yaml(adm_yaml) adm_obj = OCS(**adm_data) adm_obj.create() # Wait for admission pod to be created for adm_pod in TimeoutSampler( self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase-operator-admission', 'default' ): try: if self.is_up_and_running(adm_pod[0], self.up_adm_chk): self.admission_pod = adm_pod[0] break except IndexError: log.info("Admission pod is not ready yet") # Wait for admission pod to be running log.info("Waiting for admission pod to be running") self.pod_obj.wait_for_resource( condition='Running', resource_name=self.admission_pod, timeout=self.WAIT_FOR_TIME, sleep=10, ) self.pod_obj.new_project(self.COUCHBASE_OPERATOR) couchbase_data = templating.load_yaml( constants.COUCHBASE_CRD_YAML ) self.couchbase_obj = OCS(**couchbase_data) self.couchbase_obj.create() op_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_ROLE) self.operator_role = OCS(**op_data) self.operator_role.create() self.serviceaccount = OCP(namespace=self.COUCHBASE_OPERATOR) self.serviceaccount.exec_oc_cmd( "create serviceaccount couchbase-operator" ) dockercfgs = self.serviceaccount.exec_oc_cmd("get secrets") startloc = dockercfgs.find('couchbase-operator-dockercfg') newdockerstr = dockercfgs[startloc:] endloc = newdockerstr.find(' ') dockerstr = newdockerstr[:endloc] self.add_serviceaccount_secret("couchbase-operator", dockerstr) self.add_serviceaccount_secret("default", dockerstr) self.rolebinding = OCP(namespace=self.COUCHBASE_OPERATOR) rolebind_cmd = "".join([ "create rolebinding couchbase-operator-rolebinding ", "--role couchbase-operator ", "--serviceaccount couchbase-operator-namespace:couchbase-operator" ]) self.rolebinding.exec_oc_cmd(rolebind_cmd) dep_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_DEPLOY) self.cb_deploy = OCS(**dep_data) self.cb_deploy.create() # Wait for couchbase operator pod to be running for couchbase_pod in TimeoutSampler( self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase-operator', self.COUCHBASE_OPERATOR ): try: if self.is_up_and_running(couchbase_pod[0], self.up_check): break except IndexError: log.info("Couchbase operator is not up") cb_work = templating.load_yaml(constants.COUCHBASE_WORKER_SECRET) self.cb_worker = OCS(**cb_work) self.cb_worker.create() cb_example = templating.load_yaml(constants.COUCHBASE_WORKER_EXAMPLE) self.cb_examples = OCS(**cb_example) self.cb_examples.create() # Wait for last of three workers to be running. for cb_wrk_pod in TimeoutSampler( self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'cb-example-0002', self.COUCHBASE_OPERATOR ): try: if self.is_up_and_running(cb_wrk_pod[0], self.up_check): # once last pod is up, make sure all are ready counter = 0 for wpodn in range(0, 3): cbw_pod = f"cb-example-{wpodn:04}" if self.is_up_and_running(cbw_pod, self.up_check): counter += 1 if counter == 3: break except IndexError: log.info("Couchbase workers are not up") pillowfight.run_pillowfights() pillowfight.analyze_all()
def uninstall_ocs(): """ The function uninstalls the OCS operator from a openshift cluster and removes all its settings and dependencies """ ocp_obj = ocp.OCP() provisioners = constants.OCS_PROVISIONERS # List the storage classes sc_list = [ sc for sc in get_all_storageclass() if sc.get('provisioner') in provisioners ] # Query for PVCs and OBCs that are using the storage class provisioners listed in the previous step. pvc_to_delete = [] for sc in sc_list: pvc_to_delete.extend(pvc for pvc in get_all_pvcs_in_storageclass( sc.get('metadata').get('name')) if 'noobaa' not in pvc.name) log.info("Removing monitoring stack from OpenShift Container Storage") remove_monitoring_stack_from_ocs() log.info( "Removing OpenShift Container Platform registry from OpenShift Container Storage" ) remove_ocp_registry_from_ocs(config.ENV_DATA['platform']) log.info( "Removing the cluster logging operator from OpenShift Container Storage" ) try: remove_cluster_logging_operator_from_ocs() except CommandFailed: log.info("No cluster logging found") log.info("Deleting pvcs") for pvc in pvc_to_delete: log.info(f"Deleting pvc: {pvc.name}") pvc.delete() storage_cluster = ocp.OCP(kind=constants.STORAGECLUSTER, resource_name=constants.DEFAULT_CLUSTERNAME, namespace='openshift-storage') log.info("Checking for local storage") lso_sc = None if check_local_volume(): "Local volume was found. Will be removed later" lso_sc = storage_cluster.get().get('spec').get('storageDeviceSets')[ 0].get('dataPVCTemplate').get('spec').get('storageClassName') log.info("Deleting storageCluster object") storage_cluster.delete(resource_name=constants.DEFAULT_CLUSTERNAME) log.info("Removing CRDs") crd_list = [ 'backingstores.noobaa.io', 'bucketclasses.noobaa.io', 'cephblockpools.ceph.rook.io', 'cephfilesystems.ceph.rook.io', 'cephnfses.ceph.rook.io', 'cephobjectstores.ceph.rook.io', 'cephobjectstoreusers.ceph.rook.io', 'noobaas.noobaa.io', 'ocsinitializations.ocs.openshift.io', 'storageclusterinitializations.ocs.openshift.io', 'storageclusters.ocs.openshift.io', 'cephclusters.ceph.rook.io' ] for crd in crd_list: ocp_obj.exec_oc_cmd(f"delete crd {crd} --timeout=300m") log.info("Deleting openshift-storage namespace") ocp_obj.delete_project('openshift-storage') ocp_obj.wait_for_delete('openshift-storage') switch_to_project("default") log.info("Removing rook directory from nodes") nodes_list = get_labeled_nodes(constants.OPERATOR_NODE_LABEL) for node in nodes_list: log.info(f"Removing rook from {node}") ocp_obj.exec_oc_debug_cmd(node=node, cmd_list=["rm -rf /var/lib/rook"]) log.info("Removing LSO ") if lso_sc is not None: uninstall_lso(lso_sc) log.info( "Delete the storage classes with an openshift-storage provisioner list" ) for storage_class in sc_list: log.info( f"Deleting storage class {storage_class.get('metadata').get('name')}" ) sc_obj = ocp.OCP(kind=constants.STORAGECLASS) sc_obj.delete(resource_name=storage_class.get('metadata').get('name')) log.info("Unlabeling storage nodes") nodes_list = get_all_nodes() for node in nodes_list: node_obj = ocp.OCP(kind=constants.NODE, resource_name=node) node_obj.add_label(resource_name=node, label=constants.OPERATOR_NODE_LABEL[:-3] + '-') node_obj.add_label(resource_name=node, label=constants.TOPOLOGY_ROOK_LABEL + '-') log.info("OCS was removed successfully from cluster ")
def test_pvc_snapshot_performance_multiple_files(self, file_size, files, threads, interface): """ Run SmallFile Workload and the take snapshot. test will run with 1M of file on the volume - total data set is the same for all tests, ~30GiB, and then take snapshot and measure the time it takes. the test will run 3 time to check consistency. Args: file_size (int): the size of the file to be create - in KiB files (int): number of files each thread will create threads (int): number of threads will be used in the workload interface (str): the volume interface that will be used CephBlockPool / CephFileSystem Raises: TimeoutError : in case of creation files take too long time more then 2 Hours """ # Deploying elastic-search server in the cluster for use by the # SmallFiles workload, since it is mandatory for the workload. # This is deployed once for all test iterations and will be deleted # in the end of the test. self.es = ElasticSearch() # Loading the main template yaml file for the benchmark and update some # fields with new values sf_data = templating.load_yaml(constants.SMALLFILE_BENCHMARK_YAML) if interface == constants.CEPHBLOCKPOOL: storageclass = constants.DEFAULT_STORAGECLASS_RBD else: storageclass = constants.DEFAULT_STORAGECLASS_CEPHFS log.info(f"Using {storageclass} Storageclass") # Setting up the parameters for this test sf_data["spec"]["workload"]["args"]["samples"] = 1 sf_data["spec"]["workload"]["args"]["operation"] = ["create"] sf_data["spec"]["workload"]["args"]["file_size"] = file_size sf_data["spec"]["workload"]["args"]["files"] = files sf_data["spec"]["workload"]["args"]["threads"] = threads sf_data["spec"]["workload"]["args"]["storageclass"] = storageclass sf_data["spec"]["elasticsearch"] = { "url": f"http://{self.es.get_ip()}:{self.es.get_port()}" } """ Calculating the size of the volume that need to be test, it should be at least twice in the size then the size of the files, and at least 100Gi. Since the file_size is in Kb and the vol_size need to be in Gb, more calculation is needed. """ total_files = int(files * threads) total_data = int(files * threads * file_size / constants.GB2KB) data_set = int(total_data * 3) # calculate data with replica vol_size = data_set if data_set >= 100 else 100 sf_data["spec"]["workload"]["args"]["storagesize"] = f"{vol_size}Gi" environment = get_environment_info() if not environment["user"] == "": sf_data["spec"]["test_user"] = environment["user"] else: # since full results object need this parameter, initialize it from CR file environment["user"] = sf_data["spec"]["test_user"] sf_data["spec"]["clustername"] = environment["clustername"] log.debug(f"The smallfile yaml file is {sf_data}") # Deploy the benchmark-operator, so we can use the SmallFiles workload # to fill up the volume with files, and switch to the benchmark-operator namespace. log.info("Deploy the benchmark-operator") self.deploy_benchmark_operator() switch_to_project(BMO_NAME) all_results = [] self.results_path = get_full_test_logs_path(cname=self) log.info(f"Logs file path name is : {self.full_log_path}") # Produce ES report # Collecting environment information self.get_env_info() # Initialize the results doc file. self.full_results = self.init_full_results( ResultsAnalyse( self.uuid, self.crd_data, self.full_log_path, "pvc_snapshot_perf_multiple_files", )) self.full_results.add_key("file_size_inKB", file_size) self.full_results.add_key("threads", threads) self.full_results.add_key("interface", interface) for test_num in range(self.tests_numbers): test_results = {"creation_time": None, "csi_creation_time": None} # deploy the smallfile workload log.info("Running SmallFile bench") sf_obj = OCS(**sf_data) sf_obj.create() # wait for benchmark pods to get created - takes a while for bench_pod in TimeoutSampler( 240, 10, get_pod_name_by_pattern, "smallfile-client", BMO_NAME, ): try: if bench_pod[0] is not None: small_file_client_pod = bench_pod[0] break except IndexError: log.info("Bench pod not ready yet") bench_pod = OCP(kind="pod", namespace=BMO_NAME) log.info("Waiting for SmallFile benchmark to Run") assert bench_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=small_file_client_pod, sleep=30, timeout=600, ) # Initialize the pvc_name variable so it will not be in loop scope only. pvc_name = "" for item in bench_pod.get()["items"]: if item.get("metadata").get("name") == small_file_client_pod: for volume in item.get("spec").get("volumes"): if "persistentVolumeClaim" in volume: pvc_name = volume["persistentVolumeClaim"][ "claimName"] break log.info(f"Benchmark PVC name is : {pvc_name}") # Creation of 1M files on CephFS can take a lot of time timeout = 7200 while timeout >= 0: logs = bench_pod.get_logs(name=small_file_client_pod) if "RUN STATUS DONE" in logs: break timeout -= 30 if timeout == 0: raise TimeoutError( "Timed out waiting for benchmark to complete") time.sleep(30) log.info(f"Smallfile test ({test_num + 1}) finished.") # Taking snapshot of the PVC (which contain files) snap_name = pvc_name.replace("claim", "snapshot-") log.info(f"Taking snapshot of the PVC {pvc_name}") log.info(f"Snapshot name : {snap_name}") start_time = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") test_results["creation_time"] = self.measure_create_snapshot_time( pvc_name=pvc_name, snap_name=snap_name, namespace=BMO_NAME, interface=interface, start_time=start_time, ) log.info( f"Snapshot with name {snap_name} and id {self.snap_uid} creation time is" f' {test_results["creation_time"]} seconds') test_results[ "csi_creation_time"] = performance_lib.measure_csi_snapshot_creation_time( interface=interface, snapshot_id=self.snap_uid, start_time=start_time) log.info( f"Snapshot with name {snap_name} and id {self.snap_uid} csi creation time is" f' {test_results["csi_creation_time"]} seconds') all_results.append(test_results) # Delete the smallfile workload - which will delete also the PVC log.info("Deleting the smallfile workload") if sf_obj.delete(wait=True): log.info("The smallfile workload was deleted successfully") # Delete VolumeSnapshots log.info("Deleting the snapshots") if self.snap_obj.delete(wait=True): log.info("The snapshot deleted successfully") log.info("Verify (and wait if needed) that ceph health is OK") ceph_health_check(tries=45, delay=60) # Sleep for 1 Min. between test samples time.sleep(60) # Cleanup the elasticsearch instance. log.info("Deleting the elastic-search instance") self.es.cleanup() creation_times = [t["creation_time"] for t in all_results] avg_c_time = statistics.mean(creation_times) csi_creation_times = [t["csi_creation_time"] for t in all_results] avg_csi_c_time = statistics.mean(csi_creation_times) t_dateset = int(data_set / 3) log.info(f"Full test report for {interface}:") log.info(f"Test ran {self.tests_numbers} times, " f"All snapshot creation results are {creation_times} seconds") log.info( f"The average snapshot creation time is : {avg_c_time} seconds") log.info(f"Test ran {self.tests_numbers} times, " f"All snapshot csi creation results are {csi_creation_times}") log.info( f"The average csi snapshot creation time is : {avg_csi_c_time}") log.info(f"Number of Files on the volume : {total_files:,}, " f"Total dataset : {t_dateset} GiB") self.full_results.add_key("avg_snapshot_creation_time_insecs", avg_c_time) self.full_results.all_results["total_files"] = total_files self.full_results.all_results["total_dataset"] = t_dateset self.full_results.all_results["creation_time"] = creation_times self.full_results.all_results["csi_creation_time"] = csi_creation_times # Write the test results into the ES server log.info("writing results to elastic search server") if self.full_results.es_write(): res_link = self.full_results.results_link() # write the ES link to the test results in the test log. log.info(f"The result can be found at : {res_link}") # Create text file with results of all subtest self.write_result_to_file(res_link)
def test_pvc_snapshot_performance_multiple_files(self, file_size, files, threads, interface): """ Run SmallFile Workload and the take snapshot. test will run with 1M of file on the volume - total data set is the same for all tests, ~30GiB, and then take snapshot and measure the time it takes. the test will run 3 time to check consistency. Args: file_size (int): the size of the file to be create - in KiB files (int): number of files each thread will create threads (int): number of threads will be used in the workload interface (str): the volume interface that will be used CephBlockPool / CephFileSystem Raises: TimeoutError : in case of creation files take too long time more then 2 Hours """ # Loading the main template yaml file for the benchmark and update some # fields with new values sf_data = templating.load_yaml(constants.SMALLFILE_BENCHMARK_YAML) # Deploying elastic-search server in the cluster for use by the # SmallFiles workload, since it is mandatory for the workload. # This is deployed once for all test iterations and will be deleted # in the end of the test. if config.PERF.get("deploy_internal_es"): self.es = ElasticSearch() sf_data["spec"]["elasticsearch"] = { "url": f"http://{self.es.get_ip()}:{self.es.get_port()}" } else: if config.PERF.get("internal_es_server") == "": self.es = None return else: self.es = { "server": config.PERF.get("internal_es_server"), "port": config.PERF.get("internal_es_port"), "url": f"http://{config.PERF.get('internal_es_server')}:{config.PERF.get('internal_es_port')}", } # verify that the connection to the elasticsearch server is OK if not super(TestPvcSnapshotPerformance, self).es_connect(): self.es = None log.error( "ElasticSearch doesn't exist ! The test cannot run") return sf_data["spec"]["elasticsearch"] = {"url": self.es["url"]} if interface == constants.CEPHBLOCKPOOL: storageclass = constants.DEFAULT_STORAGECLASS_RBD else: storageclass = constants.DEFAULT_STORAGECLASS_CEPHFS log.info(f"Using {storageclass} Storageclass") # Setting up the parameters for this test sf_data["spec"]["workload"]["args"]["samples"] = 1 sf_data["spec"]["workload"]["args"]["operation"] = ["create"] sf_data["spec"]["workload"]["args"]["file_size"] = file_size sf_data["spec"]["workload"]["args"]["files"] = files sf_data["spec"]["workload"]["args"]["threads"] = threads sf_data["spec"]["workload"]["args"]["storageclass"] = storageclass """ Calculating the size of the volume that need to be test, it should be at least twice in the size then the size of the files, and at least 100Gi. Since the file_size is in Kb and the vol_size need to be in Gb, more calculation is needed. """ total_files = int(files * threads) total_data = int(files * threads * file_size / constants.GB2KB) data_set = int(total_data * 3) # calculate data with replica vol_size = data_set if data_set >= 100 else 100 sf_data["spec"]["workload"]["args"]["storagesize"] = f"{vol_size}Gi" environment = get_environment_info() if not environment["user"] == "": sf_data["spec"]["test_user"] = environment["user"] else: # since full results object need this parameter, initialize it from CR file environment["user"] = sf_data["spec"]["test_user"] sf_data["spec"]["clustername"] = environment["clustername"] log.debug(f"The smallfile yaml file is {sf_data}") # Deploy the benchmark-operator, so we can use the SmallFiles workload # to fill up the volume with files, and switch to the benchmark-operator namespace. log.info("Deploy the benchmark-operator") self.deploy_benchmark_operator() switch_to_project(BMO_NAME) all_results = [] # Produce ES report # Collecting environment information self.get_env_info() # Initialize the results doc file. self.full_results = self.init_full_results( ResultsAnalyse( self.uuid, self.crd_data, self.full_log_path, "pvc_snapshot_perf_multiple_files", )) self.full_results.add_key("file_size_inKB", file_size) self.full_results.add_key("threads", threads) self.full_results.add_key("interface", interface) for test_num in range(self.tests_numbers): test_results = {"creation_time": None, "csi_creation_time": None} # deploy the smallfile workload self.crd_data = sf_data self.client_pod_name = "smallfile-client" self.deploy_and_wait_for_wl_to_start(timeout=240) # Initialize the pvc_name variable so it will not be in loop scope only. pvc_name = (OCP(kind="pvc", namespace=BMO_NAME).get().get("items") [0].get("metadata").get("name")) log.info(f"Benchmark PVC name is : {pvc_name}") self.wait_for_wl_to_finish(sleep=30) # Taking snapshot of the PVC (which contain files) snap_name = pvc_name.replace("claim", "snapshot-") log.info(f"Taking snapshot of the PVC {pvc_name}") log.info(f"Snapshot name : {snap_name}") start_time = self.get_time("csi") test_results["creation_time"] = self.measure_create_snapshot_time( pvc_name=pvc_name, snap_name=snap_name, namespace=BMO_NAME, interface=interface, start_time=start_time, ) log.info( f"Snapshot with name {snap_name} and id {self.snap_uid} creation time is" f' {test_results["creation_time"]} seconds') test_results[ "csi_creation_time"] = performance_lib.measure_csi_snapshot_creation_time( interface=interface, snapshot_id=self.snap_uid, start_time=start_time) log.info( f"Snapshot with name {snap_name} and id {self.snap_uid} csi creation time is" f' {test_results["csi_creation_time"]} seconds') all_results.append(test_results) # Delete the smallfile workload - which will delete also the PVC log.info("Deleting the smallfile workload") if self.benchmark_obj.delete(wait=True): log.info("The smallfile workload was deleted successfully") # Delete VolumeSnapshots log.info("Deleting the snapshots") if self.snap_obj.delete(wait=True): log.info("The snapshot deleted successfully") log.info("Verify (and wait if needed) that ceph health is OK") ceph_health_check(tries=45, delay=60) # Sleep for 1 Min. between test samples time.sleep(60) # Cleanup the elasticsearch instance, if needed. if isinstance(self.es, ElasticSearch): log.info("Deleting the elastic-search instance") self.es.cleanup() creation_times = [t["creation_time"] for t in all_results] avg_c_time = statistics.mean(creation_times) csi_creation_times = [t["csi_creation_time"] for t in all_results] avg_csi_c_time = statistics.mean(csi_creation_times) t_dateset = int(data_set / 3) log.info(f"Full test report for {interface}:") log.info(f"Test ran {self.tests_numbers} times, " f"All snapshot creation results are {creation_times} seconds") log.info( f"The average snapshot creation time is : {avg_c_time} seconds") log.info(f"Test ran {self.tests_numbers} times, " f"All snapshot csi creation results are {csi_creation_times}") log.info( f"The average csi snapshot creation time is : {avg_csi_c_time}") log.info(f"Number of Files on the volume : {total_files:,}, " f"Total dataset : {t_dateset} GiB") self.full_results.add_key("avg_snapshot_creation_time_insecs", avg_c_time) self.full_results.all_results["total_files"] = total_files self.full_results.all_results["total_dataset"] = t_dateset self.full_results.all_results["creation_time"] = creation_times self.full_results.all_results["csi_creation_time"] = csi_creation_times # Write the test results into the ES server log.info("writing results to elastic search server") self.results_path = helpers.get_full_test_logs_path(cname=self) if self.full_results.es_write(): res_link = self.full_results.results_link() # write the ES link to the test results in the test log. log.info(f"The result can be found at : {res_link}") # Create text file with results of all subtest self.write_result_to_file(res_link)