def test_ceph_rgw_metrics_after_metrics_exporter_respin(rgw_deployments): """ RGW metrics should be provided via OCP Prometheus even after ocs-metrics-exporter pod is respinned. """ logger.info("Respin ocs-metrics-exporter pod") pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) metrics_pods = pod_obj.get( selector="app.kubernetes.io/name=ocs-metrics-exporter")["items"] assert len(metrics_pods) == 1 metrics_pod_data = metrics_pods[0] metrics_pod = OCS(**metrics_pod_data) metrics_pod.delete(force=True) logger.info("Wait for ocs-metrics-exporter pod to come up") assert pod_obj.wait_for_resource( condition="Running", selector="app.kubernetes.io/name=ocs-metrics-exporter", resource_count=1, timeout=600, ) logger.info("Collect RGW metrics") prometheus = PrometheusAPI() list_of_metrics_without_results = metrics.get_missing_metrics( prometheus, metrics.ceph_rgw_metrics) msg = ( "OCS Monitoring should provide some value(s) for tested rgw metrics, " "so that the list of metrics without results is empty.") assert list_of_metrics_without_results == [], msg
def test_basics_rbd(self, test_fixture): """ Testing basics: secret creation, storage class creation and pvc with cephfs """ self.cephfs_secret = templating.load_yaml_to_dict( constants.CSI_CEPHFS_SECRET_YAML) del self.cephfs_secret['data']['userID'] del self.cephfs_secret['data']['userKey'] self.cephfs_secret['data']['adminKey'] = ( get_admin_key_from_ceph_tools()) self.cephfs_secret['data']['adminID'] = constants.ADMIN_BASE64 logging.info(self.cephfs_secret) secret = OCS(**self.cephfs_secret) secret.create() self.cephfs_sc = templating.load_yaml_to_dict( constants.CSI_CEPHFS_STORAGECLASS_YAML) self.cephfs_sc['parameters']['monitors'] = self.mons self.cephfs_sc['parameters']['pool'] = ( f"{self.fs_data['metadata']['name']}-data0") storage_class = OCS(**self.cephfs_sc) storage_class.create() self.cephfs_pvc = templating.load_yaml_to_dict( constants.CSI_CEPHFS_PVC_YAML) pvc = PVC(**self.cephfs_pvc) pvc.create() log.info(pvc.status) assert 'Bound' in pvc.status pvc.delete() storage_class.delete() secret.delete()
def test_basics_cephfs(self): """ Testing basics: secret creation, storage class creation and pvc with rbd """ self.rbd_secret = templating.load_yaml_to_dict( constants.CSI_RBD_SECRET_YAML) del self.rbd_secret['data']['kubernetes'] self.rbd_secret['data']['admin'] = get_admin_key_from_ceph_tools() logging.info(self.rbd_secret) secret = OCS(**self.rbd_secret) secret.create() self.rbd_sc = templating.load_yaml_to_dict( constants.CSI_RBD_STORAGECLASS_YAML) self.rbd_sc['parameters']['monitors'] = self.mons del self.rbd_sc['parameters']['userid'] storage_class = OCS(**self.rbd_sc) storage_class.create() self.rbd_pvc = templating.load_yaml_to_dict(constants.CSI_RBD_PVC_YAML) pvc = PVC(**self.rbd_pvc) pvc.create() assert 'Bound' in pvc.status pvc.delete() storage_class.delete() secret.delete()
def invalid_storageclass(request): """ Creates a CephFS or RBD StorageClass with invalid parameters. Storageclass is removed at the end of test. Returns: str: Name of created StorageClass """ logger.info(f"SETUP - creating storageclass " f"{request.param['values']['storageclass_name']}") yaml_path = os.path.join(request.param['template_dir'], "storageclass.yaml") with open(yaml_path, 'r') as fd: yaml_data = yaml.safe_load(fd) yaml_data.update(request.param['values']) storageclass = OCS(**yaml_data) sc_data = storageclass.create() logger.debug('Check that storageclass has assigned creationTimestamp') assert sc_data['metadata']['creationTimestamp'] yield sc_data logger.info(f"TEARDOWN - removing storageclass " f"{request.param['values']['storageclass_name']}") storageclass.delete()
def teardown(self): """ Delete objects created in roughly reverse order of how they were created. """ self.cb_examples.delete() self.cb_worker.delete() self.cb_deploy.delete() self.pod_obj.exec_oc_cmd( command="delete rolebinding couchbase-operator-rolebinding") self.pod_obj.exec_oc_cmd( command="delete serviceaccount couchbase-operator") self.operator_role.delete() self.couchbase_obj.delete() switch_to_project('default') self.pod_obj.delete_project(constants.COUCHBASE_OPERATOR) for adm_yaml in self.admission_parts: adm_data = templating.load_yaml(adm_yaml) adm_obj = OCS(**adm_data) adm_obj.delete() # Before the code below was added, the teardown task would sometimes # fail with the leftover objects because it would still see one of the # couchbase pods. for admin_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase', 'default'): if admin_pod: continue else: break PillowFight.cleanup(self) switch_to_default_rook_cluster_project()
def test_fio_workload_simple(self, ripsaw, interface, io_pattern): """ This is a basic fio perf test """ # Deployment ripsaw log.info("Deploying ripsaw operator") ripsaw.apply_crd( 'resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml' ) sc = 'ocs-storagecluster-ceph-rbd' if interface == 'CephBlockPool' else 'ocs-storagecluster-cephfs' # Create fio benchmark log.info("Create resource file for fio workload") fio_cr = templating.load_yaml(constants.FIO_CR_YAML) # Todo: have pvc_size set to 'get_osd_pods_memory_sum * 5' # once pr-2037 is merged fio_cr['spec']['clustername'] = config.ENV_DATA['platform'] + get_build() + get_ocs_version() fio_cr['spec']['test_user'] = get_ocs_version() + interface + io_pattern fio_cr['spec']['workload']['args']['storageclass'] = sc if io_pattern == 'sequential': fio_cr['spec']['workload']['args']['jobs'] = ['write', 'read'] log.info(f'fio_cr: {fio_cr}') fio_cr_obj = OCS(**fio_cr) fio_cr_obj.create() # Wait for fio client pod to be created for fio_pod in TimeoutSampler( 300, 20, get_pod_name_by_pattern, 'fio-client', 'my-ripsaw' ): try: if fio_pod[0] is not None: fio_client_pod = fio_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Wait for fio pod to initialized and complete log.info("Waiting for fio_client to complete") pod_obj = OCP(kind='pod') pod_obj.wait_for_resource( condition='Completed', resource_name=fio_client_pod, timeout=18000, sleep=300, ) output = run_cmd(f'oc logs {fio_client_pod}') try: if 'Fio failed to execute' not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") # Clean up fio benchmark log.info("Deleting FIO benchmark") fio_cr_obj.delete() analyze_regression(io_pattern, sc, es_username=fio_cr['spec']['test_user'])
def test_sql_workload_simple(self, ripsaw): """ This is a basic pgsql workload """ # Deployment postgres log.info("Deploying postgres database") ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml') ripsaw.setup_postgresql() # Create pgbench benchmark log.info("Create resource file for pgbench workload") pg_data = templating.load_yaml(constants.PGSQL_BENCHMARK_YAML) pg_obj = OCS(**pg_data) pg_obj.create() # Wait for pgbench pod to be created for pgbench_pod in TimeoutSampler(300, 3, get_pod_name_by_pattern, 'pgbench-1-dbs-client', 'my-ripsaw'): try: if pgbench_pod[0] is not None: pgbench_client_pod = pgbench_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Wait for pg_bench pod to initialized and complete log.info("Waiting for pgbench_client to complete") pod_obj = OCP(kind='pod') pod_obj.wait_for_resource( condition='Completed', resource_name=pgbench_client_pod, timeout=800, sleep=10, ) # Running pgbench and parsing logs output = run_cmd(f'oc logs {pgbench_client_pod}') pg_output = utils.parse_pgsql_logs(output) log.info("*******PGBench output log*********\n" f"{pg_output}") for data in pg_output: latency_avg = data['latency_avg'] if not latency_avg: raise UnexpectedBehaviour("PGBench failed to run, " "no data found on latency_avg") log.info("PGBench has completed successfully") # Clean up pgbench benchmark log.info("Deleting PG bench benchmark") pg_obj.delete()
def test_sql_workload_simple(self, ripsaw): """ This is a basic pgsql workload """ # Deployment postgres log.info("Deploying postgres database") ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml') ripsaw.setup_postgresql() run_cmd('bin/oc wait --for condition=ready pod ' '-l app=postgres ' '--timeout=120s') # Create pgbench benchmark log.info("Create resource file for pgbench workload") pg_data = templating.load_yaml_to_dict(constants.PGSQL_BENCHMARK_YAML) pg_obj = OCS(**pg_data) pg_obj.create() # Wait for pgbench pod to be created log.info("waiting for pgbench benchmark to create, " f"PGbench pod name: {pg_obj.name} ") wait_time = 30 log.info(f"Waiting {wait_time} seconds...") time.sleep(wait_time) pgbench_pod = run_cmd('bin/oc get pods -l ' 'app=pgbench-client -o name') pgbench_pod = pgbench_pod.split('/')[1] run_cmd('bin/oc wait --for condition=Initialized ' f'pods/{pgbench_pod} ' '--timeout=60s') run_cmd('bin/oc wait --for condition=Complete jobs ' '-l app=pgbench-client ' '--timeout=300s') # Running pgbench and parsing logs output = run_cmd(f'bin/oc logs {pgbench_pod}') pg_output = utils.parse_pgsql_logs(output) log.info("*******PGBench output log*********\n" f"{pg_output}") for data in pg_output: latency_avg = data['latency_avg'] if not latency_avg: raise UnexpectedBehaviour("PGBench failed to run, " "no data found on latency_avg") log.info("PGBench has completed successfully") # Clean up pgbench benchmark log.info("Deleting PG bench benchmark:") pg_obj.delete()
def test_verify_all_fields_in_sc_yaml_with_oc_describe(self, interface): """ Test function to create RBD and CephFS SC, and match with oc describe sc output """ log.info(f"Creating a {interface} storage class") self.sc_data = templating.load_yaml( getattr(constants, f"CSI_{interface}_STORAGECLASS_YAML")) self.sc_data['metadata']['name'] = ( helpers.create_unique_resource_name('test', f'csi-{interface.lower()}')) global SC_OBJ SC_OBJ = OCS(**self.sc_data) assert SC_OBJ.create() log.info( f"{interface}Storage class: {SC_OBJ.name} created successfully") log.info(self.sc_data) # Get oc describe sc output describe_out = SC_OBJ.get("sc") log.info(describe_out) # Confirm that sc yaml details matches oc describe sc output value = { k: describe_out[k] for k in set(describe_out) - set(self.sc_data) } assert len(value) == 1 and value['volumeBindingMode'] == 'Immediate', ( "OC describe sc output didn't match storage class yaml") log.info("OC describe sc output matches storage class yaml") # Delete Storage Class log.info(f"Deleting Storageclass: {SC_OBJ.name}") assert SC_OBJ.delete() log.info(f"Storage Class: {SC_OBJ.name} deleted successfully") del SC_OBJ
def cleanup(self): """ Remove pillowfight pods and temp files """ pf_files = listdir(constants.TEMPLATE_PILLOWFIGHT_DIR) for pf_yaml in pf_files: pf_fullpath = join(constants.TEMPLATE_PILLOWFIGHT_DIR, pf_yaml) if not pf_fullpath.endswith('.yaml'): continue if not isfile(pf_fullpath): continue pfight = templating.load_yaml(pf_fullpath) lpillowfight = OCS(**pfight) try: lpillowfight.delete() except CommandFailed: log.info(f"{pf_fullpath} object is already deleted") rmtree(self.logs)
def cleanup(self): """ Remove pillowfight pods and temp files """ pf_files = listdir(constants.TEMPLATE_PILLOWFIGHT_DIR) for pf_yaml in pf_files: pf_fullpath = join(constants.TEMPLATE_PILLOWFIGHT_DIR, pf_yaml) if not pf_fullpath.endswith('.yaml'): continue if not isfile(pf_fullpath): continue pfight = templating.load_yaml(pf_fullpath) lpillowfight = OCS(**pfight) try: lpillowfight.delete() except CommandFailed: log.info(f"{pf_fullpath} object is already deleted") rmtree(self.logs) nsinfo = self.pod_obj.exec_oc_cmd(command="get namespace") if self.COUCHBASE_OPERATOR in nsinfo: self.pod_obj.exec_oc_cmd( command=f"delete namespace {self.COUCHBASE_OPERATOR}")
class SmallFiles(BenchmarkOperator): """ Small_Files workload benchmark """ def __init__(self, es, **kwargs): """ Initializer function Args: es (obj): elastic search instance object """ self.es = es self.dev_mode = config.RUN["cli_params"].get("dev_mode") super().__init__(**kwargs) # Loading the main template yaml file for the benchmark log.info("Loading the CRD Template file") self.crd_data = templating.load_yaml( constants.SMALLFILE_BENCHMARK_YAML) assert (self._setup_elasticsearch() ), "Can not execute the workload without ES server" self.deploy() def _setup_elasticsearch(self): """ Setting up the elastic search parameters in the CRD object. Return: bool : True if there is ES to connect, False otherwise """ log.info("Setting up the elasticsearch configuration") self.crd_data["spec"]["elasticsearch"] = {} if not self.dev_mode and config.PERF.get("production_es"): log.info("Setting ES to production !") self.crd_data["spec"]["elasticsearch"] = { "server": config.PERF.get("production_es_server"), "port": config.PERF.get("production_es_port"), } elif self.dev_mode and config.PERF.get("dev_lab_es"): log.info("Setting ES to development one !") self.crd_data["spec"]["elasticsearch"] = { "server": config.PERF.get("dev_es_server"), "port": config.PERF.get("dev_es_port"), } if not self.crd_data["spec"]["elasticsearch"] == {}: self.crd_data["spec"]["elasticsearch"][ "url"] = "http://{}:{}".format( self.crd_data["spec"]["elasticsearch"]["server"], self.crd_data["spec"]["elasticsearch"]["port"], ) self.crd_data["spec"]["elasticsearch"]["parallel"] = True # Saving the Original elastic-search IP and PORT - if defined in yaml self.backup_es = self.crd_data["spec"]["elasticsearch"] # Use the internal define elastic-search server in the test - if exist if self.es: self.crd_data["spec"]["elasticsearch"] = { "url": f"http://{self.es.get_ip()}:{self.es.get_port()}", "server": self.es.get_ip(), "port": self.es.get_port(), "parallel": True, } if self.crd_data["spec"]["elasticsearch"] == {}: log.error( "No ElasticSearch server is available. workload can not be execute" ) return False return True def setup_storageclass(self, interface): """ Setting up the storageclass parameter in the CRD object Args: interface (str): the storage interface """ if interface == constants.CEPHBLOCKPOOL: storageclass = constants.DEFAULT_STORAGECLASS_RBD else: storageclass = constants.DEFAULT_STORAGECLASS_CEPHFS log.info(f"Using {storageclass} Storageclass") self.crd_data["spec"]["workload"]["args"][ "storageclass"] = storageclass def setup_test_params(self, file_size, files, threads, samples): """ Setting up the parameters for this test Args: file_size (int): the file size in KB files (int): number of file to use in the test threads (int): number of threads to use in the test samples (int): number of sample to run the test """ self.crd_data["spec"]["workload"]["args"]["file_size"] = file_size self.crd_data["spec"]["workload"]["args"]["files"] = files self.crd_data["spec"]["workload"]["args"]["threads"] = threads self.crd_data["spec"]["workload"]["args"]["samples"] = samples def setup_vol_size(self, file_size, files, threads, total_capacity): """ Calculating the size of the volume that need to be test, it should be at least twice in the size then the size of the files, and at least 100Gi. Since the file_size is in Kb and the vol_size need to be in Gb, more calculation is needed. Args: file_size (int): the file size in KB files (int): number of file to use in the test threads (int): number of threads to use in the test total_capacity (int): The total usable storage capacity in GiB """ vol_size = int(files * threads * file_size * 3) vol_size = int(vol_size / constants.GB2KB) if vol_size < 100: vol_size = 100 errmsg = ("There is not enough storage to run the test. " f"Storage capacity : {total_capacity:,.2f} GiB, " f"Needed capacity is more then {vol_size:,.2f} GiB") assert vol_size < total_capacity, errmsg self.crd_data["spec"]["workload"]["args"][ "storagesize"] = f"{vol_size}Gi" def setup_operations(self, ops): """ Setting up the test operations Args: ops : can be list of operations or a string of one operation """ if isinstance(ops, list): self.crd_data["spec"]["workload"]["args"]["operation"] = ops elif isinstance(ops, str): self.crd_data["spec"]["workload"]["args"]["operation"] = [ops] def run(self): """ Run the benchmark and wait until it completed """ # Create the benchmark object self.sf_obj = OCS(**self.crd_data) self.sf_obj.create() # Wait for benchmark pods to get created - takes a while for bench_pod in TimeoutSampler( 240, 10, get_pod_name_by_pattern, "smallfile-client", benchmark_operator.BMO_NAME, ): try: if bench_pod[0] is not None: small_file_client_pod = bench_pod[0] break except IndexError: log.info("Bench pod not ready yet") bench_pod = OCP(kind="pod", namespace=benchmark_operator.BMO_NAME) log.info("Waiting for SmallFile benchmark to Run") assert bench_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=small_file_client_pod, sleep=30, timeout=600, ) log.info("The SmallFiles benchmark is running, wait for completion") bench_pod.wait_for_resource( condition=constants.STATUS_COMPLETED, resource_name=small_file_client_pod, timeout=18000, sleep=60, ) log.info("The SmallFiles benchmark is completed") def delete(self): """ Delete the benchmark """ log.info("Deleting The Small Files benchmark") self.sf_obj.delete()
class AMQ(object): """ Workload operation using AMQ """ def __init__(self, **kwargs): """ Initializer function Args: kwargs (dict): Following kwargs are valid namespace: namespace for the operator repo: AMQ repo where all necessary yaml file are there - a github link branch: branch to use from the repo """ self.args = kwargs self.repo = self.args.get("repo", constants.KAFKA_OPERATOR) self.branch = self.args.get("branch", "master") self.ocp = OCP() self.ns_obj = OCP(kind="namespace") self.pod_obj = OCP(kind="pod") self.kafka_obj = OCP(kind="Kafka") self.kafka_connect_obj = OCP(kind="KafkaConnect") self.kafka_bridge_obj = OCP(kind="KafkaBridge") self.kafka_topic_obj = OCP(kind="KafkaTopic") self.kafka_user_obj = OCP(kind="KafkaUser") self.amq_is_setup = False self.messaging = False self.benchmark = False self.consumer_pod = self.producer_pod = None self.kafka_topic = self.kafka_user = None self.kafka_connect = self.kafka_bridge = self.kafka_persistent = None self.dir = tempfile.mkdtemp(prefix="amq_") self._clone_amq() def _clone_amq(self): """ clone the amq repo """ try: log.info(f"cloning amq in {self.dir}") git_clone_cmd = f"git clone {self.repo} " run(git_clone_cmd, shell=True, cwd=self.dir, check=True) self.amq_dir = "strimzi-kafka-operator/packaging/install/cluster-operator/" self.amq_kafka_pers_yaml = ( "strimzi-kafka-operator/packaging/examples/kafka/kafka-persistent.yaml" ) self.amq_kafka_connect_yaml = ( "strimzi-kafka-operator/packaging/examples/connect/kafka-connect.yaml" ) self.amq_kafka_bridge_yaml = ( "strimzi-kafka-operator/packaging/examples/bridge/kafka-bridge.yaml" ) self.kafka_topic_yaml = ( "strimzi-kafka-operator/packaging/examples/topic/kafka-topic.yaml" ) self.kafka_user_yaml = ( "strimzi-kafka-operator/packaging/examples/user/kafka-user.yaml" ) self.hello_world_producer_yaml = constants.HELLO_WORLD_PRODUCER_YAML self.hello_world_consumer_yaml = constants.HELLO_WORLD_CONSUMER_YAML except (CommandFailed, CalledProcessError) as cf: log.error("Error during cloning of amq repository") raise cf def create_namespace(self, namespace): """ create namespace for amq Args: namespace (str): Namespace for amq pods """ self.ocp.new_project(namespace) def setup_amq_cluster_operator(self, namespace=constants.AMQ_NAMESPACE): """ Function to setup amq-cluster_operator, the file is pulling from github it will make sure cluster-operator pod is running Args: namespace (str): Namespace for AMQ pods """ # Namespace for amq try: self.create_namespace(namespace) except CommandFailed as ef: if f'project.project.openshift.io "{namespace}" already exists' not in str( ef ): raise ef # Create strimzi-cluster-operator pod run( f"for i in `(ls strimzi-kafka-operator/packaging/install/cluster-operator/)`;" f"do sed 's/{namespace}/myproject/g' " f"strimzi-kafka-operator/packaging/install/cluster-operator/$i;done", shell=True, check=True, cwd=self.dir, ) self.strimzi_kafka_operator = os.path.join(self.dir, self.amq_dir) pf_files = os.listdir(self.strimzi_kafka_operator) crds = [] for crd in pf_files: crds.append(crd) self.crd_objects = [] for adm_yaml in crds: try: adm_data = templating.load_yaml(self.strimzi_kafka_operator + adm_yaml) adm_obj = OCS(**adm_data) adm_obj.create() self.crd_objects.append(adm_obj) except (CommandFailed, CalledProcessError) as cfe: if "Error is Error from server (AlreadyExists):" in str(cfe): log.warn( "Some amq leftovers are present, please cleanup the cluster" ) pytest.skip( "AMQ leftovers are present needs to cleanup the cluster" ) time.sleep(30) # Check strimzi-cluster-operator pod created if self.is_amq_pod_running(pod_pattern="cluster-operator", expected_pods=1): log.info("strimzi-cluster-operator pod is in running state") else: raise ResourceWrongStatusException( "strimzi-cluster-operator pod is not getting to running state" ) def is_amq_pod_running( self, pod_pattern, expected_pods, namespace=constants.AMQ_NAMESPACE ): """ The function checks if provided pod_pattern finds a pod and if the status is running or not Args: pod_pattern (str): the pattern for pod expected_pods (int): Number of pods namespace (str): Namespace for amq pods Returns: bool: status of pod: True if found pod is running """ _rc = True for pod in TimeoutSampler( 300, 10, get_pod_name_by_pattern, pod_pattern, namespace ): try: if pod is not None and len(pod) == expected_pods: amq_pod = pod break except IndexError as ie: log.error(" pod not ready yet") raise ie # checking pod status for pod in amq_pod: if self.pod_obj.wait_for_resource( condition="Running", resource_name=pod, timeout=1600, sleep=30, ): log.info(f"{pod} pod is up and running") else: _rc = False log.error(f"{pod} pod is not running") return _rc def setup_amq_kafka_persistent(self, sc_name, size=100, replicas=3): """ Function to setup amq-kafka-persistent, the file is pulling from github it will make kind: Kafka and will make sure the status is running Args: sc_name (str): Name of sc size (int): Size of the storage in Gi replicas (int): Number of kafka and zookeeper pods to be created return : kafka_persistent """ if storagecluster_independent_check(): sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD try: kafka_persistent = templating.load_yaml( os.path.join(self.dir, self.amq_kafka_pers_yaml) ) kafka_persistent["spec"]["kafka"]["replicas"] = replicas kafka_persistent["spec"]["kafka"]["storage"]["volumes"][0][ "class" ] = sc_name kafka_persistent["spec"]["kafka"]["storage"]["volumes"][0][ "size" ] = f"{size}Gi" kafka_persistent["spec"]["zookeeper"]["replicas"] = replicas kafka_persistent["spec"]["zookeeper"]["storage"]["class"] = sc_name kafka_persistent["spec"]["zookeeper"]["storage"]["size"] = f"{size}Gi" self.kafka_persistent = OCS(**kafka_persistent) self.kafka_persistent.create() except (CommandFailed, CalledProcessError) as cf: log.error("Failed during setup of AMQ Kafka-persistent") raise cf time.sleep(40) if self.is_amq_pod_running( pod_pattern="my-cluster", expected_pods=(replicas * 2) + 1 ): return self.kafka_persistent else: raise ResourceWrongStatusException( "my-cluster-kafka and my-cluster-zookeeper " "Pod is not getting to running state" ) def setup_amq_kafka_connect(self): """ The function is to setup amq-kafka-connect, the yaml file is pulling from github it will make kind: KafkaConnect and will make sure the status is running Returns: kafka_connect object """ try: kafka_connect = templating.load_yaml( os.path.join(self.dir, self.amq_kafka_connect_yaml) ) self.kafka_connect = OCS(**kafka_connect) self.kafka_connect.create() except (CommandFailed, CalledProcessError) as cf: log.error("Failed during setup of AMQ KafkaConnect") raise cf if self.is_amq_pod_running( pod_pattern="my-connect-cluster-connect", expected_pods=1 ): return self.kafka_connect else: raise ResourceWrongStatusException( "my-connect-cluster-connect pod is not getting to running state" ) def setup_amq_kafka_bridge(self): """ Function to setup amq-kafka, the file file is pulling from github it will make kind: KafkaBridge and will make sure the pod status is running Return: kafka_bridge object """ try: kafka_bridge = templating.load_yaml( os.path.join(self.dir, self.amq_kafka_bridge_yaml) ) self.kafka_bridge = OCS(**kafka_bridge) self.kafka_bridge.create() except (CommandFailed, CalledProcessError) as cf: log.error("Failed during setup of AMQ KafkaConnect") raise cf # Making sure the kafka_bridge is running if self.is_amq_pod_running(pod_pattern="my-bridge-bridge", expected_pods=1): return self.kafka_bridge else: raise ResourceWrongStatusException( "kafka_bridge_pod pod is not getting to running state" ) def create_kafka_topic(self, name="my-topic", partitions=1, replicas=1): """ Creates kafka topic Args: name (str): Name of the kafka topic partitions (int): Number of partitions replicas (int): Number of replicas Return: kafka_topic object """ try: kafka_topic = templating.load_yaml( os.path.join(self.dir, self.kafka_topic_yaml) ) kafka_topic["metadata"]["name"] = name kafka_topic["spec"]["partitions"] = partitions kafka_topic["spec"]["replicas"] = replicas self.kafka_topic = OCS(**kafka_topic) self.kafka_topic.create() except (CommandFailed, CalledProcessError) as cf: if f'kafkatopics.kafka.strimzi.io "{name}" already exists' not in str(cf): log.error("Failed during creating of Kafka topic") raise cf # Making sure kafka topic created if self.kafka_topic_obj.get(resource_name=name): return self.kafka_topic else: raise ResourceWrongStatusException("kafka topic is not created") def create_kafka_user(self, name="my-user"): """ Creates kafka user Args: name (str): Name of the kafka user Return: kafka_user object """ try: kafka_user = templating.load_yaml( os.path.join(self.dir, self.kafka_user_yaml) ) kafka_user["metadata"]["name"] = name self.kafka_user = OCS(**kafka_user) self.kafka_user.create() except (CommandFailed, CalledProcessError) as cf: log.error("Failed during creating of Kafka user") raise cf # Making sure kafka user created if self.kafka_user_obj.get(resource_name=name): return self.kafka_user else: raise ResourceWrongStatusException("kafka user is not created") def create_producer_pod(self, num_of_pods=1, value="10000"): """ Creates producer pods Args: num_of_pods (int): Number of producer pods to be created value (str): Number of the messages to be sent Returns: producer pod object """ try: producer_pod = templating.load_yaml(constants.HELLO_WORLD_PRODUCER_YAML) producer_pod["spec"]["replicas"] = num_of_pods producer_pod["spec"]["template"]["spec"]["containers"][0]["env"][4][ "value" ] = value self.producer_pod = OCS(**producer_pod) self.producer_pod.create() except (CommandFailed, CalledProcessError) as cf: log.error("Failed during creation of producer pod") raise cf # Making sure the producer pod is running if self.is_amq_pod_running( pod_pattern="hello-world-producer", expected_pods=num_of_pods ): return self.producer_pod else: raise ResourceWrongStatusException( "producer pod is not getting to running state" ) def create_consumer_pod(self, num_of_pods=1, value="10000"): """ Creates producer pods Args: num_of_pods (int): Number of consumer pods to be created value (str): Number of messages to be received Returns: consumer pod object """ try: consumer_pod = templating.load_yaml(constants.HELLO_WORLD_CONSUMER_YAML) consumer_pod["spec"]["replicas"] = num_of_pods consumer_pod["spec"]["template"]["spec"]["containers"][0]["env"][4][ "value" ] = value self.consumer_pod = OCS(**consumer_pod) self.consumer_pod.create() except (CommandFailed, CalledProcessError) as cf: log.error("Failed during creation of consumer pod") raise cf # Making sure the producer pod is running if self.is_amq_pod_running( pod_pattern="hello-world-consumer", expected_pods=num_of_pods ): return self.consumer_pod else: raise ResourceWrongStatusException( "consumer pod is not getting to running state" ) def validate_msg( self, pod, namespace=constants.AMQ_NAMESPACE, value="10000", since_time=1800 ): """ Validate if messages are sent or received Args: pod (str): Name of the pod namespace (str): Namespace of the pod value (str): Number of messages are sent since_time (int): Number of seconds to required to sent the msg Returns: bool : True if all messages are sent/received """ cmd = f"oc logs -n {namespace} {pod} --since={since_time}s" msg = run_cmd(cmd) substring = f"Hello world - {int(value) - 1}" if msg.find(substring) == -1: return False else: return True def validate_messages_are_produced( self, namespace=constants.AMQ_NAMESPACE, value="10000", since_time=1800 ): """ Validates if all messages are sent in producer pod Args: namespace (str): Namespace of the pod value (str): Number of messages are sent since_time (int): Number of seconds to required to sent the msg Raises exception on failures """ # ToDo: Support multiple topics and users producer_pod_objs = [ get_pod_obj(pod) for pod in get_pod_name_by_pattern("hello-world-produce", namespace) ] for pod in producer_pod_objs: for msg in TimeoutSampler( 900, 30, self.validate_msg, pod.name, namespace, value, since_time ): if msg: break assert msg, "Few messages are not sent by producer" log.info("Producer sent all messages") def validate_messages_are_consumed( self, namespace=constants.AMQ_NAMESPACE, value="10000", since_time=1800 ): """ Validates if all messages are received in consumer pod Args: namespace (str): Namespace of the pod value (str): Number of messages are recieved since_time (int): Number of seconds to required to receive the msg Raises exception on failures """ # ToDo: Support multiple topics and users consumer_pod_objs = [ get_pod_obj(pod) for pod in get_pod_name_by_pattern("hello-world-consumer", namespace) ] for pod in consumer_pod_objs: for msg in TimeoutSampler( 900, 30, self.validate_msg, pod.name, namespace, value, since_time ): if msg: break assert msg, "Consumer didn't receive all messages" log.info("Consumer received all messages") def run_in_bg( self, namespace=constants.AMQ_NAMESPACE, value="10000", since_time=1800 ): """ Validate messages are produced and consumed in bg Args: namespace (str): Namespace of the pod value (str): Number of messages to be sent and received since_time (int): Number of seconds to required to sent and receive msg """ # Todo: Check for each messages sent and received log.info("Running open messages on pod in bg") threads = [] executor = ThreadPoolExecutor(2) threads.append( executor.submit( self.validate_messages_are_produced, namespace, value, since_time ) ) threads.append( executor.submit( self.validate_messages_are_consumed, namespace, value, since_time ) ) return threads def run_amq_benchmark( self, benchmark_pod_name="benchmark", kafka_namespace=constants.AMQ_NAMESPACE, tiller_namespace=AMQ_BENCHMARK_NAMESPACE, num_of_clients=8, worker=None, timeout=1800, amq_workload_yaml=None, run_in_bg=False, ): """ Run benchmark pod and get the results Args: benchmark_pod_name (str): Name of the benchmark pod kafka_namespace (str): Namespace where kafka cluster created tiller_namespace (str): Namespace where tiller pod needs to be created num_of_clients (int): Number of clients to be created worker (str) : Loads to create on workloads separated with commas e.g http://benchmark-worker-0.benchmark-worker:8080, http://benchmark-worker-1.benchmark-worker:8080 timeout (int): Time to complete the run amq_workload_yaml (dict): Contains amq workloads information keys and values :name (str): Name of the workloads :topics (int): Number of topics created :partitions_per_topic (int): Number of partitions per topic :message_size (int): Message size :payload_file (str): Load to run on workload :subscriptions_per_topic (int): Number of subscriptions per topic :consumer_per_subscription (int): Number of consumers per subscription :producers_per_topic (int): Number of producers per topic :producer_rate (int): Producer rate :consumer_backlog_sizegb (int): Size of block in gb :test_duration_minutes (int): Time to run the workloads run_in_bg (bool): On true the workload will run in background Return: result (str/Thread obj): Returns benchmark run information if run_in_bg is False. Otherwise a thread of the amq workload execution """ # Namespace for to helm/tiller try: self.create_namespace(tiller_namespace) except CommandFailed as ef: if ( f'project.project.openshift.io "{tiller_namespace}" already exists' not in str(ef) ): raise ef # Create rbac file try: sa_tiller = list( templating.load_yaml(constants.AMQ_RBAC_YAML, multi_document=True) ) sa_tiller[0]["metadata"]["namespace"] = tiller_namespace sa_tiller[1]["subjects"][0]["namespace"] = tiller_namespace self.sa_tiller = OCS(**sa_tiller[0]) self.crb_tiller = OCS(**sa_tiller[1]) self.sa_tiller.create() self.crb_tiller.create() except (CommandFailed, CalledProcessError) as cf: log.error("Failed during creation of service account tiller") raise cf # Install helm cli (version v2.16.0 as we need tiller component) # And create tiller pods wget_cmd = f"wget -c --read-timeout=5 --tries=0 {URL}" untar_cmd = "tar -zxvf helm-v2.16.1-linux-amd64.tar.gz" tiller_cmd = ( f"linux-amd64/helm init --tiller-namespace {tiller_namespace}" f" --service-account {tiller_namespace}" ) exec_cmd(cmd=wget_cmd, cwd=self.dir) exec_cmd(cmd=untar_cmd, cwd=self.dir) exec_cmd(cmd=tiller_cmd, cwd=self.dir) # Validate tiller pod is running log.info("Waiting for 30s for tiller pod to come up") time.sleep(30) if self.is_amq_pod_running( pod_pattern="tiller", expected_pods=1, namespace=tiller_namespace ): log.info("Tiller pod is running") else: raise ResourceWrongStatusException("Tiller pod is not in running state") # Create benchmark pods log.info("Create benchmark pods") values = templating.load_yaml(constants.AMQ_BENCHMARK_VALUE_YAML) values["numWorkers"] = num_of_clients benchmark_cmd = ( f"linux-amd64/helm install {constants.AMQ_BENCHMARK_POD_YAML}" f" --name {benchmark_pod_name} --tiller-namespace {tiller_namespace}" ) exec_cmd(cmd=benchmark_cmd, cwd=self.dir) # Making sure the benchmark pod and clients are running if self.is_amq_pod_running( pod_pattern="benchmark", expected_pods=(1 + num_of_clients), namespace=tiller_namespace, ): log.info("All benchmark pod is up and running") else: raise ResourceWrongStatusException( "Benchmark pod is not getting to running state" ) # Update commonConfig with kafka-bootstrap server details driver_kafka = templating.load_yaml(constants.AMQ_DRIVER_KAFKA_YAML) driver_kafka[ "commonConfig" ] = f"bootstrap.servers=my-cluster-kafka-bootstrap.{kafka_namespace}.svc.cluster.local:9092" json_file = f"{self.dir}/driver_kafka" templating.dump_data_to_json(driver_kafka, json_file) cmd = f"cp {json_file} {benchmark_pod_name}-driver:/" self.pod_obj.exec_oc_cmd(cmd) # Update the workload yaml if not amq_workload_yaml: amq_workload_yaml = templating.load_yaml(constants.AMQ_WORKLOAD_YAML) yaml_file = f"{self.dir}/amq_workload.yaml" templating.dump_data_to_temp_yaml(amq_workload_yaml, yaml_file) cmd = f"cp {yaml_file} {benchmark_pod_name}-driver:/" self.pod_obj.exec_oc_cmd(cmd) self.benchmark = True # Run the benchmark if worker: cmd = f"bin/benchmark --drivers /driver_kafka --workers {worker} /amq_workload.yaml" else: cmd = "bin/benchmark --drivers /driver_kafka /amq_workload.yaml" log.info(f"Run benchmark and running command {cmd} inside the benchmark pod ") if run_in_bg: executor = ThreadPoolExecutor(1) result = executor.submit( self.run_amq_workload, cmd, benchmark_pod_name, tiller_namespace, timeout, ) return result pod_obj = get_pod_obj( name=f"{benchmark_pod_name}-driver", namespace=tiller_namespace ) result = pod_obj.exec_cmd_on_pod( command=cmd, out_yaml_format=False, timeout=timeout ) return result def run_amq_workload(self, command, benchmark_pod_name, tiller_namespace, timeout): """ Runs amq workload in bg Args: command (str): Command to run on pod benchmark_pod_name (str): Pod name tiller_namespace (str): Namespace of pod timeout (int): Time to complete the run Returns: result (str): Returns benchmark run information """ pod_obj = get_pod_obj( name=f"{benchmark_pod_name}-driver", namespace=tiller_namespace ) return pod_obj.exec_cmd_on_pod( command=command, out_yaml_format=False, timeout=timeout ) def validate_amq_benchmark( self, result, amq_workload_yaml, benchmark_pod_name="benchmark" ): """ Validates amq benchmark run Args: result (str): Benchmark run information amq_workload_yaml (dict): AMQ workload information benchmark_pod_name (str): Name of the benchmark pod Returns: res_dict (dict): Returns the dict output on success, Otherwise none """ res_dict = {} res_dict["topic"] = amq_workload_yaml["topics"] res_dict["partitionsPerTopic"] = amq_workload_yaml["partitionsPerTopic"] res_dict["messageSize"] = amq_workload_yaml["messageSize"] res_dict["payloadFile"] = amq_workload_yaml["payloadFile"] res_dict["subscriptionsPerTopic"] = amq_workload_yaml["subscriptionsPerTopic"] res_dict["producersPerTopic"] = amq_workload_yaml["producersPerTopic"] res_dict["consumerPerSubscription"] = amq_workload_yaml[ "consumerPerSubscription" ] res_dict["producerRate"] = amq_workload_yaml["producerRate"] # Validate amq benchmark is completed for part in result.split(): if ".json" in part: workload_json_file = part if workload_json_file: cmd = f"rsync {benchmark_pod_name}-driver:{workload_json_file} {self.dir} -n {AMQ_BENCHMARK_NAMESPACE}" self.pod_obj.exec_oc_cmd(command=cmd, out_yaml_format=False) # Parse the json file with open(f"{self.dir}/{workload_json_file}") as json_file: data = json.load(json_file) res_dict["AvgpublishRate"] = sum(data.get("publishRate")) / len( data.get("publishRate") ) res_dict["AvgConsumerRate"] = sum(data.get("consumeRate")) / len( data.get("consumeRate") ) res_dict["AvgMsgBacklog"] = sum(data.get("backlog")) / len( data.get("backlog") ) res_dict["publishLatencyAvg"] = sum(data.get("publishLatencyAvg")) / len( data.get("publishLatencyAvg") ) res_dict["aggregatedPublishLatencyAvg"] = data.get( "aggregatedPublishLatencyAvg" ) res_dict["aggregatedPublishLatency50pct"] = data.get( "aggregatedPublishLatency50pct" ) res_dict["aggregatedPublishLatency75pct"] = data.get( "aggregatedPublishLatency75pct" ) res_dict["aggregatedPublishLatency95pct"] = data.get( "aggregatedPublishLatency95pct" ) res_dict["aggregatedPublishLatency99pct"] = data.get( "aggregatedPublishLatency99pct" ) res_dict["aggregatedPublishLatency999pct"] = data.get( "aggregatedPublishLatency999pct" ) res_dict["aggregatedPublishLatency9999pct"] = data.get( "aggregatedPublishLatency9999pct" ) res_dict["aggregatedPublishLatencyMax"] = data.get( "aggregatedPublishLatencyMax" ) res_dict["aggregatedEndToEndLatencyAvg"] = data.get( "aggregatedEndToEndLatencyAvg" ) res_dict["aggregatedEndToEndLatency50pct"] = data.get( "aggregatedEndToEndLatency50pct" ) res_dict["aggregatedEndToEndLatency75pct"] = data.get( "aggregatedEndToEndLatency75pct" ) res_dict["aggregatedEndToEndLatency95pct"] = data.get( "aggregatedEndToEndLatency95pct" ) res_dict["aggregatedEndToEndLatency99pct"] = data.get( "aggregatedEndToEndLatency99pct" ) res_dict["aggregatedEndToEndLatency999pct"] = data.get( "aggregatedEndToEndLatency999pct" ) res_dict["aggregatedEndToEndLatency9999pct"] = data.get( "aggregatedEndToEndLatency9999pct" ) res_dict["aggregatedEndToEndLatencyMax"] = data.get( "aggregatedEndToEndLatencyMax" ) else: log.error("Benchmark didn't run completely") return None amq_benchmark_pod_table = PrettyTable(["key", "value"]) for key, val in res_dict.items(): amq_benchmark_pod_table.add_row([key, val]) log.info(f"\n{amq_benchmark_pod_table}\n") return res_dict def export_amq_output_to_gsheet(self, amq_output, sheet_name, sheet_index): """ Collect amq data to google spreadsheet Args: amq_output (dict): amq output in dict sheet_name (str): Name of the sheet sheet_index (int): Index of sheet """ # Collect data and export to Google doc spreadsheet g_sheet = GoogleSpreadSheetAPI(sheet_name=sheet_name, sheet_index=sheet_index) log.info("Exporting amq data to google spreadsheet") headers_to_key = [] values = [] for key, val in amq_output.items(): headers_to_key.append(key) values.append(val) # Update amq_result to gsheet g_sheet.insert_row(values, 2) g_sheet.insert_row(headers_to_key, 2) # Capturing versions(OCP, OCS and Ceph) and test run name g_sheet.insert_row( [ f"ocp_version:{utils.get_cluster_version()}", f"ocs_build_number:{utils.get_ocs_build_number()}", f"ceph_version:{utils.get_ceph_version()}", f"test_run_name:{utils.get_testrun_name()}", ], 2, ) def create_messaging_on_amq( self, topic_name="my-topic", user_name="my-user", partitions=1, replicas=1, num_of_producer_pods=1, num_of_consumer_pods=1, value="10000", ): """ Creates workload using Open Messaging tool on amq cluster Args: topic_name (str): Name of the topic to be created user_name (str): Name of the user to be created partitions (int): Number of partitions of topic replicas (int): Number of replicas of topic num_of_producer_pods (int): Number of producer pods to be created num_of_consumer_pods (int): Number of consumer pods to be created value (str): Number of messages to be sent and received """ self.create_kafka_topic(topic_name, partitions, replicas) self.create_kafka_user(user_name) self.create_producer_pod(num_of_producer_pods, value) self.create_consumer_pod(num_of_consumer_pods, value) self.messaging = True def setup_amq_cluster( self, sc_name, namespace=constants.AMQ_NAMESPACE, size=100, replicas=3 ): """ Creates amq cluster with persistent storage. Args: sc_name (str): Name of sc namespace (str): Namespace for amq cluster size (int): Size of the storage replicas (int): Number of kafka and zookeeper pods to be created """ if storagecluster_independent_check(): sc_name = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD self.setup_amq_cluster_operator(namespace) self.setup_amq_kafka_persistent(sc_name, size, replicas) self.setup_amq_kafka_connect() self.setup_amq_kafka_bridge() self.amq_is_setup = True return self def create_kafkadrop(self, wait=True): """ Create kafkadrop pod, service and routes Args: wait (bool): If true waits till kafkadrop pod running Return: tuple: Contains objects of kafkadrop pod, service and route """ # Create kafkadrop pod try: kafkadrop = list( templating.load_yaml(constants.KAFKADROP_YAML, multi_document=True) ) self.kafkadrop_pod = OCS(**kafkadrop[0]) self.kafkadrop_svc = OCS(**kafkadrop[1]) self.kafkadrop_route = OCS(**kafkadrop[2]) self.kafkadrop_pod.create() self.kafkadrop_svc.create() self.kafkadrop_route.create() except (CommandFailed, CalledProcessError) as cf: log.error("Failed during creation of kafkadrop which kafka UI") raise cf # Validate kafkadrop pod running if wait: ocp_obj = OCP(kind=constants.POD, namespace=constants.AMQ_NAMESPACE) ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector="app=kafdrop", timeout=120, sleep=5, ) return self.kafkadrop_pod, self.kafkadrop_svc, self.kafkadrop_route def cleanup( self, kafka_namespace=constants.AMQ_NAMESPACE, tiller_namespace=AMQ_BENCHMARK_NAMESPACE, ): """ Clean up function, will start to delete from amq cluster operator then amq-connector, persistent, bridge, at the end it will delete the created namespace Args: kafka_namespace (str): Created namespace for amq tiller_namespace (str): Created namespace for benchmark """ if self.consumer_pod: self.consumer_pod.delete() if self.producer_pod: self.producer_pod.delete() if self.kafka_user: self.kafka_user.delete() if self.kafka_topic: self.kafka_topic.delete() if self.benchmark: # Delete the helm app try: purge_cmd = f"linux-amd64/helm delete benchmark --purge --tiller-namespace {tiller_namespace}" run(purge_cmd, shell=True, cwd=self.dir, check=True) except (CommandFailed, CalledProcessError) as cf: log.error("Failed to delete help app") raise cf # Delete the pods and namespace created self.sa_tiller.delete() self.crb_tiller.delete() run_cmd(f"oc delete project {tiller_namespace}") self.ns_obj.wait_for_delete(resource_name=tiller_namespace) if self.kafka_connect: self.kafka_connect.delete() if self.kafka_bridge: self.kafka_bridge.delete() if self.kafka_persistent: self.kafka_persistent.delete() log.info("Waiting for 20 seconds to delete persistent") time.sleep(20) ocs_pvc_obj = get_all_pvc_objs(namespace=kafka_namespace) if ocs_pvc_obj: delete_pvcs(ocs_pvc_obj) for pvc in ocs_pvc_obj: logging.info(pvc.name) validate_pv_delete(pvc.backed_pv) if self.crd_objects: for adm_obj in self.crd_objects: adm_obj.delete() time.sleep(20) # Reset namespace to default switch_to_default_rook_cluster_project() run_cmd(f"oc delete project {kafka_namespace}") self.ns_obj.wait_for_delete(resource_name=kafka_namespace, timeout=90)
class Postgresql(RipSaw): """ Postgresql workload operation """ def __init__(self, **kwargs): """ Initializer function """ super().__init__(**kwargs) self._apply_crd(crd=RIPSAW_CRD) def _apply_crd(self, crd): """ Apply the CRD Args: crd (str): yaml to apply """ RipSaw.apply_crd(self, crd=crd) def setup_postgresql(self, replicas): """ Deploy postgres sql server Args: replicas (int): Number of postgresql pods to be deployed Raises: CommandFailed: If PostgreSQL server setup fails """ log.info("Deploying postgres database") try: pgsql_service = templating.load_yaml(constants.PGSQL_SERVICE_YAML) pgsql_cmap = templating.load_yaml(constants.PGSQL_CONFIGMAP_YAML) pgsql_sset = templating.load_yaml(constants.PGSQL_STATEFULSET_YAML) pgsql_sset["spec"]["replicas"] = replicas self.pgsql_service = OCS(**pgsql_service) self.pgsql_service.create() self.pgsql_cmap = OCS(**pgsql_cmap) self.pgsql_cmap.create() self.pgsql_sset = OCS(**pgsql_sset) self.pgsql_sset.create() self.pod_obj.wait_for_resource( condition="Running", selector="app=postgres", resource_count=replicas, timeout=3600, ) except (CommandFailed, CalledProcessError) as cf: log.error("Failed during setup of PostgreSQL server") raise cf self.pgsql_is_setup = True log.info("Successfully deployed postgres database") def create_pgbench_benchmark( self, replicas, pgbench_name=None, postgres_name=None, clients=None, threads=None, transactions=None, scaling_factor=None, timeout=None, wait=True, ): """ Create pgbench benchmark pods Args: replicas (int): Number of pgbench pods to be deployed pgbench_name (str): Name of pgbench bechmark postgres_name (str): Name of postgres pod clients (int): Number of clients threads (int): Number of threads transactions (int): Number of transactions scaling_factor (int): scaling factor timeout (int): Time in seconds to wait wait (bool): On true waits till pgbench reaches Completed state Returns: List: pgbench pod objects list """ pg_obj_list = [] pgbench_name = pgbench_name if pgbench_name else "pgbench-benchmark" postgres_name = postgres_name if postgres_name else "postgres" for i in range(replicas): log.info("Create resource file for pgbench workload") pg_data = templating.load_yaml(constants.PGSQL_BENCHMARK_YAML) pg_data["metadata"]["name"] = f"{pgbench_name}" + f"{i}" pg_data["spec"]["workload"]["args"]["databases"][0]["host"] = ( f"{postgres_name}-" + f"{i}" + ".postgres") if clients is not None: pg_data["spec"]["workload"]["args"]["clients"][0] = clients if threads is not None: pg_data["spec"]["workload"]["args"]["threads"] = threads if transactions is not None: pg_data["spec"]["workload"]["args"][ "transactions"] = transactions if scaling_factor is not None: pg_data["spec"]["workload"]["args"][ "scaling_factor"] = scaling_factor pg_obj = OCS(**pg_data) pg_obj_list.append(pg_obj) pg_obj.create() if wait: # Confirm that expected pgbench pods are spinned log.info("Searching the pgbench pods by its name pattern") timeout = timeout if timeout else 300 for pgbench_pods in TimeoutSampler( timeout, replicas, get_pod_name_by_pattern, "pgbench-1-dbs-client", RIPSAW_NAMESPACE, ): try: if len(pgbench_pods) == replicas: log.info(f"Expected number of pgbench pods are " f"found: {replicas}") break except IndexError: log.info(f"Expected number of pgbench pods are {replicas} " f"but only found {len(pgbench_pods)}") return pg_obj_list def get_postgres_pvc(self): """ Get all postgres pvc Returns: List: postgres pvc objects list """ return get_all_pvc_objs(namespace=RIPSAW_NAMESPACE) def get_postgres_pods(self): """ Get all postgres pods Returns: List: postgres pod objects list """ return get_all_pods(namespace=RIPSAW_NAMESPACE, selector=["postgres"]) def get_pgbench_pods(self): """ Get all pgbench pods Returns: List: pgbench pod objects list """ return [ get_pod_obj(pod, RIPSAW_NAMESPACE) for pod in get_pod_name_by_pattern("pgbench", RIPSAW_NAMESPACE) ] def delete_pgbench_pods(self, pg_obj_list): """ Delete all pgbench pods on cluster Returns: bool: True if deleted, False otherwise """ log.info("Delete pgbench Benchmark") for pgbench_pod in pg_obj_list: pgbench_pod.delete(force=True) def is_pgbench_running(self): """ Check if pgbench is running Returns: bool: True if pgbench is running; False otherwise """ pod_objs = self.get_pgbench_pods() for pod in pod_objs: if (pod.get().get("status").get("containerStatuses")[0].get( "state") == "running"): log.info("One or more pgbench pods are in running state") return True else: return False break def get_pgbench_status(self, pgbench_pod_name): """ Get pgbench status Args: pgbench_pod_name (str): Name of the pgbench pod Returns: str: state of pgbench pod (running/completed) """ pod_obj = get_pod_obj(pgbench_pod_name, namespace=RIPSAW_NAMESPACE) status = pod_obj.get().get("status").get("containerStatuses")[0].get( "state") return ("running" if list(status.keys())[0] == "running" else status["terminated"]["reason"]) def wait_for_postgres_status(self, status=constants.STATUS_RUNNING, timeout=300): """ Wait for postgres pods status to reach running/completed Args: status (str): status to reach Running or Completed timeout (int): Time in seconds to wait """ log.info(f"Waiting for postgres pods to be reach {status} state") postgres_pod_objs = self.get_postgres_pods() for postgres_pod_obj in postgres_pod_objs: wait_for_resource_state(resource=postgres_pod_obj, state=status, timeout=timeout) def wait_for_pgbench_status(self, status, timeout=None): """ Wait for pgbench benchmark pods status to reach running/completed Args: status (str): status to reach Running or Completed timeout (int): Time in seconds to wait """ """ Sometimes with the default values in the benchmark yaml the pgbench pod is not getting completed within the specified time and the tests are failing. I think it is varying with the infrastructure. So, for now we set the timeout to 30 mins and will start monitoring each pg bench pods for each run.Based on the results we will define the timeout again """ timeout = timeout if timeout else 1800 # Wait for pg_bench pods to initialized and running log.info(f"Waiting for pgbench pods to be reach {status} state") pgbench_pod_objs = self.get_pgbench_pods() for pgbench_pod_obj in pgbench_pod_objs: try: wait_for_resource_state(resource=pgbench_pod_obj, state=status, timeout=timeout) except ResourceWrongStatusException: output = run_cmd(f"oc logs {pgbench_pod_obj.name}") error_msg = f"{pgbench_pod_obj.name} did not reach to {status} state after {timeout} sec\n{output}" log.error(error_msg) raise UnexpectedBehaviour(error_msg) def validate_pgbench_run(self, pgbench_pods, print_table=True): """ Validate pgbench run Args: pgbench pods (list): List of pgbench pods Returns: pg_output (list): pgbench outputs in list """ all_pgbench_pods_output = [] for pgbench_pod in pgbench_pods: log.info(f"pgbench_client_pod===={pgbench_pod.name}====") output = run_cmd( f"oc logs {pgbench_pod.name} -n {RIPSAW_NAMESPACE}") pg_output = utils.parse_pgsql_logs(output) log.info("*******PGBench output log*********\n" f"{pg_output}") # for data in all_pgbench_pods_output: for data in pg_output: run_id = list(data.keys()) latency_avg = data[run_id[0]]["latency_avg"] if not latency_avg: raise UnexpectedBehaviour("PGBench failed to run, " "no data found on latency_avg") log.info(f"PGBench on {pgbench_pod.name} completed successfully") all_pgbench_pods_output.append((pg_output, pgbench_pod.name)) if print_table: pgbench_pod_table = PrettyTable() pgbench_pod_table.field_names = [ "pod_name", "scaling_factor", "num_clients", "num_threads", "trans_client", "actually_trans", "latency_avg", "lat_stddev", "tps_incl", "tps_excl", ] for pgbench_pod_out in all_pgbench_pods_output: for pod_output in pgbench_pod_out[0]: for pod in pod_output.values(): pgbench_pod_table.add_row([ pgbench_pod_out[1], pod["scaling_factor"], pod["num_clients"], pod["num_threads"], pod["number_of_transactions_per_client"], pod["number_of_transactions_actually_processed"], pod["latency_avg"], pod["lat_stddev"], pod["tps_incl"], pod["tps_excl"], ]) log.info(f"\n{pgbench_pod_table}\n") return all_pgbench_pods_output def get_pgsql_nodes(self): """ Get nodes that contain a pgsql app pod Returns: list: Cluster node OCP objects """ pgsql_pod_objs = self.pod_obj.get(selector=constants.PGSQL_APP_LABEL, all_namespaces=True) log.info("Create a list of nodes that contain a pgsql app pod") nodes_set = set() for pod in pgsql_pod_objs["items"]: log.info(f"pod {pod['metadata']['name']} located on " f"node {pod['spec']['nodeName']}") nodes_set.add(pod["spec"]["nodeName"]) return list(nodes_set) def respin_pgsql_app_pod(self): """ Respin the pgsql app pod Returns: pod status """ app_pod_list = get_operator_pods(constants.PGSQL_APP_LABEL, constants.RIPSAW_NAMESPACE) app_pod = app_pod_list[random.randint(0, len(app_pod_list) - 1)] log.info(f"respin pod {app_pod.name}") app_pod.delete(wait=True, force=False) wait_for_resource_state(resource=app_pod, state=constants.STATUS_RUNNING, timeout=300) def get_pgbech_pod_status_table(self, pgbench_pods): """ Get pgbench pod data and print results on a table Args: pgbench pods (list): List of pgbench pods """ pgbench_pod_table = PrettyTable() pgbench_pod_table.field_names = [ "pod_name", "scaling_factor", "num_clients", "num_threads", "trans_client", "actually_trans", "latency_avg", "lat_stddev", "tps_incl", "tps_excl", ] for pgbench_pod in pgbench_pods: output = run_cmd(f"oc logs {pgbench_pod.name}") pg_output = utils.parse_pgsql_logs(output) for pod_output in pg_output: for pod in pod_output.values(): pgbench_pod_table.add_row([ pgbench_pod.name, pod["scaling_factor"], pod["num_clients"], pod["num_threads"], pod["number_of_transactions_per_client"], pod["number_of_transactions_actually_processed"], pod["latency_avg"], pod["lat_stddev"], pod["tps_incl"], pod["tps_excl"], ]) log.info(f"\n{pgbench_pod_table}\n") def export_pgoutput_to_googlesheet(self, pg_output, sheet_name, sheet_index): """ Collect pgbench output to google spreadsheet Args: pg_output (list): pgbench outputs in list sheet_name (str): Name of the sheet sheet_index (int): Index of sheet """ # Collect data and export to Google doc spreadsheet g_sheet = GoogleSpreadSheetAPI(sheet_name=sheet_name, sheet_index=sheet_index) log.info("Exporting pgoutput data to google spreadsheet") for pgbench_pod in range(len(pg_output)): for run in range(len(pg_output[pgbench_pod][0])): run_id = list(pg_output[pgbench_pod][0][run].keys())[0] lat_avg = pg_output[pgbench_pod][0][run][run_id]["latency_avg"] lat_stddev = pg_output[pgbench_pod][0][run][run_id][ "lat_stddev"] tps_incl = pg_output[pgbench_pod][0][run][run_id]["lat_stddev"] tps_excl = pg_output[pgbench_pod][0][run][run_id]["tps_excl"] g_sheet.insert_row( [ f"Pgbench-pod{pg_output[pgbench_pod][1]}-run-{run_id}", int(lat_avg), int(lat_stddev), int(tps_incl), int(tps_excl), ], 2, ) g_sheet.insert_row( ["", "latency_avg", "lat_stddev", "lat_stddev", "tps_excl"], 2) # Capturing versions(OCP, OCS and Ceph) and test run name g_sheet.insert_row( [ f"ocp_version:{utils.get_cluster_version()}", f"ocs_build_number:{utils.get_ocs_build_number()}", f"ceph_version:{utils.get_ceph_version()}", f"test_run_name:{utils.get_testrun_name()}", ], 2, ) def cleanup(self): """ Clean up """ log.info("Deleting postgres pods and configuration") if self.pgsql_is_setup: self.pgsql_sset.delete() self.pgsql_cmap.delete() self.pgsql_service.delete() log.info("Deleting pgbench pods") pods_obj = self.get_pgbench_pods() for pod in pods_obj: pod.delete() pod.ocp.wait_for_delete(pod.name) log.info("Deleting ripsaw configuration") RipSaw.cleanup(self) def attach_pgsql_pod_to_claim_pvc(self, pvc_objs, postgres_name, run_benchmark=True, pgbench_name=None): """ Attaches pgsql pod to created claim PVC Args: pvc_objs (list): List of PVC objs which needs to attached to pod postgres_name (str): Name of the postgres pod run_benchmark (bool): On true, runs pgbench benchmark on postgres pod pgbench_name (str): Name of pgbench benchmark Returns: pgsql_obj_list (list): List of pod objs created """ pgsql_obj_list = [] for pvc_obj in pvc_objs: try: pgsql_sset = templating.load_yaml( constants.PGSQL_STATEFULSET_YAML) del pgsql_sset["spec"]["volumeClaimTemplates"] pgsql_sset["metadata"]["name"] = (f"{postgres_name}" + f"{pvc_objs.index(pvc_obj)}") pgsql_sset["spec"]["template"]["spec"]["containers"][0][ "volumeMounts"][0]["name"] = pvc_obj.name pgsql_sset["spec"]["template"]["spec"]["volumes"] = [{ "name": f"{pvc_obj.name}", "persistentVolumeClaim": { "claimName": f"{pvc_obj.name}" }, }] pgsql_sset = OCS(**pgsql_sset) pgsql_sset.create() pgsql_obj_list.append(pgsql_sset) self.wait_for_postgres_status(status=constants.STATUS_RUNNING, timeout=300) if run_benchmark: pg_data = templating.load_yaml( constants.PGSQL_BENCHMARK_YAML) pg_data["metadata"]["name"] = ( f"{pgbench_name}" + f"{pvc_objs.index(pvc_obj)}" if pgbench_name else create_unique_resource_name("benchmark", "pgbench")) pg_data["spec"]["workload"]["args"]["databases"][0][ "host"] = (f"{postgres_name}" + f"{pvc_objs.index(pvc_obj)}-0" + ".postgres") pg_obj = OCS(**pg_data) pg_obj.create() pgsql_obj_list.append(pg_obj) wait_time = 120 log.info(f"Wait {wait_time} seconds before mounting pod") time.sleep(wait_time) except (CommandFailed, CalledProcessError) as cf: log.error("Failed during creation of postgres pod") raise cf if run_benchmark: log.info("Checking all pgbench benchmark reached Completed state") self.wait_for_pgbench_status(status=constants.STATUS_COMPLETED, timeout=1800) return pgsql_obj_list def get_postgres_used_file_space(self, pod_obj_list): """ Get the used file space on a mount point Args: pod_obj_list (POD): List of pod objects Returns: list: List of pod object """ # Get the used file space on a mount point for pod_obj in pod_obj_list: filepath = get_file_path(pod_obj, "pgdata") filespace = pod_obj.exec_cmd_on_pod(command=f"du -sh {filepath}", out_yaml_format=False) filespace = filespace.split()[0] pod_obj.filespace = filespace return pod_obj_list
def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern): """ This is a basic fio perf test """ # Deployment ripsaw log.info("Deploying ripsaw operator") ripsaw.apply_crd("resources/crds/" "ripsaw_v1alpha1_ripsaw_crd.yaml") if interface == "CephBlockPool": sc = constants.CEPHBLOCKPOOL_SC else: sc = constants.CEPHFILESYSTEM_SC # Create fio benchmark log.info("Create resource file for fio workload") fio_cr = templating.load_yaml(constants.FIO_CR_YAML) # Saving the Original elastic-search IP and PORT - if defined in yaml if "elasticsearch" in fio_cr["spec"]: backup_es = fio_cr["spec"]["elasticsearch"] else: log.warning( "Elastic Search information does not exists in YAML file") fio_cr["spec"]["elasticsearch"] = {} # Use the internal define elastic-search server in the test - if exist if es: fio_cr["spec"]["elasticsearch"] = { "server": es.get_ip(), "port": es.get_port(), } # Setting the data set to 40% of the total storage capacity ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() total_data_set = int(ceph_capacity * 0.4) filesize = int(fio_cr["spec"]["workload"]["args"]["filesize"].replace( "GiB", "")) # To make sure the number of App pods will not be more then 50, in case # of large data set, changing the size of the file each pod will work on if total_data_set > 500: filesize = int(ceph_capacity * 0.008) fio_cr["spec"]["workload"]["args"]["filesize"] = f"{filesize}GiB" # make sure that the storage size is larger then the file size fio_cr["spec"]["workload"]["args"][ "storagesize"] = f"{int(filesize * 1.2)}Gi" fio_cr["spec"]["workload"]["args"]["servers"] = int(total_data_set / filesize) log.info(f"Total Data set to work on is : {total_data_set} GiB") environment = get_environment_info() if not environment["user"] == "": fio_cr["spec"]["test_user"] = environment["user"] fio_cr["spec"]["clustername"] = environment["clustername"] log.debug(f"Environment information is : {environment}") fio_cr["spec"]["workload"]["args"]["storageclass"] = sc if io_pattern == "sequential": fio_cr["spec"]["workload"]["args"]["jobs"] = ["write", "read"] fio_cr["spec"]["workload"]["args"]["iodepth"] = 1 log.info(f"The FIO CR file is {fio_cr}") fio_cr_obj = OCS(**fio_cr) fio_cr_obj.create() # Wait for fio client pod to be created for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern, "fio-client", constants.RIPSAW_NAMESPACE): try: if fio_pod[0] is not None: fio_client_pod = fio_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Getting the start time of the test start_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) # Getting the UUID from inside the benchmark pod uuid = ripsaw.get_uuid(fio_client_pod) # Setting back the original elastic-search information fio_cr["spec"]["elasticsearch"] = backup_es full_results = FIOResultsAnalyse(uuid, fio_cr) # Initialize the results doc file. for key in environment: full_results.add_key(key, environment[key]) # Setting the global parameters of the test full_results.add_key("io_pattern", io_pattern) full_results.add_key("dataset", f"{total_data_set}GiB") full_results.add_key("file_size", fio_cr["spec"]["workload"]["args"]["filesize"]) full_results.add_key("servers", fio_cr["spec"]["workload"]["args"]["servers"]) full_results.add_key("samples", fio_cr["spec"]["workload"]["args"]["samples"]) full_results.add_key("operations", fio_cr["spec"]["workload"]["args"]["jobs"]) full_results.add_key("block_sizes", fio_cr["spec"]["workload"]["args"]["bs"]) full_results.add_key("io_depth", fio_cr["spec"]["workload"]["args"]["iodepth"]) full_results.add_key("jobs", fio_cr["spec"]["workload"]["args"]["numjobs"]) full_results.add_key( "runtime", { "read": fio_cr["spec"]["workload"]["args"]["read_runtime"], "write": fio_cr["spec"]["workload"]["args"]["write_runtime"], }, ) full_results.add_key( "storageclass", fio_cr["spec"]["workload"]["args"]["storageclass"]) full_results.add_key("vol_size", fio_cr["spec"]["workload"]["args"]["storagesize"]) # Wait for fio pod to initialized and complete log.info("Waiting for fio_client to complete") pod_obj = OCP(kind="pod") pod_obj.wait_for_resource( condition="Completed", resource_name=fio_client_pod, timeout=18000, sleep=300, ) # Getting the end time of the test end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) full_results.add_key("test_time", { "start": start_time, "end": end_time }) output = run_cmd(f"oc logs {fio_client_pod}") log.info(f"The Test log is : {output}") try: if "Fio failed to execute" not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") # Clean up fio benchmark log.info("Deleting FIO benchmark") fio_cr_obj.delete() log.debug(f"Full results is : {full_results.results}") # if Internal ES is exists, Copy all data from the Internal to main ES if es: log.info("Copy all data from Internal ES to Main ES") es._copy(full_results.es) # Adding this sleep between the copy and the analyzing of the results # since sometimes the results of the read (just after write) are empty time.sleep(30) full_results.analyze_results() # Analyze the results # Writing the analyzed test results to the Elastic-Search server full_results.es_write() full_results.codespeed_push() # Push results to codespeed # Creating full link to the results on the ES server log.info(f"The Result can be found at ; {full_results.results_link()}")
def test_pvc_snapshot_performance_multiple_files(self, ripsaw, file_size, files, threads, interface): """ Run SmallFile Workload and the take snapshot. test will run with 1M, 2M and 4M of file on the volume - total data set is the same for all tests, ~30GiB, and then take snapshot and measure the time it takes. the test will run 3 time to check consistency. Args: ripsaw : benchmark operator fixture which will run the workload file_size (int): the size of the file to be create - in KiB files (int): number of files each thread will create threads (int): number of threads will be used in the workload interface (str): the volume interface that will be used CephBlockPool / CephFileSystem Raises: TimeoutError : in case of creation files take too long time more then 2 Hours """ # Loading the main template yaml file for the benchmark and update some # fields with new values sf_data = templating.load_yaml(constants.SMALLFILE_BENCHMARK_YAML) if interface == constants.CEPHBLOCKPOOL: storageclass = constants.DEFAULT_STORAGECLASS_RBD else: storageclass = constants.DEFAULT_STORAGECLASS_CEPHFS log.info(f"Using {storageclass} Storageclass") # Setting up the parameters for this test sf_data["spec"]["workload"]["args"]["samples"] = 1 sf_data["spec"]["workload"]["args"]["operation"] = ["create"] sf_data["spec"]["workload"]["args"]["file_size"] = file_size sf_data["spec"]["workload"]["args"]["files"] = files sf_data["spec"]["workload"]["args"]["threads"] = threads sf_data["spec"]["workload"]["args"]["storageclass"] = storageclass del sf_data["spec"]["elasticsearch"] """ Calculating the size of the volume that need to be test, it should be at least twice in the size then the size of the files, and at least 100Gi. Since the file_size is in Kb and the vol_size need to be in Gb, more calculation is needed. """ total_files = int(files * threads) total_data = int(files * threads * file_size / constants.GB2KB) data_set = int(total_data * 3) # calculate data with replica vol_size = data_set if data_set >= 100 else 100 sf_data["spec"]["workload"]["args"]["storagesize"] = f"{vol_size}Gi" environment = get_environment_info() if not environment["user"] == "": sf_data["spec"]["test_user"] = environment["user"] else: # since full results object need this parameter, initialize it from CR file environment["user"] = sf_data["spec"]["test_user"] sf_data["spec"]["clustername"] = environment["clustername"] log.debug(f"The smallfile yaml file is {sf_data}") # Deploy the ripsaw operator log.info("Apply Operator CRD") ripsaw.apply_crd("resources/crds/ripsaw_v1alpha1_ripsaw_crd.yaml") all_results = [] for test_num in range(self.tests_numbers): # deploy the smallfile workload log.info("Running SmallFile bench") sf_obj = OCS(**sf_data) sf_obj.create() # wait for benchmark pods to get created - takes a while for bench_pod in TimeoutSampler( 240, 10, get_pod_name_by_pattern, "smallfile-client", constants.RIPSAW_NAMESPACE, ): try: if bench_pod[0] is not None: small_file_client_pod = bench_pod[0] break except IndexError: log.info("Bench pod not ready yet") bench_pod = OCP(kind="pod", namespace=constants.RIPSAW_NAMESPACE) log.info("Waiting for SmallFile benchmark to Run") assert bench_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=small_file_client_pod, sleep=30, timeout=600, ) for item in bench_pod.get()["items"][1]["spec"]["volumes"]: if "persistentVolumeClaim" in item: pvc_name = item["persistentVolumeClaim"]["claimName"] break log.info(f"Benchmark PVC name is : {pvc_name}") # Creation of 4M files on CephFS can take a lot of time timeout = 7200 while timeout >= 0: logs = bench_pod.get_logs(name=small_file_client_pod) if "RUN STATUS DONE" in logs: break timeout -= 30 if timeout == 0: raise TimeoutError( "Timed out waiting for benchmark to complete") time.sleep(30) log.info(f"Smallfile test ({test_num + 1}) finished.") snap_name = pvc_name.replace("claim", "snapshot-") log.info(f"Taking snapshot of the PVC {pvc_name}") log.info(f"Snapshot name : {snap_name}") creation_time = self.measure_create_snapshot_time( pvc_name=pvc_name, snap_name=snap_name, interface=interface) log.info(f"Snapshot creation time is {creation_time} seconds") all_results.append(creation_time) # Delete the smallfile workload log.info("Deleting the smallfile workload") if sf_obj.delete(wait=True): log.info("The smallfile workload was deleted successfully") # Delete VolumeSnapshots log.info("Deleting the snapshots") if self.snap_obj.delete(wait=True): log.info("The snapshot deleted successfully") log.info("Verify (and wait if needed) that ceph health is OK") ceph_health_check(tries=45, delay=60) log.info(f"Full test report for {interface}:") log.info(f"Test ran {self.tests_numbers} times, " f"All results are {all_results}") log.info( f"The average creation time is : {statistics.mean(all_results)}") log.info(f"Number of Files on the volume : {total_files:,}, " f"Total dataset : {int(data_set / 3)} GiB")
class CouchBase(PillowFight): """ CouchBase workload operation """ def __init__(self, **kwargs): """ Initializer function """ super().__init__(**kwargs) self.args = kwargs self.pod_obj = OCP(kind="pod") self.ns_obj = OCP(kind="namespace") self.couchbase_pod = OCP(kind="pod") self.create_namespace(namespace=constants.COUCHBASE_OPERATOR) self.cb_create_cb_secret = False self.cb_create_cb_cluster = False self.cb_create_bucket = False def create_namespace(self, namespace): """ create namespace for couchbase Args: namespace (str): Namespace for deploying couchbase pods """ try: self.ns_obj.new_project(namespace) except CommandFailed as ef: log.info("Already present") if f'project.project.openshift.io "{namespace}" already exists' not in str( ef): raise ef def couchbase_operatorgroup(self): """ Creates an operator group for Couchbase """ operatorgroup_yaml = templating.load_yaml( constants.COUCHBASE_OPERATOR_GROUP_YAML) self.operatorgroup_yaml = OCS(**operatorgroup_yaml) self.operatorgroup_yaml.create() def couchbase_subscription(self): """ Creates subscription for Couchbase operator """ # Create an operator group for Couchbase log.info("Creating operator group for couchbase") self.couchbase_operatorgroup() subscription_yaml = templating.load_yaml( constants.COUCHBASE_OPERATOR_SUBSCRIPTION_YAML) self.subscription_yaml = OCS(**subscription_yaml) self.subscription_yaml.create() # Wait for the CSV to reach succeeded state cb_csv = self.get_couchbase_csv() cb_csv_obj = CSV(resource_name=cb_csv, namespace=constants.COUCHBASE_OPERATOR) cb_csv_obj.wait_for_phase("Succeeded", timeout=720) def get_couchbase_csv(self): """ " Get the Couchbase CSV object Returns: CSV: Couchbase CSV object Raises: CSVNotFound: In case no CSV found. """ cb_package_manifest = PackageManifest( resource_name="couchbase-enterprise-certified") cb_enter_csv = cb_package_manifest.get_current_csv( channel="stable", csv_pattern=constants.COUCHBASE_CSV_PREFIX) return cb_enter_csv def create_cb_secrets(self): """ " Create secrets for running Couchbase workers """ cb_secrets = templating.load_yaml(constants.COUCHBASE_WORKER_SECRET) self.cb_secrets = OCS(**cb_secrets) self.cb_secrets.create() log.info("Successfully created secrets for Couchbase") self.cb_create_cb_secret = True def create_cb_cluster(self, replicas=1, sc_name=None): """ Deploy a Couchbase server using Couchbase operator Once the couchbase operator is running, we need to wait for the worker pods to be up. Once the Couchbase worker pods are up, pillowfight task is started. After the pillowfight task has finished, the log is collected and analyzed. Raises: Exception: If pillowfight results indicate that a minimum performance level is not reached (1 second response time, less than 1000 ops per second) """ log.info("Creating Couchbase worker pods...") cb_example = templating.load_yaml(constants.COUCHBASE_WORKER_EXAMPLE) if storagecluster_independent_check(): cb_example["spec"]["volumeClaimTemplates"][0]["spec"][ "storageClassName"] = constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD cb_example["spec"]["servers"][0]["size"] = replicas if sc_name: cb_example["spec"]["volumeClaimTemplates"][0]["spec"][ "storageClassName"] = sc_name self.cb_example = OCS(**cb_example) self.cb_example.create() self.cb_create_cb_cluster = True # Wait for the Couchbase workers to be running. log.info("Waiting for the Couchbase pods to be Running") self.pod_obj.wait_for_resource( condition="Running", selector="app=couchbase", resource_count=replicas, timeout=900, ) log.info( f"Expected number: {replicas} of couchbase workers reached running state" ) def create_data_buckets(self): """ Create data buckets """ cb_bucket = templating.load_yaml(constants.COUCHBASE_DATA_BUCKET) self.cb_bucket = OCS(**cb_bucket) self.cb_bucket.create() log.info("Successfully created data buckets") self.cb_create_bucket = True def run_workload(self, replicas, num_items=None, num_threads=None, run_in_bg=False): """ Running workload with pillow fight operator Args: replicas (int): Number of pods num_items (int): Number of items to be loaded to the cluster num_threads (int): Number of threads run_in_bg (bool) : Optional run IOs in background """ self.result = None log.info("Running IOs using Pillow-fight") if run_in_bg: executor = ThreadPoolExecutor(1) self.result = executor.submit( PillowFight.run_pillowfights, self, replicas=replicas, num_items=num_items, num_threads=num_threads, ) return self.result PillowFight.run_pillowfights(self, replicas=replicas, num_items=num_items, num_threads=num_threads) def analyze_run(self, skip_analyze=False): """ Analyzing the workload run logs Args: skip_analyze (bool): Option to skip logs analysis """ if not skip_analyze: log.info("Analyzing workload run logs..") PillowFight.analyze_all(self) def respin_couchbase_app_pod(self): """ Respin the couchbase app pod Returns: pod status """ app_pod_list = get_pod_name_by_pattern("cb-example", constants.COUCHBASE_OPERATOR) app_pod = app_pod_list[random.randint(0, len(app_pod_list) - 1)] log.info(f"respin pod {app_pod}") app_pod_obj = get_pod_obj(app_pod, namespace=constants.COUCHBASE_OPERATOR) app_pod_obj.delete(wait=True, force=False) wait_for_resource_state(resource=app_pod_obj, state=constants.STATUS_RUNNING, timeout=300) def get_couchbase_nodes(self): """ Get nodes that contain a couchbase app pod Returns: list: List of nodes """ app_pods_list = get_pod_name_by_pattern("cb-example", constants.COUCHBASE_OPERATOR) app_pod_objs = list() for pod in app_pods_list: app_pod_objs.append( get_pod_obj(pod, namespace=constants.COUCHBASE_OPERATOR)) log.info("Create a list of nodes that contain a couchbase app pod") nodes_set = set() for pod in app_pod_objs: log.info(f"pod {pod.name} located on " f"node {pod.get().get('spec').get('nodeName')}") nodes_set.add(pod.get().get("spec").get("nodeName")) return list(nodes_set) def teardown(self): """ Cleaning up the resources created during Couchbase deployment """ if self.cb_create_cb_secret: self.cb_secrets.delete() if self.cb_create_cb_cluster: self.cb_example.delete() if self.cb_create_bucket: self.cb_bucket.delete() self.subscription_yaml.delete() switch_to_project("default") self.ns_obj.delete_project(constants.COUCHBASE_OPERATOR) self.ns_obj.wait_for_delete(resource_name=constants.COUCHBASE_OPERATOR, timeout=90) PillowFight.cleanup(self) switch_to_default_rook_cluster_project()
class CouchBase(PillowFight): """ CouchBase workload operation """ WAIT_FOR_TIME = 1800 admission_parts = [ constants.COUCHBASE_ADMISSION_SERVICE_ACCOUNT_YAML, constants.COUCHBASE_ADMISSION_CLUSTER_ROLE_YAML, constants.COUCHBASE_ADMISSION_CLUSTER_ROLE_BINDING_YAML, constants.COUCHBASE_ADMISSION_SECRET_YAML, constants.COUCHBASE_ADMISSION_DEPLOYMENT_YAML, constants.COUCHBASE_ADMISSION_SERVICE_YAML, constants.COUCHBASE_MUTATING_WEBHOOK_YAML, constants.COUCHBASE_VALIDATING_WEBHOOK_YAML ] pod_obj = OCP(kind='pod') couchbase_pod = OCP(kind='pod') secretsadder = OCP(kind='pod') admission_pod = [] cb_worker = OCS() cb_examples = OCS() def __init__(self, **kwargs): """ Initializer function """ super().__init__(**kwargs) def is_up_and_running(self, pod_name, ocp_value): """ Test if the pod specified is up and running. Args: pod_name (str): Name of pod being checked. ocp_value (object): object used for running oc commands Returns: bool; True if pod is running, False otherwise """ if not pod_name: return False pod_info = ocp_value.exec_oc_cmd(f"get pods {pod_name} -o json") if pod_info['status']['containerStatuses'][0]['ready']: if 'running' in pod_info['status']['containerStatuses'][0][ 'state']: return True return False def setup_cb(self): """ Creating admission parts,couchbase operator pod, couchbase worker secret """ # Create admission controller log.info("Create admission controller process for Couchbase") switch_to_project('default') self.up_adm_chk = OCP(namespace="default") self.up_check = OCP(namespace=constants.COUCHBASE_OPERATOR) for adm_yaml in self.admission_parts: adm_data = templating.load_yaml(adm_yaml) adm_obj = OCS(**adm_data) adm_obj.create() # Wait for admission pod to be created for adm_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase-operator-admission', 'default'): try: if self.is_up_and_running(adm_pod[0], self.up_adm_chk): self.admission_pod = adm_pod[0] break except IndexError: log.info("Admission pod is not ready yet") # Wait for admission pod to be running log.info("Waiting for admission pod to be running") self.pod_obj.wait_for_resource( condition='Running', resource_name=self.admission_pod, timeout=self.WAIT_FOR_TIME, sleep=10, ) self.pod_obj.new_project(constants.COUCHBASE_OPERATOR) couchbase_data = templating.load_yaml(constants.COUCHBASE_CRD_YAML) self.couchbase_obj = OCS(**couchbase_data) self.couchbase_obj.create() op_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_ROLE) self.operator_role = OCS(**op_data) self.operator_role.create() self.serviceaccount = OCP(namespace=constants.COUCHBASE_OPERATOR) self.serviceaccount.exec_oc_cmd( "create serviceaccount couchbase-operator") dockercfgs = self.serviceaccount.exec_oc_cmd("get secrets") startloc = dockercfgs.find('couchbase-operator-dockercfg') newdockerstr = dockercfgs[startloc:] endloc = newdockerstr.find(' ') dockerstr = newdockerstr[:endloc] self.secretsadder.exec_oc_cmd( f"secrets link serviceaccount/couchbase-operator secrets/{dockerstr}" ) self.rolebinding = OCP(namespace=constants.COUCHBASE_OPERATOR) rolebind_cmd = "".join([ "create rolebinding couchbase-operator-rolebinding ", "--role couchbase-operator ", "--serviceaccount couchbase-operator-namespace:couchbase-operator" ]) self.rolebinding.exec_oc_cmd(rolebind_cmd) dep_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_DEPLOY) self.cb_deploy = OCS(**dep_data) self.cb_deploy.create() # Wait for couchbase operator pod to be running for couchbase_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase-operator', constants.COUCHBASE_OPERATOR): try: if self.is_up_and_running(couchbase_pod[0], self.up_check): break except IndexError: log.info("Couchbase operator is not up") cb_work = templating.load_yaml(constants.COUCHBASE_WORKER_SECRET) self.cb_worker = OCS(**cb_work) self.cb_worker.create() def create_couchbase_worker(self, replicas=1): """ Deploy a Couchbase server and pillowfight workload using operator The couchbase workers do not come up unless there is an admission controller running. The admission controller is started from the default project prior to bringing up the operator. Secrets, rolebindings and serviceaccounts need to also be generated. Once the couchbase operator is running, we need to wait for the three worker pods to also be up. Then a pillowfight task is started. After the pillowfight task has finished, the log is collected and analyzed. Raises: Exception: If pillowfight results indicate that a minimum performance level is not reached (1 second response time, less than 1000 ops per second) """ logging.info('Creating pods..') cb_example = templating.load_yaml(constants.COUCHBASE_WORKER_EXAMPLE) cb_example['spec']['servers'][0]['size'] = replicas self.cb_examples = OCS(**cb_example) self.cb_examples.create() # Wait for last of three workers to be running. logging.info('Waiting for the pods to Running') for cb_wrk_pods in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'cb-example', constants.COUCHBASE_OPERATOR): try: if len(cb_wrk_pods) == replicas: counter = 0 for cb_pod in cb_wrk_pods: if self.is_up_and_running(cb_pod, self.up_check): counter += 1 logging.info(f'Couchbase worker {cb_pod} is up') if counter == replicas: break except IndexError: logging.info( f'Expected number of couchbase pods are {replicas} ' f'but only found {len(cb_wrk_pods)}') def run_workload(self, replicas, num_items=None, num_threads=None, run_in_bg=False): """ Running workload with pillow fight operator Args: replicas (int): Number of pods num_items (int): Number of items to be loaded to the cluster num_threads (int): Number of threads run_in_bg (bool) : Optional run IOs in background """ self.result = None logging.info('Running IOs...') if run_in_bg: executor = ThreadPoolExecutor(1) self.result = executor.submit(PillowFight.run_pillowfights, self, replicas=replicas, num_items=num_items, num_threads=num_threads) return self.result PillowFight.run_pillowfights(self, replicas=replicas, num_items=num_items, num_threads=num_threads) def analyze_run(self, skip_analyze=False): """ Analyzing the workload run logs Args: skip_analyze (bool): Option to skip logs analysis """ if not skip_analyze: logging.info('Analyzing workload run logs..') PillowFight.analyze_all(self) def respin_couchbase_app_pod(self): """ Respin the couchbase app pod Returns: pod status """ app_pod_list = get_pod_name_by_pattern('cb-example', constants.COUCHBASE_OPERATOR) app_pod = app_pod_list[random.randint(0, len(app_pod_list) - 1)] logging.info(f"respin pod {app_pod}") app_pod_obj = get_pod_obj(app_pod, namespace=constants.COUCHBASE_OPERATOR) app_pod_obj.delete(wait=True, force=False) wait_for_resource_state(resource=app_pod_obj, state=constants.STATUS_RUNNING, timeout=300) def get_couchbase_nodes(self): """ Get nodes that contain a couchbase app pod Returns: list: List of nodes """ app_pods_list = get_pod_name_by_pattern('cb-example', constants.COUCHBASE_OPERATOR) app_pod_objs = list() for pod in app_pods_list: app_pod_objs.append( get_pod_obj(pod, namespace=constants.COUCHBASE_OPERATOR)) log.info("Create a list of nodes that contain a couchbase app pod") nodes_set = set() for pod in app_pod_objs: logging.info(f"pod {pod.name} located on " f"node {pod.get().get('spec').get('nodeName')}") nodes_set.add(pod.get().get('spec').get('nodeName')) return list(nodes_set) def teardown(self): """ Delete objects created in roughly reverse order of how they were created. """ self.cb_examples.delete() self.cb_worker.delete() self.cb_deploy.delete() self.pod_obj.exec_oc_cmd( command="delete rolebinding couchbase-operator-rolebinding") self.pod_obj.exec_oc_cmd( command="delete serviceaccount couchbase-operator") self.operator_role.delete() self.couchbase_obj.delete() switch_to_project('default') self.pod_obj.delete_project(constants.COUCHBASE_OPERATOR) for adm_yaml in self.admission_parts: adm_data = templating.load_yaml(adm_yaml) adm_obj = OCS(**adm_data) adm_obj.delete() # Before the code below was added, the teardown task would sometimes # fail with the leftover objects because it would still see one of the # couchbase pods. for admin_pod in TimeoutSampler(self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase', 'default'): if admin_pod: continue else: break PillowFight.cleanup(self) switch_to_default_rook_cluster_project()
class AMQ(object): """ Workload operation using AMQ """ def __init__(self, **kwargs): """ Initializer function Args: kwargs (dict): Following kwargs are valid namespace: namespace for the operator repo: AMQ repo where all necessary yaml file are there - a github link branch: branch to use from the repo """ self.args = kwargs self.repo = self.args.get('repo', constants.OCS_WORKLOADS) self.branch = self.args.get('branch', 'master') self.namespace = self.args.get('namespace', 'my-project') self.amq_is_setup = False self.ocp = OCP() self.ns_obj = OCP(kind='namespace') self.pod_obj = OCP(kind='pod') self.kafka_obj = OCP(kind='Kafka') self.kafka_connect_obj = OCP(kind="KafkaConnect") self.kafka_bridge_obj = OCP(kind="KafkaBridge") self._create_namespace() self._clone_amq() def _create_namespace(self): """ create namespace for amq """ self.ocp.new_project(self.namespace) def _clone_amq(self): """ clone the amq repo """ self.dir = tempfile.mkdtemp(prefix='amq_') try: log.info(f'cloning amq in {self.dir}') git_clone_cmd = f'git clone -b {self.branch} {self.repo} ' run( git_clone_cmd, shell=True, cwd=self.dir, check=True ) self.amq_dir = "ocs-workloads/amq/v1/install/cluster-operator" self.amq_dir_examples = "ocs-workloads/amq/v1/examples/templates/cluster-operator" self.amq_kafka_pers_yaml = "ocs-workloads/amq/v1/kafka-persistent.yaml" self.amq_kafka_connect_yaml = "ocs-workloads/amq/v1/kafka-connect.yaml" self.amq_kafka_bridge_yaml = "ocs-workloads/amq/v1/kafka-bridge.yaml" except (CommandFailed, CalledProcessError)as cf: log.error('Error during cloning of amq repository') raise cf def setup_amq_cluster_operator(self): """ Function to setup amq-cluster_operator, the file file is pulling from github it will make sure cluster-operator pod is running """ # self.amq_dir = constants.TEMPLATE_DEPLOYMENT_AMQ_CP run(f'oc apply -f {self.amq_dir} -n {self.namespace}', shell=True, check=True, cwd=self.dir) time.sleep(5) # Wait for strimzi-cluster-operator pod to be created if self.is_amq_pod_running(pod_pattern="cluster-operator"): log.info("strimzi-cluster-operator pod is in running state") else: raise ResourceWrongStatusException("strimzi-cluster-operator pod is not getting to running state") run(f'oc apply -f {self.amq_dir_examples} -n {self.namespace}', shell=True, check=True, cwd=self.dir) # checking pod status one more time if self.is_amq_pod_running(pod_pattern="cluster-operator"): log.info("strimzi-cluster-operator pod is in running state") else: raise ResourceWrongStatusException("strimzi-cluster-operator pod is not getting to running state") def is_amq_pod_running(self, pod_pattern="cluster-operator"): """ The function checks if provided pod_pattern finds a pod and if the status is running or not Args: pod_pattern (str): the pattern for pod Returns: bool: status of pod: True if found pod is running """ for pod in TimeoutSampler( 300, 10, get_pod_name_by_pattern, pod_pattern, self.namespace ): try: if pod[0] is not None: amq_pod = pod[0] break except IndexError as ie: log.error(pod_pattern + " pod not ready yet") raise ie # checking pod status if (self.pod_obj.wait_for_resource( condition='Running', resource_name=amq_pod, timeout=1600, sleep=30, ) ): log.info(amq_pod + " pod is up and running") return True else: return False def setup_amq_kafka_persistent(self): """ Function to setup amq-kafka-persistent, the file file is pulling from github it will make kind: Kafka and will make sure the status is running :return: kafka_persistent """ try: kafka_persistent = templating.load_yaml(os.path.join(self.dir, self.amq_kafka_pers_yaml)) self.kafka_persistent = OCS(**kafka_persistent) self.kafka_persistent.create() except(CommandFailed, CalledProcessError) as cf: log.error('Failed during setup of AMQ Kafka-persistent') raise cf time.sleep(5) if self.is_amq_pod_running(pod_pattern="zookeeper"): return self.kafka_persistent else: raise ResourceWrongStatusException("my-cluster-zookeeper Pod is not getting to running state") def setup_amq_kafka_connect(self): """ The function is to setup amq-kafka-connect, the yaml file is pulling from github it will make kind: KafkaConnect and will make sure the status is running Returns: kafka_connect object """ try: kafka_connect = templating.load_yaml(os.path.join(self.dir, self.amq_kafka_connect_yaml)) self.kafka_connect = OCS(**kafka_connect) self.kafka_connect.create() except(CommandFailed, CalledProcessError) as cf: log.error('Failed during setup of AMQ KafkaConnect') raise cf if self.is_amq_pod_running(pod_pattern="my-connect-cluster-connect"): return self.kafka_connect else: raise ResourceWrongStatusException("my-connect-cluster-connect pod is not getting to running state") def setup_amq_kafka_bridge(self): """ Function to setup amq-kafka, the file file is pulling from github it will make kind: KafkaBridge and will make sure the pod status is running Return: kafka_bridge object """ try: kafka_bridge = templating.load_yaml(os.path.join(self.dir, self.amq_kafka_bridge_yaml)) self.kafka_bridge = OCS(**kafka_bridge) self.kafka_bridge.create() except(CommandFailed, CalledProcessError) as cf: log.error('Failed during setup of AMQ KafkaConnect') raise cf # Making sure the kafka_bridge is running if self.is_amq_pod_running(pod_pattern="my-bridge-bridge"): return self.kafka_bridge else: raise ResourceWrongStatusException("kafka_bridge_pod pod is not getting to running state") def setup_amq(self): """ Setup AMQ from local folder, function will call all necessary sub functions to make sure amq installation is complete """ self.setup_amq_cluster_operator() self.setup_amq_kafka_persistent() self.setup_amq_kafka_connect() self.setup_amq_kafka_bridge() self.amq_is_setup = True return self def cleanup(self): """ Clean up function, will start to delete from amq cluster operator then amq-connector, persistent, bridge, at the end it will delete the created namespace """ if self.amq_is_setup: self.kafka_persistent.delete() self.kafka_connect.delete() self.kafka_bridge.delete() run_cmd(f'oc delete -f {self.amq_dir}', shell=True, check=True, cwd=self.dir) run_cmd(f'oc delete -f {self.amq_dir_examples}', shell=True, check=True, cwd=self.dir) run_cmd(f'oc delete project {self.namespace}') # Reset namespace to default switch_to_default_rook_cluster_project() self.ns_obj.wait_for_delete(resource_name=self.namespace)
def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern): """ This is a basic fio perf test """ # Deployment ripsaw log.info("Deploying ripsaw operator") ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml') sc = 'ocs-storagecluster-ceph-rbd' if interface == 'CephBlockPool' else 'ocs-storagecluster-cephfs' # Create fio benchmark log.info("Create resource file for fio workload") fio_cr = templating.load_yaml(constants.FIO_CR_YAML) # Saving the Original elastic-search IP and PORT - if defined in yaml es_server = "" es_port = "" if 'elasticsearch' in fio_cr['spec']: if 'server' in fio_cr['spec']['elasticsearch']: es_server = fio_cr['spec']['elasticsearch']['server'] if 'port' in fio_cr['spec']['elasticsearch']: es_port = fio_cr['spec']['elasticsearch']['port'] else: fio_cr['spec']['elasticsearch'] = {} # Use the internal define elastic-search server in the test fio_cr['spec']['elasticsearch'] = { 'server': es.get_ip(), 'port': es.get_port() } # Setting the data set to 40% of the total storage capacity but # not more then 600GiB ceph_cluster = CephCluster() total_data_set = int(ceph_cluster.get_ceph_capacity() * 0.4) filesize = int(fio_cr['spec']['workload']['args']['filesize'].replace( 'GiB', '')) # To make sure the number of App pods will not be more then 50, in case # of large data set, changing the size of the file each pod will work on if total_data_set > 500: filesize = int(ceph_cluster.get_ceph_capacity() * 0.008) fio_cr['spec']['workload']['args']['filesize'] = f'{filesize}GiB' # make sure that the storage size is larger then the file size fio_cr['spec']['workload']['args'][ 'storagesize'] = f'{int(filesize * 1.2)}Gi' fio_cr['spec']['workload']['args']['servers'] = int(total_data_set / filesize) log.info(f'Total Data set to work on is : {total_data_set} GiB') fio_cr['spec']['clustername'] = config.ENV_DATA[ 'platform'] + get_build() + get_ocs_version() fio_cr['spec']['test_user'] = get_ocs_version( ) + interface + io_pattern fio_cr['spec']['workload']['args']['storageclass'] = sc if io_pattern == 'sequential': fio_cr['spec']['workload']['args']['jobs'] = ['write', 'read'] log.info(f'fio_cr: {fio_cr}') fio_cr_obj = OCS(**fio_cr) fio_cr_obj.create() # Wait for fio client pod to be created for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern, 'fio-client', constants.RIPSAW_NAMESPACE): try: if fio_pod[0] is not None: fio_client_pod = fio_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Wait for fio pod to initialized and complete log.info("Waiting for fio_client to complete") pod_obj = OCP(kind='pod') pod_obj.wait_for_resource( condition='Completed', resource_name=fio_client_pod, timeout=18000, sleep=300, ) output = run_cmd(f'oc logs {fio_client_pod}') try: if 'Fio failed to execute' not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") # Clean up fio benchmark log.info("Deleting FIO benchmark") fio_cr_obj.delete() # Setting back the original elastic-search information fio_cr['spec']['elasticsearch'] = { 'server': es_server, 'port': es_port } analyze_regression(io_pattern, sc, es_username=fio_cr['spec']['test_user'])
def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern): """ This is a basic fio perf test """ # Deployment ripsaw log.info("Deploying ripsaw operator") ripsaw.apply_crd('resources/crds/' 'ripsaw_v1alpha1_ripsaw_crd.yaml') if interface == 'CephBlockPool': sc = constants.CEPHBLOCKPOOL_SC else: sc = constants.CEPHFILESYSTEM_SC # Create fio benchmark log.info("Create resource file for fio workload") fio_cr = templating.load_yaml(constants.FIO_CR_YAML) # Saving the Original elastic-search IP and PORT - if defined in yaml if 'elasticsearch' in fio_cr['spec']: backup_es = fio_cr['spec']['elasticsearch'] else: log.warning( 'Elastic Search information does not exists in YAML file') fio_cr['spec']['elasticsearch'] = {} # Use the internal define elastic-search server in the test - if exist if es: fio_cr['spec']['elasticsearch'] = { 'server': es.get_ip(), 'port': es.get_port() } # Setting the data set to 40% of the total storage capacity ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() total_data_set = int(ceph_capacity * 0.4) filesize = int(fio_cr['spec']['workload']['args']['filesize'].replace( 'GiB', '')) # To make sure the number of App pods will not be more then 50, in case # of large data set, changing the size of the file each pod will work on if total_data_set > 500: filesize = int(ceph_capacity * 0.008) fio_cr['spec']['workload']['args']['filesize'] = f'{filesize}GiB' # make sure that the storage size is larger then the file size fio_cr['spec']['workload']['args'][ 'storagesize'] = f'{int(filesize * 1.2)}Gi' fio_cr['spec']['workload']['args']['servers'] = int(total_data_set / filesize) log.info(f'Total Data set to work on is : {total_data_set} GiB') environment = get_environment_info() if not environment['user'] == '': fio_cr['spec']['test_user'] = environment['user'] fio_cr['spec']['clustername'] = environment['clustername'] log.debug(f'Environment information is : {environment}') fio_cr['spec']['workload']['args']['storageclass'] = sc if io_pattern == 'sequential': fio_cr['spec']['workload']['args']['jobs'] = ['write', 'read'] fio_cr['spec']['workload']['args']['iodepth'] = 1 log.info(f'The FIO CR file is {fio_cr}') fio_cr_obj = OCS(**fio_cr) fio_cr_obj.create() # Wait for fio client pod to be created for fio_pod in TimeoutSampler(300, 20, get_pod_name_by_pattern, 'fio-client', constants.RIPSAW_NAMESPACE): try: if fio_pod[0] is not None: fio_client_pod = fio_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Getting the start time of the test start_time = time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime()) # Getting the UUID from inside the benchmark pod uuid = ripsaw.get_uuid(fio_client_pod) # Setting back the original elastic-search information fio_cr['spec']['elasticsearch'] = backup_es full_results = FIOResultsAnalyse(uuid, fio_cr) # Initialize the results doc file. for key in environment: full_results.add_key(key, environment[key]) # Setting the global parameters of the test full_results.add_key('io_pattern', io_pattern) full_results.add_key('dataset', f'{total_data_set}GiB') full_results.add_key('file_size', fio_cr['spec']['workload']['args']['filesize']) full_results.add_key('servers', fio_cr['spec']['workload']['args']['servers']) full_results.add_key('samples', fio_cr['spec']['workload']['args']['samples']) full_results.add_key('operations', fio_cr['spec']['workload']['args']['jobs']) full_results.add_key('block_sizes', fio_cr['spec']['workload']['args']['bs']) full_results.add_key('io_depth', fio_cr['spec']['workload']['args']['iodepth']) full_results.add_key('jobs', fio_cr['spec']['workload']['args']['numjobs']) full_results.add_key( 'runtime', { 'read': fio_cr['spec']['workload']['args']['read_runtime'], 'write': fio_cr['spec']['workload']['args']['write_runtime'] }) full_results.add_key( 'storageclass', fio_cr['spec']['workload']['args']['storageclass']) full_results.add_key('vol_size', fio_cr['spec']['workload']['args']['storagesize']) # Wait for fio pod to initialized and complete log.info("Waiting for fio_client to complete") pod_obj = OCP(kind='pod') pod_obj.wait_for_resource( condition='Completed', resource_name=fio_client_pod, timeout=18000, sleep=300, ) # Getting the end time of the test end_time = time.strftime('%Y-%m-%dT%H:%M:%SGMT', time.gmtime()) full_results.add_key('test_time', { 'start': start_time, 'end': end_time }) output = run_cmd(f'oc logs {fio_client_pod}') log.info(f'The Test log is : {output}') try: if 'Fio failed to execute' not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") # Clean up fio benchmark log.info("Deleting FIO benchmark") fio_cr_obj.delete() log.debug(f'Full results is : {full_results.results}') # if Internal ES is exists, Copy all data from the Internal to main ES if es: log.info('Copy all data from Internal ES to Main ES') es._copy(full_results.es) # Adding this sleep between the copy and the analyzing of the results # since sometimes the results of the read (just after write) are empty time.sleep(30) full_results.analyze_results() # Analyze the results # Writing the analyzed test results to the Elastic-Search server full_results.es_write() full_results.codespeed_push() # Push results to codespeed # Creating full link to the results on the ES server log.info(f'The Result can be found at ; {full_results.results_link()}')
class QuayOperator(object): """ Quay operator class """ def __init__(self): """ Quay operator initializer function """ self.namespace = constants.OPENSHIFT_OPERATORS self.ocp_obj = ocp.OCP(namespace=self.namespace) self.quay_operator = None self.quay_registry = None self.quay_registry_secret = None self.quay_pod_obj = OCP(kind=constants.POD, namespace=self.namespace) self.quay_registry_name = "" self.quay_operator_csv = "" self.quay_registry_secret_name = "" self.sc_default = False self.sc_name = (constants.DEFAULT_EXTERNAL_MODE_STORAGECLASS_RBD if storagecluster_independent_check() else constants.DEFAULT_STORAGECLASS_RBD) def setup_quay_operator(self): """ Deploys Quay operator """ quay_operator_data = templating.load_yaml(file=constants.QUAY_SUB) self.quay_operator = OCS(**quay_operator_data) logger.info(f"Installing Quay operator: {self.quay_operator.name}") self.quay_operator.create() for quay_pod in TimeoutSampler(300, 10, get_pod_name_by_pattern, constants.QUAY_OPERATOR, self.namespace): if quay_pod: self.quay_pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=quay_pod[0], sleep=30, timeout=600, ) break self.quay_operator_csv = get_csvs_start_with_prefix( csv_prefix=constants.QUAY_OPERATOR, namespace=self.namespace, )[0]["metadata"]["name"] def create_quay_registry(self): """ Creates Quay registry """ if not helpers.get_default_storage_class(): patch = ' \'{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}\' ' run_cmd(f"oc patch storageclass {self.sc_name} " f"-p {patch} " f"--request-timeout=120s") self.sc_default = True self.quay_registry_secret_name = create_unique_resource_name( "quay-user", "secret") logger.info( f"Creating Quay registry config for super-user access: {self.quay_registry_secret_name}" ) self.quay_registry_secret = self.ocp_obj.exec_oc_cmd( command= f"create secret generic --from-file config.yaml={constants.QUAY_SUPER_USER} " f"{self.quay_registry_secret_name}") quay_registry_data = templating.load_yaml(file=constants.QUAY_REGISTRY) self.quay_registry_name = quay_registry_data["metadata"]["name"] quay_registry_data["spec"][ "configBundleSecret"] = self.quay_registry_secret_name self.quay_registry = OCS(**quay_registry_data) logger.info(f"Creating Quay registry: {self.quay_registry.name}") self.quay_registry.create() logger.info("Waiting for 15s for registry to get initialized") sleep(15) self.wait_for_quay_endpoint() def wait_for_quay_endpoint(self): """ Waits for quay registry endpoint """ logger.info("Waiting for quay registry endpoint to be up") sample = TimeoutSampler( timeout=300, sleep=15, func=self.check_quay_registry_endpoint, ) if not sample.wait_for_func_status(result=True): logger.error("Quay registry endpoint did not get created.") raise TimeoutExpiredError else: logger.info("Quay registry endpoint is up") def check_quay_registry_endpoint(self): """ Checks if quay registry endpoint is up Returns: bool: True if quay endpoint is up else False """ return (True if self.quay_registry.get().get("status").get("registryEndpoint") else False) def get_quay_endpoint(self): """ Returns quay endpoint """ return self.quay_registry.get().get("status").get("registryEndpoint") def teardown(self): """ Quay operator teardown """ if self.sc_default: patch = ' \'{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"false"}}}\' ' run_cmd(f"oc patch storageclass {self.sc_name} " f"-p {patch} " f"--request-timeout=120s") if self.quay_registry_secret: self.ocp_obj.exec_oc_cmd( f"delete secret {self.quay_registry_secret_name}") if self.quay_registry: self.quay_registry.delete() if self.quay_operator: self.quay_operator.delete() if self.quay_operator_csv: self.ocp_obj.exec_oc_cmd( f"delete {constants.CLUSTER_SERVICE_VERSION} " f"{self.quay_operator_csv}")
def test_run_pgsql(self, transactions, pod_name): """ Test pgsql workload """ # Create pgbench benchmark log.info("Create resource file for pgbench workload") pg_trans = transactions timeout = pg_trans * 3 pg_data = templating.load_yaml(constants.PGSQL_BENCHMARK_YAML) pg_data['spec']['workload']['args']['transactions'] = pg_trans pg_obj = OCS(**pg_data) pg_obj.create() # Wait for pgbench pod to be created for pgbench_pod in TimeoutSampler( pg_trans, 3, get_pod_name_by_pattern, 'pgbench', 'my-ripsaw' ): try: if pgbench_pod[0] is not None: pgbench_client_pod = pgbench_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Respin Ceph pod resource_osd = [f'{pod_name}'] log.info(f"Respin Ceph pod {pod_name}") disruption = disruption_helpers.Disruptions() for resource in resource_osd: disruption.set_resource(resource=resource) disruption.delete_resource() # Wait for pg_bench pod to initialized and complete log.info("Waiting for pgbench_client to complete") pod_obj = OCP(kind='pod') pod_obj.wait_for_resource( condition='Completed', resource_name=pgbench_client_pod, timeout=timeout, sleep=10, ) # Running pgbench and parsing logs output = run_cmd(f'oc logs {pgbench_client_pod}') pg_output = utils.parse_pgsql_logs(output) log.info( "*******PGBench output log*********\n" f"{pg_output}" ) for data in pg_output: latency_avg = data['latency_avg'] if not latency_avg: raise UnexpectedBehaviour( "PGBench failed to run, no data found on latency_avg" ) log.info("PGBench has completed successfully") # Collect data and export to Google doc spreadsheet g_sheet = GoogleSpreadSheetAPI(sheet_name="OCS PGSQL", sheet_index=2) for lat in pg_output: lat_avg = lat['latency_avg'] lat_stddev = lat['lat_stddev'] tps_incl = lat['tps_incl'] tps_excl = lat['tps_excl'] g_sheet.insert_row( [int(lat_avg), int(lat_stddev), int(tps_incl), int(tps_excl)], 2 ) # Clean up pgbench benchmark log.info("Deleting PG bench benchmark") pg_obj.delete()
class Postgresql(RipSaw): """ Postgresql workload operation """ def __init__(self, **kwargs): """ Initializer function """ super().__init__(**kwargs) self._apply_crd(crd=RIPSAW_CRD) def _apply_crd(self, crd): """ Apply the CRD Args: crd (str): yaml to apply """ RipSaw.apply_crd(self, crd=crd) def setup_postgresql(self, replicas): """ Deploy postgres sql server Args: replicas (int): Number of postgresql pods to be deployed Raises: CommandFailed: If PostgreSQL server setup fails """ log.info("Deploying postgres database") try: pgsql_service = templating.load_yaml(constants.PGSQL_SERVICE_YAML) pgsql_cmap = templating.load_yaml(constants.PGSQL_CONFIGMAP_YAML) pgsql_sset = templating.load_yaml(constants.PGSQL_STATEFULSET_YAML) pgsql_sset['spec']['replicas'] = replicas self.pgsql_service = OCS(**pgsql_service) self.pgsql_service.create() self.pgsql_cmap = OCS(**pgsql_cmap) self.pgsql_cmap.create() self.pgsql_sset = OCS(**pgsql_sset) self.pgsql_sset.create() self.pod_obj.wait_for_resource(condition='Running', selector='app=postgres', resource_count=replicas, timeout=3600) except (CommandFailed, CalledProcessError) as cf: log.error('Failed during setup of PostgreSQL server') raise cf self.pgsql_is_setup = True log.info("Successfully deployed postgres database") def create_pgbench_benchmark(self, replicas, clients=None, threads=None, transactions=None, scaling_factor=None, timeout=None): """ Create pgbench benchmark pods Args: replicas (int): Number of pgbench pods to be deployed clients (int): Number of clients threads (int): Number of threads transactions (int): Number of transactions scaling_factor (int): scaling factor timeout (int): Time in seconds to wait Returns: List: pgbench pod objects list """ pg_obj_list = [] for i in range(replicas): log.info("Create resource file for pgbench workload") pg_data = templating.load_yaml(constants.PGSQL_BENCHMARK_YAML) pg_data['metadata']['name'] = 'pgbench-benchmark' + f"{i}" pg_data['spec']['workload']['args']['databases'][0][ 'host'] = "postgres-" + f"{i}" + ".postgres" if clients is not None: pg_data['spec']['workload']['args']['clients'][0] = clients if threads is not None: pg_data['spec']['workload']['args']['threads'] = threads if transactions is not None: pg_data['spec']['workload']['args'][ 'transactions'] = transactions if scaling_factor is not None: pg_data['spec']['workload']['args'][ 'scaling_factor'] = scaling_factor pg_obj = OCS(**pg_data) pg_obj_list.append(pg_obj) pg_obj.create() # Confirm that expected pgbench pods are spinned log.info("Checking if Getting pgbench pods name") timeout = timeout if timeout else 300 for pgbench_pods in TimeoutSampler(timeout, replicas, get_pod_name_by_pattern, 'pgbench-1-dbs-client', RIPSAW_NAMESPACE): try: if len(pgbench_pods) == replicas: log.info(f"Expected number of pgbench pods are " f"found: {replicas}") break except IndexError: log.info(f'Expected number of pgbench pods are {replicas} ' f'but only found {len(pgbench_pods)}') return pg_obj_list def get_postgres_pods(self): """ Get all postgres pods Returns: List: postgres pod objects list """ return get_all_pods(namespace=RIPSAW_NAMESPACE, selector=['postgres']) def get_pgbench_pods(self): """ Get all pgbench pods Returns: List: pgbench pod objects list """ return [ get_pod_obj(pod) for pod in get_pod_name_by_pattern('pgbench', RIPSAW_NAMESPACE) ] def delete_pgbench_pods(self, pg_obj_list): """ Delete all pgbench pods on cluster Returns: bool: True if deleted, False otherwise """ log.info("Delete pgbench Benchmark") for pgbench_pod in pg_obj_list: pgbench_pod.delete(force=True) def is_pgbench_running(self): """ Check if pgbench is running Returns: bool: True if pgbench is running; False otherwise """ pod_objs = self.get_pgbench_pods() for pod in pod_objs: if pod.get().get('status').get('containerStatuses')[0].get( 'state') == 'running': log.info("One or more pgbench pods are in running state") return True else: return False break def get_pgbench_status(self, pgbench_pod_name): """ Get pgbench status Args: pgbench_pod_name (str): Name of the pgbench pod Returns: str: state of pgbench pod (running/completed) """ pod_obj = get_pod_obj(pgbench_pod_name, namespace=RIPSAW_NAMESPACE) status = pod_obj.get().get('status').get('containerStatuses')[0].get( 'state') return 'running' if list( status.keys())[0] == 'running' else status['terminated']['reason'] def wait_for_postgres_status(self, status=constants.STATUS_RUNNING, timeout=300): """ Wait for postgres pods status to reach running/completed Args: status (str): status to reach Running or Completed timeout (int): Time in seconds to wait """ log.info(f"Waiting for postgres pods to be reach {status} state") postgres_pod_objs = self.get_postgres_pods() for postgres_pod_obj in postgres_pod_objs: wait_for_resource_state(resource=postgres_pod_obj, state=status, timeout=timeout) def wait_for_pgbench_status(self, status, timeout=None): """ Wait for pgbench benchmark pods status to reach running/completed Args: status (str): status to reach Running or Completed timeout (int): Time in seconds to wait """ """ Sometimes with the default values in the benchmark yaml the pgbench pod is not getting completed within the specified time and the tests are failing. I think it is varying with the infrastructure. So, for now we set the timeout to 30 mins and will start monitoring each pg bench pods for each run.Based on the results we will define the timeout again """ timeout = timeout if timeout else 1800 # Wait for pg_bench pods to initialized and running log.info(f"Waiting for pgbench pods to be reach {status} state") pgbench_pod_objs = self.get_pgbench_pods() for pgbench_pod_obj in pgbench_pod_objs: try: wait_for_resource_state(resource=pgbench_pod_obj, state=status, timeout=timeout) except ResourceWrongStatusException: output = run_cmd(f'oc logs {pgbench_pod_obj.name}') error_msg = f'{pgbench_pod_obj.name} did not reach to {status} state after {timeout} sec\n{output}' log.error(error_msg) raise UnexpectedBehaviour(error_msg) def validate_pgbench_run(self, pgbench_pods, print_table=True): """ Validate pgbench run Args: pgbench pods (list): List of pgbench pods Returns: pg_output (list): pgbench outputs in list """ all_pgbench_pods_output = [] for pgbench_pod in pgbench_pods: log.info(f"pgbench_client_pod===={pgbench_pod.name}====") output = run_cmd(f'oc logs {pgbench_pod.name}') pg_output = utils.parse_pgsql_logs(output) log.info("*******PGBench output log*********\n" f"{pg_output}") # for data in all_pgbench_pods_output: for data in pg_output: run_id = list(data.keys()) latency_avg = data[run_id[0]]['latency_avg'] if not latency_avg: raise UnexpectedBehaviour("PGBench failed to run, " "no data found on latency_avg") log.info(f"PGBench on {pgbench_pod.name} completed successfully") all_pgbench_pods_output.append((pg_output, pgbench_pod.name)) if print_table: pgbench_pod_table = PrettyTable() pgbench_pod_table.field_names = [ 'pod_name', 'scaling_factor', 'num_clients', 'num_threads', 'trans_client', 'actually_trans', 'latency_avg', 'lat_stddev', 'tps_incl', 'tps_excl' ] for pgbench_pod_out in all_pgbench_pods_output: for pod_output in pgbench_pod_out[0]: for pod in pod_output.values(): pgbench_pod_table.add_row([ pgbench_pod_out[1], pod['scaling_factor'], pod['num_clients'], pod['num_threads'], pod['number_of_transactions_per_client'], pod['number_of_transactions_actually_processed'], pod['latency_avg'], pod['lat_stddev'], pod['tps_incl'], pod['tps_excl'] ]) log.info(f'\n{pgbench_pod_table}\n') return all_pgbench_pods_output def get_pgsql_nodes(self): """ Get nodes that contain a pgsql app pod Returns: list: Cluster node OCP objects """ pgsql_pod_objs = self.pod_obj.get(selector=constants.PGSQL_APP_LABEL, all_namespaces=True) log.info("Create a list of nodes that contain a pgsql app pod") nodes_set = set() for pod in pgsql_pod_objs['items']: log.info( f"pod {pod['metadata']['name']} located on node {pod['spec']['nodeName']}" ) nodes_set.add(pod['spec']['nodeName']) return list(nodes_set) def respin_pgsql_app_pod(self): """ Respin the pgsql app pod Returns: pod status """ app_pod_list = get_operator_pods(constants.PGSQL_APP_LABEL, constants.RIPSAW_NAMESPACE) app_pod = app_pod_list[random.randint(0, len(app_pod_list) - 1)] log.info(f"respin pod {app_pod.name}") app_pod.delete(wait=True, force=False) wait_for_resource_state(resource=app_pod, state=constants.STATUS_RUNNING, timeout=300) def get_pgbech_pod_status_table(self, pgbench_pods): """ Get pgbench pod data and print results on a table Args: pgbench pods (list): List of pgbench pods """ pgbench_pod_table = PrettyTable() pgbench_pod_table.field_names = [ 'pod_name', 'scaling_factor', 'num_clients', 'num_threads', 'trans_client', 'actually_trans', 'latency_avg', 'lat_stddev', 'tps_incl', 'tps_excl' ] for pgbench_pod in pgbench_pods: output = run_cmd(f'oc logs {pgbench_pod.name}') pg_output = utils.parse_pgsql_logs(output) for pod_output in pg_output: for pod in pod_output.values(): pgbench_pod_table.add_row([ pgbench_pod.name, pod['scaling_factor'], pod['num_clients'], pod['num_threads'], pod['number_of_transactions_per_client'], pod['number_of_transactions_actually_processed'], pod['latency_avg'], pod['lat_stddev'], pod['tps_incl'], pod['tps_excl'] ]) log.info(f'\n{pgbench_pod_table}\n') def cleanup(self): """ Clean up """ log.info("Deleting postgres pods and configuration") if self.pgsql_is_setup: self.pgsql_sset.delete() self.pgsql_cmap.delete() self.pgsql_service.delete() log.info("Deleting pgbench pods") pods_obj = self.get_pgbench_pods() for pod in pods_obj: pod.delete() pod.ocp.wait_for_delete(pod.name) log.info("Deleting ripsaw configuration") RipSaw.cleanup(self)
def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory): """ Test cluster recovery from disk deletion from the platform side. Based on documented procedure detailed in https://bugzilla.redhat.com/show_bug.cgi?id=1823183 """ logger.info("Picking a PV which to be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get("spec").get("claimRef").get("name") # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ ds for ds in osd_pvcs if ds.get().get("metadata").get("name") == claim_name ][0] # Get the corresponding OSD pod and ID logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] logger.info(f"OSD_POD {osd_pod.name}") osd_id = osd_pod.get().get("metadata").get("labels").get("ceph-osd-id") # Get the node that has the OSD pod running on logger.info( f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get("metadata").get( "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get( "labels").get("job-name")) osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_deployment_name = osd_deployment.name # Delete the volume from the platform side logger.info(f"Deleting {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Scale down OSD deployment logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0") ocp.OCP().exec_oc_cmd( f"scale --replicas=0 deployment/{osd_deployment_name}") # Force delete OSD pod if necessary osd_pod_name = osd_pod.name logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted") try: osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) except TimeoutError: osd_pod.delete(force=True) osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) # Run ocs-osd-removal job ocp_version = float(get_ocp_version()) if ocp_version >= 4.6: cmd = f"process ocs-osd-removal -p FAILED_OSD_IDS={osd_id} -o yaml" else: cmd = f"process ocs-osd-removal -p FAILED_OSD_ID={osd_id} -o yaml" logger.info(f"Executing OSD removal job on OSD-{osd_id}") ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"]) osd_removal_job_yaml = ocp_obj.exec_oc_cmd(cmd) osd_removal_job = OCS(**osd_removal_job_yaml) osd_removal_job.create(do_reload=False) # Get ocs-osd-removal pod name logger.info("Getting the ocs-osd-removal pod name") osd_removal_pod_name = get_osd_removal_pod_name(osd_id) osd_removal_pod_obj = get_pod_obj(osd_removal_pod_name, namespace="openshift-storage") osd_removal_pod_obj.ocp.wait_for_resource( condition=constants.STATUS_COMPLETED, resource_name=osd_removal_pod_name) # Verify OSD removal from the ocs-osd-removal pod logs logger.info( f"Verifying removal of OSD from {osd_removal_pod_name} pod logs") logs = get_pod_logs(osd_removal_pod_name) pattern = f"purged osd.{osd_id}" assert re.search(pattern, logs) osd_pvc_name = osd_pvc.name if ocp_version < 4.6: # Delete the OSD prepare job logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}") osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete( resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Delete the OSD deployment logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete( resource_name=osd_deployment_name, timeout=120) else: # If ocp version is '4.6' and above the osd removal job should # delete the OSD prepare job, OSD PVC, OSD deployment logger.info( f"Verifying deletion of OSD prepare job {osd_prepare_job_name}" ) osd_prepare_job.ocp.wait_for_delete( resource_name=osd_prepare_job_name, timeout=30) logger.info(f"Verifying deletion of OSD PVC {osd_pvc_name}") osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name, timeout=30) logger.info( f"Verifying deletion of OSD deployment {osd_deployment_name}") osd_deployment.ocp.wait_for_delete( resource_name=osd_deployment_name, timeout=30) # Delete PV logger.info(f"Verifying deletion of PV {osd_pv_name}") try: osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) except TimeoutError: osd_pv.delete() osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) if ocp_version < 4.6: # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info( f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() # Delete the OSD removal job logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}") osd_removal_job = get_job_obj(f"ocs-osd-removal-{osd_id}") osd_removal_job.delete() osd_removal_job.ocp.wait_for_delete( resource_name=f"ocs-osd-removal-{osd_id}") timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info( "Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info( "Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810 # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438 if ocp_version >= 4.6: silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning( osd_pod_name) if not silence_osd_crash: logger.info("Didn't find ceph osd crash warning") # Validate cluster is still functional self.sanity_helpers.health_check(tries=100) self.sanity_helpers.create_resources(pvc_factory, pod_factory)
class Cosbench(object): """ Cosbench S3 benchmark tool """ def __init__(self): """ Initializer function """ self.ns_obj = OCP(kind="namespace") self.namespace = constants.COSBENCH_PROJECT self.configmap_obj = OCP(namespace=self.namespace, kind=constants.CONFIGMAP) self.ocp_obj = OCP(namespace=self.namespace) self.cosbench_config = None self.cosbench_pod = None self.cosbench_dir = mkdtemp(prefix="cosbench-tool-") self.xml_file = "" self.workload_id = "" self.init_container = 1 self.range_selector = "r" self.init_object = 1 mcg_obj = MCG() self.access_key_id = mcg_obj.access_key_id self.access_key = mcg_obj.access_key self.endpoint = ( "http://" + mcg_obj.s3_internal_endpoint.split("/")[2].split(":")[0]) def setup_cosbench(self): """ Setups Cosbench namespace, configmap and pod """ # Create cosbench project self.ns_obj.new_project(project_name=self.namespace) # Create configmap config_data = templating.load_yaml(file=constants.COSBENCH_CONFIGMAP) cosbench_configmap_name = create_unique_resource_name( constants.COSBENCH, "configmap") config_data["metadata"]["name"] = cosbench_configmap_name config_data["metadata"]["namespace"] = self.namespace self.cosbench_config = OCS(**config_data) logger.info( f"Creating Cosbench configmap: {self.cosbench_config.name}") self.cosbench_config.create() self.configmap_obj.wait_for_resource( resource_name=self.cosbench_config.name, column="DATA", condition="4") # Create Cosbench pod cosbench_pod_data = templating.load_yaml(file=constants.COSBENCH_POD) cosbench_pod_data["spec"]["containers"][0]["envFrom"][0][ "configMapRef"]["name"] = self.cosbench_config.name cosbench_pod_name = create_unique_resource_name( constants.COSBENCH, "pod") cosbench_pod_data["metadata"]["name"] = cosbench_pod_name cosbench_pod_data["metadata"]["namespace"] = self.namespace self.cosbench_pod = OCS(**cosbench_pod_data) logger.info(f"Creating Cosbench pod: {self.cosbench_pod.name}") self.cosbench_pod.create() helpers.wait_for_resource_state(resource=self.cosbench_pod, state=constants.STATUS_RUNNING, timeout=300) def _apply_mcg_auth(self, xml_root): """ Applies MCG credentials Args: xml_root (Element): Root element of workload xml """ xml_root[0].set( "config", f"accesskey={self.access_key_id};secretkey={self.access_key};" f"endpoint={self.endpoint};path_style_access=true", ) def run_init_workload( self, prefix, containers, objects, start_container=None, start_object=None, size=64, size_unit="KB", sleep=15, timeout=300, validate=True, ): """ Creates specific containers and objects in bulk Args: prefix (str): Prefix of bucket name. containers (int): Number of containers/buckets to be created. objects (int): Number of objects to be created on each bucket. start_container (int): Start of containers. Default: 1. start_object (int): Start of objects. Default: 1. size (int): Size of each objects. size_unit (str): Object size unit (B/KB/MB/GB) sleep (int): Sleep in seconds. timeout (int): Timeout in seconds. validate (bool): Validates whether init and prepare is completed. Returns: Tuple[str, str]: Workload xml and its name """ init_template = """ <workload name="Fill" description="Init and prepare operation"> <storage type="s3" config="" /> <workflow> <workstage name="init-containers"> <work type="init" workers="1" config="" /> </workstage> <workstage name="prepare-objects"> <work type="prepare" workers="16" config="" /> </workstage> </workflow> </workload> """ xml_root, xml_tree = self._create_element_tree(template=init_template) workload_name = xml_root.get("name") self._apply_mcg_auth(xml_root) self.init_container = (start_container if start_container else self.init_container) self.init_object = start_object if start_object else self.init_object init_container_config = self.generate_container_stage_config( self.range_selector, self.init_container, containers, ) init_config = self.generate_stage_config( self.range_selector, self.init_container, containers, self.init_object, objects, ) for stage in xml_root.iter("work"): if stage.get("type") == "init": stage.set("config", f"cprefix={prefix};{init_container_config}") elif stage.get("type") == "prepare": stage.set( "config", f"cprefix={prefix};{init_config};sizes=c({str(size)}){size_unit}", ) self._create_tmp_xml(xml_tree=xml_tree, xml_file_prefix=workload_name) self.submit_workload(workload_path=self.xml_file) self.wait_for_workload(workload_id=self.workload_id, sleep=sleep, timeout=timeout) if validate: self.validate_workload(workload_id=self.workload_id, workload_name=workload_name) else: return self.workload_id, workload_name def run_cleanup_workload( self, prefix, containers, objects, start_container=None, start_object=None, sleep=15, timeout=300, validate=True, ): """ Deletes specific objects and containers in bulk. Args: prefix (str): Prefix of bucket name. containers (int): Number of containers/buckets to be created. objects (int): Number of objects to be created on each bucket. start_container (int): Start of containers. Default: 1. start_object (int): Start of objects. Default: 1. sleep (int): Sleep in seconds. timeout (int): Timeout in seconds. validate (bool): Validates whether cleanup and dispose is completed. Returns: Tuple[str, str]: Workload xml and its name """ cleanup_template = """ <workload name="Cleanup" description="Cleanup and Dispose"> <storage type="s3" config="" /> <workflow> <workstage name="cleanup-objects"> <work type="cleanup" workers="4" config="" /> </workstage> <workstage name="dispose-containers"> <work type="dispose" workers="1" config="" /> </workstage> </workflow> </workload> """ xml_root, xml_tree = self._create_element_tree( template=cleanup_template) workload_name = xml_root.get("name") self._apply_mcg_auth(xml_root) self.init_container = (start_container if start_container else self.init_container) self.init_object = start_object if start_object else self.init_object cleanuo_config = self.generate_stage_config( self.range_selector, self.init_container, containers, self.init_object, objects, ) for stage in xml_root.iter("work"): if stage.get("type") == "cleanup": stage.set( "config", f"cprefix={prefix};{cleanuo_config}", ) elif stage.get("type") == "dispose": stage.set("config", f"cprefix={prefix};{cleanuo_config}") self._create_tmp_xml(xml_tree=xml_tree, xml_file_prefix=workload_name) self.submit_workload(workload_path=self.xml_file) self.wait_for_workload(workload_id=self.workload_id, sleep=sleep, timeout=timeout) if validate: self.validate_workload(workload_id=self.workload_id, workload_name=workload_name) else: return self.workload_id, workload_name def run_main_workload( self, operation_type, prefix, containers, objects, workers=4, selector="s", start_container=None, start_object=None, size=64, size_unit="KB", sleep=15, timeout=300, extend_objects=None, validate=True, result=True, ): """ Creates and runs main Cosbench workload. Args: operation_type (dict): Cosbench operation and its ratio. Operation (str): Supported ops are read, write, list and delete. Ratio (int): Percentage of each operation. Should add up to 100. workers (int): Number of users to perform operations. containers (int): Number of containers/buckets to be created. objects (int): Number of objects to be created on each bucket. selector (str): The way object is accessed/selected. u=uniform, r=range, s=sequential. prefix (str): Prefix of bucket name. start_container (int): Start of containers. Default: 1. start_object (int): Start of objects. Default: 1. size (int): Size of each objects. size_unit (str): Object size unit (B/KB/MB/GB) sleep (int): Sleep in seconds timeout (int): Timeout in seconds validate (bool): Validates whether each stage is completed extend_objects (int): Extends the total number of objects to prevent overlap. Use only for Write and Delete operations. result (bool): Get performance results when running workload is completed. Returns: Tuple[str, str]: Workload xml and its name """ main_template = """ <workload name="workload_name" description="Main workload"> <storage type="s3" config="" /> <workflow> <workstage name="Main"> <work name="work_name" workers="4" division="object" runtime="60"> </work> </workstage> </workflow> </workload> """ xml_root, xml_tree = self._create_element_tree(template=main_template) workload_name = xml_root.get("name") self._apply_mcg_auth(xml_root) start_container = start_container if start_container else self.init_container start_object = start_object if start_object else self.init_object for stage in xml_root.iter("work"): stage.set("workers", f"{workers}") for operation, ratio in operation_type.items(): if operation == "write" or "delete": if extend_objects: start_object = objects + 1 stage_config = self.generate_stage_config( selector, start_container, containers, start_object, extend_objects, ) attributes = { "type": f"{operation}", "ratio": f"{ratio}", "config": f"cprefix={prefix};{stage_config};sizes=c({str(size)}){size_unit}", } ElementTree.SubElement(stage, "operation", attributes) else: stage_config = self.generate_stage_config( selector, start_container, containers, start_object, objects, ) attributes = { "type": f"{operation}", "ratio": f"{ratio}", "config": f"cprefix={prefix};{stage_config};sizes=c({str(size)}){size_unit}", } ElementTree.SubElement(stage, "operation", attributes) else: stage_config = self.generate_stage_config( selector, start_container, containers, start_object, objects, ) attributes = { "type": f"{operation}", "ratio": f"{ratio}", "config": f"cprefix={prefix};{stage_config}", } ElementTree.SubElement(stage, "operation", attributes) self._create_tmp_xml(xml_tree=xml_tree, xml_file_prefix=workload_name) self.submit_workload(workload_path=self.xml_file) self.wait_for_workload(workload_id=self.workload_id, sleep=sleep, timeout=timeout) if validate: self.validate_workload(workload_id=self.workload_id, workload_name=workload_name) else: return self.workload_id, workload_name if result: throughput, bandwidth = self.get_performance_result( workload_id=self.workload_id, workload_name=workload_name, size=size, ) return throughput, bandwidth else: return self.workload_id, workload_name @staticmethod def generate_stage_config(selector, start_container, end_container, start_objects, end_object): """ Generates config which is used in stage creation Args: selector (str): The way object is accessed/selected. u=uniform, r=range, s=sequential. start_container (int): Start of containers end_container (int): End of containers start_objects (int): Start of objects end_object (int): End of objects Returns: (str): Container and object configuration """ xml_config = ( f"containers={selector}({str(start_container)},{str(end_container)});" f"objects={selector}({str(start_objects)},{str(end_object)})") return xml_config @staticmethod def generate_container_stage_config(selector, start_container, end_container): """ Generates container config which creates buckets in bulk Args: selector (str): The way object is accessed/selected. u=uniform, r=range, s=sequential. start_container (int): Start of containers end_container (int): End of containers Returns: (str): Container and object configuration """ container_config = ( f"containers={selector}({str(start_container)},{str(end_container)});" ) return container_config def _create_tmp_xml(self, xml_tree, xml_file_prefix): """ Creates a xml file and writes the workload Args: xml_file_prefix (str): Prefix of xml file xml_tree (Element): Element tree """ self.xml_file = NamedTemporaryFile( dir=self.cosbench_dir, prefix=f"{xml_file_prefix}", suffix=".xml", delete=False, ).name logger.info(self.xml_file) xml_tree.write(self.xml_file) @staticmethod def _create_element_tree(template): """ Creates element tree and root element of xml Args: template (str): Template of Cosbench workload Returns: Tuple[Element, ElementTree]: Root element and element tree of xml """ xml_root = ElementTree.fromstring(text=template) xml_tree = ElementTree.ElementTree(element=xml_root) return xml_root, xml_tree def _copy_workload(self, workload_path): """ Copies workload xml to Cosbench pod Args: workload_path (str): Absolute path of xml to copy """ self.ocp_obj.exec_oc_cmd( command=f"cp {workload_path} {self.cosbench_pod.name}:/cos", out_yaml_format=False, timeout=180, ) def submit_workload(self, workload_path): """ Submits Cosbench xml to initiate workload Args: workload_path (str): Absolute path of xml to submit """ self._copy_workload(workload_path=workload_path) workload = os.path.split(workload_path)[1] self._cosbench_cli(workload) @retry(AttributeError, tries=15, delay=5, backoff=1) def _cosbench_cli(self, workload): """ Runs Cosbench cli to initiate workload Args: workload (str): Workload file """ submit_key = "Accepted with ID" cobench_pod_obj = get_pod_obj(name=self.cosbench_pod.name, namespace=self.namespace) submit = cobench_pod_obj.exec_cmd_on_pod( command=f"/cos/cli.sh submit /cos/{workload}", out_yaml_format=True, timeout=180, ) if submit_key in submit.keys(): self.workload_id = submit[submit_key] else: assert f"Failed to submit the workload, ID not found. stdout: {submit}" def wait_for_workload(self, workload_id, sleep=1, timeout=60): """ Waits for the cosbench workload to complete Args: workload_id (str): ID of cosbench workload sleep: sleep in seconds timeout: timeout in seconds to check if mirroring Returns: bool: Whether cosbench workload processed successfully """ logger.info(f"Waiting for workload {workload_id} to be processed") pattern = f"sucessfully processed workload {workload_id}" try: for ret in TimeoutSampler( timeout=timeout, sleep=sleep, func=get_pod_logs, pod_name=self.cosbench_pod.name, namespace=self.namespace, ): if re.search(pattern=pattern, string=ret): break logger.info( f"Verified: Workload {workload_id} processed successfully") return True except TimeoutExpiredError: logger.error( f"Workload {workload_id} did not complete. Dumping cosbench pod log" ) # Log cosbench pod for debugging purpose cosbench_log = get_pod_logs(pod_name=self.cosbench_pod.name, namespace=self.namespace) logger.debug(cosbench_log) return False def validate_workload(self, workload_id, workload_name): """ Validates each stage of cosbench workload Args: workload_id (str): ID of cosbench workload workload_name (str): Name of the workload Raises: UnexpectedBehaviour: When workload csv is incorrect/malformed. """ workload_csv = self.get_result_csv(workload_id=workload_id, workload_name=workload_name) with open(workload_csv, "r") as file: reader = csv.reader(file) header = next(reader) if header is not None: # Iterate over each row after the header logger.info( f"Verifying whether each stage of workload {workload_id} completed" ) for row in reader: if row[16] == "completed": logger.info(f"Stage {row[0]} completed successfully") else: assert ( f"Failed: Stage {row[0]} did not complete. Status {row[16]}" ) else: raise UnexpectedBehaviour( f"Workload csv is incorrect/malformed. Dumping csv {reader}" ) def get_result_csv(self, workload_id, workload_name): """ Gets cosbench workload result csv Args: workload_id (str): ID of cosbench workload workload_name (str): Name of the workload Returns: str: Absolute path of the result csv """ archive_file = f"{workload_id}-{workload_name}" cmd = ( f"cp {self.cosbench_pod.name}:/cos/archive/{archive_file}/{archive_file}.csv " f"{self.cosbench_dir}/{archive_file}.csv ") self.ocp_obj.exec_oc_cmd( command=cmd, out_yaml_format=False, timeout=300, ) return f"{self.cosbench_dir}/{archive_file}.csv" def cleanup(self): """ Cosbench cleanup """ switch_to_project(constants.COSBENCH_PROJECT) logger.info("Deleting Cosbench pod, configmap and namespace") self.cosbench_pod.delete() self.cosbench_config.delete() self.ns_obj.delete_project(self.namespace) self.ns_obj.wait_for_delete(resource_name=self.namespace, timeout=90) def get_performance_result(self, workload_name, workload_id, size): workload_file = self.get_result_csv(workload_id=workload_id, workload_name=workload_name) throughput_data = {} bandwidth_data = {} with open(workload_file, "r") as file: reader = csv.reader(file) header = next(reader) if header is not None: for row in reader: throughput_data[row[1]] = row[13] bandwidth_data[row[1]] = row[14] else: raise UnexpectedBehaviour( f"Workload csv is incorrect/malformed. Dumping csv {reader}" ) # Store throughput data on csv file log_path = f"{self.cosbench_dir}" with open(f"{log_path}/{workload_name}-{size}-throughput.csv", "a") as fd: csv_obj = csv.writer(fd) for k, v in throughput_data.items(): csv_obj.writerow([k, v]) logger.info( f"Throughput data present in {log_path}/{workload_name}-{size}-throughput.csv" ) # Store bandwidth data on csv file with open(f"{log_path}/{workload_name}-{size}-bandwidth.csv", "a") as fd: csv_obj = csv.writer(fd) for k, v in bandwidth_data.items(): csv_obj.writerow([k, v]) logger.info( f"Bandwidth data present in {log_path}/{workload_name}-{size}-bandwidth.csv" ) return throughput_data, bandwidth_data def cosbench_full(self): """ Run full Cosbench workload """ bucket_prefix = "bucket-" buckets = 10 objects = 1000 # Operations to perform and its ratio(%) operations = {"read": 50, "write": 50} # Deployment of cosbench self.setup_cosbench() # Create initial containers and objects self.run_init_workload(prefix=bucket_prefix, containers=buckets, objects=objects, validate=True) # Start measuring time start_time = datetime.now() # Run main workload self.run_main_workload( operation_type=operations, prefix=bucket_prefix, containers=buckets, objects=objects, validate=True, timeout=10800, ) # Calculate the total run time of Cosbench workload end_time = datetime.now() diff_time = end_time - start_time logger.info(f"Cosbench workload completed after {diff_time}") # Dispose containers and objects self.run_cleanup_workload(prefix=bucket_prefix, containers=buckets, objects=objects, validate=True)
class TestFIOBenchmark(E2ETest): """ Run FIO perf test using ripsaw benchmark """ def ripsaw_deploy(self, ripsaw): """ Deploy the benchmark operator (formally ripsaw) CRD Args: ripsaw (obj): benchmark operator object """ log.info("Deploying benchmark operator (ripsaw)") ripsaw.apply_crd("resources/crds/" "ripsaw_v1alpha1_ripsaw_crd.yaml") def es_info_backup(self, elasticsearch): """ Saving the Original elastic-search IP and PORT - if defined in yaml Args: elasticsearch (obj): elasticsearch object """ # for development mode use the Dev ES server if dev_mode: if "elasticsearch" in self.fio_cr["spec"]: self.fio_cr["spec"]["elasticsearch"] = { "server": defaults.ELASTICSEARCH_DEV_IP, "port": defaults.ELASTICSEARCE_PORT, "url": f"http://{defaults.ELASTICSEARCH_DEV_IP}:{defaults.ELASTICSEARCE_PORT}", } if "elasticsearch" in self.fio_cr["spec"]: self.fio_cr["spec"]["elasticsearch"]["url"] = ( f"http://{self.fio_cr['spec']['elasticsearch']['server']}:" f"{self.fio_cr['spec']['elasticsearch']['port']}") self.backup_es = self.fio_cr["spec"]["elasticsearch"] log.info( f"Creating object for the Main ES server on {self.backup_es['url']}" ) self.main_es = Elasticsearch([self.backup_es["url"]], verify_certs=True) if not self.main_es.ping(): log.warning("Cannot connect to Main elasticsearch server") self.main_es = None else: log.warning( "Elastic Search information does not exists in YAML file") self.fio_cr["spec"]["elasticsearch"] = {} # Use the internal define elastic-search server in the test - if exist if elasticsearch: self.fio_cr["spec"]["elasticsearch"] = { "server": elasticsearch.get_ip(), "port": elasticsearch.get_port(), "url": f"http://{elasticsearch.get_ip()}:{elasticsearch.get_port()}", } def setting_storage_usage(self): """ Getting the storage capacity, calculate the usage of the storage and setting the workload CR rile parameters. """ ceph_cluster = CephCluster() ceph_capacity = ceph_cluster.get_ceph_capacity() log.info(f"Total storage capacity is {ceph_capacity} GiB") self.total_data_set = int(ceph_capacity * 0.4) self.filesize = int( self.fio_cr["spec"]["workload"]["args"]["filesize"].replace( "GiB", "")) # To make sure the number of App pods will not be more then 50, in case # of large data set, changing the size of the file each pod will work on if self.total_data_set > 500: self.filesize = int(ceph_capacity * 0.008) self.fio_cr["spec"]["workload"]["args"][ "filesize"] = f"{self.filesize}GiB" # make sure that the storage size is larger then the file size self.fio_cr["spec"]["workload"]["args"][ "storagesize"] = f"{int(self.filesize * 1.2)}Gi" self.fio_cr["spec"]["workload"]["args"]["servers"] = int( self.total_data_set / self.filesize) log.info(f"Total Data set to work on is : {self.total_data_set} GiB") def get_env_info(self): """ Getting the environment information and update the workload RC if necessary. """ self.environment = get_environment_info() if not self.environment["user"] == "": self.fio_cr["spec"]["test_user"] = self.environment["user"] self.fio_cr["spec"]["clustername"] = self.environment["clustername"] log.debug(f"Environment information is : {self.environment}") def setting_io_pattern(self, io_pattern): """ Setting the test jobs according to the io pattern - random / sequential Args: io_pattern (str): the I/O pattern to run (random / sequential) """ if io_pattern == "sequential": self.fio_cr["spec"]["workload"]["args"]["jobs"] = ["write", "read"] self.fio_cr["spec"]["workload"]["args"]["iodepth"] = 1 if io_pattern == "random": self.fio_cr["spec"]["workload"]["args"]["jobs"] = [ "randwrite", "randread" ] def deploy_and_wait_for_wl_to_start(self): """ Deploy the workload and wait until it start working Returns: obj : the FIO client pod object """ log.info(f"The FIO CR file is {self.fio_cr}") self.fio_cr_obj = OCS(**self.fio_cr) self.fio_cr_obj.create() # Wait for fio client pod to be created for fio_pod in TimeoutSampler(900, 20, get_pod_name_by_pattern, "fio-client", constants.RIPSAW_NAMESPACE): try: if fio_pod[0] is not None: fio_client_pod = fio_pod[0] break except IndexError: log.info("Bench pod not ready yet") # Getting the start time of the test self.start_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) return fio_client_pod def wait_for_wl_to_finish(self, fio_client_pod): """ Waiting until the workload is finished Args: fio_client_pod (obj): the FIO client pod object Returns: str: the end time of the workload """ if dev_mode: timeout = 3600 sleeptime = 30 else: timeout = 18000 sleeptime = 300 log.info("Waiting for fio_client to complete") pod_obj = OCP(kind="pod") pod_obj.wait_for_resource( condition="Completed", resource_name=fio_client_pod, timeout=timeout, sleep=sleeptime, ) # Getting the end time of the test end_time = time.strftime("%Y-%m-%dT%H:%M:%SGMT", time.gmtime()) output = run_cmd(f"oc logs {fio_client_pod}") log_file_name = f"{self.full_log_path}/test-pod.log" with open(log_file_name, "w") as f: f.write(output) log.info(f"The Test log is can be found at : {log_file_name}") try: if "Fio failed to execute" not in output: log.info("FIO has completed successfully") except IOError: log.info("FIO failed to complete") return end_time def init_full_results(self, full_results): """ Initialize the full results object which will send to the ES server Args: full_results (obj): an empty FIOResultsAnalyse object Returns: FIOResultsAnalyse (obj): the input object fill with data """ for key in self.environment: full_results.add_key(key, self.environment[key]) # Setting the global parameters of the test full_results.add_key("dataset", f"{self.total_data_set}GiB") full_results.add_key( "file_size", self.fio_cr["spec"]["workload"]["args"]["filesize"]) full_results.add_key( "servers", self.fio_cr["spec"]["workload"]["args"]["servers"]) full_results.add_key( "samples", self.fio_cr["spec"]["workload"]["args"]["samples"]) full_results.add_key("operations", self.fio_cr["spec"]["workload"]["args"]["jobs"]) full_results.add_key("block_sizes", self.fio_cr["spec"]["workload"]["args"]["bs"]) full_results.add_key( "io_depth", self.fio_cr["spec"]["workload"]["args"]["iodepth"]) full_results.add_key( "jobs", self.fio_cr["spec"]["workload"]["args"]["numjobs"]) full_results.add_key( "runtime", { "read": self.fio_cr["spec"]["workload"]["args"]["read_runtime"], "write": self.fio_cr["spec"]["workload"]["args"]["write_runtime"], }, ) full_results.add_key( "storageclass", self.fio_cr["spec"]["workload"]["args"]["storageclass"]) full_results.add_key( "vol_size", self.fio_cr["spec"]["workload"]["args"]["storagesize"]) return full_results def copy_es_data(self, elasticsearch): """ Copy data from Internal ES (if exists) to the main ES Args: elasticsearch (obj): elasticsearch object (if exits) Returns: bool: True if data was copy to the main ES False otherwise """ if elasticsearch: log.info("Copy all data from Internal ES to Main ES") log.info("Dumping data from the Internal ES to tar ball file") elasticsearch.dumping_all_data(self.full_log_path) es_connection = self.backup_es es_connection["host"] = es_connection.pop("server") es_connection.pop("url") if elasticsearch_load(self.main_es, self.full_log_path): # Adding this sleep between the copy and the analyzing of the results # since sometimes the results of the read (just after write) are empty time.sleep(10) return True else: log.warning("Cannot upload data into the Main ES server") return False def cleanup(self): log.info("Deleting FIO benchmark") self.fio_cr_obj.delete() time.sleep(180) # Getting all PVCs created in the test (if left). NL = "\\n" # NewLine character command = ["oc", "get", "pvc", "-n"] command.append(constants.RIPSAW_NAMESPACE) command.append("-o") command.append("template") command.append("--template") command.append("'{{range .items}}{{.metadata.name}}{{\"" + NL + "\"}}{{end}}'") pvcs_list = run_command(command, out_format="list") log.info(f"list of all PVCs :{pvcs_list}") for pvc in pvcs_list: pvc = pvc.replace("'", "") run_command(f"oc -n {constants.RIPSAW_NAMESPACE} delete pvc {pvc}") # Getting all PVs created in the test (if left). command[2] = "pv" command[8] = ( "'{{range .items}}{{.metadata.name}} {{.spec.claimRef.namespace}}{{\"" + NL + "\"}}{{end}}'") command.remove("-n") command.remove(constants.RIPSAW_NAMESPACE) pvs_list = run_command(command, out_format="list") log.info(f"list of all PVs :{pvs_list}") for line in pvs_list: pv, ns = line.split(" ") pv = pv.replace("'", "") if ns == constants.RIPSAW_NAMESPACE: log.info(f"Going to delete {pv}") run_command(f"oc delete pv {pv}") @pytest.mark.parametrize( argnames=["interface", "io_pattern"], argvalues=[ pytest.param( *[constants.CEPHBLOCKPOOL, "sequential"], marks=pytest.mark.polarion_id("OCS-844"), ), pytest.param( *[constants.CEPHFILESYSTEM, "sequential"], marks=pytest.mark.polarion_id("OCS-845"), ), pytest.param( *[constants.CEPHBLOCKPOOL, "random"], marks=pytest.mark.polarion_id("OCS-846"), ), pytest.param( *[constants.CEPHFILESYSTEM, "random"], marks=pytest.mark.polarion_id("OCS-847"), ), ], ) def test_fio_workload_simple(self, ripsaw, es, interface, io_pattern): """ This is a basic fio perf test - non-compressed volumes """ self.full_log_path = get_full_test_logs_path(cname=self) self.full_log_path += f"-{interface}-{io_pattern}" log.info(f"Logs file path name is : {self.full_log_path}") self.ripsaw_deploy(ripsaw) if interface == "CephBlockPool": sc = constants.CEPHBLOCKPOOL_SC else: sc = constants.CEPHFILESYSTEM_SC # Create fio benchmark log.info("Create resource file for fio workload") self.fio_cr = templating.load_yaml(constants.FIO_CR_YAML) # Saving the Original elastic-search IP and PORT - if defined in yaml self.es_info_backup(es) # Setting the data set to 40% of the total storage capacity self.setting_storage_usage() self.get_env_info() self.fio_cr["spec"]["workload"]["args"]["storageclass"] = sc self.setting_io_pattern(io_pattern) fio_client_pod = self.deploy_and_wait_for_wl_to_start() # Getting the UUID from inside the benchmark pod uuid = ripsaw.get_uuid(fio_client_pod) # Setting back the original elastic-search information self.fio_cr["spec"]["elasticsearch"] = self.backup_es # Initialize the results doc file. full_results = self.init_full_results( FIOResultsAnalyse(uuid, self.fio_cr, self.full_log_path, self.main_es)) # Setting the global parameters of the test full_results.add_key("io_pattern", io_pattern) end_time = self.wait_for_wl_to_finish(fio_client_pod) full_results.add_key("test_time", { "start": self.start_time, "end": end_time }) # Clean up fio benchmark self.cleanup() log.debug(f"Full results is : {full_results.results}") self.copy_es_data(es) full_results.analyze_results() # Analyze the results # Writing the analyzed test results to the Elastic-Search server if self.main_es is not None: full_results.es_write() full_results.codespeed_push() # Push results to codespeed # Creating full link to the results on the ES server log.info( f"The Result can be found at ; {full_results.results_link()}") @skipif_ocs_version("<4.6") @pytest.mark.parametrize( argnames=["io_pattern", "bs", "cmp_ratio"], argvalues=[ pytest.param(*["random", "1024KiB", 60]), pytest.param(*["random", "64KiB", 60]), pytest.param(*["random", "16KiB", 60]), pytest.param(*["sequential", "1024KiB", 60]), pytest.param(*["sequential", "64KiB", 60]), pytest.param(*["sequential", "16KiB", 60]), ], ) def test_fio_compressed_workload(self, ripsaw, es, storageclass_factory, io_pattern, bs, cmp_ratio): """ This is a basic fio perf test which run on compression enabled volume Args: io_pattern (str): the I/O pattern to do - random / sequential bs (str): block size to use in the test cmp_ratio (int): the expected compression ratio """ self.full_log_path = get_full_test_logs_path(cname=self) self.full_log_path += f"-{io_pattern}-{bs}-{cmp_ratio}" log.info(f"Logs file path name is : {self.full_log_path}") self.ripsaw_deploy(ripsaw) log.info("Creating compressed pool & SC") sc_obj = storageclass_factory( interface=constants.CEPHBLOCKPOOL, new_rbd_pool=True, replica=3, compression="aggressive", ) sc = sc_obj.name pool_name = run_cmd( f"oc get sc {sc} -o jsonpath={{'.parameters.pool'}}") # Create fio benchmark log.info("Create resource file for fio workload") self.fio_cr = templating.load_yaml( "ocs_ci/templates/workloads/fio/benchmark_fio_cmp.yaml") self.fio_cr["spec"]["workload"]["args"]["bs"] = [bs] self.fio_cr["spec"]["workload"]["args"]["prefill_bs"] = bs self.fio_cr["spec"]["workload"]["args"]["cmp_ratio"] = cmp_ratio # Saving the Original elastic-search IP and PORT - if defined in yaml self.es_info_backup(es) # Setting the data set to 40% of the total storage capacity self.setting_storage_usage() self.get_env_info() self.fio_cr["spec"]["workload"]["args"]["storageclass"] = sc self.setting_io_pattern(io_pattern) fio_client_pod = self.deploy_and_wait_for_wl_to_start() # Getting the UUID from inside the benchmark pod uuid = ripsaw.get_uuid(fio_client_pod) # Setting back the original elastic-search information self.fio_cr["spec"]["elasticsearch"] = self.backup_es # Initialize the results doc file. full_results = self.init_full_results( FIOResultsAnalyse(uuid, self.fio_cr, self.full_log_path, self.main_es)) # Setting the global parameters of the test full_results.add_key("io_pattern", io_pattern) end_time = self.wait_for_wl_to_finish(fio_client_pod) full_results.add_key("test_time", { "start": self.start_time, "end": end_time }) # Clean up fio benchmark self.copy_es_data(es) log.info("verifying compression ratio") ratio = calculate_compression_ratio(pool_name) full_results.add_key("cmp_ratio", { "expected": cmp_ratio, "actual": ratio }) full_results.analyze_results() # Analyze the results # TODO: change the info message to Warning/Error after # prefill at ripsaw will be fixed Ripsaw PR - #505 if (cmp_ratio + 5) < ratio or ratio < (cmp_ratio - 5): log.info(f"The compression ratio is {ratio}% " f"while the expected ratio is {cmp_ratio}%") else: log.info(f"The compression ratio is {ratio}%") # Writing the analyzed test results to the Elastic-Search server if self.main_es is not None: full_results.es_write() # Creating full link to the results on the ES server log.info( f"The Result can be found at : {full_results.results_link()}") self.cleanup() sc_obj.delete() sc_obj.ocp.wait_for_delete(resource_name=sc, timeout=300, sleep=5) log.debug(f"Full results is : {full_results.results}")
class AMQ(object): """ Workload operation using AMQ """ def __init__(self, **kwargs): """ Initializer function Args: kwargs (dict): Following kwargs are valid namespace: namespace for the operator repo: AMQ repo where all necessary yaml file are there - a github link branch: branch to use from the repo """ self.args = kwargs self.repo = self.args.get('repo', constants.KAFKA_OPERATOR) self.branch = self.args.get('branch', 'master') self.ocp = OCP() self.ns_obj = OCP(kind='namespace') self.pod_obj = OCP(kind='pod') self.kafka_obj = OCP(kind='Kafka') self.kafka_connect_obj = OCP(kind="KafkaConnect") self.kafka_bridge_obj = OCP(kind="KafkaBridge") self.kafka_topic_obj = OCP(kind="KafkaTopic") self.kafka_user_obj = OCP(kind="KafkaUser") self.amq_is_setup = False self.messaging = False self._clone_amq() def _clone_amq(self): """ clone the amq repo """ self.dir = tempfile.mkdtemp(prefix='amq_') try: log.info(f'cloning amq in {self.dir}') git_clone_cmd = f'git clone -b {self.branch} {self.repo} ' run(git_clone_cmd, shell=True, cwd=self.dir, check=True) self.amq_dir = "strimzi-kafka-operator/install/cluster-operator/" self.amq_kafka_pers_yaml = "strimzi-kafka-operator/examples/kafka/kafka-persistent.yaml" self.amq_kafka_connect_yaml = "strimzi-kafka-operator/examples/connect/kafka-connect.yaml" self.amq_kafka_bridge_yaml = "strimzi-kafka-operator/examples/bridge/kafka-bridge.yaml" self.kafka_topic_yaml = "strimzi-kafka-operator/examples/topic/kafka-topic.yaml" self.kafka_user_yaml = "strimzi-kafka-operator/examples/user/kafka-user.yaml" self.hello_world_producer_yaml = constants.HELLO_WORLD_PRODUCER_YAML self.hello_world_consumer_yaml = constants.HELLO_WORLD_CONSUMER_YAML except (CommandFailed, CalledProcessError) as cf: log.error('Error during cloning of amq repository') raise cf def create_namespace(self, namespace): """ create namespace for amq Args: namespace (str): Namespace for amq pods """ self.ocp.new_project(namespace) def setup_amq_cluster_operator(self, namespace=constants.AMQ_NAMESPACE): """ Function to setup amq-cluster_operator, the file is pulling from github it will make sure cluster-operator pod is running Args: namespace (str): Namespace for AMQ pods """ # Namespace for amq try: self.create_namespace(namespace) except CommandFailed as ef: if f'project.project.openshift.io "{namespace}" already exists' not in str( ef): raise ef # Create strimzi-cluster-operator pod run( f"for i in `(ls strimzi-kafka-operator/install/cluster-operator/)`;" f"do sed 's/{namespace}/myproject/g' strimzi-kafka-operator/install/cluster-operator/$i;done", shell=True, check=True, cwd=self.dir) run(f'oc apply -f {self.amq_dir} -n {namespace}', shell=True, check=True, cwd=self.dir) time.sleep(10) # Check strimzi-cluster-operator pod created if self.is_amq_pod_running(pod_pattern="cluster-operator", expected_pods=1): log.info("strimzi-cluster-operator pod is in running state") else: raise ResourceWrongStatusException( "strimzi-cluster-operator pod is not getting to running state") def is_amq_pod_running(self, pod_pattern, expected_pods, namespace=constants.AMQ_NAMESPACE): """ The function checks if provided pod_pattern finds a pod and if the status is running or not Args: pod_pattern (str): the pattern for pod expected_pods (int): Number of pods namespace (str): Namespace for amq pods Returns: bool: status of pod: True if found pod is running """ _rc = True for pod in TimeoutSampler(300, 10, get_pod_name_by_pattern, pod_pattern, namespace): try: if pod is not None and len(pod) == expected_pods: amq_pod = pod break except IndexError as ie: log.error(" pod not ready yet") raise ie # checking pod status for pod in amq_pod: if (self.pod_obj.wait_for_resource( condition='Running', resource_name=pod, timeout=1600, sleep=30, )): log.info(f"{pod} pod is up and running") else: _rc = False log.error(f"{pod} pod is not running") return _rc def setup_amq_kafka_persistent(self, sc_name, size=100, replicas=3): """ Function to setup amq-kafka-persistent, the file is pulling from github it will make kind: Kafka and will make sure the status is running Args: sc_name (str): Name of sc size (int): Size of the storage in Gi replicas (int): Number of kafka and zookeeper pods to be created return : kafka_persistent """ try: kafka_persistent = templating.load_yaml( os.path.join(self.dir, self.amq_kafka_pers_yaml)) kafka_persistent['spec']['kafka']['replicas'] = replicas kafka_persistent['spec']['kafka']['storage']['volumes'][0][ 'class'] = sc_name kafka_persistent['spec']['kafka']['storage']['volumes'][0][ 'size'] = f"{size}Gi" kafka_persistent['spec']['zookeeper']['replicas'] = replicas kafka_persistent['spec']['zookeeper']['storage']['class'] = sc_name kafka_persistent['spec']['zookeeper']['storage'][ 'size'] = f"{size}Gi" self.kafka_persistent = OCS(**kafka_persistent) self.kafka_persistent.create() except (CommandFailed, CalledProcessError) as cf: log.error('Failed during setup of AMQ Kafka-persistent') raise cf time.sleep(40) if self.is_amq_pod_running( pod_pattern="my-cluster-zookeeper", expected_pods=replicas) and self.is_amq_pod_running( pod_pattern="my-cluster-kafka", expected_pods=replicas): return self.kafka_persistent else: raise ResourceWrongStatusException( "my-cluster-kafka and my-cluster-zookeeper " "Pod is not getting to running state") def setup_amq_kafka_connect(self): """ The function is to setup amq-kafka-connect, the yaml file is pulling from github it will make kind: KafkaConnect and will make sure the status is running Returns: kafka_connect object """ try: kafka_connect = templating.load_yaml( os.path.join(self.dir, self.amq_kafka_connect_yaml)) self.kafka_connect = OCS(**kafka_connect) self.kafka_connect.create() except (CommandFailed, CalledProcessError) as cf: log.error('Failed during setup of AMQ KafkaConnect') raise cf if self.is_amq_pod_running(pod_pattern="my-connect-cluster-connect", expected_pods=1): return self.kafka_connect else: raise ResourceWrongStatusException( "my-connect-cluster-connect pod is not getting to running state" ) def setup_amq_kafka_bridge(self): """ Function to setup amq-kafka, the file file is pulling from github it will make kind: KafkaBridge and will make sure the pod status is running Return: kafka_bridge object """ try: kafka_bridge = templating.load_yaml( os.path.join(self.dir, self.amq_kafka_bridge_yaml)) self.kafka_bridge = OCS(**kafka_bridge) self.kafka_bridge.create() except (CommandFailed, CalledProcessError) as cf: log.error('Failed during setup of AMQ KafkaConnect') raise cf # Making sure the kafka_bridge is running if self.is_amq_pod_running(pod_pattern="my-bridge-bridge", expected_pods=1): return self.kafka_bridge else: raise ResourceWrongStatusException( "kafka_bridge_pod pod is not getting to running state") def create_kafka_topic(self, name='my-topic', partitions=1, replicas=1): """ Creates kafka topic Args: name (str): Name of the kafka topic partitions (int): Number of partitions replicas (int): Number of replicas Return: kafka_topic object """ try: kafka_topic = templating.load_yaml( os.path.join(self.dir, self.kafka_topic_yaml)) kafka_topic["metadata"]["name"] = name kafka_topic["spec"]["partitions"] = partitions kafka_topic["spec"]["replicas"] = replicas self.kafka_topic = OCS(**kafka_topic) self.kafka_topic.create() except (CommandFailed, CalledProcessError) as cf: log.error('Failed during creating of Kafka topic') raise cf # Making sure kafka topic created if self.kafka_topic_obj.get(resource_name=name): return self.kafka_topic else: raise ResourceWrongStatusException("kafka topic is not created") def create_kafka_user(self, name="my-user"): """ Creates kafka user Args: name (str): Name of the kafka user Return: kafka_user object """ try: kafka_user = templating.load_yaml( os.path.join(self.dir, self.kafka_user_yaml)) kafka_user["metadata"]["name"] = name self.kafka_user = OCS(**kafka_user) self.kafka_user.create() except (CommandFailed, CalledProcessError) as cf: log.error('Failed during creating of Kafka user') raise cf # Making sure kafka user created if self.kafka_user_obj.get(resource_name=name): return self.kafka_user else: raise ResourceWrongStatusException("kafka user is not created") def create_producer_pod(self, num_of_pods=1, value='10000'): """ Creates producer pods Args: num_of_pods (int): Number of producer pods to be created value (str): Number of the messages to be sent Returns: producer pod object """ try: producer_pod = templating.load_yaml( constants.HELLO_WORLD_PRODUCER_YAML) producer_pod["spec"]["replicas"] = num_of_pods producer_pod["spec"]["template"]["spec"]["containers"][0]["env"][ 4]["value"] = value self.producer_pod = OCS(**producer_pod) self.producer_pod.create() except (CommandFailed, CalledProcessError) as cf: log.error('Failed during creation of producer pod') raise cf # Making sure the producer pod is running if self.is_amq_pod_running(pod_pattern="hello-world-producer", expected_pods=num_of_pods): return self.producer_pod else: raise ResourceWrongStatusException( "producer pod is not getting to running state") def create_consumer_pod(self, num_of_pods=1, value='10000'): """ Creates producer pods Args: num_of_pods (int): Number of consumer pods to be created value (str): Number of messages to be received Returns: consumer pod object """ try: consumer_pod = templating.load_yaml( constants.HELLO_WORLD_CONSUMER_YAML) consumer_pod["spec"]["replicas"] = num_of_pods consumer_pod["spec"]["template"]["spec"]["containers"][0]["env"][ 4]["value"] = value self.consumer_pod = OCS(**consumer_pod) self.consumer_pod.create() except (CommandFailed, CalledProcessError) as cf: log.error('Failed during creation of consumer pod') raise cf # Making sure the producer pod is running if self.is_amq_pod_running(pod_pattern="hello-world-consumer", expected_pods=num_of_pods): return self.consumer_pod else: raise ResourceWrongStatusException( "consumer pod is not getting to running state") def validate_msg(self, pod, namespace=constants.AMQ_NAMESPACE, value='10000', since_time=1800): """ Validate if messages are sent or received Args: pod (str): Name of the pod namespace (str): Namespace of the pod value (str): Number of messages are sent since_time (int): Number of seconds to required to sent the msg Returns: bool : True if all messages are sent/received """ cmd = f"oc logs -n {namespace} {pod} --since={since_time}s" msg = run_cmd(cmd) if msg.find(f"Hello world - {int(value) - 1} ") is -1: return False else: return True def validate_messages_are_produced(self, namespace=constants.AMQ_NAMESPACE, value='10000', since_time=1800): """ Validates if all messages are sent in producer pod Args: namespace (str): Namespace of the pod value (str): Number of messages are sent since_time (int): Number of seconds to required to sent the msg Raises exception on failures """ # ToDo: Support multiple topics and users producer_pod_objs = [ get_pod_obj(pod) for pod in get_pod_name_by_pattern( 'hello-world-produce', namespace) ] for pod in producer_pod_objs: for msg in TimeoutSampler(900, 30, self.validate_msg, pod.name, namespace, value, since_time): if msg: break log.error("Few messages are not sent") raise Exception("All messages are not sent from the producer pod") def validate_messages_are_consumed(self, namespace=constants.AMQ_NAMESPACE, value='10000', since_time=1800): """ Validates if all messages are received in consumer pod Args: namespace (str): Namespace of the pod value (str): Number of messages are recieved since_time (int): Number of seconds to required to receive the msg Raises exception on failures """ # ToDo: Support multiple topics and users consumer_pod_objs = [ get_pod_obj(pod) for pod in get_pod_name_by_pattern( 'hello-world-consumer', namespace) ] for pod in consumer_pod_objs: for msg in TimeoutSampler(900, 30, self.validate_msg, pod.name, namespace, value, since_time): if msg: log.info( "Consumer pod received all messages sent by producer") break log.error("Few messages are not received") raise Exception("Consumer pod received all messages sent by producer") def run_in_bg(self, namespace=constants.AMQ_NAMESPACE, value='10000', since_time=1800): """ Validate messages are produced and consumed in bg Args: namespace (str): Namespace of the pod value (str): Number of messages to be sent and received since_time (int): Number of seconds to required to sent and receive msg """ # Todo: Check for each messages sent and received log.info("Running open messages on pod in bg") threads = [] thread1 = Thread(target=self.validate_messages_are_produced, args=(namespace, value, since_time)) thread1.start() time.sleep(10) threads.append(thread1) thread2 = Thread(target=self.validate_messages_are_consumed, args=(namespace, value, since_time)) thread2.start() time.sleep(10) threads.append(thread2) return threads # ToDo: Install helm and get kafka metrics def create_messaging_on_amq(self, topic_name='my-topic', user_name="my-user", partitions=1, replicas=1, num_of_producer_pods=1, num_of_consumer_pods=1, value='10000'): """ Creates workload using Open Messaging tool on amq cluster Args: topic_name (str): Name of the topic to be created user_name (str): Name of the user to be created partitions (int): Number of partitions of topic replicas (int): Number of replicas of topic num_of_producer_pods (int): Number of producer pods to be created num_of_consumer_pods (int): Number of consumer pods to be created value (str): Number of messages to be sent and received """ self.create_kafka_topic(topic_name, partitions, replicas) self.create_kafka_user(user_name) self.create_producer_pod(num_of_producer_pods, value) self.create_consumer_pod(num_of_consumer_pods, value) self.messaging = True def setup_amq_cluster(self, sc_name, namespace=constants.AMQ_NAMESPACE, size=100, replicas=3): """ Creates amq cluster with persistent storage. Args: sc_name (str): Name of sc namespace (str): Namespace for amq cluster size (int): Size of the storage replicas (int): Number of kafka and zookeeper pods to be created """ self.setup_amq_cluster_operator(namespace) self.setup_amq_kafka_persistent(sc_name, size, replicas) self.setup_amq_kafka_connect() self.setup_amq_kafka_bridge() self.amq_is_setup = True return self def cleanup(self, namespace=constants.AMQ_NAMESPACE): """ Clean up function, will start to delete from amq cluster operator then amq-connector, persistent, bridge, at the end it will delete the created namespace Args: namespace (str): Created namespace for amq """ if self.amq_is_setup: if self.messaging: self.consumer_pod.delete() self.producer_pod.delete() self.kafka_user.delete() self.kafka_topic.delete() self.kafka_persistent.delete() self.kafka_connect.delete() self.kafka_bridge.delete() run_cmd(f'oc delete -f {self.amq_dir}', shell=True, check=True, cwd=self.dir) run_cmd(f'oc delete project {namespace}') # Reset namespace to default switch_to_default_rook_cluster_project() self.ns_obj.wait_for_delete(resource_name=namespace)
class TestCouchbaseWorkload(E2ETest): """ Main couchbase workload class """ COUCHBASE_OPERATOR = 'couchbase-operator-namespace' WAIT_FOR_TIME = 600 admission_parts = [ constants.COUCHBASE_ADMISSION_SERVICE_ACCOUNT_YAML, constants.COUCHBASE_ADMISSION_CLUSTER_ROLE_YAML, constants.COUCHBASE_ADMISSION_CLUSTER_ROLE_BINDING_YAML, constants.COUCHBASE_ADMISSION_SECRET_YAML, constants.COUCHBASE_ADMISSION_DEPLOYMENT_YAML, constants.COUCHBASE_ADMISSION_SERVICE_YAML, constants.COUCHBASE_MUTATING_WEBHOOK_YAML, constants.COUCHBASE_VALIDATING_WEBHOOK_YAML ] pod_obj = OCP(kind='pod') couchbase_pod = OCP(kind='pod') secretsadder = OCP(kind='pod') admission_pod = [] cb_worker = OCS() cb_examples = OCS() def add_serviceaccount_secret(self, acct_name, dockerstr): """ Add secret for serviceaccount Args: acct_name (str): Name of the service account dockerstr (str): Docker secret """ self.secretsadder.exec_oc_cmd( f"secrets add serviceaccount/{acct_name} secrets/{dockerstr} --for=pull" ) def is_up_and_running(self, pod_name, ocp_value): """ Test if the pod specified is up and running. Args: pod_name (str): Name of pod being checked. ocp_value (OCP): object used for running oc commands Returns: bool; True if pod is running, False otherwise """ if not pod_name: return False pod_info = ocp_value.exec_oc_cmd(f"get pods {pod_name} -o json") if pod_info['status']['containerStatuses'][0]['ready']: if 'running' in pod_info['status']['containerStatuses'][0]['state']: return True return False def test_couchbase_workload_simple(self, pillowfight): """ Deploy a Couchbase server and pillowfight workload using operator The couchbase workers do not come up unless there is an admission controller running. The admission controller is started from the default project prior to bringing up the operator. Secrets, rolebindings and serviceaccounts need to also be generated. Once the couchbase operator is running, we need to wait for the three worker pods to also be up. Then a pillowfight task is started. After the pillowfight task has finished, the log is collected and analyzed. Raises: Exception: If pillowfight results indicate that a minimum performance level is not reached (1 second response time, less than 1000 ops per second) """ # Create admission controller log.info("Create admission controller process for Couchbase") switch_to_project('default') self.up_adm_chk = OCP(namespace="default") self.up_check = OCP(namespace=self.COUCHBASE_OPERATOR) for adm_yaml in self.admission_parts: adm_data = templating.load_yaml(adm_yaml) adm_obj = OCS(**adm_data) adm_obj.create() # Wait for admission pod to be created for adm_pod in TimeoutSampler( self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase-operator-admission', 'default' ): try: if self.is_up_and_running(adm_pod[0], self.up_adm_chk): self.admission_pod = adm_pod[0] break except IndexError: log.info("Admission pod is not ready yet") # Wait for admission pod to be running log.info("Waiting for admission pod to be running") self.pod_obj.wait_for_resource( condition='Running', resource_name=self.admission_pod, timeout=self.WAIT_FOR_TIME, sleep=10, ) self.pod_obj.new_project(self.COUCHBASE_OPERATOR) couchbase_data = templating.load_yaml( constants.COUCHBASE_CRD_YAML ) self.couchbase_obj = OCS(**couchbase_data) self.couchbase_obj.create() op_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_ROLE) self.operator_role = OCS(**op_data) self.operator_role.create() self.serviceaccount = OCP(namespace=self.COUCHBASE_OPERATOR) self.serviceaccount.exec_oc_cmd( "create serviceaccount couchbase-operator" ) dockercfgs = self.serviceaccount.exec_oc_cmd("get secrets") startloc = dockercfgs.find('couchbase-operator-dockercfg') newdockerstr = dockercfgs[startloc:] endloc = newdockerstr.find(' ') dockerstr = newdockerstr[:endloc] self.add_serviceaccount_secret("couchbase-operator", dockerstr) self.add_serviceaccount_secret("default", dockerstr) self.rolebinding = OCP(namespace=self.COUCHBASE_OPERATOR) rolebind_cmd = "".join([ "create rolebinding couchbase-operator-rolebinding ", "--role couchbase-operator ", "--serviceaccount couchbase-operator-namespace:couchbase-operator" ]) self.rolebinding.exec_oc_cmd(rolebind_cmd) dep_data = templating.load_yaml(constants.COUCHBASE_OPERATOR_DEPLOY) self.cb_deploy = OCS(**dep_data) self.cb_deploy.create() # Wait for couchbase operator pod to be running for couchbase_pod in TimeoutSampler( self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase-operator', self.COUCHBASE_OPERATOR ): try: if self.is_up_and_running(couchbase_pod[0], self.up_check): break except IndexError: log.info("Couchbase operator is not up") cb_work = templating.load_yaml(constants.COUCHBASE_WORKER_SECRET) self.cb_worker = OCS(**cb_work) self.cb_worker.create() cb_example = templating.load_yaml(constants.COUCHBASE_WORKER_EXAMPLE) self.cb_examples = OCS(**cb_example) self.cb_examples.create() # Wait for last of three workers to be running. for cb_wrk_pod in TimeoutSampler( self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'cb-example-0002', self.COUCHBASE_OPERATOR ): try: if self.is_up_and_running(cb_wrk_pod[0], self.up_check): # once last pod is up, make sure all are ready counter = 0 for wpodn in range(0, 3): cbw_pod = f"cb-example-{wpodn:04}" if self.is_up_and_running(cbw_pod, self.up_check): counter += 1 if counter == 3: break except IndexError: log.info("Couchbase workers are not up") pillowfight.run_pillowfights() pillowfight.analyze_all() def teardown(self): """ Delete objects created in roughly reverse order of how they were created. """ self.cb_examples.delete() self.cb_worker.delete() self.cb_deploy.delete() self.pod_obj.exec_oc_cmd( command="delete rolebinding couchbase-operator-rolebinding" ) self.pod_obj.exec_oc_cmd( command="delete serviceaccount couchbase-operator" ) self.operator_role.delete() self.couchbase_obj.delete() switch_to_project('default') self.pod_obj.delete_project(self.COUCHBASE_OPERATOR) for adm_yaml in self.admission_parts: adm_data = templating.load_yaml(adm_yaml) adm_obj = OCS(**adm_data) adm_obj.delete() # Before the code below was added, the teardown task would sometimes # fail with the leftover objects because it would still see one of the # couchbase pods. for admin_pod in TimeoutSampler( self.WAIT_FOR_TIME, 3, get_pod_name_by_pattern, 'couchbase', 'default' ): if admin_pod: continue else: break
def test_pvc_snapshot_performance_multiple_files(self, file_size, files, threads, interface): """ Run SmallFile Workload and the take snapshot. test will run with 1M of file on the volume - total data set is the same for all tests, ~30GiB, and then take snapshot and measure the time it takes. the test will run 3 time to check consistency. Args: file_size (int): the size of the file to be create - in KiB files (int): number of files each thread will create threads (int): number of threads will be used in the workload interface (str): the volume interface that will be used CephBlockPool / CephFileSystem Raises: TimeoutError : in case of creation files take too long time more then 2 Hours """ # Deploying elastic-search server in the cluster for use by the # SmallFiles workload, since it is mandatory for the workload. # This is deployed once for all test iterations and will be deleted # in the end of the test. self.es = ElasticSearch() # Loading the main template yaml file for the benchmark and update some # fields with new values sf_data = templating.load_yaml(constants.SMALLFILE_BENCHMARK_YAML) if interface == constants.CEPHBLOCKPOOL: storageclass = constants.DEFAULT_STORAGECLASS_RBD else: storageclass = constants.DEFAULT_STORAGECLASS_CEPHFS log.info(f"Using {storageclass} Storageclass") # Setting up the parameters for this test sf_data["spec"]["workload"]["args"]["samples"] = 1 sf_data["spec"]["workload"]["args"]["operation"] = ["create"] sf_data["spec"]["workload"]["args"]["file_size"] = file_size sf_data["spec"]["workload"]["args"]["files"] = files sf_data["spec"]["workload"]["args"]["threads"] = threads sf_data["spec"]["workload"]["args"]["storageclass"] = storageclass sf_data["spec"]["elasticsearch"] = { "url": f"http://{self.es.get_ip()}:{self.es.get_port()}" } """ Calculating the size of the volume that need to be test, it should be at least twice in the size then the size of the files, and at least 100Gi. Since the file_size is in Kb and the vol_size need to be in Gb, more calculation is needed. """ total_files = int(files * threads) total_data = int(files * threads * file_size / constants.GB2KB) data_set = int(total_data * 3) # calculate data with replica vol_size = data_set if data_set >= 100 else 100 sf_data["spec"]["workload"]["args"]["storagesize"] = f"{vol_size}Gi" environment = get_environment_info() if not environment["user"] == "": sf_data["spec"]["test_user"] = environment["user"] else: # since full results object need this parameter, initialize it from CR file environment["user"] = sf_data["spec"]["test_user"] sf_data["spec"]["clustername"] = environment["clustername"] log.debug(f"The smallfile yaml file is {sf_data}") # Deploy the benchmark-operator, so we can use the SmallFiles workload # to fill up the volume with files, and switch to the benchmark-operator namespace. log.info("Deploy the benchmark-operator") self.deploy_benchmark_operator() switch_to_project(BMO_NAME) all_results = [] self.results_path = get_full_test_logs_path(cname=self) log.info(f"Logs file path name is : {self.full_log_path}") # Produce ES report # Collecting environment information self.get_env_info() # Initialize the results doc file. self.full_results = self.init_full_results( ResultsAnalyse( self.uuid, self.crd_data, self.full_log_path, "pvc_snapshot_perf_multiple_files", )) self.full_results.add_key("file_size_inKB", file_size) self.full_results.add_key("threads", threads) self.full_results.add_key("interface", interface) for test_num in range(self.tests_numbers): test_results = {"creation_time": None, "csi_creation_time": None} # deploy the smallfile workload log.info("Running SmallFile bench") sf_obj = OCS(**sf_data) sf_obj.create() # wait for benchmark pods to get created - takes a while for bench_pod in TimeoutSampler( 240, 10, get_pod_name_by_pattern, "smallfile-client", BMO_NAME, ): try: if bench_pod[0] is not None: small_file_client_pod = bench_pod[0] break except IndexError: log.info("Bench pod not ready yet") bench_pod = OCP(kind="pod", namespace=BMO_NAME) log.info("Waiting for SmallFile benchmark to Run") assert bench_pod.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=small_file_client_pod, sleep=30, timeout=600, ) # Initialize the pvc_name variable so it will not be in loop scope only. pvc_name = "" for item in bench_pod.get()["items"]: if item.get("metadata").get("name") == small_file_client_pod: for volume in item.get("spec").get("volumes"): if "persistentVolumeClaim" in volume: pvc_name = volume["persistentVolumeClaim"][ "claimName"] break log.info(f"Benchmark PVC name is : {pvc_name}") # Creation of 1M files on CephFS can take a lot of time timeout = 7200 while timeout >= 0: logs = bench_pod.get_logs(name=small_file_client_pod) if "RUN STATUS DONE" in logs: break timeout -= 30 if timeout == 0: raise TimeoutError( "Timed out waiting for benchmark to complete") time.sleep(30) log.info(f"Smallfile test ({test_num + 1}) finished.") # Taking snapshot of the PVC (which contain files) snap_name = pvc_name.replace("claim", "snapshot-") log.info(f"Taking snapshot of the PVC {pvc_name}") log.info(f"Snapshot name : {snap_name}") start_time = datetime.datetime.utcnow().strftime( "%Y-%m-%dT%H:%M:%SZ") test_results["creation_time"] = self.measure_create_snapshot_time( pvc_name=pvc_name, snap_name=snap_name, namespace=BMO_NAME, interface=interface, start_time=start_time, ) log.info( f"Snapshot with name {snap_name} and id {self.snap_uid} creation time is" f' {test_results["creation_time"]} seconds') test_results[ "csi_creation_time"] = performance_lib.measure_csi_snapshot_creation_time( interface=interface, snapshot_id=self.snap_uid, start_time=start_time) log.info( f"Snapshot with name {snap_name} and id {self.snap_uid} csi creation time is" f' {test_results["csi_creation_time"]} seconds') all_results.append(test_results) # Delete the smallfile workload - which will delete also the PVC log.info("Deleting the smallfile workload") if sf_obj.delete(wait=True): log.info("The smallfile workload was deleted successfully") # Delete VolumeSnapshots log.info("Deleting the snapshots") if self.snap_obj.delete(wait=True): log.info("The snapshot deleted successfully") log.info("Verify (and wait if needed) that ceph health is OK") ceph_health_check(tries=45, delay=60) # Sleep for 1 Min. between test samples time.sleep(60) # Cleanup the elasticsearch instance. log.info("Deleting the elastic-search instance") self.es.cleanup() creation_times = [t["creation_time"] for t in all_results] avg_c_time = statistics.mean(creation_times) csi_creation_times = [t["csi_creation_time"] for t in all_results] avg_csi_c_time = statistics.mean(csi_creation_times) t_dateset = int(data_set / 3) log.info(f"Full test report for {interface}:") log.info(f"Test ran {self.tests_numbers} times, " f"All snapshot creation results are {creation_times} seconds") log.info( f"The average snapshot creation time is : {avg_c_time} seconds") log.info(f"Test ran {self.tests_numbers} times, " f"All snapshot csi creation results are {csi_creation_times}") log.info( f"The average csi snapshot creation time is : {avg_csi_c_time}") log.info(f"Number of Files on the volume : {total_files:,}, " f"Total dataset : {t_dateset} GiB") self.full_results.add_key("avg_snapshot_creation_time_insecs", avg_c_time) self.full_results.all_results["total_files"] = total_files self.full_results.all_results["total_dataset"] = t_dateset self.full_results.all_results["creation_time"] = creation_times self.full_results.all_results["csi_creation_time"] = csi_creation_times # Write the test results into the ES server log.info("writing results to elastic search server") if self.full_results.es_write(): res_link = self.full_results.results_link() # write the ES link to the test results in the test log. log.info(f"The result can be found at : {res_link}") # Create text file with results of all subtest self.write_result_to_file(res_link)