def test_add_capacity_with_resource_delete( self, add_capacity_setup, workload_storageutilization_rbd, resource_name, resource_id, is_kill_resource_repeatedly, ): """ The function get the resource name, and id. The function adds capacity to the cluster, and then delete the resource while storage capacity is getting increased. Args: resource_name (str): the name of the resource to delete resource_id (int): the id of the resource to delete is_kill_resource_repeatedly (bool): If True then kill the resource repeatedly. Else, if False delete the resource only once. """ used_percentage = get_percent_used_capacity() logging.info( f"storageutilization is completed. used capacity = {used_percentage}" ) osd_pods_before = pod_helpers.get_osd_pods() number_of_osd_pods_before = len(osd_pods_before) d = Disruptions() d.set_resource(resource_name) self.new_pods_in_status_running = False osd_size = storage_cluster.get_osd_size() logging.info(f"Adding one new set of OSDs. osd size = {osd_size}") storagedeviceset_count = storage_cluster.add_capacity(osd_size) logging.info("Adding one new set of OSDs was issued without problems") # Wait for new osd's to come up. After the first new osd in status Init - delete the resource. # After deleting the resource we expect that all the new osd's will be in status running, # and the delete resource will be also in status running. pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before) logging.info( f"Delete a {resource_name} pod while storage capacity is getting increased" ) if is_kill_resource_repeatedly: with ThreadPoolExecutor() as executor: executor.submit(self.kill_resource_repeatedly, resource_name, resource_id) self.wait_for_osd_pods_to_be_running(storagedeviceset_count) else: d.delete_resource(resource_id) self.wait_for_osd_pods_to_be_running(storagedeviceset_count) self.new_pods_in_status_running = True logging.info( "Finished verifying add capacity when one of the pods gets deleted" ) logging.info("Waiting for ceph health check to finished...") check_ceph_health_after_add_capacity()
def test_new_sc_new_rbd_pool( self, replica, compression, volume_binding_mode, pvc_status, storageclass_factory, pvc_factory, pod_factory, ): """ This test function does below, *. Creates Storage Class with creating new rbd pool *. Creates PVCs using new Storage Class *. Mount PVC to an app pod *. Run IO on an app pod """ interface_type = constants.CEPHBLOCKPOOL sc_obj = storageclass_factory( interface=interface_type, new_rbd_pool=True, replica=replica, compression=compression, volume_binding_mode=volume_binding_mode, ) log.info(f"Creating a PVC using {sc_obj.name}") pvc_obj = pvc_factory(interface=interface_type, storageclass=sc_obj, size=10, status=pvc_status) log.info(f"PVC: {pvc_obj.name} created successfully using " f"{sc_obj.name}") # Create app pod and mount each PVC log.info(f"Creating an app pod and mount {pvc_obj.name}") pod_obj = pod_factory(interface=interface_type, pvc=pvc_obj) log.info( f"{pod_obj.name} created successfully and mounted {pvc_obj.name}") # Run IO on each app pod for sometime log.info(f"Running FIO on {pod_obj.name}") pod_obj.run_io( "fs", size="1G", rate="1500m", runtime=60, buffer_compress_percentage=60, buffer_pattern="0xdeadface", bs="8K", jobs=5, readwrite="readwrite", ) cluster_used_space = get_percent_used_capacity() log.info(f"Cluster used space with replica size {replica}, " f"compression mode {compression}={cluster_used_space}") cbp_name = sc_obj.get().get("parameters").get("pool") if compression != "none": validate_compression(cbp_name) validate_replica_data(cbp_name, replica)
def calculate_crd_data(self): """ Getting the storage capacity and calculate pod count and pvc size """ ceph_used_capacity_percent = get_percent_used_capacity() logger.info(f"Ceph used capacity percent is {ceph_used_capacity_percent}%") ceph_capacity = self.ceph_cluster.get_ceph_capacity() logger.info(f"Total storage capacity is {ceph_capacity} GiB") self.percent_to_fill = self.percent_to_fill - ceph_used_capacity_percent logger.info(f"Percentage to fill is {self.percent_to_fill}%") self.total_data_set = int(ceph_capacity * (int(self.percent_to_fill) / 100)) self.filesize = int( self.crd_data["spec"]["workload"]["args"]["filesize"].replace("GiB", "") ) # Make sure that filesize>=10 and servers<=60 self.servers = 60 self.filesize = int(self.total_data_set / self.servers) if self.filesize < 10: self.filesize = 10 self.servers = int(self.total_data_set / self.filesize) self.crd_data["spec"]["workload"]["args"]["filesize"] = f"{self.filesize}GiB" self.crd_data["spec"]["workload"]["args"][ "storagesize" ] = f"{int(self.total_data_set)}Gi" self.crd_data["spec"]["workload"]["args"]["servers"] = self.servers self.crd_data["spec"]["workload"]["args"]["bs"] = "1024KiB" self.crd_data["spec"]["workload"]["args"]["jobs"] = ["write", "read"] self.crd_data["spec"]["workload"]["args"]["iodepth"] = 1
def test_new_sc_new_rbd_pool_e2e_wl( self, storageclass_factory, amq_factory_fixture, couchbase_factory_fixture, pgsql_factory_fixture, replica, compression, ): """ Testing workloads on new storage class with new cephblockpool """ interface_type = constants.CEPHBLOCKPOOL sc_obj = storageclass_factory( interface=interface_type, new_rbd_pool=True, replica=replica, compression=compression, ) self.amq, self.threads = amq_factory_fixture(sc_name=sc_obj.name) self.cb = couchbase_factory_fixture(sc_name=sc_obj.name, run_in_bg=True) self.pgsql = pgsql_factory_fixture(replicas=3, clients=3, transactions=600, sc_name=sc_obj.name) bg_handler = flowtest.BackgroundOps() bg_ops = [self.cb.result] bg_handler.wait_for_bg_operations(bg_ops, timeout=3600) cluster_used_space = get_percent_used_capacity() log.info( f" Cluster used percentage space with replica size {replica}, " f"compression mode {compression}={cluster_used_space}")
def test_new_sc_new_rbd_pool_e2e_wl( self, storageclass_factory, amq_factory_fixture, couchbase_factory_fixture, pgsql_factory_fixture, replica, compression, ): """ Testing workloads on new storage class with new cephblockpool """ interface_type = constants.CEPHBLOCKPOOL sc_obj = storageclass_factory( interface=interface_type, new_rbd_pool=True, replica=replica, compression=compression, ) bg_handler = flowtest.BackgroundOps() executor_run_bg_ios_ops = ThreadPoolExecutor(max_workers=5) self.amq, self.threads = amq_factory_fixture(sc_name=sc_obj.name) cb_workload = executor_run_bg_ios_ops.submit( bg_handler.handler, couchbase_factory_fixture, sc_name=sc_obj.name, replicas=3, skip_analyze=True, run_in_bg=False, num_items="1000", num_threads="1", iterations=1, ) pgsql_workload = executor_run_bg_ios_ops.submit( bg_handler.handler, pgsql_factory_fixture, replicas=1, clients=1, transactions=100, timeout=100, sc_name=sc_obj.name, iterations=1, ) bg_handler = flowtest.BackgroundOps() bg_ops = [pgsql_workload, cb_workload] bg_handler.wait_for_bg_operations(bg_ops, timeout=3600) # AMQ Validate the results log.info("Validate message run completely") for thread in self.threads: thread.result(timeout=1800) cluster_used_space = get_percent_used_capacity() log.info( f" Cluster used percentage space with replica size {replica}, " f"compression mode {compression}={cluster_used_space}" )
def test_add_capacity_osd_pod_delete(self, workload_storageutilization_rbd): """ Test add capacity when one of the osd pods gets deleted in the middle of the process. """ used_percentage = get_percent_used_capacity() logging.info(f"storageutilization is completed. used capacity = {used_percentage}") max_osds = 15 osd_pods_before = pod_helpers.get_osd_pods() number_of_osd_pods_before = len(osd_pods_before) if number_of_osd_pods_before >= max_osds: pytest.skip("We have maximum of osd's in the cluster") d = Disruptions() d.set_resource('osd') osd_size = storage_cluster.get_osd_size() logging.info(f"Adding one new set of OSDs. osd size = {osd_size}") storagedeviceset_count = storage_cluster.add_capacity(osd_size) logging.info("Adding one new set of OSDs was issued without problems") # OSD number go down by one and then gradually go up by 1 # and finally the OSD number will be storagedeviceset_count*3 pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before) logging.info("Delete an osd pod while storage capacity is getting increased") d.delete_resource(1) pod = OCP( kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace'] ) pod.wait_for_resource( timeout=420, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=storagedeviceset_count * 3 ) logging.info("Finished verifying add capacity when one of the osd pods gets deleted") logging.info("Waiting for ceph health check to finished...") ceph_health_check( namespace=config.ENV_DATA['cluster_namespace'], tries=80 )
def cluster_filler(self): curl_cmd = ( f""" curl {constants.REMOTE_FILE_URL} --output {constants.FILE_PATH} """ ) logging.info("downloading......") run_cmd(cmd=curl_cmd) logging.info("finished") with ThreadPoolExecutor() as executor: for pod in self.pods_to_fill: executor.submit( pod_helpers.upload, pod.name, constants.FILE_PATH, "/mnt/", namespace=self.namespace, ) logging.info(f"### initiated downloader for {pod.name}") filler_executor = ThreadPoolExecutor() while not self.cluster_filled: for copy_iter in range(self.concurrent_copies): for each_pod in self.pods_to_fill: self.used_capacity = get_percent_used_capacity() logging.info( f"### used capacity %age = {self.used_capacity}") if self.used_capacity <= self.percent_required_filled: filler_executor.submit(self.filler, each_pod) logging.info( f"#### Ran copy operation on pod {each_pod.name}. copy_iter # {copy_iter}" ) else: logging.info( f"############ Cluster filled to the expected capacity " f"{self.percent_required_filled}") self.cluster_filled = True break if self.cluster_filled: return True
def reach_cluster_load_percentage(self): """ Reach the cluster limit and then drop to the given target percentage. The number of pods needed for the desired target percentage is determined by creating pods one by one, while examining the cluster latency. Once the latency is greater than 250 ms and it is growing exponentially, it means that the cluster limit has been reached. Then, dropping to the target percentage by deleting all pods and re-creating ones with smaller value of FIO 'rate' param. This leaves the number of pods needed running IO for cluster load to be around the desired percentage. """ if not self.target_percentage: logger.warning("The target percentage was not provided. Breaking") return if not 0.1 < self.target_percentage < 0.95: logger.warning( f"The target percentage is {self.target_percentage * 100}% which is " "not within the accepted range. Therefore, IO will not be started" ) return low_diff_counter = 0 cluster_limit = None latency_vals = list() time_to_wait = 60 * 30 time_before = time.time() self.current_iops = self.get_query(query=constants.IOPS_QUERY) # Creating FIO DeploymentConfig pods one by one, with a large value of FIO # 'rate' arg. This in order to determine the cluster limit faster. # Once determined, these pods will be deleted. Then, new FIO DC pods will be # created, with a smaller value of 'rate' param. This in order to be more # accurate with reaching the target percentage while True: wait = False if len(self.dc_objs) <= 1 else True self.increase_load_and_print_data(rate='250M', wait=wait) if self.current_iops > self.previous_iops: cluster_limit = self.current_iops latency = self.calc_trim_metric_mean(metric=constants.LATENCY_QUERY) * 1000 latency_vals.append(latency) logger.info(f"Latency values: {latency_vals}") iops_diff = (self.current_iops / self.previous_iops * 100) - 100 low_diff_counter += 1 if -15 < iops_diff < 10 else 0 cluster_used_space = get_percent_used_capacity() if len(latency_vals) > 1 and latency > 250: # Checking for an exponential growth. In case the latest latency sample # value is more than 128 times the first latency value sample, we can conclude # that the cluster limit in terms of IOPS, has been reached. # See https://blog.docbert.org/vdbench-curve/ for more details. # In other cases, when the first latency sample value is greater than 3 ms, # the multiplication factor we check according to, is lower, in order to # determine the cluster load faster. if latency > latency_vals[0] * 2 ** 7 or ( 3 < latency_vals[0] < 50 and len(latency_vals) > 5 ): logger.info( wrap_msg("The cluster limit was determined by latency growth") ) break # In case the latency is greater than 2 seconds, # most chances the limit has been reached elif latency > 2000: logger.info( wrap_msg(f"The limit was determined by the high latency - {latency} ms") ) break # For clusters that their nodes do not meet the minimum # resource requirements, the cluster limit is being reached # while the latency remains low. For that, the cluster limit # needs to be determined by the following condition of IOPS # diff between FIO pod creation iterations elif low_diff_counter > 3: logger.warning( wrap_msg( "Limit was determined by low IOPS diff between " f"iterations - {iops_diff:.2f}%" ) ) break elif time.time() > time_before + time_to_wait: logger.warning( wrap_msg( "Could not determine the cluster IOPS limit within" f"the given {time_to_wait} seconds timeout. Breaking" ) ) break elif cluster_used_space > 60: logger.warning( wrap_msg( f"Cluster used space is {cluster_used_space}%. Could " "not reach the cluster IOPS limit before the " "used spaced reached 60%. Breaking" ) ) break self.cluster_limit = cluster_limit logger.info(wrap_msg(f"The cluster IOPS limit is {self.cluster_limit:.2f}")) logger.info("Deleting all DC FIO pods that have large FIO rate") while self.dc_objs: self.decrease_load(wait=False) target_iops = self.cluster_limit * self.target_percentage range_map = RangeKeyDict( { (0, 500): (6, 0.82, 0.4), (500, 1000): (8, 0.84, 0.45), (1000, 1500): (10, 0.86, 0.5), (1500, 2000): (12, 0.88, 0.55), (2000, 2500): (14, 0.90, 0.6), (2500, 3000): (16, 0.92, 0.65), (3000, 3500): (18, 0.94, 0.7), (3500, math.inf): (20, 0.96, 0.75), } ) self.rate = f'{range_map[target_iops][0]}M' # Creating the first pod of small FIO 'rate' param, to speed up the process. # In the meantime, the load will drop, following the deletion of the # FIO pods with large FIO 'rate' param logger.info("Creating FIO pods, one by one, until the target percentage is reached") self.increase_load_and_print_data(rate=self.rate) msg = ( f"The target load, in IOPS, is: {target_iops}, which is " f"{self.target_percentage*100}% of the {self.cluster_limit} cluster limit" ) logger.info(wrap_msg(msg)) while self.current_iops < target_iops * range_map[target_iops][1]: wait = False if self.current_iops < target_iops * range_map[target_iops][2] else True self.increase_load_and_print_data(rate=self.rate, wait=wait) msg = f"The target load, of {self.target_percentage * 100}%, has been reached" logger.info(wrap_msg(msg)) self.target_pods_number = len(self.dc_objs)
def reach_cluster_load_percentage(self): """ Reach the cluster limit and then drop to the given target percentage. The number of pods needed for the desired target percentage is determined by creating pods one by one, while examining the cluster latency. Once the latency is greater than 250 ms and it is growing exponentially, it means that the cluster limit has been reached. Then, dropping to the target percentage by deleting all pods and re-creating ones with smaller value of FIO 'rate' param. This leaves the number of pods needed running IO for cluster load to be around the desired percentage. """ if not self.target_percentage: logger.warning("The target percentage was not provided. Breaking") return if not 0.1 < self.target_percentage < 0.95: logger.warning( f"The target percentage is {self.target_percentage * 100}% which is " "not within the accepted range. Therefore, IO will not be started" ) return low_diff_counter = 0 limit_reached = False cluster_limit = None latency_vals = list() time_to_wait = 60 * 30 time_before = time.time() self.current_iops = self.get_query(query=constants.IOPS_QUERY) # Creating FIO DeploymentConfig pods one by one, with a large value of FIO # 'rate' arg. This in order to determine the cluster limit faster. # Once determined, these pods will be deleted. Then, new FIO DC pods will be # created, with a smaller value of 'rate' param. This in order to be more # accurate with reaching the target percentage rate = '250M' while not limit_reached: self.increase_load_and_print_data(rate=rate) if self.current_iops > self.previous_iops: cluster_limit = self.current_iops latency = self.calc_trim_metric_mean( metric=constants.LATENCY_QUERY) * 1000 latency_vals.append(latency) logger.info(f"Latency values: {latency_vals}") if len(latency_vals) > 1 and latency > 250: # Checking for an exponential growth if latency > latency_vals[0] * 2**7: logger.info("Latency exponential growth was detected") limit_reached = True # In case the latency is greater than 3 seconds, # most chances the limit has been reached if latency > 3000: logger.info("Limit was determined by latency, which is " f"higher than 3 seconds - {latency} ms") limit_reached = True # For clusters that their nodes do not meet the minimum # resource requirements, the cluster limit is being reached # while the latency remains low. For that, the cluster limit # needs to be determined by the following condition of IOPS # diff between FIO pod creation iterations iops_diff = (self.current_iops / self.previous_iops * 100) - 100 low_diff_counter += 1 if -15 < iops_diff < 10 else 0 if low_diff_counter > 3: logger.warning("Limit was determined by low IOPS diff between " f"iterations - {iops_diff:.2f}%") limit_reached = True if time.time() > time_before + time_to_wait: logger.warning( "Could not determine the cluster IOPS limit within" f"\nthe given {time_to_wait} seconds timeout. Breaking") limit_reached = True cluster_used_space = get_percent_used_capacity() if cluster_used_space > 60: logger.warning( f"Cluster used space is {cluster_used_space}%. Could " "not reach the cluster IOPS limit before the " "used spaced reached 60%. Breaking") limit_reached = True self.cluster_limit = cluster_limit logger.info( wrap_msg(f"The cluster IOPS limit is {self.cluster_limit:.2f}")) logger.info( f"Deleting all DC FIO pods that have FIO rate parameter of {rate}") while self.dc_objs: self.decrease_load(wait=False) # Creating the first pod of small FIO 'rate' param, to speed up the process. # In the meantime, the load will drop, following the deletion of the # FIO pods with large FIO 'rate' param rate = '15M' logger.info( f"Creating FIO pods with a rate parameter of {rate}, one by " "one, until the target percentage is reached") self.increase_load(rate=rate) target_iops = self.cluster_limit * self.target_percentage self.current_iops = self.get_query(query=constants.IOPS_QUERY) msg = ( f"The target load, in IOPS, is: {target_iops}, which is " f"{self.target_percentage*100}% of the {self.cluster_limit} cluster limit" ) logger.info(wrap_msg(msg)) while self.current_iops < target_iops * 0.95: wait = False if self.current_iops < target_iops / 2 else True self.increase_load_and_print_data(rate=rate, wait=wait) msg = f"The target load, of {self.target_percentage * 100}%, has been reached" logger.info(wrap_msg(msg)) self.target_pods_number = len(self.dc_objs)