def add_capacity_test(): osd_size = storage_cluster.get_osd_size() existing_osd_pods = get_osd_pods() existing_osd_pod_names = [pod.name for pod in existing_osd_pods] if ui_add_capacity_conditions(): try: result = ui_add_capacity(osd_size) except Exception as e: logging.error( f"Add capacity via UI is not applicable and CLI method will be done. The error is {e}" ) result = storage_cluster.add_capacity(osd_size) else: result = storage_cluster.add_capacity(osd_size) osd_pods_post_expansion = get_osd_pods() osd_pod_names_post_expansion = [ pod.name for pod in osd_pods_post_expansion ] restarted_osds = list() logger.info( "Checking if existing OSD pods were restarted (deleted) post add capacity (bug 1931601)" ) for pod in existing_osd_pod_names: if pod not in osd_pod_names_post_expansion: restarted_osds.append(pod) assert ( len(restarted_osds) == 0 ), f"The following OSD pods were restarted (deleted) post add capacity: {restarted_osds}" pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) if is_flexible_scaling_enabled(): replica_count = 1 else: replica_count = 3 pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * replica_count, ) # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061 # pod.wait_for_resource( # timeout=300, # condition=constants.STATUS_COMPLETED, # selector=constants.OSD_PREPARE_APP_LABEL, # resource_count=result * 3 # ) # Commented this lines as a workaround due to bug 1842500 # Verify OSDs are encrypted. if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() check_ceph_health_after_add_capacity(ceph_rebalance_timeout=3600)
def add_capacity_test(): osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * 3, ) # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061 # pod.wait_for_resource( # timeout=300, # condition=constants.STATUS_COMPLETED, # selector=constants.OSD_PREPARE_APP_LABEL, # resource_count=result * 3 # ) # Commented this lines as a workaround due to bug 1842500 # Verify OSDs are encrypted. if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=80) ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=5400), "Data re-balance failed to complete"
def test_add_capacity_with_resource_delete( self, add_capacity_setup, workload_storageutilization_rbd, resource_name, resource_id, is_kill_resource_repeatedly, ): """ The function get the resource name, and id. The function adds capacity to the cluster, and then delete the resource while storage capacity is getting increased. Args: resource_name (str): the name of the resource to delete resource_id (int): the id of the resource to delete is_kill_resource_repeatedly (bool): If True then kill the resource repeatedly. Else, if False delete the resource only once. """ used_percentage = get_percent_used_capacity() logging.info( f"storageutilization is completed. used capacity = {used_percentage}" ) osd_pods_before = pod_helpers.get_osd_pods() number_of_osd_pods_before = len(osd_pods_before) d = Disruptions() d.set_resource(resource_name) self.new_pods_in_status_running = False osd_size = storage_cluster.get_osd_size() logging.info(f"Adding one new set of OSDs. osd size = {osd_size}") storagedeviceset_count = storage_cluster.add_capacity(osd_size) logging.info("Adding one new set of OSDs was issued without problems") # Wait for new osd's to come up. After the first new osd in status Init - delete the resource. # After deleting the resource we expect that all the new osd's will be in status running, # and the delete resource will be also in status running. pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before) logging.info( f"Delete a {resource_name} pod while storage capacity is getting increased" ) if is_kill_resource_repeatedly: with ThreadPoolExecutor() as executor: executor.submit(self.kill_resource_repeatedly, resource_name, resource_id) self.wait_for_osd_pods_to_be_running(storagedeviceset_count) else: d.delete_resource(resource_id) self.wait_for_osd_pods_to_be_running(storagedeviceset_count) self.new_pods_in_status_running = True logging.info( "Finished verifying add capacity when one of the pods gets deleted" ) logging.info("Waiting for ceph health check to finished...") check_ceph_health_after_add_capacity()
def test_scale_osds_reboot_nodes(self, interface, project_factory, multi_pvc_factory, dc_pod_factory): """ Check storage utilization, if its less then runs IO, Scale osds from 3-6, check for rebalance and reboot workers """ current_osd_count = count_cluster_osd() proj_obj = project_factory() if current_osd_count == 3: while not validate_osd_utilization(osd_used=50): # Create pvc pvc_objs = multi_pvc_factory(project=proj_obj, interface=interface, size=self.pvc_size, num_of_pvc=self.num_of_pvcs) dc_pod_objs = list() for pvc_obj in pvc_objs: dc_pod_objs.append(dc_pod_factory(pvc=pvc_obj)) wait_for_dc_app_pods_to_reach_running_state(dc_pod_objs) for pod_obj in dc_pod_objs: pod_obj.run_io(storage_type='fs', size='3G', runtime='60', fio_filename=f'{pod_obj.name}_io') # Add capacity osd_size = storage_cluster.get_osd_size() count = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=count * 3) assert ceph_health_check(), "New OSDs failed to reach running state" cluster = CephCluster() # Get rebalance status rebalance_status = cluster.get_rebalance_status() logger.info(rebalance_status) if rebalance_status: time_taken = cluster.time_taken_to_complete_rebalance() logger.info(f"The time taken to complete rebalance {time_taken}") # Rolling reboot on worker nodes worker_nodes = get_typed_nodes(node_type='worker') factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() for node in worker_nodes: nodes.restart_nodes(nodes=[node]) wait_for_nodes_status() assert ceph_health_check( delay=180), "Failed, Ceph health bad after nodes reboot"
def test_add_capacity_node_restart( self, nodes, multi_pvc_factory, pod_factory, workload_storageutilization_rbd, num_of_nodes, ): """ test add capacity when one of the worker nodes got restart in the middle of the process """ logging.info( "Condition 1 to start the test is met: storageutilization is completed" ) # Please notice: When the branch 'wip-add-capacity-e_e' will be merged into master # the test will include more much data both before, and after calling 'add_capacity'function. node_list = get_ocs_nodes(num_of_nodes=num_of_nodes) assert node_list, "Condition 2 to start test failed: No node to restart" max_osds = 15 osd_pods_before = pod_helpers.get_osd_pods() assert ( len(osd_pods_before) < max_osds ), "Condition 3 to start test failed: We have maximum of osd's in the cluster" logging.info("All start conditions are met!") osd_size = storage_cluster.get_osd_size() logging.info("Calling add_capacity function...") result = storage_cluster.add_capacity(osd_size) if result: logging.info("add capacity finished successfully") else: logging.info("add capacity failed") # Restart nodes while additional storage is being added logging.info("Restart nodes:") logging.info([n.name for n in node_list]) nodes.restart_nodes(nodes=node_list, wait=True) logging.info("Finished restarting the node list") # The exit criteria verification conditions here are not complete. When the branch # 'wip-add-capacity-e_e' will be merged into master I will use the functions from this branch. pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) pod.wait_for_resource( timeout=600, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * 3, ) # Verify OSDs are encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() logging.info("Finished verifying add capacity osd storage with node restart") logging.info("Waiting for ceph health check to finished...") ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=180)
def test_add_capacity(self): """ Test to add variable capacity to the OSD cluster while IOs running """ self.ceph_cluster = CephCluster() osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=result * 3) self.ceph_cluster.cluster_health_check(timeout=1200)
def add_capacity_test(): osd_size = storage_cluster.get_osd_size() existing_osd_pods = get_osd_pods() existing_osd_pod_names = [pod.name for pod in existing_osd_pods] result = storage_cluster.add_capacity(osd_size) osd_pods_post_expansion = get_osd_pods() osd_pod_names_post_expansion = [ pod.name for pod in osd_pods_post_expansion ] restarted_osds = list() logger.info( "Checking if existing OSD pods were restarted (deleted) post add capacity (bug 1931601)" ) for pod in existing_osd_pod_names: if pod not in osd_pod_names_post_expansion: restarted_osds.append(pod) assert ( len(restarted_osds) == 0 ), f"The following OSD pods were restarted (deleted) post add capacity: {restarted_osds}" pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * 3, ) # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061 # pod.wait_for_resource( # timeout=300, # condition=constants.STATUS_COMPLETED, # selector=constants.OSD_PREPARE_APP_LABEL, # resource_count=result * 3 # ) # Commented this lines as a workaround due to bug 1842500 # Verify OSDs are encrypted. if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"], tries=80) ceph_cluster_obj = CephCluster() assert ceph_cluster_obj.wait_for_rebalance( timeout=5400), "Data re-balance failed to complete"
def test_add_capacity_osd_pod_delete(self, workload_storageutilization_rbd): """ Test add capacity when one of the osd pods gets deleted in the middle of the process. """ used_percentage = get_percent_used_capacity() logging.info(f"storageutilization is completed. used capacity = {used_percentage}") max_osds = 15 osd_pods_before = pod_helpers.get_osd_pods() number_of_osd_pods_before = len(osd_pods_before) if number_of_osd_pods_before >= max_osds: pytest.skip("We have maximum of osd's in the cluster") d = Disruptions() d.set_resource('osd') osd_size = storage_cluster.get_osd_size() logging.info(f"Adding one new set of OSDs. osd size = {osd_size}") storagedeviceset_count = storage_cluster.add_capacity(osd_size) logging.info("Adding one new set of OSDs was issued without problems") # OSD number go down by one and then gradually go up by 1 # and finally the OSD number will be storagedeviceset_count*3 pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before) logging.info("Delete an osd pod while storage capacity is getting increased") d.delete_resource(1) pod = OCP( kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace'] ) pod.wait_for_resource( timeout=420, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=storagedeviceset_count * 3 ) logging.info("Finished verifying add capacity when one of the osd pods gets deleted") logging.info("Waiting for ceph health check to finished...") ceph_health_check( namespace=config.ENV_DATA['cluster_namespace'], tries=80 )
def add_capacity_test(): osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=result * 3) # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061 # pod.wait_for_resource( # timeout=300, # condition=constants.STATUS_COMPLETED, # selector=constants.OSD_PREPARE_APP_LABEL, # resource_count=result * 3 # ) # Commented this lines as a workaround due to bug 1842500 ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'], tries=80)
def test_add_capacity(self): """ Test to add variable capacity to the OSD cluster while IOs running """ osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA['cluster_namespace']) pod.wait_for_resource(timeout=300, condition=constants.STATUS_RUNNING, selector='app=rook-ceph-osd', resource_count=result * 3) # Verify status of rook-ceph-osd-prepare pods. Verifies bug 1769061 pod.wait_for_resource(timeout=300, condition=constants.STATUS_COMPLETED, selector=constants.OSD_PREPARE_APP_LABEL, resource_count=result * 3) ceph_health_check(namespace=config.ENV_DATA['cluster_namespace'], tries=80)
def test_base_operation_node_drain( self, node_drain_teardown, node_restart_teardown, nodes, pgsql_factory_fixture, project_factory, multi_pvc_factory, mcg_obj, bucket_factory, ): """ Test covers following flow operations while running workloads in the background: 1. Node drain 2. Add capacity 3. Node reboot 4. Node n/w failure """ logger.info("Starting IO operations in Background") project = project_factory() bg_handler = flowtest.BackgroundOps() executor_run_bg_ios_ops = ThreadPoolExecutor(max_workers=3) pgsql_workload = executor_run_bg_ios_ops.submit( bg_handler.handler, pgsql_factory_fixture, replicas=1, clients=1, transactions=100, timeout=100, iterations=1, ) logging.info("Started pgsql workload in background") flow_ops = flowtest.FlowOperations() obc_ios = executor_run_bg_ios_ops.submit( bg_handler.handler, flow_ops.sanity_helpers.obc_put_obj_create_delete, mcg_obj, bucket_factory, iterations=30, ) logging.info("Started object IOs in background") pvc_create_delete = executor_run_bg_ios_ops.submit( bg_handler.handler, flow_ops.sanity_helpers.create_pvc_delete, multi_pvc_factory, project, iterations=70, ) logging.info("Started pvc create and delete in background") logger.info("Starting operation 1: Node Drain") node_name = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=1, operation_name="Node Drain") # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([node_name[0].name]) # Make the node schedulable again node.schedule_nodes([node_name[0].name]) logger.info("Verifying exit criteria for operation 1: Node Drain") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Node Drain") logger.info("Starting operation 2: Add Capacity") osd_pods_before, restart_count_before = flow_ops.add_capacity_entry_criteria( ) # Add capacity osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) if is_flexible_scaling_enabled: replica_count = 1 else: replica_count = 3 pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * replica_count, ) logger.info("Verifying exit criteria for operation 2: Add Capacity") flow_ops.add_capacity_exit_criteria(restart_count_before, osd_pods_before) logger.info("Starting operation 3: Node Restart") node_name = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=1, operation_name="Node Restart") # Node failure (reboot) nodes.restart_nodes(nodes=node_name) logger.info("Verifying exit criteria for operation 3: Node Restart") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Node Restart") logger.info("Starting operation 4: Node network fail") node_name, nw_fail_time = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=1, network_fail_time=300, operation_name="Node N/W failure", ) # Node n/w interface failure node.node_network_failure(node_name[0].name) logger.info(f"Waiting for {nw_fail_time} seconds") sleep(nw_fail_time) # Reboot the unresponsive node(s) logger.info( f"Stop and start the unresponsive node(s): {node_name[0].name}") nodes.restart_nodes_by_stop_and_start(nodes=node_name) logger.info( "Verifying exit criteria for operation 4: Node network fail") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Node N/W failure") logger.info( "Waiting for final iteration of background operations to be completed" ) bg_ops = [pvc_create_delete, obc_ios, pgsql_workload] bg_handler.wait_for_bg_operations(bg_ops, timeout=600)
def test_non_ocs_taint_and_tolerations(self): """ Test runs the following steps 1. Taint ocs nodes with non-ocs taint 2. Set tolerations on storagecluster, subscription, configmap and ocsinit 3. Respin all ocs pods and check if it runs on ocs nodes with tolerations 4. Add Capacity """ # Taint all nodes with non-ocs taint ocs_nodes = get_worker_nodes() taint_nodes(nodes=ocs_nodes, taint_label="xyz=true:NoSchedule") # Add tolerations to the storagecluster storagecluster_obj = ocp.OCP( resource_name=constants.DEFAULT_CLUSTERNAME, namespace=defaults.ROOK_CLUSTER_NAMESPACE, kind=constants.STORAGECLUSTER, ) tolerations = ( '{"tolerations": [{"effect": "NoSchedule", "key": "xyz",' '"operator": "Equal", "value": "true"}, ' '{"effect": "NoSchedule", "key": "node.ocs.openshift.io/storage", ' '"operator": "Equal", "value": "true"}]}') param = ( f'{{"spec": {{"placement": {{"all": {tolerations}, "mds": {tolerations}, ' f'"noobaa-core": {tolerations}, "rgw": {tolerations}}}}}}}') storagecluster_obj.patch(params=param, format_type="merge") # Add tolerations to the subscription sub_list = ocp.get_all_resource_names_of_a_kind( kind=constants.SUBSCRIPTION) param = ( '{"spec": {"config": {"tolerations": ' '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", ' '"value": "true"}]}}}') for sub in sub_list: sub_obj = ocp.OCP( resource_name=sub, namespace=defaults.ROOK_CLUSTER_NAMESPACE, kind=constants.SUBSCRIPTION, ) sub_obj.patch(params=param, format_type="merge") # Add tolerations to the ocsinitializations.ocs.openshift.io param = ( '{"spec": {"tolerations": ' '[{"effect": "NoSchedule", "key": "xyz", "operator": "Equal", ' '"value": "true"}]}}') ocsini_obj = ocp.OCP( resource_name=constants.OCSINIT, namespace=defaults.ROOK_CLUSTER_NAMESPACE, kind=constants.OCSINITIALIZATION, ) ocsini_obj.patch(params=param, format_type="merge") # Add tolerations to the configmap rook-ceph-operator-config configmap_obj = ocp.OCP( kind=constants.CONFIGMAP, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, resource_name=constants.ROOK_OPERATOR_CONFIGMAP, ) toleration = configmap_obj.get().get("data").get( "CSI_PLUGIN_TOLERATIONS") toleration += ( '\n- key: xyz\n operator: Equal\n value: "true"\n effect: NoSchedule' ) toleration = toleration.replace('"', '\\"').replace("\n", "\\n") param_cmd = ( f'[{{"op": "replace", "path": "/data/CSI_PLUGIN_TOLERATIONS", "value": "{toleration}" }}, ' f'{{"op": "replace", "path": "/data/CSI_PROVISIONER_TOLERATIONS", "value": "{toleration}" }}]' ) configmap_obj.patch(params=param_cmd, format_type="json") # After edit noticed few pod respins as expected assert wait_for_pods_to_be_running(timeout=600, sleep=15) # Respin all pods and check it if is still running pod_list = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE, ) for pod in pod_list: pod.delete(wait=False) assert wait_for_pods_to_be_running(timeout=600, sleep=15) self.sanity_helpers.health_check() # Add capacity to check if new osds has toleration osd_size = storage_cluster.get_osd_size() count = storage_cluster.add_capacity(osd_size) pod = ocp.OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) if is_flexible_scaling_enabled(): replica_count = 1 else: replica_count = 3 assert pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=count * replica_count, ), "New OSDs failed to reach running state" check_ceph_health_after_add_capacity(ceph_rebalance_timeout=2500)
def test_add_capacity( self, project_factory, multi_dc_pod, multi_pvc_factory, pod_factory, mcg_obj, awscli_pod, bucket_factory, percent_to_fill, ): ##################################### # ENTRY CRITERIA # ##################################### # Prepare initial configuration : logger, cluster filling, loop for creating & deleting of PVCs and Pods, # noobaa IOs etc., # Perform Health checks: # Make sure cluster is healthy assert ceph_health_check( defaults.ROOK_CLUSTER_NAMESPACE ), "Entry criteria FAILED: Cluster is Unhealthy" # All OCS pods are in running state: # ToDo https://github.com/red-hat-storage/ocs-ci/issues/2361 assert ( pod_helpers.check_pods_in_running_state() ), "Entry criteria FAILED: one or more OCS pods are not in running state" # Create the namespace under which this test will execute: project = project_factory() # total pvc created will be 'num_of_pvcs' * 4 types of pvcs(rbd-rwo,rwx # & cephfs-rwo,rwx) num_of_pvcs = 40 rwo_rbd_pods = multi_dc_pod( num_of_pvcs=num_of_pvcs, pvc_size=175, project=project, access_mode="RWO", pool_type="rbd", timeout=360, ) # Note: Skipping cephfs pods creation # observing bug https://bugzilla.redhat.com/show_bug.cgi?id=1785399, # https://bugzilla.redhat.com/show_bug.cgi?id=1779421#c14 # Todo: https://github.com/red-hat-storage/ocs-ci/issues/2360 # Create rwx-rbd pods pods_ios_rwx_rbd = multi_dc_pod( num_of_pvcs=10, pvc_size=175, project=project, access_mode="RWX-BLK", pool_type="rbd", timeout=360, ) cluster_fill_io_pods = rwo_rbd_pods logger.info("The DC pods are up. Running IOs from them to fill the cluster") filler = cluster_exp_helpers.ClusterFiller( cluster_fill_io_pods, percent_to_fill, project.namespace ) assert filler.cluster_filler(), "IOs failed" # create separate threadpool for running IOs in the background executor_run_bg_ios_ops = ThreadPoolExecutor() bg_wrap = cluster_exp_helpers.BackgroundOps() status_cluster_ios = [] pods_for_copy = rwo_rbd_pods[0:5] + pods_ios_rwx_rbd for p in pods_for_copy: logger.info(f"running IOs on {p.name}") if p.pod_type == "rbd_block_rwx": status_cluster_ios.append( executor_run_bg_ios_ops.submit( bg_wrap.wrap, cluster_exp_helpers.raw_block_io, p, iterations=10 ) ) else: status_cluster_ios.append( executor_run_bg_ios_ops.submit( bg_wrap.wrap, cluster_exp_helpers.cluster_copy_ops, p, iterations=200, ) ) # Start pvc ops in the background.: logger.info("Started pvc create delete operations") executor_run_bg_ios_ops.submit( bg_wrap.wrap, test_create_delete_pvcs, multi_pvc_factory, pod_factory, project, iterations=200, ) # Start NooBaa IOs in the background.: logger.info("Started s3_io_create_delete...") executor_run_bg_ios_ops.submit( bg_wrap.wrap, s3_io_create_delete, mcg_obj, awscli_pod, bucket_factory, iterations=200, ) logger.info("Started obc_io_create_delete...") executor_run_bg_ios_ops.submit( bg_wrap.wrap, obc_io_create_delete, mcg_obj, awscli_pod, bucket_factory, iterations=200, ) # All ocs nodes are in Ready state (including master): executor_run_bg_ios_ops.submit( bg_wrap.wrap, cluster_exp_helpers.check_nodes_status, iterations=100 ) # Get restart count of ocs pods before expanstion restart_count_before = pod_helpers.get_pod_restarts_count( defaults.ROOK_CLUSTER_NAMESPACE ) # Get osd pods before expansion osd_pods_before = pod_helpers.get_osd_pods() # Get the total space in cluster before expansion ct_pod = pod_helpers.get_ceph_tools_pod() output = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd df") total_space_b4_expansion = int(output.get("summary").get("total_kb")) logger.info(f"total_space_b4_expansion == {total_space_b4_expansion}") logger.info("############## Calling add_capacity $$$$$$$$$$") ##################### # Call add_capacity # ##################### osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) # New osd (all) pods corresponding to the additional capacity should be # in running state pod.wait_for_resource( timeout=1200, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * 3, ) ################################# # Exit criteria verification: # ################################# cluster_exp_helpers.BackgroundOps.EXPANSION_COMPLETED = True # No ocs pods should get restarted unexpectedly # Get restart count of ocs pods after expansion and see any pods got # restated restart_count_after = pod_helpers.get_pod_restarts_count( defaults.ROOK_CLUSTER_NAMESPACE ) # # # TO DO # # Handle Bug 1814254 - All Mons respinned during add capacity and OSDs took longtime to come up # # implement function to make sure no pods are respun after expansion logger.info( f"sum(restart_count_before.values()) = {sum(restart_count_before.values())}" ) logger.info( f" sum(restart_count_after.values()) = {sum(restart_count_after.values())}" ) assert sum(restart_count_before.values()) == sum( restart_count_after.values() ), "Exit criteria verification FAILED: One or more pods got restarted" logger.info("Exit criteria verification Success: No pods were restarted") # Make sure right number of OSDs are added: # Get osd pods after expansion osd_pods_after = pod_helpers.get_osd_pods() number_of_osds_added = len(osd_pods_after) - len(osd_pods_before) logger.info( f"### number_of_osds_added = {number_of_osds_added}, " f"before = {len(osd_pods_before)}, after = {len(osd_pods_after) }" ) # If the difference b/w updated count of osds and old osd count is not # 3 then expansion failed assert ( number_of_osds_added == 3 ), "Exit criteria verification FAILED: osd count mismatch" logger.info( "Exit criteria verification Success: Correct number of OSDs are added" ) # The newly added capacity takes into effect at the storage level ct_pod = pod_helpers.get_ceph_tools_pod() output = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd df") total_space_after_expansion = int(output.get("summary").get("total_kb")) osd_size = int(output.get("nodes")[0].get("kb")) expanded_space = osd_size * 3 # 3 OSDS are added of size = 'osd_size' logger.info(f"space output == {output} ") logger.info(f"osd size == {osd_size} ") logger.info(f"total_space_after_expansion == {total_space_after_expansion} ") expected_total_space_after_expansion = total_space_b4_expansion + expanded_space logger.info( f"expected_total_space_after_expansion == {expected_total_space_after_expansion} " ) assert ( total_space_after_expansion == expected_total_space_after_expansion ), "Exit criteria verification FAILED: Expected capacity mismatch" logger.info( "Exit criteria verification Success: Newly added capacity took into effect" ) logger.info("Exit criteria verification Success: IOs completed successfully") # 'ceph osd tree' should show the new osds under right nodes/hosts # Verification is different for 3 AZ and 1 AZ configs ct_pod = pod_helpers.get_ceph_tools_pod() tree_output = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd tree") logger.info(f"### OSD tree output = {tree_output}") if config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM: assert cluster_helpers.check_osd_tree_1az_vmware( tree_output, len(osd_pods_after) ), "Exit criteria verification FAILED: Incorrect ceph osd tree formation found" aws_number_of_zones = 3 if config.ENV_DATA["platform"].lower() == constants.AWS_PLATFORM: # parse the osd tree. if it contains a node 'rack' then it's a # AWS_1AZ cluster. Else, 3 AWS_3AZ cluster for i in range(len(tree_output["nodes"])): if tree_output["nodes"][i]["name"] in "rack0": aws_number_of_zones = 1 if aws_number_of_zones == 1: assert cluster_helpers.check_osd_tree_1az_aws( tree_output, len(osd_pods_after) ), "Exit criteria verification FAILED: Incorrect ceph osd tree formation found" else: assert cluster_helpers.check_osd_tree_3az_aws( tree_output, len(osd_pods_after) ), "Exit criteria verification FAILED: Incorrect ceph osd tree formation found" logger.info("Exit criteria verification Success: osd tree verification success") # Make sure new pvcs and pods can be created and IOs can be run from # the pods num_of_pvcs = 1 rwo_rbd_pods = multi_dc_pod( num_of_pvcs=num_of_pvcs, pvc_size=5, project=project, access_mode="RWO", pool_type="rbd", ) rwo_cephfs_pods = multi_dc_pod( num_of_pvcs=num_of_pvcs, pvc_size=5, project=project, access_mode="RWO", pool_type="cephfs", ) rwx_cephfs_pods = multi_dc_pod( num_of_pvcs=num_of_pvcs, pvc_size=5, project=project, access_mode="RWX", pool_type="cephfs", ) # Create rwx-rbd pods pods_ios_rwx_rbd = multi_dc_pod( num_of_pvcs=num_of_pvcs, pvc_size=5, project=project, access_mode="RWX-BLK", pool_type="rbd", ) cluster_io_pods = ( rwo_rbd_pods + rwo_cephfs_pods + rwx_cephfs_pods + pods_ios_rwx_rbd ) with ThreadPoolExecutor() as pod_ios_executor: for p in cluster_io_pods: if p.pod_type == "rbd_block_rwx": logger.info(f"Calling block fio on pod {p.name}") pod_ios_executor.submit(cluster_exp_helpers.raw_block_io, p, "100M") else: logger.info(f"calling file fio on pod {p.name}") pod_ios_executor.submit(p.run_io, "fs", "100M") for pod_io in cluster_io_pods: pod_helpers.get_fio_rw_iops(pod_io) # Verify OSDs are encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() cluster_obj = cluster_helpers.CephCluster() assert ( cluster_obj.get_ceph_health() != "HEALTH_ERR" ), "Ceph cluster health checking failed" logger.info("ALL Exit criteria verification successfully") logger.info( "********************** TEST PASSED *********************************" )