def test_base_operation_node_drain( self, node_drain_teardown, node_restart_teardown, nodes, pgsql_factory_fixture, project_factory, multi_pvc_factory, mcg_obj, bucket_factory, ): """ Test covers following flow operations while running workloads in the background: 1. Node drain 2. Add capacity 3. Node reboot 4. Node n/w failure """ logger.info("Starting IO operations in Background") project = project_factory() bg_handler = flowtest.BackgroundOps() executor_run_bg_ios_ops = ThreadPoolExecutor(max_workers=3) pgsql_workload = executor_run_bg_ios_ops.submit( bg_handler.handler, pgsql_factory_fixture, replicas=1, clients=1, transactions=100, timeout=100, iterations=1, ) logging.info("Started pgsql workload in background") flow_ops = flowtest.FlowOperations() obc_ios = executor_run_bg_ios_ops.submit( bg_handler.handler, flow_ops.sanity_helpers.obc_put_obj_create_delete, mcg_obj, bucket_factory, iterations=30, ) logging.info("Started object IOs in background") pvc_create_delete = executor_run_bg_ios_ops.submit( bg_handler.handler, flow_ops.sanity_helpers.create_pvc_delete, multi_pvc_factory, project, iterations=70, ) logging.info("Started pvc create and delete in background") logger.info("Starting operation 1: Node Drain") node_name = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=1, operation_name="Node Drain") # Node maintenance - to gracefully terminate all pods on the node node.drain_nodes([node_name[0].name]) # Make the node schedulable again node.schedule_nodes([node_name[0].name]) logger.info("Verifying exit criteria for operation 1: Node Drain") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Node Drain") logger.info("Starting operation 2: Add Capacity") osd_pods_before, restart_count_before = flow_ops.add_capacity_entry_criteria( ) # Add capacity osd_size = storage_cluster.get_osd_size() result = storage_cluster.add_capacity(osd_size) pod = OCP(kind=constants.POD, namespace=config.ENV_DATA["cluster_namespace"]) if is_flexible_scaling_enabled: replica_count = 1 else: replica_count = 3 pod.wait_for_resource( timeout=300, condition=constants.STATUS_RUNNING, selector="app=rook-ceph-osd", resource_count=result * replica_count, ) logger.info("Verifying exit criteria for operation 2: Add Capacity") flow_ops.add_capacity_exit_criteria(restart_count_before, osd_pods_before) logger.info("Starting operation 3: Node Restart") node_name = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=1, operation_name="Node Restart") # Node failure (reboot) nodes.restart_nodes(nodes=node_name) logger.info("Verifying exit criteria for operation 3: Node Restart") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Node Restart") logger.info("Starting operation 4: Node network fail") node_name, nw_fail_time = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=1, network_fail_time=300, operation_name="Node N/W failure", ) # Node n/w interface failure node.node_network_failure(node_name[0].name) logger.info(f"Waiting for {nw_fail_time} seconds") sleep(nw_fail_time) # Reboot the unresponsive node(s) logger.info( f"Stop and start the unresponsive node(s): {node_name[0].name}") nodes.restart_nodes_by_stop_and_start(nodes=node_name) logger.info( "Verifying exit criteria for operation 4: Node network fail") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Node N/W failure") logger.info( "Waiting for final iteration of background operations to be completed" ) bg_ops = [pvc_create_delete, obc_ios, pgsql_workload] bg_handler.wait_for_bg_operations(bg_ops, timeout=600)
def test_node_replacement_reactive_aws_ipi( self, nodes, pvc_factory, pod_factory, dc_pod_factory, failure, interface, bucket_factory, rgw_bucket_factory, ): """ Knip-894 Node replacement - AWS-IPI-Reactive """ # Get worker nodes initial_nodes = get_worker_nodes() # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key="dc", label_value="fedora") # Create DC app pods log.info("Creating DC based app pods") if interface == "rbd": interface = constants.CEPHBLOCKPOOL elif interface == "cephfs": interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={"dc": "fedora"}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "power off": # Power off AWS worker node instance nodes.stop_nodes(failure_node_obj, wait=True) log.info( f"Successfully powered off node: {failure_node_obj[0].name}") elif failure == "network failure": # Induce Network failure node_network_failure([failure_node_obj[0].name]) # Add annotation to the failed node annotation = "machine.openshift.io/exclude-node-draining=''" machine.add_annotation_to_machine(annotation=annotation, machine_name=machine_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Wait for the new machine to spin log.info("Waiting for the new node to be in ready state") machine.wait_for_new_node_to_be_ready(machineset_name) # Get the node name of new spun node nodes_after_new_spun_node = get_worker_nodes() new_spun_node = list( set(nodes_after_new_spun_node) - set(initial_nodes)) log.info(f"New spun node is {new_spun_node}") # Label it node_obj = ocp.OCP(kind="node") node_obj.add_label(resource_name=new_spun_node[0], label=constants.OPERATOR_NODE_LABEL) log.info( f"Successfully labeled {new_spun_node} with OCS storage label") # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=1200) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_rwo_pvc_fencing_node_prolonged_and_short_network_failure( self, nodes, setup, node_restart_teardown): """ OCS-1431/OCS-1436: - Start DeploymentConfig based app pods on 1 node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Disrupt the leader provisioner pods if not running on above selected node - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive node - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods - Again make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods scheduled on another node are stuck due to Multi-Attach error. - Reboot the unresponsive node - When unresponsive node recovers, run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup external_mode = helpers.storagecluster_independent_check() extra_nodes = list(set(test_nodes) - set(app_pod_nodes)) helpers.remove_label_from_worker_node(node_list=extra_nodes[:-1], label_key="nodetype") # Run IO on pods md5sum_data = self.run_and_verify_io(pod_list=dc_pods, fio_filename="io_file1", run_io_in_bg=True) # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds") sleep(self.prolong_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in dc_pods + ceph_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len( dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods) new_ceph_pods = [] if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, "Unexpected number of osd pods" self.verify_multi_attach_error(new_ceph_pods) logger.info("Executing manual recovery steps") # Power off the unresponsive node logger.info(f"Powering off the unresponsive node: {app_pod_nodes}") nodes.stop_nodes(node.get_node_objs(app_pod_nodes)) # Force delete the app pods and/or mon,osd pods on the unresponsive node for pod_obj in dc_pods + ceph_pods: pod_obj.delete(force=True) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not external_mode: # Wait for mon and osd pods to reach Running state selectors_to_check = { constants.MON_APP_LABEL: self.expected_mon_count, constants.OSD_APP_LABEL: ceph_cluster.osd_count, } for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == self.expected_mon_count: # Check ceph health toolbox_status = ceph_cluster.POD.get_resource_status( ceph_cluster.toolbox.name) if toolbox_status == constants.STATUS_TERMINATING: ceph_cluster.toolbox.delete(force=True) assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods md5sum_data2 = self.run_and_verify_io(pod_list=new_dc_pods, fio_filename="io_file2", run_io_in_bg=True) helpers.label_worker_node(node_list=extra_nodes[:-1], label_key="nodetype", label_value="app-pod") # Induce network failure on the node node.node_network_failure(extra_nodes[-1]) logger.info(f"Waiting for {self.short_nw_fail_time} seconds") sleep(self.short_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name, timeout=600, sleep=30, ) # Fetch info of new pods and verify Multi-Attach error new_dc_pods2 = self.get_new_pods(new_dc_pods) assert len(new_dc_pods2) == len( new_dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods2) # Reboot the unresponsive node logger.info(f"Rebooting the unresponsive node: {extra_nodes[-1]}") nodes.restart_nodes_by_stop_and_start( node.get_node_objs([extra_nodes[-1]])) node.wait_for_nodes_status(node_names=[extra_nodes[-1]], status=constants.NODE_READY) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods2: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not external_mode: # Wait for mon and osd pods to reach Running state for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == 3: # Check ceph health assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods2): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file2", original_md5sum=md5sum_data2[num]) for num, pod_obj in enumerate(new_dc_pods2): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods self.run_and_verify_io(pod_list=new_dc_pods2, fio_filename="io_file3", return_md5sum=False)
def test_rwo_pvc_fencing_node_prolonged_network_failure( self, nodes, setup, node_restart_teardown): """ OCS-1427/OCS-1429: - Start DeploymentConfig based app pods on 1 OCS/Non-OCS node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive node - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods OCS-1430/OCS-1435: - Start DeploymentConfig based app pods on multiple node Colocated scenario: Select 1 node where osd and/or mon is running, select other 2 nodes where mon/osd are not running Dedicated scenario: 3 Non-OCS nodes - Disrupt the leader provisioner pods if not running on above selected nodes - Make the nodes (where app pods are running) unresponsive by bringing their main network interface down - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Power off the unresponsive nodes - Force delete the app pods and/or mon,osd pods on the unresponsive node - Check new app pods and/or mon, osd pods scheduled on another node comes into Running state - Run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup external_mode = helpers.storagecluster_independent_check() # Run IO on pods md5sum_data = self.run_and_verify_io(pod_list=dc_pods, fio_filename="io_file1", run_io_in_bg=True) # OCS-1430/OCS-1435 # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.prolong_nw_fail_time} seconds") sleep(self.prolong_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in dc_pods + ceph_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len( dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods) if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, "Unexpected number of osd pods" self.verify_multi_attach_error(new_ceph_pods) logger.info("Executing manual recovery steps") # Power off the unresponsive node(s) logger.info(f"Powering off the unresponsive node(s): {app_pod_nodes}") nodes.stop_nodes(node.get_node_objs(app_pod_nodes)) # Force delete the app pods and/or mon,osd pods on the unresponsive node if float(config.ENV_DATA["ocs_version"] ) < 4.4 and ceph_cluster.mon_count == 5: for pod_obj in ceph_cluster.mons: if pod.get_pod_node(pod_obj).name in app_pod_nodes: ceph_pods.append(pod_obj) for pod_obj in dc_pods + ceph_pods: pod_obj.delete(force=True) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not external_mode: # Wait for mon and osd pods to reach Running state selectors_to_check = { constants.MON_APP_LABEL: self.expected_mon_count, constants.OSD_APP_LABEL: ceph_cluster.osd_count, } for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" if ceph_cluster.mon_count == self.expected_mon_count: # Check ceph health toolbox_status = ceph_cluster.POD.get_resource_status( ceph_cluster.toolbox.name) if toolbox_status == constants.STATUS_TERMINATING: ceph_cluster.toolbox.delete(force=True) assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num]) # Run IO on new pods self.run_and_verify_io(pod_list=new_dc_pods, fio_filename="io_file2", return_md5sum=False)
def test_rwo_pvc_fencing_node_short_network_failure( self, nodes, setup, node_restart_teardown): """ OCS-1423/OCS-1428/OCS-1426: - Start DeploymentConfig based app pods on 1 OCS/Non-OCS node - Make the node (where app pods are running) unresponsive by bringing its main network interface down - Check new app pods and/or mon, osd pods scheduled on another node are stuck due to Multi-Attach error. - Reboot the unresponsive node - When unresponsive node recovers, run IOs on new app pods OCS-1424/OCS-1434: - Start DeploymentConfig based app pods on multiple node Colocated scenario: Select 1 node where osd and/or mon is running, select other 2 nodes where mon/osd are not running Dedicated scenario: 3 Non-OCS nodes - Disrupt the leader provisioner pods if not running on above selected nodes - Make the nodes (where app pods are running) unresponsive by bringing their main network interface down - Check new app pods and/or mon, osd pods scheduled on another node and are stuck due to Multi-Attach error. - Reboot the unresponsive nodes - When unresponsive nodes recover, run IOs on new app pods """ ceph_cluster, dc_pods, ceph_pods, app_pod_nodes, test_nodes, disruptor = setup # Run IO on pods md5sum_data = self.run_and_verify_io(pod_list=dc_pods, fio_filename="io_file1", run_io_in_bg=True) # OCS-1424/OCS-1434 # Disrupt leader plugin-provisioner pods, skip if running on node to be failed if disruptor: [disruption.delete_resource() for disruption in disruptor] # Induce network failure on the nodes node.node_network_failure(app_pod_nodes) logger.info(f"Waiting for {self.short_nw_fail_time} seconds") sleep(self.short_nw_fail_time) # Wait for pods to be rescheduled for pod_obj in dc_pods + ceph_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_TERMINATING, resource_name=pod_obj.name, timeout=600, sleep=30, ) # Fetch info of new pods and verify Multi-Attach error new_dc_pods = self.get_new_pods(dc_pods) assert len(new_dc_pods) == len( dc_pods), "Unexpected number of app pods" self.verify_multi_attach_error(new_dc_pods) if ceph_pods: new_ceph_pods = self.get_new_pods(ceph_pods) assert len(new_ceph_pods) > 0, "Unexpected number of osd pods" self.verify_multi_attach_error(new_ceph_pods) # Reboot the unresponsive node(s) logger.info(f"Rebooting the unresponsive node(s): {app_pod_nodes}") nodes.restart_nodes_by_stop_and_start( node.get_node_objs(app_pod_nodes)) node.wait_for_nodes_status(node_names=app_pod_nodes, status=constants.NODE_READY) # Wait for new app pods to reach Running state for pod_obj in new_dc_pods: pod_obj.ocp.wait_for_resource( condition=constants.STATUS_RUNNING, resource_name=pod_obj.name, timeout=1200, sleep=30, ), (f"App pod with name {pod_obj.name} did not reach Running state" ) if not helpers.storagecluster_independent_check(): # Wait for mon and osd pods to reach Running state selectors_to_check = { constants.MON_APP_LABEL: ceph_cluster.mon_count, constants.OSD_APP_LABEL: ceph_cluster.osd_count, } for selector, count in selectors_to_check.items(): assert ceph_cluster.POD.wait_for_resource( condition=constants.STATUS_RUNNING, selector=selector, resource_count=count, timeout=1800, sleep=60, ), f"{count} expected pods with selector {selector} are not in Running state" assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") # Verify data integrity from new pods for num, pod_obj in enumerate(new_dc_pods): assert pod.verify_data_integrity(pod_obj=pod_obj, file_name="io_file1", original_md5sum=md5sum_data[num] ), "Data integrity check failed" # Run IO on new pods self.run_and_verify_io(pod_list=new_dc_pods, fio_filename="io_file2", return_md5sum=False)
def test_node_replacement_reactive_aws_ipi( self, nodes, pvc_factory, pod_factory, dc_pod_factory, failure, interface ): """ Knip-894 Node replacement - AWS-IPI-Reactive """ # Get worker nodes initial_nodes = get_worker_nodes() # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") if interface == 'rbd': interface = constants.CEPHBLOCKPOOL elif interface == 'cephfs': interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory( interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name ) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name ) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}" ) # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "power off": # Power off AWS worker node instance nodes.stop_nodes(failure_node_obj, wait=True) log.info(f"Successfully powered off node: {failure_node_obj[0].name}") elif failure == "network failure": # Induce Network failure node_network_failure([failure_node_obj[0].name]) # Add annotation to the failed node annotation = "machine.openshift.io/exclude-node-draining=''" machine.add_annotation_to_machine( annotation=annotation, machine_name=machine_name ) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Wait for the new machine to spin log.info("Waiting for the new node to be in ready state") machine.wait_for_new_node_to_be_ready(machineset_name) # Get the node name of new spun node nodes_after_new_spun_node = get_worker_nodes() new_spun_node = list( set(nodes_after_new_spun_node) - set(initial_nodes) ) log.info(f"New spun node is {new_spun_node}") # Label it node_obj = ocp.OCP(kind='node') node_obj.add_label( resource_name=new_spun_node[0], label=constants.OPERATOR_NODE_LABEL ) log.info( f"Successfully labeled {new_spun_node} with OCS storage label" ) # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state( dc_pod_obj, timeout=1200 ) log.info("All the dc pods reached running state") # Check all OCS pods status, they should be in running state all_pod_obj = pod.get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE ) for pod_obj in all_pod_obj: if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=1800 ) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE ) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()