def test_automated_recovery_from_failed_nodes_IPI_proactive( self, pvc_factory, pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get the osd associated node name osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"{osd_node_name} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info(f"{osd_node_name} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name: try: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) # 'rook-ceph-crashcollector' on the failed node stucks at pending # state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as WA and # deleting its deployment so that the pod disappears # Will revert this WA once the BZ is fixed except ResourceWrongStatusException: if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP() name = pod_obj.name[:-17] command = f"delete deployment {name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_automated_recovery_from_failed_nodes_IPI_proactive( self, pvc_factory, pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get the osd associated node name osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"{osd_node_name} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info(f"{osd_node_name} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_automated_recovery_from_failed_nodes_IPI_proactive( self, interface, pvc_factory, pod_factory, dc_pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) msg = "Common OSD and app running node(s) NOT found" assert (len(common_nodes) > 0), msg log.info(f"Common OSD and app pod running nodes are {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # DC app pods on the failed node will get automatically created on # other running node. Waiting for all dc app pod to reach running # state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_node_replacement_reactive_aws_ipi( self, nodes, pvc_factory, pod_factory, dc_pod_factory, failure, interface, bucket_factory, rgw_bucket_factory, ): """ Knip-894 Node replacement - AWS-IPI-Reactive """ # Get worker nodes initial_nodes = get_worker_nodes() # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key="dc", label_value="fedora") # Create DC app pods log.info("Creating DC based app pods") if interface == "rbd": interface = constants.CEPHBLOCKPOOL elif interface == "cephfs": interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={"dc": "fedora"}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "power off": # Power off AWS worker node instance nodes.stop_nodes(failure_node_obj, wait=True) log.info( f"Successfully powered off node: {failure_node_obj[0].name}") elif failure == "network failure": # Induce Network failure node_network_failure([failure_node_obj[0].name]) # Add annotation to the failed node annotation = "machine.openshift.io/exclude-node-draining=''" machine.add_annotation_to_machine(annotation=annotation, machine_name=machine_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Wait for the new machine to spin log.info("Waiting for the new node to be in ready state") machine.wait_for_new_node_to_be_ready(machineset_name) # Get the node name of new spun node nodes_after_new_spun_node = get_worker_nodes() new_spun_node = list( set(nodes_after_new_spun_node) - set(initial_nodes)) log.info(f"New spun node is {new_spun_node}") # Label it node_obj = ocp.OCP(kind="node") node_obj.add_label(resource_name=new_spun_node[0], label=constants.OPERATOR_NODE_LABEL) log.info( f"Successfully labeled {new_spun_node} with OCS storage label") # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=1200) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_automated_recovery_from_failed_nodes_IPI_proactive( self, interface, pvc_factory, pod_factory, dc_pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") interface = (constants.CEPHBLOCKPOOL if interface == 'rbd' else constants.CEPHFILESYSTEM) dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) msg = "Common OSD and app running node(s) NOT found" assert (len(common_nodes) > 0), msg log.info(f"Common OSD and app pod running nodes are {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # DC app pods on the failed node will get automatically created on # other running node. Waiting for all dc app pod to reach running # state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj) log.info("All the dc pods reached running state") # Check all OCS pods status, they should be in running state all_pod_obj = pod.get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE) for pod_obj in all_pod_obj: if ('-1-deploy' or 'ocs-deviceset') not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") else: raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_node_replacement_reactive_aws_ipi( self, nodes, pvc_factory, pod_factory, dc_pod_factory, failure, interface ): """ Knip-894 Node replacement - AWS-IPI-Reactive """ # Get worker nodes initial_nodes = get_worker_nodes() # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") if interface == 'rbd': interface = constants.CEPHBLOCKPOOL elif interface == 'cephfs': interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory( interface=interface, node_selector={'dc': 'fedora'}) pod.run_io_in_bg(dc_pod, fedora_dc=True) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name ) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name ) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}" ) # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "power off": # Power off AWS worker node instance nodes.stop_nodes(failure_node_obj, wait=True) log.info(f"Successfully powered off node: {failure_node_obj[0].name}") elif failure == "network failure": # Induce Network failure node_network_failure([failure_node_obj[0].name]) # Add annotation to the failed node annotation = "machine.openshift.io/exclude-node-draining=''" machine.add_annotation_to_machine( annotation=annotation, machine_name=machine_name ) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Wait for the new machine to spin log.info("Waiting for the new node to be in ready state") machine.wait_for_new_node_to_be_ready(machineset_name) # Get the node name of new spun node nodes_after_new_spun_node = get_worker_nodes() new_spun_node = list( set(nodes_after_new_spun_node) - set(initial_nodes) ) log.info(f"New spun node is {new_spun_node}") # Label it node_obj = ocp.OCP(kind='node') node_obj.add_label( resource_name=new_spun_node[0], label=constants.OPERATOR_NODE_LABEL ) log.info( f"Successfully labeled {new_spun_node} with OCS storage label" ) # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state( dc_pod_obj, timeout=1200 ) log.info("All the dc pods reached running state") # Check all OCS pods status, they should be in running state all_pod_obj = pod.get_all_pods( namespace=defaults.ROOK_CLUSTER_NAMESPACE ) for pod_obj in all_pod_obj: if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name: try: helpers.wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=1800 ) except ResourceWrongStatusException: # 'rook-ceph-crashcollector' on the failed node stucks at # pending state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as # WA and deleting its deployment so that the pod # disappears. Will revert this WA once the BZ is fixed if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP( namespace=defaults.ROOK_CLUSTER_NAMESPACE ) pod_name = pod_obj.name deployment_name = '-'.join(pod_name.split("-")[:-2]) command = f"delete deployment {deployment_name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()