def test_automated_recovery_from_stopped_node_and_start( self, nodes, additional_node ): """ Knip-678 Automated recovery from failed nodes Reactive case - IPI 0) A - add new node, B - don't add new node 1) Stop node 2) Validate result: A - pods should respin on the new node B - pods should remain in Pending state on the stopped node 3) Start node 4) Validate result: A - pods should start on the new node B - pods should start on the stopped node after starting it """ wnode_name = get_worker_nodes()[0] machine_name = machine.get_machine_from_node_name(wnode_name) self.machineset_name = machine.get_machineset_from_machine_name(machine_name) self.start_ready_replica_count = machine.get_ready_replica_count( self.machineset_name ) temp_osd = get_osd_pods()[0] osd_real_name = "-".join(temp_osd.name.split("-")[:-1]) self.osd_worker_node = [get_pod_node(temp_osd)] if additional_node: self.add_new_storage_node(self.osd_worker_node[0].name) self.extra_node = True nodes.stop_nodes(self.osd_worker_node, wait=True) log.info(f"Successfully powered off node: {self.osd_worker_node[0].name}") timeout = 420 assert wait_for_rook_ceph_pod_status( temp_osd, constants.STATUS_TERMINATING, timeout ), ( f"The pod {osd_real_name} didn't reach the status {constants.STATUS_TERMINATING} " f"after {timeout} seconds" ) # Validate that the OSD in terminate state has a new OSD in Pending all_pod_obj = get_all_pods(namespace=defaults.ROOK_CLUSTER_NAMESPACE) new_osd = None for pod_obj in all_pod_obj: if osd_real_name == "-".join(pod_obj.name.split("-")[:-1]) and ( temp_osd.name != pod_obj.name ): new_osd = pod_obj break nodes.start_nodes(nodes=self.osd_worker_node, wait=True) log.info(f"Successfully powered on node: {self.osd_worker_node[0].name}") wait_for_resource_state(new_osd, constants.STATUS_RUNNING, timeout=180) if additional_node: new_osd_node = get_pod_node(new_osd) assert ( new_osd_node.name != self.osd_worker_node[0].name ), "New OSD is expected to run on the new additional node"
def noobaa_running_node_restart(pod_name): """ Function to restart node which has noobaa pod's running Args: pod_name (str): Name of noobaa pod """ nb_pod_obj = pod.get_pod_obj( (get_pod_name_by_pattern( pattern=pod_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE))[0], namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) nb_node_name = pod.get_pod_node(nb_pod_obj).name factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() nb_nodes = get_node_objs(node_names=nb_node_name) log.info(f"{pod_name} is running on {nb_node_name}") log.info(f"Restating node: {nb_node_name}....") nodes.restart_nodes_by_stop_and_start(nodes=nb_nodes, force=True) # Validate nodes are up and running wait_for_nodes_status() ceph_health_check(tries=30, delay=60) helpers.wait_for_resource_state(nb_pod_obj, constants.STATUS_RUNNING, timeout=180)
def mgr_pod_node_restart(self): """ Restart node that runs mgr pod """ mgr_pod_obj = pod.get_mgr_pods() mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) self.nodes.restart_nodes([mgr_node_obj]) wait_for_nodes_status() # Check for Ceph pods pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-mgr", timeout=600 ) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-mon", resource_count=3, timeout=600, ) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-osd", resource_count=3, timeout=600, )
def test_monitoring_after_rebooting_node_where_mgr_is_running(self): """ Test case to validate rebooting a node where mgr is running should not delete the data collected on prometheus pod """ aws_obj = aws.AWS() # Get the mgr pod obj mgr_pod_obj = pod.get_mgr_pods() # Get the node where the mgr pod is hosted mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) # Reboot the node where the mgr pod is hosted instances = aws.get_instances_ids_and_names([mgr_node_obj]) aws_obj.restart_ec2_instances(instances=instances, wait=True, force=True) # Validate all nodes are in READY state wait_for_nodes_status() # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for ceph health check metrics is updated with new mgr pod wait_to_update_mgrpod_info_prometheus_pod() # Check for the created pvc metrics after rebooting the node where mgr pod was running for pod_obj in self.pod_objs: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def test_node_maintenance(self, node_type, pvc_factory, pod_factory): """ OCS-1269/OCS-1272: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Check cluster functionality by creating resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) - Mark the node as scheduable - Check cluster and Ceph health """ # Get a list of 2 nodes. Pick one of them after checking # which one does't have the rook operator running on typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2) typed_node_name = typed_nodes[0].name # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222 rook_operator_pod = pod.get_operator_pods()[0] operator_node = pod.get_pod_node(rook_operator_pod) if operator_node.get().get('metadata').get('name') == typed_node_name: typed_node_name = typed_nodes[1].name # End of workaround for BZ 1778488 # Maintenance the node (unschedule and drain) drain_nodes([typed_node_name]) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def disrupt_plugin_provisioner_pods(self, node_list): """ Set leader plugin-provisioner resources for disruption, skip if running on node from the node_list Args: node_list (list): list of node names to check Returns: list: list of Disruption objects """ provisioner_resource = [] for interface in [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM]: provisioner_pod = pod.get_plugin_provisioner_leader( interface=interface) node_name = pod.get_pod_node(provisioner_pod).name if node_name not in node_list: if interface == constants.CEPHBLOCKPOOL: provisioner_resource.append("rbdplugin_provisioner") else: provisioner_resource.append("cephfsplugin_provisioner") disruptor = [] for resource in provisioner_resource: disruption = disruption_helpers.Disruptions() disruption.set_resource(resource=resource) disruptor.append(disruption) return disruptor
def test_ceph_csidriver_runs_on_non_ocs_nodes( self, pvc_factory, pod_factory, add_nodes ): """ 1. Add non ocs nodes 2. Taint new nodes with app label 3. Check if plugin pods running on new nodes 4. Create app-pods on app_nodes """ # Add worker nodes and tainting it as app_nodes add_nodes(ocs_nodes=False, taint_label="nodetype=app:NoSchedule") # Checks for new plugin pod respinning on new app-nodes app_nodes = [node.name for node in get_worker_nodes_not_in_ocs()] interfaces = [constants.CEPHFILESYSTEM, constants.CEPHBLOCKPOOL] logger.info("Checking for plugin pods on non-ocs worker nodes") for interface in interfaces: pod_objs = get_plugin_pods(interface) for pod_obj in pod_objs: node_obj = get_pod_node(pod_obj) try: if node_obj.name in app_nodes: logger.info( f"The plugin pod {pod_obj.name} is running on app_node {node_obj.name}" ) continue except Exception as e: logging.info(f"Plugin pod was not found on {node_obj.name} - {e}") # Creates app-pods on app-nodes for node in app_nodes: pvc_obj = pvc_factory() pod_factory(pvc=pvc_obj, node_name=node)
def finalizer(): not_ready_nodes = [ n for n in node.get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_NOT_READY ] logger.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status() # Restart node if the osd stays at CLBO state osd_pods_obj_list = get_osd_pods() for pod in osd_pods_obj_list: if (pod.get().get("status").get("containerStatuses")[0].get( "state") == constants.STATUS_CLBO): node_obj = get_pod_node(pod) nodes.restart_nodes([node_obj]) node.wait_for_nodes_status([node_obj.name]) # Verify OSD encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() logger.info("Clear crash warnings and osd removal leftovers") clear_crash_warning_and_osd_removal_leftovers()
def test_rgw_host_node_failure( self, nodes, node_restart_teardown, mcg_obj, bucket_factory ): """ Test case to fail node where RGW and Noobaa-db-0 hosting and verify new pod spuns on healthy node """ # Get rgw pods rgw_pod_obj = get_rgw_pods() # Get nooba pods noobaa_pod_obj = get_noobaa_pods() # Get the node where noobaa-db hosted for noobaa_pod in noobaa_pod_obj: if noobaa_pod.name == "noobaa-db-0": noobaa_pod_node = get_pod_node(noobaa_pod) for rgw_pod in rgw_pod_obj: pod_node = rgw_pod.get().get("spec").get("nodeName") if pod_node == noobaa_pod_node.name: # Stop the node log.info( f"Stopping node {pod_node} where" f" rgw pod {rgw_pod.name} and noobaa-db-0 hosted" ) node_obj = get_node_objs(node_names=[pod_node]) nodes.stop_nodes(node_obj) # Validate old rgw pod went terminating state wait_for_resource_state( resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720 ) # Validate new rgw pod spun ocp_obj = OCP( kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE ) ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, ) # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-1") # Start the node nodes.start_nodes(node_obj) # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2") # Verify cluster health self.sanity_helpers.health_check() # Verify all storage pods are running wait_for_storage_pods()
def get_osd_running_nodes(): """ Gets the osd running node names Returns: list: OSD node names """ return [pod.get_pod_node(osd_node).name for osd_node in pod.get_osd_pods()]
def test_automated_recovery_from_failed_nodes_IPI_proactive( self, pvc_factory, pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get the osd associated node name osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"{osd_node_name} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info(f"{osd_node_name} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: if '-1-deploy' and 'ocs-deviceset' not in pod_obj.name: try: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) # 'rook-ceph-crashcollector' on the failed node stucks at pending # state. BZ 1810014 tracks it. # Ignoring 'rook-ceph-crashcollector' pod health check as WA and # deleting its deployment so that the pod disappears # Will revert this WA once the BZ is fixed except ResourceWrongStatusException: if 'rook-ceph-crashcollector' in pod_obj.name: ocp_obj = ocp.OCP() name = pod_obj.name[:-17] command = f"delete deployment {name}" ocp_obj.exec_oc_cmd(command=command) log.info(f"Deleted deployment for pod {pod_obj.name}") # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def get_osds_per_node(): """ Gets the osd running pod names per node name Returns: dict: {"Node name":["osd running pod name running on the node",..,]} """ dic_node_osd = defaultdict(list) for osd_pod in pod.get_osd_pods(): dic_node_osd[pod.get_pod_node(osd_pod).name].append(osd_pod.name) return dic_node_osd
def get_pgbench_running_nodes(self): """ get nodes that contains pgbench pods Returns: list: List of pgbench running nodes """ pgbench_nodes = [ get_pod_node(pgbench_pod).name for pgbench_pod in self.get_pgbench_pods() ] return list(set(pgbench_nodes))
def select_osd_node_name(): """ select randomly one of the osd nodes Returns: str: the selected osd node name """ osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") return osd_node_name
def get_app_pod_running_nodes(pod_obj): """ Gets the app pod running node names Args: pod_obj (list): List of app pod objects Returns: list: App pod running node names """ return [pod.get_pod_node(obj_pod).name for obj_pod in pod_obj]
def test_monitoring_after_rebooting_node_where_mgr_is_running( self, nodes, pods): """ Test case to validate rebooting a node where mgr is running should not delete the data collected on prometheus pod """ # Get the mgr pod obj mgr_pod_obj = pod.get_mgr_pods() # Get the node where the mgr pod is hosted mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) # Reboot the node where the mgr pod is hosted nodes.restart_nodes([mgr_node_obj]) # Validate all nodes are in READY state retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)(wait_for_nodes_status()) # Check for Ceph pods pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert pod_obj.wait_for_resource(condition="Running", selector="app=rook-ceph-mgr", timeout=600) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-mon", resource_count=3, timeout=600, ) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-osd", resource_count=3, timeout=600, ) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check for ceph health check metrics is updated with new mgr pod wait_to_update_mgrpod_info_prometheus_pod() # Check for the created pvc metrics after rebooting the node where mgr pod was running for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
def get_ocs_operator_node_name(): """ Getting node's name that running ocs-operator pod Returns: str: node's name that running ocs-operator pod """ ocs_operator_pod = get_ocs_operator_pod() log.debug(f"ocs operator pod info: {ocs_operator_pod}") ocs_operator_node = get_pod_node(ocs_operator_pod) return get_node_name(ocs_operator_node)
def node_replacement_verification_steps_ceph_side(old_node_name, new_node_name): """ Check the verification steps from the Ceph side, after the process of node replacement as described in the docs Args: old_node_name (str): The name of the old node that has been deleted new_node_name (str): The name of the new node that has been created Returns: bool: True if all the verification steps passed. False otherwise """ if old_node_name == new_node_name: log.warning("Hostname didn't change") return False wait_for_nodes_status([new_node_name]) # It can take some time until all the ocs pods are up and running # after the process of node replacement if not pod.wait_for_pods_to_be_running(): log.warning("Not all the pods in running state") return False ct_pod = pod.get_ceph_tools_pod() ceph_osd_status = ct_pod.exec_ceph_cmd(ceph_cmd="ceph osd status") if new_node_name not in ceph_osd_status: log.warning("new node name not found in 'ceph osd status' output") return False if old_node_name in ceph_osd_status: log.warning("old node name found in 'ceph osd status' output") return False osd_pods_obj = pod.get_osd_pods() osd_node_names = [pod.get_pod_node(p).name for p in osd_pods_obj] if new_node_name not in osd_node_names: log.warning("the new hostname not found in osd node names") return False if old_node_name in osd_node_names: log.warning("the old hostname found in osd node names") return False from ocs_ci.ocs.cluster import check_ceph_osd_tree_after_node_replacement if not check_ceph_osd_tree_after_node_replacement(): return False log.info("Verification steps from the ceph side finish successfully") return True
def test_nodereplacement_proactive(self, pvc_factory, pod_factory, dc_pod_factory): """ Knip-894 Node Replacement proactive """ # Get worker nodes worker_node_list = get_worker_nodes() log.info(f"Current available worker nodes are {worker_node_list}") osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") log.info("Creating dc pod backed with rbd pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: rbd_dc_pod = dc_pod_factory(interface=constants.CEPHBLOCKPOOL, node_name=worker_node, size=20) pod.run_io_in_bg(rbd_dc_pod, expect_to_fail=False, fedora_dc=True) log.info("Creating dc pod backed with cephfs pvc and running io in bg") for worker_node in worker_node_list: if worker_node != osd_node_name: cephfs_dc_pod = dc_pod_factory(interface=constants.CEPHFILESYSTEM, node_name=worker_node, size=20) pod.run_io_in_bg(cephfs_dc_pod, expect_to_fail=False, fedora_dc=True) if config.ENV_DATA['platform'].lower() == constants.AWS_PLATFORM: if config.ENV_DATA['deployment_type'] == 'ipi': node.delete_and_create_osd_node_aws_ipi(osd_node_name) elif config.ENV_DATA['deployment_type'] == 'upi': node.delete_and_create_osd_node_aws_upi(osd_node_name) else: pytest.fail( f"ocs-ci config 'deployment_type' value '{config.ENV_DATA['deployment_type']}' is not valid, " f"results of this test run are all invalid.") elif config.ENV_DATA['platform'].lower() == constants.VSPHERE_PLATFORM: pytest.skip("Skipping add node in Vmware platform due to " "https://bugzilla.redhat.com/show_bug.cgi?id=1844521" ) # Creating Resources log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory) # Deleting Resources self.sanity_helpers.delete_resources() # Verify everything running fine log.info("Verifying All resources are Running and matches expected result") self.sanity_helpers.health_check(tries=30)
def get_node_pods(node_name, pods_to_search=None): """ Get all the pods of a specified node Args: node_name (str): The node name to get the pods pods_to_search (list): list of pods to search for the node pods. If not specified, will search in all the pods. Returns: list: list of all the pods of the specified node """ pods_to_search = pods_to_search or pod.get_all_pods() return [p for p in pods_to_search if pod.get_pod_node(p).name == node_name]
def restart_ocs_operator_node(self): """ Restart node that runs OCS operator pod """ pod_obj = pod.get_ocs_operator_pod() node_obj = pod.get_pod_node(pod_obj) self.nodes.restart_nodes([node_obj]) wait_for_nodes_status() pod.wait_for_pods_to_be_running( namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, pod_names=[pod_obj.name] )
def test_monitoring_shutdown_and_recovery_prometheus_node( self, nodes, pods): """ Test case to validate whether shutdown and recovery of a node where monitoring pods running has no functional impact """ # Get all prometheus pods prometheus_pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for prometheus_pod_obj in prometheus_pod_obj_list: # Get the node where the prometheus pod is hosted prometheus_node_obj = pod.get_pod_node(prometheus_pod_obj) # Shutdown and recovery node(i,e. restart nodes) where the prometheus pod is hosted nodes.stop_nodes([prometheus_node_obj]) waiting_time = 20 log.info(f"Waiting for {waiting_time} seconds") time.sleep(waiting_time) nodes.start_nodes(nodes=[prometheus_node_obj]) # Validate all nodes are in READY state retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)(wait_for_nodes_status()) # Check all the prometheus pods are up for pod_obj in prometheus_pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check for the created pvc metrics after shutdown and recovery of prometheus nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def test_automated_recovery_from_failed_nodes_IPI_proactive( self, pvc_factory, pod_factory): """ Knip-678 Automated recovery from failed nodes Proactive case - IPI """ # Get the osd associated node name osd_pods_obj = pod.get_osd_pods() osd_node_name = pod.get_pod_node(random.choice(osd_pods_obj)).name log.info(f"Selected OSD is {osd_node_name}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(osd_node_name) log.info(f"{osd_node_name} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info(f"{osd_node_name} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Delete the machine machine.delete_machine(machine_name) log.info(f"Successfully deleted machine {machine_name}") # Check the pods should be in running state all_pod_obj = pod.get_all_pods(wait=True) for pod_obj in all_pod_obj: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=200) # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_node_maintenance_restart_activate(self, nodes, pvc_factory, pod_factory, node_type): """ OCS-1292/OCS-1293: - Maintenance (mark as unscheduable and drain) 1 worker/master node - Restart the node - Mark the node as scheduable - Check cluster and Ceph health - Check cluster functionality by creating and deleting resources (pools, storageclasses, PVCs, pods - both CephFS and RBD) """ # Get a list of 2 nodes. Pick one of them after checking # which one does't have the rook operator running on typed_nodes = get_typed_nodes(node_type=node_type, num_of_nodes=2) assert typed_nodes, f"Failed to find a {node_type} node for the test" typed_node_name = typed_nodes[0].name # Workaround for BZ 1778488 - https://github.com/red-hat-storage/ocs-ci/issues/1222 rook_operator_pod = pod.get_operator_pods()[0] operator_node = pod.get_pod_node(rook_operator_pod) if operator_node.get().get('metadata').get('name') == typed_node_name: typed_node_name = typed_nodes[1].name # End of workaround for BZ 1778488 # Maintenance the node (unschedule and drain). The function contains logging drain_nodes([typed_node_name]) # Restarting the node nodes.restart_nodes(nodes=typed_nodes, wait=True) wait_for_nodes_status(node_names=[typed_node_name], status=constants.NODE_READY_SCHEDULING_DISABLED) # Mark the node back to schedulable schedule_nodes([typed_node_name]) # Check cluster and Ceph health and checking basic cluster # functionality by creating resources (pools, storageclasses, # PVCs, pods - both CephFS and RBD), run IO and delete the resources self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources()
def test_monitoring_when_one_of_the_prometheus_node_down( self, nodes, pods): """ Test case to validate when the prometheus pod is down and its interaction with prometheus """ # Get all prometheus pods pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the node where the prometheus pod is hosted pod_node_obj = pod.get_pod_node(pod_obj) # Make one of the node down where the prometheus pod is hosted nodes.restart_nodes([pod_node_obj]) # Validate all nodes are in READY state retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)(wait_for_nodes_status()) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check all the prometheus pods are up for pod_obj in pod_obj_list: wait_for_resource_state(resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180) # Check for the created pvc metrics after restarting node where prometheus pod is hosted for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) log.info( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is collected" )
def finalizer(): not_ready_nodes = [ n for n in node.get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_NOT_READY ] logger.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status() # Restart node if the osd stays at CLBO state osd_pods_obj_list = get_osd_pods() for pod in osd_pods_obj_list: if (pod.get().get("status").get("containerStatuses")[0].get( "state") == constants.STATUS_CLBO): node_obj = get_pod_node(pod) nodes.restart_nodes([node_obj]) node.wait_for_nodes_status([node_obj.name])
def setup(self, interface, pvc_factory, service_account_factory, teardown_factory): """ Create dc pod with replica 5 """ self.replica_count = 5 pvc_obj = pvc_factory(interface=interface, size=3) sa_obj = service_account_factory(project=pvc_obj.project) try: pod1 = create_pod( interface_type=interface, pvc_name=pvc_obj.name, namespace=pvc_obj.namespace, sa_name=sa_obj.name, dc_deployment=True, replica_count=self.replica_count, deploy_pod_status=constants.STATUS_RUNNING, ) except TimeoutExpiredError: # The test cannot be continued if all the pods are created on the same node pods = pod.get_all_pods(namespace=pvc_obj.namespace) pod_nodes = [pod.get_pod_node(pod_obj).name for pod_obj in pods] if set(pod_nodes) == 1: pytest.skip( "All pods are created on same node and reached Running state" ) raise self.name = pod1.labels["name"] self.namespace = pod1.namespace dc_obj = OCP( kind=constants.DEPLOYMENTCONFIG, namespace=self.namespace, resource_name=self.name, ) dc_info = dc_obj.get(resource_name=self.name, selector=f"app={self.name}")["items"][0] dc_obj = OCS(**dc_info) teardown_factory(dc_obj)
def test_delete_local_volume_sym_link(self): """ Delete sym link on LSO Cluster """ # Get rook-ceph-crashcollector pod objects crashcollector_pods = get_pod_name_by_pattern( pattern="rook-ceph-crashcollector", namespace=ROOK_CLUSTER_NAMESPACE) crashcollector_pods_objs = [] for crashcollector_pod in crashcollector_pods: crashcollector_pods_objs.append( get_pod_obj(name=crashcollector_pod, namespace=ROOK_CLUSTER_NAMESPACE)) # Get Node object node_obj = get_pod_node(pod_obj=crashcollector_pods_objs[0]) # Get Sym link osd_pvcs = get_deviceset_pvcs() pv_name = osd_pvcs[0].data["spec"]["volumeName"] ocp_obj = ocp.OCP(namespace=ROOK_CLUSTER_NAMESPACE, kind=constants.PV) pv_obj = ocp_obj.get(resource_name=pv_name) path = pv_obj["spec"]["local"]["path"] log.info("Delete sym link") oc_cmd = ocp.OCP(namespace=ROOK_CLUSTER_NAMESPACE) cmd = f"rm -rfv {path}" oc_cmd.exec_oc_debug_cmd(node=node_obj.name, cmd_list=[cmd]) log.info( "Waiting for rook-ceph-crashcollector pods to be reach Running state" ) for crashcollector_pods_obj in crashcollector_pods_objs: wait_for_resource_state(resource=crashcollector_pods_obj, state=constants.STATUS_RUNNING) # Check all OCS pods status, they should be in Running or Completed state wait_for_storage_pods() # Check ceph status ceph_health_check(namespace=config.ENV_DATA["cluster_namespace"])
def measure_stop_worker_nodes(request, measurement_dir, nodes): """ Stop worker nodes that doesn't contain RGW (so that alerts are triggered correctly), measure the time when it was stopped and monitors alerts that were triggered during this event. Returns: dict: Contains information about `start` and `stop` time for stopping worker node """ mgr_pod = pod.get_mgr_pods()[0] mgr_node = pod.get_pod_node(mgr_pod) test_nodes = [ worker_node for worker_node in get_nodes(node_type=constants.WORKER_MACHINE) if worker_node.name != mgr_node.name ] def stop_nodes(): """ Turn off test nodes for 5 minutes. Returns: list: Names of nodes that were turned down """ # run_time of operation run_time = 60 * 5 nonlocal test_nodes node_names = [node.name for node in test_nodes] logger.info(f"Turning off nodes {node_names}") nodes.stop_nodes(nodes=test_nodes) # Validate node reached NotReady state wait_for_nodes_status(node_names=node_names, status=constants.NODE_NOT_READY) logger.info(f"Waiting for {run_time} seconds") time.sleep(run_time) return node_names def finalizer(): nodes.restart_nodes_by_stop_and_start_teardown() assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") request.addfinalizer(finalizer) test_file = os.path.join(measurement_dir, "measure_stop_nodes.json") if config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS: # It seems that it takes longer to propagate incidents to PagerDuty. # Adding 3 extra minutes measured_op = measure_operation(stop_nodes, test_file, minimal_time=60 * 8) else: measured_op = measure_operation(stop_nodes, test_file) logger.info("Turning on nodes") try: nodes.start_nodes(nodes=test_nodes) except CommandFailed: logger.warning( "Nodes were not found: they were probably recreated. Check ceph health below" ) # Validate all nodes are in READY state and up retry((CommandFailed, ResourceWrongStatusException,), tries=60, delay=15,)( wait_for_nodes_status )(timeout=900) # wait for ceph to return into HEALTH_OK state after mgr deployment # is returned back to normal ceph_health_check(tries=20, delay=15) return measured_op
def test_recovery_from_volume_deletion(self, nodes, pvc_factory, pod_factory): """ Test cluster recovery from disk deletion from the platform side. Based on documented procedure detailed in https://bugzilla.redhat.com/show_bug.cgi?id=1823183 """ logger.info("Picking a PV which to be deleted from the platform side") osd_pvs = get_deviceset_pvs() osd_pv = random.choice(osd_pvs) osd_pv_name = osd_pv.name # get the claim name logger.info(f"Getting the claim name for OSD PV {osd_pv_name}") claim_name = osd_pv.get().get("spec").get("claimRef").get("name") # Get the backing volume name logger.info(f"Getting the backing volume name for PV {osd_pv_name}") backing_volume = nodes.get_data_volumes(pvs=[osd_pv])[0] # Get the corresponding PVC logger.info(f"Getting the corresponding PVC of PV {osd_pv_name}") osd_pvcs = get_deviceset_pvcs() osd_pvcs_count = len(osd_pvcs) osd_pvc = [ ds for ds in osd_pvcs if ds.get().get("metadata").get("name") == claim_name ][0] # Get the corresponding OSD pod and ID logger.info(f"Getting the OSD pod using PVC {osd_pvc.name}") osd_pods = get_osd_pods() osd_pods_count = len(osd_pods) osd_pod = [ osd_pod for osd_pod in osd_pods if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] logger.info(f"OSD_POD {osd_pod.name}") osd_id = osd_pod.get().get("metadata").get("labels").get("ceph-osd-id") # Get the node that has the OSD pod running on logger.info( f"Getting the node that has the OSD pod {osd_pod.name} running on") osd_node = get_pod_node(osd_pod) osd_prepare_pods = get_osd_prepare_pods() osd_prepare_pod = [ pod for pod in osd_prepare_pods if pod.get().get("metadata").get( "labels").get(constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_prepare_job_name = (osd_prepare_pod.get().get("metadata").get( "labels").get("job-name")) osd_prepare_job = get_job_obj(osd_prepare_job_name) # Get the corresponding OSD deployment logger.info(f"Getting the OSD deployment for OSD PVC {claim_name}") osd_deployment = [ osd_pod for osd_pod in get_osd_deployments() if osd_pod.get().get("metadata").get("labels").get( constants.CEPH_ROOK_IO_PVC_LABEL) == claim_name ][0] osd_deployment_name = osd_deployment.name # Delete the volume from the platform side logger.info(f"Deleting {backing_volume} from the platform side") nodes.detach_volume(backing_volume, osd_node) # Scale down OSD deployment logger.info(f"Scaling down OSD deployment {osd_deployment_name} to 0") ocp.OCP().exec_oc_cmd( f"scale --replicas=0 deployment/{osd_deployment_name}") # Force delete OSD pod if necessary osd_pod_name = osd_pod.name logger.info(f"Waiting for OSD pod {osd_pod.name} to get deleted") try: osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) except TimeoutError: osd_pod.delete(force=True) osd_pod.ocp.wait_for_delete(resource_name=osd_pod_name) # Run ocs-osd-removal job ocp_version = float(get_ocp_version()) if ocp_version >= 4.6: cmd = f"process ocs-osd-removal -p FAILED_OSD_IDS={osd_id} -o yaml" else: cmd = f"process ocs-osd-removal -p FAILED_OSD_ID={osd_id} -o yaml" logger.info(f"Executing OSD removal job on OSD-{osd_id}") ocp_obj = ocp.OCP(namespace=config.ENV_DATA["cluster_namespace"]) osd_removal_job_yaml = ocp_obj.exec_oc_cmd(cmd) osd_removal_job = OCS(**osd_removal_job_yaml) osd_removal_job.create(do_reload=False) # Get ocs-osd-removal pod name logger.info("Getting the ocs-osd-removal pod name") osd_removal_pod_name = get_osd_removal_pod_name(osd_id) osd_removal_pod_obj = get_pod_obj(osd_removal_pod_name, namespace="openshift-storage") osd_removal_pod_obj.ocp.wait_for_resource( condition=constants.STATUS_COMPLETED, resource_name=osd_removal_pod_name) # Verify OSD removal from the ocs-osd-removal pod logs logger.info( f"Verifying removal of OSD from {osd_removal_pod_name} pod logs") logs = get_pod_logs(osd_removal_pod_name) pattern = f"purged osd.{osd_id}" assert re.search(pattern, logs) osd_pvc_name = osd_pvc.name if ocp_version < 4.6: # Delete the OSD prepare job logger.info(f"Deleting OSD prepare job {osd_prepare_job_name}") osd_prepare_job.delete() osd_prepare_job.ocp.wait_for_delete( resource_name=osd_prepare_job_name, timeout=120) # Delete the OSD PVC logger.info(f"Deleting OSD PVC {osd_pvc_name}") osd_pvc.delete() osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name) # Delete the OSD deployment logger.info(f"Deleting OSD deployment {osd_deployment_name}") osd_deployment.delete() osd_deployment.ocp.wait_for_delete( resource_name=osd_deployment_name, timeout=120) else: # If ocp version is '4.6' and above the osd removal job should # delete the OSD prepare job, OSD PVC, OSD deployment logger.info( f"Verifying deletion of OSD prepare job {osd_prepare_job_name}" ) osd_prepare_job.ocp.wait_for_delete( resource_name=osd_prepare_job_name, timeout=30) logger.info(f"Verifying deletion of OSD PVC {osd_pvc_name}") osd_pvc.ocp.wait_for_delete(resource_name=osd_pvc_name, timeout=30) logger.info( f"Verifying deletion of OSD deployment {osd_deployment_name}") osd_deployment.ocp.wait_for_delete( resource_name=osd_deployment_name, timeout=30) # Delete PV logger.info(f"Verifying deletion of PV {osd_pv_name}") try: osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) except TimeoutError: osd_pv.delete() osd_pv.ocp.wait_for_delete(resource_name=osd_pv_name) if ocp_version < 4.6: # Delete the rook ceph operator pod to trigger reconciliation rook_operator_pod = get_operator_pods()[0] logger.info( f"deleting Rook Ceph operator pod {rook_operator_pod.name}") rook_operator_pod.delete() # Delete the OSD removal job logger.info(f"Deleting OSD removal job ocs-osd-removal-{osd_id}") osd_removal_job = get_job_obj(f"ocs-osd-removal-{osd_id}") osd_removal_job.delete() osd_removal_job.ocp.wait_for_delete( resource_name=f"ocs-osd-removal-{osd_id}") timeout = 600 # Wait for OSD PVC to get created and reach Bound state logger.info( "Waiting for a new OSD PVC to get created and reach Bound state") assert osd_pvc.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_BOUND, selector=constants.OSD_PVC_GENERIC_LABEL, resource_count=osd_pvcs_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pvcs_count} OSD PVCs in status Bound. Current OSD PVCs status: " f"{[pvc.ocp.get_resource(pvc.get().get('metadata').get('name'), 'STATUS') for pvc in get_deviceset_pvcs()]}" ) # Wait for OSD pod to get created and reach Running state logger.info( "Waiting for a new OSD pod to get created and reach Running state") assert osd_pod.ocp.wait_for_resource( timeout=timeout, condition=constants.STATUS_RUNNING, selector=constants.OSD_APP_LABEL, resource_count=osd_pods_count, ), (f"Cluster recovery failed after {timeout} seconds. " f"Expected to have {osd_pods_count} OSD pods in status Running. Current OSD pods status: " f"{[osd_pod.ocp.get_resource(pod.get().get('metadata').get('name'), 'STATUS') for pod in get_osd_pods()]}" ) # We need to silence the old osd crash warning due to BZ https://bugzilla.redhat.com/show_bug.cgi?id=1896810 # This is a workaround - issue for tracking: https://github.com/red-hat-storage/ocs-ci/issues/3438 if ocp_version >= 4.6: silence_osd_crash = cluster.wait_for_silence_ceph_osd_crash_warning( osd_pod_name) if not silence_osd_crash: logger.info("Didn't find ceph osd crash warning") # Validate cluster is still functional self.sanity_helpers.health_check(tries=100) self.sanity_helpers.create_resources(pvc_factory, pod_factory)