def kill_resource_repeatedly(self, resource_name, resource_id, max_iterations=30): """ The function get the resource name, and id and kill the resource repeatedly until the new osd pods reached status running. Args: resource_name (str): the name of the resource to kill resource_id (int): the id of the resource to kill max_iterations (int): Maximum times of iterations to delete the given resource """ d = Disruptions() for i in range(max_iterations): logging.info( f"iteration {i}: Delete resource {resource_name} with id {resource_id}" ) d.set_resource(resource_name) d.delete_resource(resource_id) if self.new_pods_in_status_running: logging.info("New osd pods reached status running") break if not self.new_pods_in_status_running: logging.warning( f"New osd pods didn't reach status running after {max_iterations} iterations" )
def test_run_amq_respin_pod(self, pod_name): """ Test amq workload when spinning ceph pods and restarting amq pods """ # Respin relevant pod if pod_name == "amq": pod_pattern_list = [ "cluster-operator", "my-cluster-kafka", "my-cluster-zookeeper", "my-connect-cluster-connect", "my-bridge-bridge", ] for pod_pattern in pod_pattern_list: respin_amq_app_pod(kafka_namespace=constants.AMQ_NAMESPACE, pod_pattern=pod_pattern) else: log.info(f"Respin Ceph pod {pod_name}") disruption = Disruptions() disruption.set_resource(resource=f"{pod_name}") disruption.delete_resource() # Validate the results log.info("Validate message run completely") for thread in self.threads: thread.result(timeout=1800)
def test_add_capacity_with_resource_delete( self, add_capacity_setup, workload_storageutilization_rbd, resource_name, resource_id, is_kill_resource_repeatedly, ): """ The function get the resource name, and id. The function adds capacity to the cluster, and then delete the resource while storage capacity is getting increased. Args: resource_name (str): the name of the resource to delete resource_id (int): the id of the resource to delete is_kill_resource_repeatedly (bool): If True then kill the resource repeatedly. Else, if False delete the resource only once. """ used_percentage = get_percent_used_capacity() logging.info( f"storageutilization is completed. used capacity = {used_percentage}" ) osd_pods_before = pod_helpers.get_osd_pods() number_of_osd_pods_before = len(osd_pods_before) d = Disruptions() d.set_resource(resource_name) self.new_pods_in_status_running = False osd_size = storage_cluster.get_osd_size() logging.info(f"Adding one new set of OSDs. osd size = {osd_size}") storagedeviceset_count = storage_cluster.add_capacity(osd_size) logging.info("Adding one new set of OSDs was issued without problems") # Wait for new osd's to come up. After the first new osd in status Init - delete the resource. # After deleting the resource we expect that all the new osd's will be in status running, # and the delete resource will be also in status running. pod_helpers.wait_for_new_osd_pods_to_come_up(number_of_osd_pods_before) logging.info( f"Delete a {resource_name} pod while storage capacity is getting increased" ) if is_kill_resource_repeatedly: with ThreadPoolExecutor() as executor: executor.submit(self.kill_resource_repeatedly, resource_name, resource_id) self.wait_for_osd_pods_to_be_running(storagedeviceset_count) else: d.delete_resource(resource_id) self.wait_for_osd_pods_to_be_running(storagedeviceset_count) self.new_pods_in_status_running = True logging.info( "Finished verifying add capacity when one of the pods gets deleted" ) logging.info("Waiting for ceph health check to finished...") check_ceph_health_after_add_capacity()
def test_run_couchbase_respin_pod(self, cb_setup, pod_name): log.info(f"Respin Ceph pod {pod_name}") if pod_name == "couchbase": self.cb.respin_couchbase_app_pod() else: disruption = Disruptions() disruption.set_resource(resource=f"{pod_name}") disruption.delete_resource() bg_handler = flowtest.BackgroundOps() bg_ops = [self.cb.result] bg_handler.wait_for_bg_operations(bg_ops, timeout=3600) self.sanity_helpers.health_check(tries=40)
def test_monitoring_after_respinning_ceph_pods(self, pods): """ Test case to validate respinning the ceph pods and its interaction with prometheus pod """ # Re-spin the ceph pods(i.e mgr, mon, osd, mds) one by one resource_to_delete = ["mgr", "mon", "osd"] disruption = Disruptions() for res_to_del in resource_to_delete: disruption.set_resource(resource=res_to_del) disruption.delete_resource() # Check for the created pvc metrics on prometheus pod for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
def test_coredump_check_for_ceph_daemon_crash(self): """ Verify coredumpctl list updated after killing daemon """ log.info("Get Node name where mon pod running") mon_pod_nodes = [get_pod_node(pod) for pod in get_mon_pods()] mon_pod_node_names = [node.name for node in mon_pod_nodes] log.info("Get Node name where mgr pod running") mgr_pod_nodes = [get_pod_node(pod) for pod in get_mgr_pods()] mgr_pod_node_names = [node.name for node in mgr_pod_nodes] log.info("Get Node name where osd pod running") osd_pod_nodes = [get_pod_node(pod) for pod in get_osd_pods()] osd_pod_node_names = [node.name for node in osd_pod_nodes] node_mgr_mon_osd_names = set(mgr_pod_node_names).intersection( osd_pod_node_names, mon_pod_node_names) node_mgr_osd_names = set(mgr_pod_node_names).intersection( osd_pod_node_names) node_mgr_mon_names = set(mgr_pod_node_names).intersection( mon_pod_node_names) if len(node_mgr_mon_osd_names) > 0: daemon_types = ["mgr", "osd", "mon"] node_name = list(node_mgr_mon_osd_names)[0] elif len(node_mgr_osd_names) > 0: daemon_types = ["mgr", "osd"] node_name = list(node_mgr_osd_names)[0] elif len(node_mgr_mon_names) > 0: daemon_types = ["mgr", "mon"] node_name = list(node_mgr_mon_names)[0] else: daemon_types = ["mgr"] node_name = mgr_pod_node_names[0] log.info(f"Test the daemon_types {daemon_types} on node {node_name}") log.info("Delete the contents of 'posted' directory " "`/var/lib/rook/openshift-storage/crash/posted/`") cmd_bash = f"oc debug nodes/{node_name} -- chroot /host /bin/bash -c " cmd_delete_files = '"rm -rf /var/lib/rook/openshift-storage/crash/posted/*"' cmd = cmd_bash + cmd_delete_files run_cmd(cmd=cmd) for daemon_type in daemon_types: log.info(f"find ceph-{daemon_type} process-id") cmd_pid = f"pidof ceph-{daemon_type}" cmd_gen = "oc debug node/" + node_name + " -- chroot /host " cmd = cmd_gen + cmd_pid out = run_cmd(cmd=cmd) pids = out.strip().split() pid = pids[0] if not pid.isnumeric(): raise Exception( f"The ceph-{daemon_type} process-id was not found.") log.info(f"Kill ceph-{daemon_type} process-id {pid}") disruptions_obj = Disruptions() disruptions_obj.daemon_pid = pid disruptions_obj.kill_daemon(node_name=node_name, check_new_pid=False, kill_signal="11") log.info( f"Verify that we have a crash event for ceph-{daemon_types} crash (tool pod)" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="ceph crash ls", expected_output_lst=daemon_types, cephtool_cmd=True, ) if not sample.wait_for_func_status(True): raise Exception( f"ceph-{daemon_types} process does not exist on crash list (tool pod)" ) log.info( f"Verify coredumpctl list updated after killing {daemon_types} daemons on {node_name}" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="coredumpctl list", expected_output_lst=daemon_types, debug_node=node_name, ) if not sample.wait_for_func_status(True): raise Exception( f"coredump not getting generated for ceph-{daemon_types} daemon crash" ) log.info( f"Verify the directory postedcoredumpctl is not empty on {node_name}" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="ls -ltr /var/lib/rook/openshift-storage/crash/posted/", expected_output_lst=[":"], debug_node=node_name, ) if not sample.wait_for_func_status(True): raise Exception( f"coredump not getting generated for {daemon_types} daemons crash" ) log.info( "Verify ceph status moved to HEALTH_WARN state with the relevant " "information (daemons have recently crashed)") sample = TimeoutSampler( timeout=20, sleep=5, func=run_cmd_verify_cli_output, cmd="ceph health detail", expected_output_lst=daemon_types + ["HEALTH_WARN", "daemons have recently crashed"], cephtool_cmd=True, ) if not sample.wait_for_func_status(True): raise Exception( "The output of command ceph health detail did not show " "warning 'daemons have recently crashed'")
def test_coredump_check_for_ceph_daemon_crash(self, daemon_type): """ Verify coredumpctl list updated after killing daemon """ log.info(f"Get Node name where {daemon_type} pod running") if daemon_type == "mon": mon_pod_nodes = [get_pod_node(pod) for pod in get_mon_pods()] node_obj = mon_pod_nodes[0] elif daemon_type == "mgr": mgr_pod_nodes = [get_pod_node(pod) for pod in get_mgr_pods()] node_obj = mgr_pod_nodes[0] elif daemon_type == "osd": osd_pod_nodes = [get_pod_node(pod) for pod in get_osd_pods()] node_obj = osd_pod_nodes[0] node_name = node_obj.name log.info("Delete the contents of 'posted' directory " "`/var/lib/rook/openshift-storage/crash/posted/`") cmd_bash = f"oc debug nodes/{node_name} -- chroot /host /bin/bash -c " cmd_delete_files = '"rm -rf /var/lib/rook/openshift-storage/crash/posted/*"' cmd = cmd_bash + cmd_delete_files run_cmd(cmd=cmd) log.info(f"find ceph-{daemon_type} process-id") cmd_pid = f"pidof ceph-{daemon_type}" cmd_gen = "oc debug node/" + node_name + " -- chroot /host " cmd = cmd_gen + cmd_pid out = run_cmd(cmd=cmd) pid = out.strip() if not pid.isnumeric(): raise Exception( f"The ceph-{daemon_type} process-id was not found.") log.info(f"Kill ceph-{daemon_type} process-id {pid}") disruptions_obj = Disruptions() disruptions_obj.daemon_pid = pid disruptions_obj.kill_daemon(node_name=node_name, check_new_pid=False, kill_signal="11") log.info( f"Verify that we have a crash event for ceph-{daemon_type} crash") sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="ceph crash ls", expected_output_lst=[daemon_type], cephtool_cmd=True, ) if not sample.wait_for_func_status(True): raise Exception( f"ceph-{daemon_type} process does not exist on crash list (tool pod)" ) log.info( f"Verify coredumpctl list updated after killing ceph-{daemon_type} daemon" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="coredumpctl list", expected_output_lst=[daemon_type], debug_node=node_name, ) if not sample.wait_for_func_status(True): raise Exception( f"coredump not getting generated for ceph-{daemon_type} daemon crash" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="ls -ltr /var/lib/rook/openshift-storage/crash/posted/", expected_output_lst=[":"], debug_node=node_name, ) if not sample.wait_for_func_status(True): raise Exception( f"coredump not getting generated for ceph-{daemon_type} daemon crash" )
def test_pvc_snapshot_and_clone( self, nodes, multiple_snapshot_and_clone_of_postgres_pvc_factory ): """ 1. Deploy PGSQL workload 2. Take a snapshot of the pgsql PVC. 3. Create a new PVC out of that snapshot or restore snapshot 4. Create a clone of restored snapshot 5. Attach a new pgsql pod to it. 5. Resize cloned pvc 7. Create snapshots of cloned pvc and restore those snapshots 8. Attach a new pgsql pod to it and Resize the new restored pvc 9. Repeat the above steps in bg when performing base operation: restart pods, worker node reboot, node drain, device replacement """ log.info("Starting multiple creation & clone of postgres PVC in Background") bg_handler = flowtest.BackgroundOps() executor_run_bg_ops = ThreadPoolExecutor(max_workers=1) pgsql_snapshot_and_clone = executor_run_bg_ops.submit( bg_handler.handler, multiple_snapshot_and_clone_of_postgres_pvc_factory, pvc_size_new=25, pgsql=self.pgsql, iterations=1, ) log.info("Started creation of snapshots & clones in background") flow_ops = flowtest.FlowOperations() log.info("Starting operation 1: Pod Restarts") disruption = Disruptions() pod_obj_list = [ "osd", "mon", "mgr", "operator", "rbdplugin", "rbdplugin_provisioner", ] for pod in pod_obj_list: disruption.set_resource(resource=f"{pod}") disruption.delete_resource() log.info("Verifying exit criteria for operation 1: Pod Restarts") flow_ops.validate_cluster( node_status=True, pod_status=True, operation_name="Pod Restarts" ) log.info("Starting operation 2: Node Reboot") node_names = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=3, operation_name="Node Reboot" ) # Reboot node nodes.restart_nodes(node_names) log.info("Verifying exit criteria for operation 2: Node Reboot") flow_ops.validate_cluster( node_status=True, pod_status=True, operation_name="Node Reboot" ) log.info("Starting operation 3: Node Drain") node_name = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=1, operation_name="Node Drain" ) # Node maintenance - to gracefully terminate all pods on the node drain_nodes([node_name[0].name]) # Make the node schedulable again schedule_nodes([node_name[0].name]) log.info("Verifying exit criteria for operation 3: Node Drain") flow_ops.validate_cluster( node_status=True, pod_status=True, operation_name="Node Drain" ) log.info("Waiting for background operations to be completed") bg_handler.wait_for_bg_operations([pgsql_snapshot_and_clone], timeout=600)
def run_in_bg(self, nodes, multiple_snapshot_and_clone_of_postgres_pvc_factory, sc_name=None): log.info( "Starting multiple creation & clone of postgres PVC in Background") bg_handler = flowtest.BackgroundOps() executor_run_bg_ops = ThreadPoolExecutor(max_workers=1) pgsql_snapshot_and_clone = executor_run_bg_ops.submit( bg_handler.handler, multiple_snapshot_and_clone_of_postgres_pvc_factory, pvc_size_new=25, pgsql=self.pgsql, sc_name=sc_name, iterations=1, ) log.info("Started creation of snapshots & clones in background") flow_ops = flowtest.FlowOperations() log.info("Starting operation 1: Pod Restarts") disruption = Disruptions() pod_obj_list = [ "osd", "mon", "mgr", "operator", "rbdplugin", "rbdplugin_provisioner", ] for pod in pod_obj_list: disruption.set_resource(resource=f"{pod}") disruption.delete_resource() log.info("Verifying exit criteria for operation 1: Pod Restarts") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Pod Restarts") log.info("Starting operation 2: Node Reboot") node_names = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=3, operation_name="Node Reboot") # Reboot node nodes.restart_nodes(node_names) log.info("Verifying exit criteria for operation 2: Node Reboot") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Node Reboot") log.info("Starting operation 3: Node Drain") node_name = flow_ops.node_operations_entry_criteria( node_type="worker", number_of_nodes=1, operation_name="Node Drain") # Node maintenance - to gracefully terminate all pods on the node drain_nodes([node_name[0].name]) # Make the node schedulable again schedule_nodes([node_name[0].name]) log.info("Verifying exit criteria for operation 3: Node Drain") flow_ops.validate_cluster(node_status=True, pod_status=True, operation_name="Node Drain") log.info("Waiting for background operations to be completed") bg_handler.wait_for_bg_operations([pgsql_snapshot_and_clone], timeout=600)