def test_coredump_check_for_ceph_daemon_crash(self): """ Verify coredumpctl list updated after killing daemon """ log.info("Get Node name where mon pod running") mon_pod_nodes = [get_pod_node(pod) for pod in get_mon_pods()] mon_pod_node_names = [node.name for node in mon_pod_nodes] log.info("Get Node name where mgr pod running") mgr_pod_nodes = [get_pod_node(pod) for pod in get_mgr_pods()] mgr_pod_node_names = [node.name for node in mgr_pod_nodes] log.info("Get Node name where osd pod running") osd_pod_nodes = [get_pod_node(pod) for pod in get_osd_pods()] osd_pod_node_names = [node.name for node in osd_pod_nodes] node_mgr_mon_osd_names = set(mgr_pod_node_names).intersection( osd_pod_node_names, mon_pod_node_names) node_mgr_osd_names = set(mgr_pod_node_names).intersection( osd_pod_node_names) node_mgr_mon_names = set(mgr_pod_node_names).intersection( mon_pod_node_names) if len(node_mgr_mon_osd_names) > 0: daemon_types = ["mgr", "osd", "mon"] node_name = list(node_mgr_mon_osd_names)[0] elif len(node_mgr_osd_names) > 0: daemon_types = ["mgr", "osd"] node_name = list(node_mgr_osd_names)[0] elif len(node_mgr_mon_names) > 0: daemon_types = ["mgr", "mon"] node_name = list(node_mgr_mon_names)[0] else: daemon_types = ["mgr"] node_name = mgr_pod_node_names[0] log.info(f"Test the daemon_types {daemon_types} on node {node_name}") log.info("Delete the contents of 'posted' directory " "`/var/lib/rook/openshift-storage/crash/posted/`") cmd_bash = f"oc debug nodes/{node_name} -- chroot /host /bin/bash -c " cmd_delete_files = '"rm -rf /var/lib/rook/openshift-storage/crash/posted/*"' cmd = cmd_bash + cmd_delete_files run_cmd(cmd=cmd) for daemon_type in daemon_types: log.info(f"find ceph-{daemon_type} process-id") cmd_pid = f"pidof ceph-{daemon_type}" cmd_gen = "oc debug node/" + node_name + " -- chroot /host " cmd = cmd_gen + cmd_pid out = run_cmd(cmd=cmd) pids = out.strip().split() pid = pids[0] if not pid.isnumeric(): raise Exception( f"The ceph-{daemon_type} process-id was not found.") log.info(f"Kill ceph-{daemon_type} process-id {pid}") disruptions_obj = Disruptions() disruptions_obj.daemon_pid = pid disruptions_obj.kill_daemon(node_name=node_name, check_new_pid=False, kill_signal="11") log.info( f"Verify that we have a crash event for ceph-{daemon_types} crash (tool pod)" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="ceph crash ls", expected_output_lst=daemon_types, cephtool_cmd=True, ) if not sample.wait_for_func_status(True): raise Exception( f"ceph-{daemon_types} process does not exist on crash list (tool pod)" ) log.info( f"Verify coredumpctl list updated after killing {daemon_types} daemons on {node_name}" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="coredumpctl list", expected_output_lst=daemon_types, debug_node=node_name, ) if not sample.wait_for_func_status(True): raise Exception( f"coredump not getting generated for ceph-{daemon_types} daemon crash" ) log.info( f"Verify the directory postedcoredumpctl is not empty on {node_name}" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="ls -ltr /var/lib/rook/openshift-storage/crash/posted/", expected_output_lst=[":"], debug_node=node_name, ) if not sample.wait_for_func_status(True): raise Exception( f"coredump not getting generated for {daemon_types} daemons crash" ) log.info( "Verify ceph status moved to HEALTH_WARN state with the relevant " "information (daemons have recently crashed)") sample = TimeoutSampler( timeout=20, sleep=5, func=run_cmd_verify_cli_output, cmd="ceph health detail", expected_output_lst=daemon_types + ["HEALTH_WARN", "daemons have recently crashed"], cephtool_cmd=True, ) if not sample.wait_for_func_status(True): raise Exception( "The output of command ceph health detail did not show " "warning 'daemons have recently crashed'")
def test_coredump_check_for_ceph_daemon_crash(self, daemon_type): """ Verify coredumpctl list updated after killing daemon """ log.info(f"Get Node name where {daemon_type} pod running") if daemon_type == "mon": mon_pod_nodes = [get_pod_node(pod) for pod in get_mon_pods()] node_obj = mon_pod_nodes[0] elif daemon_type == "mgr": mgr_pod_nodes = [get_pod_node(pod) for pod in get_mgr_pods()] node_obj = mgr_pod_nodes[0] elif daemon_type == "osd": osd_pod_nodes = [get_pod_node(pod) for pod in get_osd_pods()] node_obj = osd_pod_nodes[0] node_name = node_obj.name log.info("Delete the contents of 'posted' directory " "`/var/lib/rook/openshift-storage/crash/posted/`") cmd_bash = f"oc debug nodes/{node_name} -- chroot /host /bin/bash -c " cmd_delete_files = '"rm -rf /var/lib/rook/openshift-storage/crash/posted/*"' cmd = cmd_bash + cmd_delete_files run_cmd(cmd=cmd) log.info(f"find ceph-{daemon_type} process-id") cmd_pid = f"pidof ceph-{daemon_type}" cmd_gen = "oc debug node/" + node_name + " -- chroot /host " cmd = cmd_gen + cmd_pid out = run_cmd(cmd=cmd) pid = out.strip() if not pid.isnumeric(): raise Exception( f"The ceph-{daemon_type} process-id was not found.") log.info(f"Kill ceph-{daemon_type} process-id {pid}") disruptions_obj = Disruptions() disruptions_obj.daemon_pid = pid disruptions_obj.kill_daemon(node_name=node_name, check_new_pid=False, kill_signal="11") log.info( f"Verify that we have a crash event for ceph-{daemon_type} crash") sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="ceph crash ls", expected_output_lst=[daemon_type], cephtool_cmd=True, ) if not sample.wait_for_func_status(True): raise Exception( f"ceph-{daemon_type} process does not exist on crash list (tool pod)" ) log.info( f"Verify coredumpctl list updated after killing ceph-{daemon_type} daemon" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="coredumpctl list", expected_output_lst=[daemon_type], debug_node=node_name, ) if not sample.wait_for_func_status(True): raise Exception( f"coredump not getting generated for ceph-{daemon_type} daemon crash" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="ls -ltr /var/lib/rook/openshift-storage/crash/posted/", expected_output_lst=[":"], debug_node=node_name, ) if not sample.wait_for_func_status(True): raise Exception( f"coredump not getting generated for ceph-{daemon_type} daemon crash" )