def test_monitoring_after_rebooting_node_where_mgr_is_running(self): """ Test case to validate rebooting a node where mgr is running should not delete the data collected on prometheus pod """ aws_obj = aws.AWS() # Get the mgr pod obj mgr_pod_obj = pod.get_mgr_pods() # Get the node where the mgr pod is hosted mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) # Reboot the node where the mgr pod is hosted instances = aws.get_instances_ids_and_names([mgr_node_obj]) aws_obj.restart_ec2_instances(instances=instances, wait=True, force=True) # Validate all nodes are in READY state wait_for_nodes_status() # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for ceph health check metrics is updated with new mgr pod wait_to_update_mgrpod_info_prometheus_pod() # Check for the created pvc metrics after rebooting the node where mgr pod was running for pod_obj in self.pod_objs: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def delete_pods(self): """ Try to delete pods: - Rook operator - OSD - MGR - MON """ pod_list = [] rook_operator_pod = pod.get_ocs_operator_pod( ocs_label=constants.OPERATOR_LABEL, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) pod_list.append(rook_operator_pod) osd_pods = pod.get_osd_pods() pod_list.extend(osd_pods) mgr_pods = pod.get_mgr_pods() pod_list.extend(mgr_pods) mon_pods = pod.get_mon_pods() pod_list.extend(mon_pods) logger.info(f"Deleting pods: {[p.name for p in pod_list]}") pod.delete_pods(pod_objs=pod_list)
def mgr_pod_node_restart(self): """ Restart node that runs mgr pod """ mgr_pod_obj = pod.get_mgr_pods() mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) self.nodes.restart_nodes([mgr_node_obj]) wait_for_nodes_status() # Check for Ceph pods pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-mgr", timeout=600 ) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-mon", resource_count=3, timeout=600, ) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-osd", resource_count=3, timeout=600, )
def set_resource(self, resource): self.resource = resource if self.resource == 'mgr': self.resource_obj = pod.get_mgr_pods() self.type = 'rook-ceph' if self.resource == 'mon': self.resource_obj = pod.get_mon_pods() self.type = 'rook-ceph' if self.resource == 'osd': self.resource_obj = pod.get_osd_pods() self.type = 'rook-ceph' if self.resource == 'mds': self.resource_obj = pod.get_mds_pods() self.type = 'rook-ceph' if self.resource == 'cephfsplugin': self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHFILESYSTEM ) self.type = 'csi' if self.resource == 'rbdplugin': self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHBLOCKPOOL ) self.type = 'csi' self.resource_count = len(self.resource_obj)
def scan_cluster(self): """ Get accurate info on current state of pods """ self._ceph_pods = pod.get_all_pods(self._namespace) # TODO: Workaround for BZ1748325: mons = pod.get_mon_pods(self.mon_selector, self.namespace) for mon in mons: if mon.ocp.get_resource_status( mon.name) == constant.STATUS_RUNNING: self.mons.append(mon) # TODO: End of workaround for BZ1748325 self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace) self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace) self.osds = pod.get_osd_pods(self.osd_selector, self.namespace) self.toolbox = pod.get_ceph_tools_pod() # set port attrib on mon pods self.mons = list(map(self.set_port, self.mons)) self.cluster.reload() if self.cephfs: self.cephfs.reload() else: try: self.cephfs_config = self.CEPHFS.get().get('items')[0] self.cephfs = ocs.OCS(**self.cephfs_config) self.cephfs.reload() except IndexError as e: logging.warning(e) logging.warning("No CephFS found") self.mon_count = len(self.mons) self.mds_count = len(self.mdss) self.mgr_count = len(self.mgrs) self.osd_count = len(self.osds)
def scan_cluster(self): """ Get accurate info on current state of pods """ self._ceph_pods = pod.get_all_pods(self._namespace) self.mons = pod.get_mon_pods(self.mon_selector, self.namespace) self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace) self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace) self.osds = pod.get_osd_pods(self.osd_selector, self.namespace) self.toolbox = pod.get_ceph_tools_pod() # set port attrib on mon pods self.mons = list(map(self.set_port, self.mons)) self.cluster.reload() if self.cephfs: self.cephfs.reload() else: try: self.cephfs_config = self.CEPHFS.get().get('items')[0] self.cephfs = ocs.OCS(**self.cephfs_config) self.cephfs.reload() except IndexError as e: logging.warning(e) logging.warning("No CephFS found") self.mon_count = len(self.mons) self.mds_count = len(self.mdss) self.mgr_count = len(self.mgrs) self.osd_count = len(self.osds)
def verify_multus_network(): """ Verify Multus network(s) created successfully and are present on relevant pods. """ with open(constants.MULTUS_YAML, mode="r") as f: multus_public_data = yaml.load(f) multus_namespace = multus_public_data["metadata"]["namespace"] multus_name = multus_public_data["metadata"]["name"] multus_public_network_name = f"{multus_namespace}/{multus_name}" log.info("Verifying multus NetworkAttachmentDefinitions") ocp.OCP( resource_name=multus_public_network_name, kind="network-attachment-definitions", namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) # TODO: also check if private NAD exists log.info("Verifying multus public network exists on ceph pods") osd_pods = get_osd_pods() for _pod in osd_pods: assert (_pod.data["metadata"]["annotations"] ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name) # TODO: also check private network if it exists on OSD pods mon_pods = get_mon_pods() mds_pods = get_mds_pods() mgr_pods = get_mgr_pods() rgw_pods = get_rgw_pods() ceph_pods = [*mon_pods, *mds_pods, *mgr_pods, *rgw_pods] for _pod in ceph_pods: assert (_pod.data["metadata"]["annotations"] ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name) log.info("Verifying multus public network exists on CSI pods") csi_pods = [] interfaces = [constants.CEPHBLOCKPOOL, constants.CEPHFILESYSTEM] for interface in interfaces: plugin_pods = get_plugin_pods(interface) csi_pods += plugin_pods cephfs_provisioner_pods = get_cephfsplugin_provisioner_pods() rbd_provisioner_pods = get_rbdfsplugin_provisioner_pods() csi_pods += cephfs_provisioner_pods csi_pods += rbd_provisioner_pods for _pod in csi_pods: assert (_pod.data["metadata"]["annotations"] ["k8s.v1.cni.cncf.io/networks"] == multus_public_network_name) log.info("Verifying StorageCluster multus network data") sc = get_storage_cluster() sc_data = sc.get().get("items")[0] network_data = sc_data["spec"]["network"] assert network_data["provider"] == "multus" selectors = network_data["selectors"] assert selectors[ "public"] == f"{defaults.ROOK_CLUSTER_NAMESPACE}/ocs-public"
def set_resource(self, resource, leader_type="provisioner"): self.resource = resource if (config.ENV_DATA["platform"] in constants.MANAGED_SERVICE_PLATFORMS) and (resource in CEPH_PODS): # If the platform is Managed Services, then the ceph pods will be present in the provider cluster. # Consumer cluster will be the primary cluster context in a multicluster run. Setting 'cluster_kubeconfig' # attribute to use as the value of the parameter '--kubeconfig' in the 'oc' commands to get ceph pods. provider_kubeconfig = os.path.join( config.clusters[ config.get_provider_index()].ENV_DATA["cluster_path"], config.clusters[config.get_provider_index()].RUN.get( "kubeconfig_location"), ) self.cluster_kubeconfig = provider_kubeconfig resource_count = 0 if self.resource == "mgr": self.resource_obj = pod.get_mgr_pods() self.selector = constants.MGR_APP_LABEL if self.resource == "mon": self.resource_obj = pod.get_mon_pods() self.selector = constants.MON_APP_LABEL if self.resource == "osd": self.resource_obj = pod.get_osd_pods() self.selector = constants.OSD_APP_LABEL if self.resource == "mds": self.resource_obj = pod.get_mds_pods() self.selector = constants.MDS_APP_LABEL if self.resource == "cephfsplugin": self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHFILESYSTEM) self.selector = constants.CSI_CEPHFSPLUGIN_LABEL if self.resource == "rbdplugin": self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHBLOCKPOOL) self.selector = constants.CSI_RBDPLUGIN_LABEL if self.resource == "cephfsplugin_provisioner": self.resource_obj = [ pod.get_plugin_provisioner_leader( interface=constants.CEPHFILESYSTEM, leader_type=leader_type) ] self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_cephfsplugin_provisioner_pods()) if self.resource == "rbdplugin_provisioner": self.resource_obj = [ pod.get_plugin_provisioner_leader( interface=constants.CEPHBLOCKPOOL, leader_type=leader_type) ] self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_rbdfsplugin_provisioner_pods()) if self.resource == "operator": self.resource_obj = pod.get_operator_pods() self.selector = constants.OPERATOR_LABEL self.resource_count = resource_count or len(self.resource_obj)
def set_resource(self, resource): self.resource = resource if self.resource == 'mgr': self.resource_obj = pod.get_mgr_pods() if self.resource == 'mon': self.resource_obj = pod.get_mon_pods() if self.resource == 'osd': self.resource_obj = pod.get_osd_pods() if self.resource == 'mds': self.resource_obj = pod.get_mds_pods() self.resource_count = len(self.resource_obj)
def test_monitoring_after_rebooting_node_where_mgr_is_running( self, nodes, pods): """ Test case to validate rebooting a node where mgr is running should not delete the data collected on prometheus pod """ # Get the mgr pod obj mgr_pod_obj = pod.get_mgr_pods() # Get the node where the mgr pod is hosted mgr_node_obj = pod.get_pod_node(mgr_pod_obj[0]) # Reboot the node where the mgr pod is hosted nodes.restart_nodes([mgr_node_obj]) # Validate all nodes are in READY state retry((CommandFailed, ResourceWrongStatusException), tries=20, delay=15)(wait_for_nodes_status()) # Check for Ceph pods pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert pod_obj.wait_for_resource(condition="Running", selector="app=rook-ceph-mgr", timeout=600) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-mon", resource_count=3, timeout=600, ) assert pod_obj.wait_for_resource( condition="Running", selector="app=rook-ceph-osd", resource_count=3, timeout=600, ) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check(tries=40) # Check for ceph health check metrics is updated with new mgr pod wait_to_update_mgrpod_info_prometheus_pod() # Check for the created pvc metrics after rebooting the node where mgr pod was running for pod_obj in pods: assert check_pvcdata_collected_on_prometheus( pod_obj.pvc.name ), f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected"
def test_pod_log_after_upgrade(): """ Check OSD/MON/MGR pod logs after upgrade and verify the expected log exist """ pod_objs = get_osd_pods() + get_mon_pods() + get_mgr_pods() pod_names = [osd_pod_obj.name for osd_pod_obj in pod_objs] expected_log_after_upgrade = "set uid:gid to 167:167 (ceph:ceph)" logging.info(f"Check that the log '{expected_log_after_upgrade}' " f"appears after the osd/mon/mg pod is initialized") for pod_name in pod_names: pod_logs = get_pod_logs(pod_name=pod_name, all_containers=True) assert expected_log_after_upgrade in pod_logs, ( f"The expected log after upgrade '{expected_log_after_upgrade}' does not exist" f" on pod {pod_name}") logging.info( f"The log '{expected_log_after_upgrade}' appears in all relevant pods." )
def get_node_pods_to_scale_down(node_name): """ Get the pods of a node to scale down as described in the documents of node replacement with LSO Args: node_name (str): The node name Returns: list: The node's pods to scale down """ pods_to_scale_down = [ *pod.get_mon_pods(), *pod.get_osd_pods(), *pod.get_mgr_pods(), ] return get_node_pods(node_name, pods_to_scale_down)
def set_resource(self, resource, leader_type="provisioner"): self.resource = resource resource_count = 0 if self.resource == "mgr": self.resource_obj = pod.get_mgr_pods() self.selector = constants.MGR_APP_LABEL if self.resource == "mon": self.resource_obj = pod.get_mon_pods() self.selector = constants.MON_APP_LABEL if self.resource == "osd": self.resource_obj = pod.get_osd_pods() self.selector = constants.OSD_APP_LABEL if self.resource == "mds": self.resource_obj = pod.get_mds_pods() self.selector = constants.MDS_APP_LABEL if self.resource == "cephfsplugin": self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHFILESYSTEM) self.selector = constants.CSI_CEPHFSPLUGIN_LABEL if self.resource == "rbdplugin": self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHBLOCKPOOL) self.selector = constants.CSI_RBDPLUGIN_LABEL if self.resource == "cephfsplugin_provisioner": self.resource_obj = [ pod.get_plugin_provisioner_leader( interface=constants.CEPHFILESYSTEM, leader_type=leader_type) ] self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_cephfsplugin_provisioner_pods()) if self.resource == "rbdplugin_provisioner": self.resource_obj = [ pod.get_plugin_provisioner_leader( interface=constants.CEPHBLOCKPOOL, leader_type=leader_type) ] self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_rbdfsplugin_provisioner_pods()) if self.resource == "operator": self.resource_obj = pod.get_operator_pods() self.selector = constants.OPERATOR_LABEL self.resource_count = resource_count or len(self.resource_obj)
def set_resource(self, resource): self.resource = resource resource_count = 0 if self.resource == 'mgr': self.resource_obj = pod.get_mgr_pods() self.selector = constants.MGR_APP_LABEL if self.resource == 'mon': self.resource_obj = pod.get_mon_pods() self.selector = constants.MON_APP_LABEL if self.resource == 'osd': self.resource_obj = pod.get_osd_pods() self.selector = constants.OSD_APP_LABEL if self.resource == 'mds': self.resource_obj = pod.get_mds_pods() self.selector = constants.MDS_APP_LABEL if self.resource == 'cephfsplugin': self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHFILESYSTEM) self.selector = constants.CSI_CEPHFSPLUGIN_LABEL if self.resource == 'rbdplugin': self.resource_obj = pod.get_plugin_pods( interface=constants.CEPHBLOCKPOOL) self.selector = constants.CSI_RBDPLUGIN_LABEL if self.resource == 'cephfsplugin_provisioner': self.resource_obj = [ pod.plugin_provisioner_leader( interface=constants.CEPHFILESYSTEM) ] self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_cephfsplugin_provisioner_pods()) if self.resource == 'rbdplugin_provisioner': self.resource_obj = [ pod.plugin_provisioner_leader( interface=constants.CEPHBLOCKPOOL) ] self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_rbdfsplugin_provisioner_pods()) if self.resource == 'operator': self.resource_obj = pod.get_operator_pods() self.selector = constants.OPERATOR_LABEL self.resource_count = resource_count or len(self.resource_obj)
def scan_cluster(self): """ Get accurate info on current state of pods """ self._ceph_pods = pod.get_all_pods(self._namespace) self.mons = pod.get_mon_pods(self.mon_selector, self.namespace) self.mdss = pod.get_mds_pods(self.mds_selector, self.namespace) self.mgrs = pod.get_mgr_pods(self.mgr_selector, self.namespace) self.osds = pod.get_osd_pods(self.osd_selector, self.namespace) self.toolbox = pod.get_ceph_tools_pod() # set port attrib on mon pods self.mons = list(map(self.set_port, self.mons)) self.cluster.reload() if self.cephfs_config: self.cephfs.reload() self.mon_count = len(self.mons) self.mds_count = len(self.mdss) self.mgr_count = len(self.mgrs) self.osd_count = len(self.osds)
def measure_stop_worker_nodes(request, measurement_dir, nodes): """ Stop worker nodes that doesn't contain RGW (so that alerts are triggered correctly), measure the time when it was stopped and monitors alerts that were triggered during this event. Returns: dict: Contains information about `start` and `stop` time for stopping worker node """ mgr_pod = pod.get_mgr_pods()[0] mgr_node = pod.get_pod_node(mgr_pod) test_nodes = [ worker_node for worker_node in get_nodes(node_type=constants.WORKER_MACHINE) if worker_node.name != mgr_node.name ] def stop_nodes(): """ Turn off test nodes for 5 minutes. Returns: list: Names of nodes that were turned down """ # run_time of operation run_time = 60 * 5 nonlocal test_nodes node_names = [node.name for node in test_nodes] logger.info(f"Turning off nodes {node_names}") nodes.stop_nodes(nodes=test_nodes) # Validate node reached NotReady state wait_for_nodes_status(node_names=node_names, status=constants.NODE_NOT_READY) logger.info(f"Waiting for {run_time} seconds") time.sleep(run_time) return node_names def finalizer(): nodes.restart_nodes_by_stop_and_start_teardown() assert ceph_health_check(), "Ceph cluster health is not OK" logger.info("Ceph cluster health is OK") request.addfinalizer(finalizer) test_file = os.path.join(measurement_dir, "measure_stop_nodes.json") if config.ENV_DATA["platform"].lower() in constants.MANAGED_SERVICE_PLATFORMS: # It seems that it takes longer to propagate incidents to PagerDuty. # Adding 3 extra minutes measured_op = measure_operation(stop_nodes, test_file, minimal_time=60 * 8) else: measured_op = measure_operation(stop_nodes, test_file) logger.info("Turning on nodes") try: nodes.start_nodes(nodes=test_nodes) except CommandFailed: logger.warning( "Nodes were not found: they were probably recreated. Check ceph health below" ) # Validate all nodes are in READY state and up retry((CommandFailed, ResourceWrongStatusException,), tries=60, delay=15,)( wait_for_nodes_status )(timeout=900) # wait for ceph to return into HEALTH_OK state after mgr deployment # is returned back to normal ceph_health_check(tries=20, delay=15) return measured_op
def set_resource(self, resource, leader_type="provisioner", cluster_index=None): self.resource = resource if (config.ENV_DATA["platform"] in constants.MANAGED_SERVICE_PLATFORMS) and ( resource in CEPH_PODS ): # If the platform is Managed Services, then the ceph pods will be present in the provider cluster. # Consumer cluster will be the primary cluster context in a multicluster run. Setting 'cluster_kubeconfig' # attribute to use as the value of the parameter '--kubeconfig' in the 'oc' commands to get ceph pods. provider_kubeconfig = os.path.join( config.clusters[config.get_provider_index()].ENV_DATA["cluster_path"], config.clusters[config.get_provider_index()].RUN.get( "kubeconfig_location" ), ) self.cluster_kubeconfig = provider_kubeconfig elif config.ENV_DATA["platform"] in constants.MANAGED_SERVICE_PLATFORMS: # cluster_index is used to identify the the cluster in which the pod is residing. If cluster_index is not # passed, assume that the context is already changed to the cluster where the pod is residing. cluster_index = ( cluster_index if cluster_index is not None else config.cur_index ) self.cluster_kubeconfig = os.path.join( config.clusters[cluster_index].ENV_DATA["cluster_path"], config.clusters[cluster_index].RUN.get("kubeconfig_location"), ) resource_count = 0 if self.resource == "mgr": self.resource_obj = pod.get_mgr_pods() self.selector = constants.MGR_APP_LABEL if self.resource == "mon": self.resource_obj = pod.get_mon_pods() self.selector = constants.MON_APP_LABEL if self.resource == "osd": self.resource_obj = pod.get_osd_pods() self.selector = constants.OSD_APP_LABEL if self.resource == "mds": self.resource_obj = pod.get_mds_pods() self.selector = constants.MDS_APP_LABEL if self.resource == "cephfsplugin": self.resource_obj = pod.get_plugin_pods(interface=constants.CEPHFILESYSTEM) self.selector = constants.CSI_CEPHFSPLUGIN_LABEL if self.resource == "rbdplugin": self.resource_obj = pod.get_plugin_pods(interface=constants.CEPHBLOCKPOOL) self.selector = constants.CSI_RBDPLUGIN_LABEL if self.resource == "cephfsplugin_provisioner": self.resource_obj = [ pod.get_plugin_provisioner_leader( interface=constants.CEPHFILESYSTEM, leader_type=leader_type ) ] self.selector = constants.CSI_CEPHFSPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_cephfsplugin_provisioner_pods()) if self.resource == "rbdplugin_provisioner": self.resource_obj = [ pod.get_plugin_provisioner_leader( interface=constants.CEPHBLOCKPOOL, leader_type=leader_type ) ] self.selector = constants.CSI_RBDPLUGIN_PROVISIONER_LABEL resource_count = len(pod.get_rbdfsplugin_provisioner_pods()) if self.resource == "operator": self.resource_obj = pod.get_operator_pods() self.selector = constants.OPERATOR_LABEL if self.resource == "ocs_operator": self.resource_obj = [pod.get_ocs_operator_pod()] self.selector = constants.OCS_OPERATOR_LABEL if self.resource == "alertmanager_managed_ocs_alertmanager": self.resource_obj = pod.get_alertmanager_managed_ocs_alertmanager_pods() self.selector = constants.MANAGED_ALERTMANAGER_LABEL if self.resource == "ocs_osd_controller_manager": self.resource_obj = [pod.get_ocs_osd_controller_manager_pod()] self.selector = constants.MANAGED_CONTROLLER_LABEL # Setting resource_count because odf-operator-controller-manager pod also have the same label. resource_count = len( pod.get_pods_having_label( constants.MANAGED_CONTROLLER_LABEL, config.ENV_DATA["cluster_namespace"], ) ) if self.resource == "prometheus_managed_ocs_prometheus": self.resource_obj = [pod.get_prometheus_managed_ocs_prometheus_pod()] self.selector = constants.MANAGED_PROMETHEUS_LABEL if self.resource == "prometheus_operator": self.resource_obj = [pod.get_prometheus_operator_pod()] self.selector = constants.PROMETHEUS_OPERATOR_LABEL if self.resource == "ocs_provider_server": self.resource_obj = [pod.get_ocs_provider_server_pod()] self.selector = constants.PROVIDER_SERVER_LABEL self.resource_count = resource_count or len(self.resource_obj)
def test_coredump_check_for_ceph_daemon_crash(self): """ Verify coredumpctl list updated after killing daemon """ log.info("Get Node name where mon pod running") mon_pod_nodes = [get_pod_node(pod) for pod in get_mon_pods()] mon_pod_node_names = [node.name for node in mon_pod_nodes] log.info("Get Node name where mgr pod running") mgr_pod_nodes = [get_pod_node(pod) for pod in get_mgr_pods()] mgr_pod_node_names = [node.name for node in mgr_pod_nodes] log.info("Get Node name where osd pod running") osd_pod_nodes = [get_pod_node(pod) for pod in get_osd_pods()] osd_pod_node_names = [node.name for node in osd_pod_nodes] node_mgr_mon_osd_names = set(mgr_pod_node_names).intersection( osd_pod_node_names, mon_pod_node_names) node_mgr_osd_names = set(mgr_pod_node_names).intersection( osd_pod_node_names) node_mgr_mon_names = set(mgr_pod_node_names).intersection( mon_pod_node_names) if len(node_mgr_mon_osd_names) > 0: daemon_types = ["mgr", "osd", "mon"] node_name = list(node_mgr_mon_osd_names)[0] elif len(node_mgr_osd_names) > 0: daemon_types = ["mgr", "osd"] node_name = list(node_mgr_osd_names)[0] elif len(node_mgr_mon_names) > 0: daemon_types = ["mgr", "mon"] node_name = list(node_mgr_mon_names)[0] else: daemon_types = ["mgr"] node_name = mgr_pod_node_names[0] log.info(f"Test the daemon_types {daemon_types} on node {node_name}") log.info("Delete the contents of 'posted' directory " "`/var/lib/rook/openshift-storage/crash/posted/`") cmd_bash = f"oc debug nodes/{node_name} -- chroot /host /bin/bash -c " cmd_delete_files = '"rm -rf /var/lib/rook/openshift-storage/crash/posted/*"' cmd = cmd_bash + cmd_delete_files run_cmd(cmd=cmd) for daemon_type in daemon_types: log.info(f"find ceph-{daemon_type} process-id") cmd_pid = f"pidof ceph-{daemon_type}" cmd_gen = "oc debug node/" + node_name + " -- chroot /host " cmd = cmd_gen + cmd_pid out = run_cmd(cmd=cmd) pids = out.strip().split() pid = pids[0] if not pid.isnumeric(): raise Exception( f"The ceph-{daemon_type} process-id was not found.") log.info(f"Kill ceph-{daemon_type} process-id {pid}") disruptions_obj = Disruptions() disruptions_obj.daemon_pid = pid disruptions_obj.kill_daemon(node_name=node_name, check_new_pid=False, kill_signal="11") log.info( f"Verify that we have a crash event for ceph-{daemon_types} crash (tool pod)" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="ceph crash ls", expected_output_lst=daemon_types, cephtool_cmd=True, ) if not sample.wait_for_func_status(True): raise Exception( f"ceph-{daemon_types} process does not exist on crash list (tool pod)" ) log.info( f"Verify coredumpctl list updated after killing {daemon_types} daemons on {node_name}" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="coredumpctl list", expected_output_lst=daemon_types, debug_node=node_name, ) if not sample.wait_for_func_status(True): raise Exception( f"coredump not getting generated for ceph-{daemon_types} daemon crash" ) log.info( f"Verify the directory postedcoredumpctl is not empty on {node_name}" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="ls -ltr /var/lib/rook/openshift-storage/crash/posted/", expected_output_lst=[":"], debug_node=node_name, ) if not sample.wait_for_func_status(True): raise Exception( f"coredump not getting generated for {daemon_types} daemons crash" ) log.info( "Verify ceph status moved to HEALTH_WARN state with the relevant " "information (daemons have recently crashed)") sample = TimeoutSampler( timeout=20, sleep=5, func=run_cmd_verify_cli_output, cmd="ceph health detail", expected_output_lst=daemon_types + ["HEALTH_WARN", "daemons have recently crashed"], cephtool_cmd=True, ) if not sample.wait_for_func_status(True): raise Exception( "The output of command ceph health detail did not show " "warning 'daemons have recently crashed'")
def test_restart_mgr_while_two_mons_down(self, pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory): """ Test Procedure: 1.Scaling down two mons: oc scale --replicas=0 deploy/rook-ceph-mon-a oc scale --replicas=0 deploy/rook-ceph-mon-b 2.Restarting mgr oc delete pod -l app=rook-ceph-mgr 3.sleep 5 seconds 4.Scaling mons back up oc scale --replicas=1 deploy/rook-ceph-mon-a oc scale --replicas=1 deploy/rook-ceph-mon-b 5.sleep 10 6.Waiting for mgr pod move to running state: oc get pod -l app=rook-ceph-mgr """ self.oc = ocp.OCP(kind=constants.DEPLOYMENT, namespace=config.ENV_DATA["cluster_namespace"]) mons = [ mon["metadata"]["name"] for mon in get_deployments_having_label( constants.MON_APP_LABEL, defaults.ROOK_CLUSTER_NAMESPACE) ] self.mons_scale = mons[0:2] tries = 11 for index in range(1, tries): log.info(f"Scaling down two mons {self.mons_scale}, index={index}") for mon_scale in self.mons_scale: self.oc.exec_oc_cmd( f"scale --replicas=0 deployment/{mon_scale}") log.info(f"Restarting mgr pod, index={index}") mgr_pod = get_mgr_pods() mgr_pod[0].delete(wait=True) time.sleep(5) log.info(f"Scaling up two mons {self.mons_scale}, index={index}") for mon_scale in self.mons_scale: self.oc.exec_oc_cmd( f"scale --replicas=1 deployment/{mon_scale}") time.sleep(10) log.info( f"Waiting for mgr pod move to Running state, index={index}") mgr_pod_obj = ocp.OCP(kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE) assert mgr_pod_obj.wait_for_resource( condition=constants.STATUS_RUNNING, selector=constants.MGR_APP_LABEL, resource_count=1, timeout=100, ), f"Mgr pod did'nt move to Running state after 100 seconds, index={index}" log.info("Creating Resources using sanity helpers") self.sanity_helpers.create_resources(pvc_factory, pod_factory, bucket_factory, rgw_bucket_factory) log.info("Deleting Resources using sanity helpers") self.sanity_helpers.delete_resources()
def test_coredump_check_for_ceph_daemon_crash(self, daemon_type): """ Verify coredumpctl list updated after killing daemon """ log.info(f"Get Node name where {daemon_type} pod running") if daemon_type == "mon": mon_pod_nodes = [get_pod_node(pod) for pod in get_mon_pods()] node_obj = mon_pod_nodes[0] elif daemon_type == "mgr": mgr_pod_nodes = [get_pod_node(pod) for pod in get_mgr_pods()] node_obj = mgr_pod_nodes[0] elif daemon_type == "osd": osd_pod_nodes = [get_pod_node(pod) for pod in get_osd_pods()] node_obj = osd_pod_nodes[0] node_name = node_obj.name log.info("Delete the contents of 'posted' directory " "`/var/lib/rook/openshift-storage/crash/posted/`") cmd_bash = f"oc debug nodes/{node_name} -- chroot /host /bin/bash -c " cmd_delete_files = '"rm -rf /var/lib/rook/openshift-storage/crash/posted/*"' cmd = cmd_bash + cmd_delete_files run_cmd(cmd=cmd) log.info(f"find ceph-{daemon_type} process-id") cmd_pid = f"pidof ceph-{daemon_type}" cmd_gen = "oc debug node/" + node_name + " -- chroot /host " cmd = cmd_gen + cmd_pid out = run_cmd(cmd=cmd) pid = out.strip() if not pid.isnumeric(): raise Exception( f"The ceph-{daemon_type} process-id was not found.") log.info(f"Kill ceph-{daemon_type} process-id {pid}") disruptions_obj = Disruptions() disruptions_obj.daemon_pid = pid disruptions_obj.kill_daemon(node_name=node_name, check_new_pid=False, kill_signal="11") log.info( f"Verify that we have a crash event for ceph-{daemon_type} crash") sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="ceph crash ls", expected_output_lst=[daemon_type], cephtool_cmd=True, ) if not sample.wait_for_func_status(True): raise Exception( f"ceph-{daemon_type} process does not exist on crash list (tool pod)" ) log.info( f"Verify coredumpctl list updated after killing ceph-{daemon_type} daemon" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="coredumpctl list", expected_output_lst=[daemon_type], debug_node=node_name, ) if not sample.wait_for_func_status(True): raise Exception( f"coredump not getting generated for ceph-{daemon_type} daemon crash" ) sample = TimeoutSampler( timeout=600, sleep=10, func=run_cmd_verify_cli_output, cmd="ls -ltr /var/lib/rook/openshift-storage/crash/posted/", expected_output_lst=[":"], debug_node=node_name, ) if not sample.wait_for_func_status(True): raise Exception( f"coredump not getting generated for ceph-{daemon_type} daemon crash" )