def test_monitoring_before_ocp_upgrade(): """ Test monitoring before ocp upgrade """ assert pre_upgrade_monitoring_pvc assert prometheus_health_check(), "Prometheus health is degraded"
def finalizer(): # Validate all nodes are schedulable scheduling_disabled_nodes = [ n.name for n in get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_READY_SCHEDULING_DISABLED ] if scheduling_disabled_nodes: schedule_nodes(scheduling_disabled_nodes) # Validate all nodes are in READY state not_ready_nodes = [ n for n in get_node_objs() if n.ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] log.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes_by_stop_and_start(not_ready_nodes) wait_for_nodes_status() log.info("All nodes are in Ready status") assert prometheus_health_check(), "Prometheus health is degraded"
def test_monitoring_delete_pvc(self): """ Test case to validate whether delete pvcs+configmap and recovery of a node where monitoring pods running has no functional impact """ # Get 'cluster-monitoring-config' configmap ocp_configmap = ocp.OCP( namespace=constants.MONITORING_NAMESPACE, kind="configmap" ) configmap_dict = ocp_configmap.get(resource_name="cluster-monitoring-config") dir_configmap = tempfile.mkdtemp(prefix="configmap_") yaml_file = f"{dir_configmap}/configmap.yaml" templating.dump_data_to_temp_yaml(configmap_dict, yaml_file) # Get prometheus and alertmanager pods prometheus_alertmanager_pods = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=["prometheus", "alertmanager"], ) # Get all pvc on monitoring namespace pvc_objs_list = pvc.get_all_pvc_objs(namespace=constants.MONITORING_NAMESPACE) # Delete configmap ocp_configmap.delete(resource_name="cluster-monitoring-config") # Delete all pvcs on monitoring namespace pvc.delete_pvcs(pvc_objs=pvc_objs_list) # Check all the prometheus and alertmanager pods are up for pod_obj in prometheus_alertmanager_pods: wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180 ) # Create configmap ocp_configmap.create(yaml_file=dir_configmap) # Check all the PVCs are up for pvc_obj in pvc_objs_list: wait_for_resource_state( resource=pvc_obj, state=constants.STATUS_BOUND, timeout=180 ) # Check all the prometheus and alertmanager pods are up # and pvc are mounted on monitoring pods for pod_obj in prometheus_alertmanager_pods: wait_for_resource_state( resource=pod_obj, state=constants.STATUS_RUNNING, timeout=180 ) mount_point = pod_obj.exec_cmd_on_pod( command="df -kh", out_yaml_format=False, ) assert "/dev/rbd" in mount_point, f"pvc is not mounted on pod {pod.name}" log.info("Verified all pvc are mounted on monitoring pods") # Validate the prometheus health is ok assert prometheus_health_check(), "Prometheus cluster health is not OK"
def wait_for_nodes_status_and_prometheus_health_check(pods): """ Waits for the all the nodes to be in running state and also check prometheus health """ # Validate all nodes are in READY state wait_for_nodes_status(timeout=900) # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" ) assert prometheus_health_check(), "Prometheus health is degraded"
def test_monitoring_after_ocp_upgrade(pre_upgrade_monitoring_pvc): """ After ocp upgrade validate all monitoring pods are up and running, its health is OK and also confirm no new monitoring pvc created instead using previous one. """ pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE) POD.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(pod_obj_list), timeout=180, ) post_upgrade_monitoring_pvc = get_list_pvc_objs_created_on_monitoring_pods( ) assert len(pre_upgrade_monitoring_pvc) == len( post_upgrade_monitoring_pvc ), ("Before and after ocp upgrade pvc are not matching" f"pre_upgrade_monitoring_pvc are {[pvc_obj.name for pvc_obj in pre_upgrade_monitoring_pvc]}." f"post_upgrade_monitoring_pvc are {[pvc_obj.name for pvc_obj in post_upgrade_monitoring_pvc]}" ) before_upgrade_pv_list = [] after_upgrade_pv_list = [] for before_upgrade_pvc_obj in pre_upgrade_monitoring_pvc: before_upgrade_pv_list.append( before_upgrade_pvc_obj.get().get("spec").get("volumeName")) for after_upgrade_pvc_obj in post_upgrade_monitoring_pvc: after_upgrade_pv_list.append( after_upgrade_pvc_obj.get().get("spec").get("volumeName")) assert after_upgrade_pvc_obj.get().get("status").get( "phase") == "Bound" assert set(before_upgrade_pv_list) == set( after_upgrade_pv_list ), "Before and after ocp upgrade pv list are not matching" assert prometheus_health_check(), "Prometheus health is degraded"
def test_monitoring_after_draining_node_where_prometheus_hosted( self, pods): """ Test case to validate when node is drained where prometheus is hosted, prometheus pod should re-spin on new healthy node and shouldn't be any data/metrics loss """ # Get the prometheus pod pod_obj_list = pod.get_all_pods( namespace=defaults.OCS_MONITORING_NAMESPACE, selector=['prometheus']) for pod_obj in pod_obj_list: # Get the pvc which mounted on prometheus pod pod_info = pod_obj.get() pvc_name = pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] # Get the node where the prometheus pod is hosted prometheus_pod_obj = pod_obj.get() prometheus_node = prometheus_pod_obj['spec']['nodeName'] # Drain node where the prometheus pod hosted drain_nodes([prometheus_node]) # Validate node is in SchedulingDisabled state wait_for_nodes_status( [prometheus_node], status=constants.NODE_READY_SCHEDULING_DISABLED) # Validate all prometheus pod is running POD = ocp.OCP(kind=constants.POD, namespace=defaults.OCS_MONITORING_NAMESPACE) assert POD.wait_for_resource( condition='Running', selector='app=prometheus', timeout=180), ( "One or more prometheus pods are not in running state") # Validate prometheus pod is re-spinned on new healthy node pod_info = pod_obj.get() new_node = pod_info['spec']['nodeName'] assert new_node not in prometheus_node, ( 'Promethues pod not re-spinned on new node') log.info(f"Prometheus pod re-spinned on new node {new_node}") # Validate same pvc is mounted on prometheus pod assert pod_info['spec']['volumes'][0]['persistentVolumeClaim'][ 'claimName'] in pvc_name, ( f"Old pvc not found after restarting the prometheus pod {pod_obj.name}" ) # Validate the prometheus health is ok assert prometheus_health_check(), ( "Prometheus cluster health is not OK") # Mark the nodes back to schedulable schedule_nodes([prometheus_node]) # Wait some time after node scheduling back waiting_time = 30 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) # Validate node is in Ready State wait_for_nodes_status([prometheus_node], status=constants.NODE_READY) # Validate ceph health OK ceph_health_check(tries=40, delay=30) # Check the node are Ready state and check cluster is health ok self.sanity_helpers.health_check() # Check for the created pvc metrics after rebooting the master nodes for pod_obj in pods: assert check_pvcdata_collected_on_prometheus(pod_obj.pvc.name), ( f"On prometheus pod for created pvc {pod_obj.pvc.name} related data is not collected" )
def finalizer(): assert prometheus_health_check(), "Prometheus health is degraded"