def health(self) -> Health: """Return health status of the cluster.""" ansible_health = Health(source=self._instance_id) if self._ansible: for test_health_function in [ self._health_all_ping, ]: test_health = test_health_function() ansible_health.merge(test_health) return ansible_health
def health(self) -> Health: """Create a Health check for the helm workload.""" health: Health = Health(source=self._instance_id) status = self.status() if status.status in [Status.UNKNOWN]: health.unknown( f"Helm: {self._instance_id} release status is unknown: {status.status} " f": {status.description}") if status.status in [Status.SUPERSEDED, Status.UNINSTALLING]: health.warning( f"Helm: {self._instance_id} release status is at issue: {status.status} " f": {status.description}") if status.status in [Status.DEPLOYED, Status.UNINSTALLED]: health.healthy( f"Helm: {self._instance_id} release status is good: {status.status} " f": {status.description}") if status.status == Status.FAILED: health.error( f"Helm: {self._instance_id} release status is not good: {status.status} " f": {status.description}") if status.status in [ Status.PENDING_INSTALL, Status.PENDING_UPGRADE, Status.PENDING_ROLLBACK, ]: health.warning(f"Helm status pending: {status.status}") return health
def _health_k8s_allpod_health(self) -> Health: """Check if kubernetes thinks all the pods are healthy.""" health = Health(source=self._instance_id) core_v1_api = self.get_api("CoreV1Api") unhealthy_pod_count = 0 for pod in core_v1_api.list_pod_for_all_namespaces().items: if pod.status.phase == "Failed": health.error(f"KubeAPI: pod failed: {pod.metadata.name}") unhealthy_pod_count += 1 if unhealthy_pod_count == 0: health.healthy("KubeAPI: all pods report as healthy") elif unhealthy_pod_count < 2: health.warning("KubeAPI: some pods report as failed") else: health.error("KubeAPI: Kubernetes Reports cluster is unhealthy (pod health)") return health
def _health_k8s_livez(self) -> Health: """Check if kubernetes thinks the pod is healthy.""" health = Health(source=self._instance_id) try: if self.livez(): health.healthy("KubeAPI: livez reports live") else: health.warning("KubeAPI: livez reports NOT live.") # pylint: disable=broad-except except Exception as err: health.error(f"Could not retrieve livez: {err}") return health
def health(self) -> Health: """Determine the health of the K8s instance.""" k8s_health = Health(source=self._instance_id, status=HealthStatus.UNKNOWN) for test_health_function in [self._health_swarm_nodes]: try: test_health = test_health_function() # pylint: disable=broad-except except Exception as err: test_health = Health(source=self._instance_id) test_health.critical( f"{test_health_function} exception: {err}") finally: k8s_health.merge(test_health) return k8s_health
def health(self) -> Health: """Determine the health of the K8s instance.""" k8s_health = Health(source=self._instance_id, status=HealthStatus.UNKNOWN) for test_health_function in [ self._health_k8s_readyz, self._health_k8s_livez, self._health_k8s_node_health, self._health_k8s_alldeployment_health, self._health_k8s_alldaemonset_health, self._health_k8s_allstatefulset_health, self._health_k8s_allpod_health, ]: try: test_health = test_health_function() # pylint: disable=broad-except except Exception as err: test_health = Health(source=self._instance_id) test_health.critical(f"{test_health_function} exception: {err}") finally: k8s_health.merge(test_health) return k8s_health
def health(self) -> Health: """Determine the health of the K8s deployment.""" dep_health = Health(source=self._instance_id, status=HealthStatus.UNKNOWN) if self._deployment is None: dep_health.info( f"Deployment: {self._instance_id} not yet started.") return dep_health for test_health_function in [self._health_deployment_status]: test_health = test_health_function() dep_health.merge(test_health) return dep_health
def _health_deployment_status(self): """Check if kubernetes thinks the deployment is healthy.""" health = Health(source=self._instance_id) apps_v1 = self._kubeapi_client.get_api("AppsV1Api") try: deployment = apps_v1.read_namespaced_deployment( self.name, self.namespace) status = deployment.status if status is None: health.error( f"Deployment: [{self.namespace}/{self.name}] retrieved no status." ) if status.conditions is None: health.warning( f"Deployment: [{self.namespace}/{self.name}] retrieved no status conditions." ) else: available_condition = next(condition for condition in status.conditions if condition.type == "Available") progressing_condition = next( condition for condition in status.conditions if condition.type == "Progressing") if available_condition and available_condition.status == "True": health.healthy( f"Deployment: [{self.namespace}/{self.name}] " "Deployment is available " f"-> {available_condition.message}") elif progressing_condition and progressing_condition.status == "True": health.warning( f"Deployment: [{self.namespace}/{self.name}] " "Deployment is progressing " f"-> {progressing_condition.message}") else: health.error( f"Deployment: [{self.namespace}/{self.name}] " "Deployment is neither progressing nor available " f"-> {available_condition.message} && {progressing_condition.message}" ) for condition in status.conditions: if condition.type in ["Available", "Progressing"]: pass elif condition.status == "True": health.healthy( f"Deployment: [{self.namespace}/{self.name}] {condition.type} " f"-> {condition.message}") else: health.error( f"Deployment: [{self.namespace}/{self.name}] {condition.type} " f"-> {condition.message}") except kubernetes.client.rest.ApiException as err: health.error(f"Deployment: K8S REST API exception occured: {err}") return health
def _health_swarm_nodes(self) -> Health: """Check if kubernetes thinks the pod is healthy.""" health = Health(source=self._instance_id) try: for node in self.nodes.list(): attrs = node.attrs description = node.attrs["Description"]["Hostname"] role = node.attrs["Spec"]["Role"] errors: int = 0 if "Status" in attrs: node_status = node.attrs["Status"] message = node_status["Message"] if node_status["State"] != "ready": health.error( f"Docker:Node: {role} {description} : {message}") errors += 1 if "ManagerStatus" in attrs: manager_status = node.attrs["ManagerStatus"] if manager_status["Reachability"] != "reachable": health.error( f"Docker:Node: {role} {description} : manager is not reachable" ) errors += 1 if node.attrs["Spec"]["Availability"] != "active": health.warning( f"Docker:Node: {role} {description} : is not available" ) errors += 1 if errors == 0: health.healthy( f"Docker:Node: {role} {description} : reports healthy") else: health.warning( f"Docker:Node: {role} {description} : is not health ({errors} issues.)" ) # pylint: disable=broad-except except Exception as err: health.error(f"Docker: could not retrieve node health: {err}") return health
def _health_k8s_allstatefulset_health(self) -> Health: """Check if kubernetes thinks all the statefulsets are healthy.""" health = Health(source=self._instance_id) apps_v1_api: api.apps_v1_api.AppsV1Api = self.get_api("AppsV1Api") unhealthy_count = 0 # pylint: disable=no-member for statefulset in apps_v1_api.list_stateful_set_for_all_namespaces().items: namespace = statefulset.metadata.namespace name = statefulset.metadata.name status = statefulset.status # {'collision_count': 0, # 'conditions': None, # 'current_replicas': 1, # 'current_revision': 'loki-workload-67877b465c', # 'observed_generation': 1, # 'ready_replicas': 1, # 'replicas': 1, # 'update_revision': 'loki-workload-67877b465c', # 'updated_replicas': 1} if status.collision_count is not None and status.collision_count > 0: health.warning( f"KubeAPI:Statefulset: [{namespace}/{name}] " "-> Reports some collisions: " f"{status.collision_count}" ) unhealthy_count += 1 if status.conditions: for condition in status.conditions: if condition.status == "True": health.healthy( f"KubeAPI:Statefulset: [{namespace}/{name}] {condition.type} " f"-> {condition.message}" ) else: health.warning( f"KubeAPI:Statefulset: [{namespace}/{name}] {condition.type} " f"-> {condition.message}" ) unhealthy_count += 1 if unhealthy_count == 0: health.healthy("KubeAPI: all statefulsets report as healthy") elif unhealthy_count < 3: health.warning("KubeAPI: some statefulsets report condition failures") else: health.error("KubeAPI: Kubernetes Reports cluster is unhealthy (statefulset health)") return health
def _health_k8s_alldaemonset_health(self) -> Health: """Check if kubernetes thinks all the daemonsets are healthy.""" health = Health(source=self._instance_id) apps_v1_api: api.apps_v1_api.AppsV1Api = self.get_api("AppsV1Api") unhealthy_dae_count = 0 # pylint: disable=no-member for daemonset in apps_v1_api.list_daemon_set_for_all_namespaces().items: namespace = daemonset.metadata.namespace name = daemonset.metadata.name status = daemonset.status if status.collision_count is not None and status.collision_count > 0: health.warning( f"Daemonset: [{namespace}/{name}] collision_count " "-> Reports some collisions: " f"{status.collision_count}" ) unhealthy_dae_count += 1 if status.number_unavailable is not None and status.number_unavailable > 0: health.warning( f"Daemonset: [{namespace}/{name}] number_unavailable " "-> Reports some unavailable pods: " f"{status.number_unavailable}" ) unhealthy_dae_count += 1 if status.desired_number_scheduled < status.current_number_scheduled: health.warning( f"Daemonset: [{namespace}/{name}] desired_number_scheduled " "-> Does not have the desired number scheduled: " f"{status.desired_number_scheduled} < " f"{status.current_number_scheduled}" ) unhealthy_dae_count += 1 if status.conditions: for condition in status.conditions: if condition.status == "True": health.healthy( f"Daemonset: [{namespace}/{name}] {condition.type} " f"-> {condition.message}" ) else: health.warning( f"Daemonset: [{namespace}/{name}] {condition.type} " f"-> {condition.message}" ) unhealthy_dae_count += 1 if unhealthy_dae_count == 0: health.healthy("KubeAPI: all daemonsets report as healthy") elif unhealthy_dae_count < 3: health.warning("KubeAPI: some daemonsets report condition failures") else: health.error("KubeAPI: Kubernetes Reports cluster is unhealthy (daemonset health)") return health
def _health_k8s_alldeployment_health(self) -> Health: """Check if kubernetes thinks all the deployments are healthy.""" health = Health(source=self._instance_id) apps_v1_api: api.apps_v1_api.AppsV1Api = self.get_api("AppsV1Api") unhealthy_dep_count = 0 # pylint: disable=no-member for deployment in apps_v1_api.list_deployment_for_all_namespaces().items: namespace = deployment.metadata.namespace name = deployment.metadata.name no_issues = True if not deployment.status.conditions: health.unknown( f"KubeAPI:Deployment: [{namespace}/{name}] " "Deployment does not have any conditions (yet?)" ) continue available_condition = next( ( condition for condition in deployment.status.conditions if condition.type == "Available" ), None, ) progressing_condition = next( ( condition for condition in deployment.status.conditions if condition.type == "Progressing" ), None, ) if available_condition and available_condition.status == "True": pass elif progressing_condition and progressing_condition.status == "True": health.warning( f"KubeAPI:Deployment: [{namespace}/{name}] " "Deployment is progressing " f"-> {progressing_condition.message}" ) no_issues = False else: messages = "\n".join( list( condition.message for condition in [progressing_condition, available_condition] if condition is not None ) ) health.warning( f"KubeAPI:Deployment: [{namespace}/{name}] " "Deployment is neither progressing nor available " f"-> {messages}" ) no_issues = False for condition in deployment.status.conditions: if condition.type in ["Available", "Progressing"]: pass elif condition.status != "True": health.warning( f"KubeAPI:Deployment: [{namespace}/{name}] {condition.type} " f"-> {condition.message}" ) no_issues = False if no_issues: health.healthy(f"KubeAPI:Deployment: [{namespace}/{name}] is healthy") else: health.warning(f"KubeAPI:Deployment: [{namespace}/{name}] is not healthy") unhealthy_dep_count += 1 if unhealthy_dep_count == 0: health.healthy("KubeAPI: all deployments report healthy") elif unhealthy_dep_count < 3: health.warning("KubeAPI: some deployments report condition failures") else: health.error("KubeAPI: Kubernetes Reports cluster is unhealthy (deployment health)") return health
def _health_k8s_node_health(self) -> Health: """Check if kubernetes thinks the nodes are healthy.""" health = Health(source=self._instance_id) try: for node in self.nodes(): name = node.metadata.name no_issues = True condition = next( ( condition for condition in node.status.conditions if condition.type == "Ready" ), None, ) if condition is not None and condition.status != "True": health.warning(f"KubeAPI: {name}: {condition.message}") no_issues = False condition = next( ( condition for condition in node.status.conditions if condition.type == "NetworkUnavailable" ), None, ) if condition is not None and condition.status == "True": health.warning(f"KubeAPI: {name}: {condition.message}") no_issues = False condition = next( ( condition for condition in node.status.conditions if condition.type == "MemoryPressure" ), None, ) if condition is not None and condition.status == "True": health.warning(f"KubeAPI: {name}: {condition.message}") no_issues = False condition = next( ( condition for condition in node.status.conditions if condition.type == "DiskPressure" ), None, ) if condition is not None and condition.status == "True": health.warning(f"KubeAPI: {name}: {condition.message}") no_issues = False condition = next( ( condition for condition in node.status.conditions if condition.type == "PIDPressure" ), None, ) if condition is not None and condition.status == "True": health.warning(f"KubeAPI: {name}: {condition.message}") no_issues = False if no_issues: health.healthy(f"KubeAPI: Node {name} reports healthy.") else: health.error(f"KubeAPI: Node {name} reporting issues.") # pylint: disable=broad-except except Exception as err: health.error(f"KubeAPI:Exception occured when check kubelet health: {err}") return health
def health(self) -> Health: """Perform a health check on the workload.""" health = Health(source=self._instance_id) try: status = self.status() if status.status in [Status.POSTPROCESS]: health.info( "Sonobuoy: run has finished, but result is not yet avaialble." ) elif status.status in [Status.COMPLETE, Status.PASSED]: health.info("Sonobuoy: completed.") elif status.status in [Status.FAILED]: health.error("Sonobuoy: run has produced a failure.") else: # if status.status() in [Status.PENDING, Status.RUNNING]: health.info("Sonobuoy: Running") except (subprocess.CalledProcessError, AttributeError) as err: health.unknown( f"No status found. Sonobuoy is likely not running: {err}") return health
def health(self) -> Health: """Evaluate health of the node.""" return Health(source=f"{self._client_id}-{self._id}")
def _health_all_ping(self) -> Health: """Health check that tries to ping all of the hosts.""" ping_health = Health(source=self._instance_id) ping = self.ping() try: ping_task_result_hosts = ping["plays"][0]["tasks"][0]["hosts"] stats_hosts = ping["stats"] except KeyError: ping_health.error("ansible ping gave unexpected results.") else: for host, host_stats in stats_hosts.items(): if host_stats["ok"]: ping_health.healthy(f"Ansible: {host} ping response ok.") elif host_stats["unreachable"]: ping_health.warning(f"Ansible: {host} unreachable during ping.") elif host_stats["ignored"]: ping_health.warning(f"Ansible: {host} ping ignored.") elif host_stats["failures"]: ping_health.error(f"Ansible: {host} ping failed.") elif host_stats["unknowwn"]: ping_health.error(f"Ansible: {host} ping skipped.") else: ping_health.warning( f"Ansible: {host} status not understood: {ping_task_result_hosts[host]}." ) return ping_health