def get_kubernetes_pods_and_nodes( namespace: str, ) -> Tuple[Sequence[V1Pod], Sequence[V1Node]]: kube_client = KubeClient() all_pods = get_all_pods(kube_client=kube_client, namespace=namespace) all_nodes = get_all_nodes(kube_client) return all_pods, all_nodes
def check_all_kubernetes_services_replication(soa_dir: str) -> None: kube_client = KubeClient() all_pods = get_all_pods(kube_client) all_nodes = get_all_nodes(kube_client) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() smartstack_replication_checker = KubeSmartstackReplicationChecker( nodes=all_nodes, system_paasta_config=system_paasta_config, ) for service in list_services(soa_dir=soa_dir): service_config = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) for instance_config in service_config.instance_configs( cluster=cluster, instance_type_class=kubernetes_tools. KubernetesDeploymentConfig, ): if instance_config.get_docker_image(): check_service_replication( instance_config=instance_config, all_pods=all_pods, smartstack_replication_checker= smartstack_replication_checker, ) else: log.debug( '%s is not deployed. Skipping replication monitoring.' % instance_config.job_id, )
def get_all_running_kubernetes_pods(kube_client: KubeClient, namespace: str) -> Iterable[V1Pod]: running = [] for pod in kubernetes_tools.get_all_pods(kube_client, namespace): if kubernetes_tools.get_pod_status( pod) == kubernetes_tools.PodStatus.RUNNING: running.append(pod) return running
def assert_kube_pods_running(kube_client: KubeClient, ) -> HealthCheckResult: statuses = [get_pod_status(pod) for pod in get_all_pods(kube_client)] running = statuses.count(PodStatus.RUNNING) pending = statuses.count(PodStatus.PENDING) failed = statuses.count(PodStatus.FAILED) healthy = running > 0 return HealthCheckResult( message=f"Pods: running: {running} pending: {pending} failed: {failed}", healthy=healthy, )
def evicted_pods_per_service( client: KubeClient, ) -> Mapping[str, Sequence[EvictedPod]]: all_pods = get_all_pods(kube_client=client, namespace="") evicted_pods = get_evicted_pods(all_pods) log.info( f"Pods in evicted state: {[pod.metadata.name for pod in evicted_pods]}" ) evicted_pods_aggregated: Dict[str, List[EvictedPod]] = defaultdict(list) for pod in evicted_pods: service = get_pod_service(pod) if service: evicted_pods_aggregated[service].append( EvictedPod(pod.metadata.name, pod.metadata.namespace, pod.status.message)) else: log.info(f"Could not get service name for pod {pod.metadata.name}") return evicted_pods_aggregated
def check_all_kubernetes_based_services_replication( soa_dir: str, service_instances: Sequence[str], instance_type_class: Type[InstanceConfig_T], check_service_replication: CheckServiceReplication, namespace: str, ) -> None: kube_client = KubeClient() all_pods = get_all_pods(kube_client=kube_client, namespace=namespace) all_nodes = get_all_nodes(kube_client) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() smartstack_replication_checker = KubeSmartstackReplicationChecker( nodes=all_nodes, system_paasta_config=system_paasta_config) service_instances_set = set(service_instances) for service in list_services(soa_dir=soa_dir): service_config = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) for instance_config in service_config.instance_configs( cluster=cluster, instance_type_class=instance_type_class): if (service_instances_set and f"{service}{SPACER}{instance_config.instance}" not in service_instances_set): continue if instance_config.get_docker_image(): check_service_replication( instance_config=instance_config, all_pods=all_pods, smartstack_replication_checker= smartstack_replication_checker, ) else: log.debug( "%s is not deployed. Skipping replication monitoring." % instance_config.job_id)
def main(): args = parse_args() setup_logging(args.verbose) kube_client = KubeClient() pods = get_all_pods(kube_client, args.namespace) allowed_uptime_minutes = args.minutes allowed_error_minutes = args.error_minutes completed_pods = [] errored_pods = [] for pod in pods: if is_pod_completed(pod) and _completed_longer_than_threshold( pod, allowed_uptime_minutes): completed_pods.append(pod) elif ( # this is currently optional allowed_error_minutes is not None # there's no direct way to get what type of "bad" state these Pods ended up # (kubectl looks at phase and then container statuses to give something descriptive) # but, in the end, we really just care that a Pod is in a Failed phase and pod.status.phase == "Failed"): try: # and that said Pod has been around for a while (generally longer than we'd leave # Pods that exited sucessfully) # NOTE: we do this in a try-except since we're intermittently seeing pods in an error # state without a PodScheduled condition (even though that should be impossible) # this is not ideal, but its fine to skip these since this isn't a critical process if _scheduled_longer_than_threshold(pod, allowed_error_minutes): errored_pods.append(pod) except AttributeError: log.exception( f"Unable to check {pod.metadata.name}'s schedule time. Pod status: {pod.status}.'" ) if not (completed_pods or errored_pods): log.debug("No pods to terminate.") sys.exit(0) if args.dry_run: log.debug( "Dry run would have terminated the following completed pods:\n " + "\n ".join([pod.metadata.name for pod in completed_pods])) log.debug( "Dry run would have terminated the following errored pods:\n " + "\n ".join([pod.metadata.name for pod in errored_pods])) sys.exit(0) completed_successes, completed_errors = terminate_pods( completed_pods, kube_client) errored_successes, errored_errors = terminate_pods(errored_pods, kube_client) successes = { "completed": completed_successes, "errored": errored_successes, } errors = { "completed": completed_errors, "errored": errored_errors, } for typ, pod_names in successes.items(): if pod_names: log.debug(f"Successfully terminated the following {typ} pods:\n" + "\n ".join(pod_names)) # we've only really seen this fail recently due to the k8s API being flaky and returning # 404s for Pods that its returning to us when we get all Pods, so we just print the error # here for now and don't exit with a non-zero exit code since, again, this isn't a critical # process for typ, pod_names_and_errors in errors.items(): if pod_names_and_errors: log.error(f"Failed to terminate the following {typ} pods:\n" + "\n ".join(f"{pod_name}: {error}" for pod_name, error in pod_names_and_errors))