def get_resource_utilization_by_grouping_kube( grouping_func: _GenericNodeGroupingFunctionT, kube_client: KubeClient, filters: Sequence[_GenericNodeFilterFunctionT] = [], sort_func: _GenericNodeSortFunctionT = None, ) -> Mapping[_KeyFuncRetT, ResourceUtilizationDict]: """ Given a function used to group nodes, calculate resource utilization for each value of a given attribute. :grouping_func: a function that given a node, will return the value of an attribute to group by. :param kube_client: the Kubernetes client :param filters: filters to apply to the nodes in the calculation, with filtering preformed by filter_slaves :param sort_func: a function that given a list of nodes, will return the sorted list of nodes. :returns: a dict of {attribute_value: resource_usage}, where resource usage is the dict returned by ``calculate_resource_utilization_for_kube_nodes`` for nodes grouped by attribute value. """ nodes: Sequence[V1Node] = get_all_nodes(kube_client) nodes = filter_slaves(nodes, filters) if len(nodes) == 0: raise ValueError("There are no nodes registered in the Kubernetes.") node_groupings = group_slaves_by_key_func(grouping_func, nodes, sort_func) return { attribute_value: calculate_resource_utilization_for_kube_nodes(nodes) for attribute_value, nodes in node_groupings.items() }
def get_kubernetes_pods_and_nodes( namespace: str, ) -> Tuple[Sequence[V1Pod], Sequence[V1Node]]: kube_client = KubeClient() all_pods = get_all_pods(kube_client=kube_client, namespace=namespace) all_nodes = get_all_nodes(kube_client) return all_pods, all_nodes
def check_all_kubernetes_services_replication(soa_dir: str) -> None: kube_client = KubeClient() all_pods = get_all_pods(kube_client) all_nodes = get_all_nodes(kube_client) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() smartstack_replication_checker = KubeSmartstackReplicationChecker( nodes=all_nodes, system_paasta_config=system_paasta_config, ) for service in list_services(soa_dir=soa_dir): service_config = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) for instance_config in service_config.instance_configs( cluster=cluster, instance_type_class=kubernetes_tools. KubernetesDeploymentConfig, ): if instance_config.get_docker_image(): check_service_replication( instance_config=instance_config, all_pods=all_pods, smartstack_replication_checker= smartstack_replication_checker, ) else: log.debug( '%s is not deployed. Skipping replication monitoring.' % instance_config.job_id, )
def smartstack_status( service: str, instance: str, job_config: LongRunningServiceConfig, service_namespace_config: ServiceNamespaceConfig, pods: Sequence[V1Pod], settings: Any, should_return_individual_backends: bool = False, ) -> Mapping[str, Any]: registration = job_config.get_registrations()[0] instance_pool = job_config.get_pool() smartstack_replication_checker = KubeSmartstackReplicationChecker( nodes=kubernetes_tools.get_all_nodes(settings.kubernetes_client), system_paasta_config=settings.system_paasta_config, ) node_hostname_by_location = smartstack_replication_checker.get_allowed_locations_and_hosts( job_config ) expected_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace( service=service, namespace=instance, cluster=settings.cluster, instance_type_class=KubernetesDeploymentConfig, ) expected_count_per_location = int( expected_smartstack_count / len(node_hostname_by_location) ) smartstack_status: MutableMapping[str, Any] = { "registration": registration, "expected_backends_per_location": expected_count_per_location, "locations": [], } for location, hosts in node_hostname_by_location.items(): synapse_host = smartstack_replication_checker.get_first_host_in_pool( hosts, instance_pool ) sorted_backends = sorted( smartstack_tools.get_backends( registration, synapse_host=synapse_host, synapse_port=settings.system_paasta_config.get_synapse_port(), synapse_haproxy_url_format=settings.system_paasta_config.get_synapse_haproxy_url_format(), ), key=lambda backend: backend["status"], reverse=True, # put 'UP' backends above 'MAINT' backends ) matched_backends_and_pods = match_backends_and_pods(sorted_backends, pods) location_dict = smartstack_tools.build_smartstack_location_dict( location, matched_backends_and_pods, should_return_individual_backends ) smartstack_status["locations"].append(location_dict) return smartstack_status
def get_kube_resource_utilization_health( kube_client: KubeClient, ) -> Sequence[HealthCheckResult]: """Perform healthchecks against Kubernetes. :param kube_client: the KUbernetes client :returns: a list of HealthCheckResult tuples """ nodes = get_all_nodes(kube_client) return [ assert_cpu_health(get_kube_cpu_status(nodes)), assert_memory_health(get_kube_memory_status(nodes)), assert_disk_health(get_kube_disk_status(nodes)), assert_gpu_health(get_kube_gpu_status(nodes)), assert_nodes_health(get_kube_nodes_health_status(nodes)), ]
def main() -> None: args = parse_args() if args.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) dry_run = args.dry_run kube_client = KubeClient() all_nodes = get_all_nodes(kube_client) log.debug( f"found nodes in cluster {[node.metadata.name for node in all_nodes]}") # we depend on iam credentials existing on the host for this to run. # anywhere else, and you'll need to set credentials using environment variables # we also make the assumption that all nodes are in the same region here region = all_nodes[0].metadata.labels[ "failure-domain.beta.kubernetes.io/region"] ec2_client = boto3.client("ec2", region) filtered_nodes = nodes_for_cleanup(ec2_client, all_nodes) if logging.DEBUG >= logging.root.level: log.debug( f"nodes to be deleted: {[node.metadata.name for node in filtered_nodes]}" ) if not dry_run: success, errors = terminate_nodes( kube_client, [node.metadata.name for node in filtered_nodes]) else: success, errors = [], [] log.info("dry run mode detected: not deleting nodes") for node_name in success: log.info(f"successfully deleted node {node_name}") for node_name, exception in errors: log.error(f"error deleting node: {node_name}: {exception}") if errors: sys.exit(1)
def check_all_kubernetes_based_services_replication( soa_dir: str, service_instances: Sequence[str], instance_type_class: Type[InstanceConfig_T], check_service_replication: CheckServiceReplication, namespace: str, ) -> None: kube_client = KubeClient() all_pods = get_all_pods(kube_client=kube_client, namespace=namespace) all_nodes = get_all_nodes(kube_client) system_paasta_config = load_system_paasta_config() cluster = system_paasta_config.get_cluster() smartstack_replication_checker = KubeSmartstackReplicationChecker( nodes=all_nodes, system_paasta_config=system_paasta_config) service_instances_set = set(service_instances) for service in list_services(soa_dir=soa_dir): service_config = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir) for instance_config in service_config.instance_configs( cluster=cluster, instance_type_class=instance_type_class): if (service_instances_set and f"{service}{SPACER}{instance_config.instance}" not in service_instances_set): continue if instance_config.get_docker_image(): check_service_replication( instance_config=instance_config, all_pods=all_pods, smartstack_replication_checker= smartstack_replication_checker, ) else: log.debug( "%s is not deployed. Skipping replication monitoring." % instance_config.job_id)
def mesh_status( service: str, service_mesh: ServiceMesh, instance: str, job_config: LongRunningServiceConfig, service_namespace_config: ServiceNamespaceConfig, pods: Sequence[V1Pod], settings: Any, should_return_individual_backends: bool = False, ) -> Mapping[str, Any]: registration = job_config.get_registrations()[0] instance_pool = job_config.get_pool() replication_checker = KubeSmartstackEnvoyReplicationChecker( nodes=kubernetes_tools.get_all_nodes(settings.kubernetes_client), system_paasta_config=settings.system_paasta_config, ) node_hostname_by_location = replication_checker.get_allowed_locations_and_hosts( job_config) expected_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace( service=service, namespace=job_config.get_nerve_namespace(), cluster=settings.cluster, instance_type_class=KubernetesDeploymentConfig, ) expected_count_per_location = int(expected_smartstack_count / len(node_hostname_by_location)) mesh_status: MutableMapping[str, Any] = { "registration": registration, "expected_backends_per_location": expected_count_per_location, "locations": [], } for location, hosts in node_hostname_by_location.items(): host = replication_checker.get_first_host_in_pool(hosts, instance_pool) if service_mesh == ServiceMesh.SMARTSTACK: mesh_status["locations"].append( _build_smartstack_location_dict( synapse_host=host, synapse_port=settings.system_paasta_config. get_synapse_port(), synapse_haproxy_url_format=settings.system_paasta_config. get_synapse_haproxy_url_format(), registration=registration, pods=pods, location=location, should_return_individual_backends= should_return_individual_backends, )) elif service_mesh == ServiceMesh.ENVOY: mesh_status["locations"].append( _build_envoy_location_dict( envoy_host=host, envoy_admin_port=settings.system_paasta_config. get_envoy_admin_port(), envoy_admin_endpoint_format=settings.system_paasta_config. get_envoy_admin_endpoint_format(), registration=registration, pods=pods, location=location, should_return_individual_backends= should_return_individual_backends, )) return mesh_status