Exemplo n.º 1
0
def get_resource_utilization_by_grouping_kube(
    grouping_func: _GenericNodeGroupingFunctionT,
    kube_client: KubeClient,
    filters: Sequence[_GenericNodeFilterFunctionT] = [],
    sort_func: _GenericNodeSortFunctionT = None,
) -> Mapping[_KeyFuncRetT, ResourceUtilizationDict]:
    """ Given a function used to group nodes, calculate resource utilization
    for each value of a given attribute.

    :grouping_func: a function that given a node, will return the value of an
    attribute to group by.
    :param kube_client: the Kubernetes client
    :param filters: filters to apply to the nodes in the calculation, with
    filtering preformed by filter_slaves
    :param sort_func: a function that given a list of nodes, will return the
    sorted list of nodes.
    :returns: a dict of {attribute_value: resource_usage}, where resource usage
    is the dict returned by ``calculate_resource_utilization_for_kube_nodes`` for
    nodes grouped by attribute value.
    """
    nodes: Sequence[V1Node] = get_all_nodes(kube_client)
    nodes = filter_slaves(nodes, filters)
    if len(nodes) == 0:
        raise ValueError("There are no nodes registered in the Kubernetes.")

    node_groupings = group_slaves_by_key_func(grouping_func, nodes, sort_func)

    return {
        attribute_value: calculate_resource_utilization_for_kube_nodes(nodes)
        for attribute_value, nodes in node_groupings.items()
    }
def get_kubernetes_pods_and_nodes(
    namespace: str, ) -> Tuple[Sequence[V1Pod], Sequence[V1Node]]:
    kube_client = KubeClient()
    all_pods = get_all_pods(kube_client=kube_client, namespace=namespace)
    all_nodes = get_all_nodes(kube_client)

    return all_pods, all_nodes
Exemplo n.º 3
0
def check_all_kubernetes_services_replication(soa_dir: str) -> None:
    kube_client = KubeClient()
    all_pods = get_all_pods(kube_client)
    all_nodes = get_all_nodes(kube_client)
    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()
    smartstack_replication_checker = KubeSmartstackReplicationChecker(
        nodes=all_nodes,
        system_paasta_config=system_paasta_config,
    )

    for service in list_services(soa_dir=soa_dir):
        service_config = PaastaServiceConfigLoader(service=service,
                                                   soa_dir=soa_dir)
        for instance_config in service_config.instance_configs(
                cluster=cluster,
                instance_type_class=kubernetes_tools.
                KubernetesDeploymentConfig,
        ):
            if instance_config.get_docker_image():
                check_service_replication(
                    instance_config=instance_config,
                    all_pods=all_pods,
                    smartstack_replication_checker=
                    smartstack_replication_checker,
                )
            else:
                log.debug(
                    '%s is not deployed. Skipping replication monitoring.' %
                    instance_config.job_id, )
Exemplo n.º 4
0
def smartstack_status(
    service: str,
    instance: str,
    job_config: LongRunningServiceConfig,
    service_namespace_config: ServiceNamespaceConfig,
    pods: Sequence[V1Pod],
    settings: Any,
    should_return_individual_backends: bool = False,
) -> Mapping[str, Any]:

    registration = job_config.get_registrations()[0]
    instance_pool = job_config.get_pool()

    smartstack_replication_checker = KubeSmartstackReplicationChecker(
        nodes=kubernetes_tools.get_all_nodes(settings.kubernetes_client),
        system_paasta_config=settings.system_paasta_config,
    )
    node_hostname_by_location = smartstack_replication_checker.get_allowed_locations_and_hosts(
        job_config
    )

    expected_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(
        service=service,
        namespace=instance,
        cluster=settings.cluster,
        instance_type_class=KubernetesDeploymentConfig,
    )
    expected_count_per_location = int(
        expected_smartstack_count / len(node_hostname_by_location)
    )
    smartstack_status: MutableMapping[str, Any] = {
        "registration": registration,
        "expected_backends_per_location": expected_count_per_location,
        "locations": [],
    }

    for location, hosts in node_hostname_by_location.items():
        synapse_host = smartstack_replication_checker.get_first_host_in_pool(
            hosts, instance_pool
        )
        sorted_backends = sorted(
            smartstack_tools.get_backends(
                registration,
                synapse_host=synapse_host,
                synapse_port=settings.system_paasta_config.get_synapse_port(),
                synapse_haproxy_url_format=settings.system_paasta_config.get_synapse_haproxy_url_format(),
            ),
            key=lambda backend: backend["status"],
            reverse=True,  # put 'UP' backends above 'MAINT' backends
        )

        matched_backends_and_pods = match_backends_and_pods(sorted_backends, pods)
        location_dict = smartstack_tools.build_smartstack_location_dict(
            location, matched_backends_and_pods, should_return_individual_backends
        )
        smartstack_status["locations"].append(location_dict)

    return smartstack_status
Exemplo n.º 5
0
def get_kube_resource_utilization_health(
    kube_client: KubeClient, ) -> Sequence[HealthCheckResult]:
    """Perform healthchecks against Kubernetes.
    :param kube_client: the KUbernetes client
    :returns: a list of HealthCheckResult tuples
    """

    nodes = get_all_nodes(kube_client)

    return [
        assert_cpu_health(get_kube_cpu_status(nodes)),
        assert_memory_health(get_kube_memory_status(nodes)),
        assert_disk_health(get_kube_disk_status(nodes)),
        assert_gpu_health(get_kube_gpu_status(nodes)),
        assert_nodes_health(get_kube_nodes_health_status(nodes)),
    ]
def main() -> None:
    args = parse_args()
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    dry_run = args.dry_run

    kube_client = KubeClient()
    all_nodes = get_all_nodes(kube_client)
    log.debug(
        f"found nodes in cluster {[node.metadata.name for node in all_nodes]}")

    # we depend on iam credentials existing on the host for this to run.
    # anywhere else, and you'll need to set credentials using environment variables
    # we also make the assumption that all nodes are in the same region here
    region = all_nodes[0].metadata.labels[
        "failure-domain.beta.kubernetes.io/region"]
    ec2_client = boto3.client("ec2", region)

    filtered_nodes = nodes_for_cleanup(ec2_client, all_nodes)
    if logging.DEBUG >= logging.root.level:
        log.debug(
            f"nodes to be deleted: {[node.metadata.name for node in filtered_nodes]}"
        )

    if not dry_run:
        success, errors = terminate_nodes(
            kube_client, [node.metadata.name for node in filtered_nodes])
    else:
        success, errors = [], []
        log.info("dry run mode detected: not deleting nodes")

    for node_name in success:
        log.info(f"successfully deleted node {node_name}")

    for node_name, exception in errors:
        log.error(f"error deleting node: {node_name}: {exception}")

    if errors:
        sys.exit(1)
Exemplo n.º 7
0
def check_all_kubernetes_based_services_replication(
    soa_dir: str,
    service_instances: Sequence[str],
    instance_type_class: Type[InstanceConfig_T],
    check_service_replication: CheckServiceReplication,
    namespace: str,
) -> None:
    kube_client = KubeClient()
    all_pods = get_all_pods(kube_client=kube_client, namespace=namespace)
    all_nodes = get_all_nodes(kube_client)
    system_paasta_config = load_system_paasta_config()
    cluster = system_paasta_config.get_cluster()
    smartstack_replication_checker = KubeSmartstackReplicationChecker(
        nodes=all_nodes, system_paasta_config=system_paasta_config)
    service_instances_set = set(service_instances)

    for service in list_services(soa_dir=soa_dir):
        service_config = PaastaServiceConfigLoader(service=service,
                                                   soa_dir=soa_dir)
        for instance_config in service_config.instance_configs(
                cluster=cluster, instance_type_class=instance_type_class):
            if (service_instances_set
                    and f"{service}{SPACER}{instance_config.instance}"
                    not in service_instances_set):
                continue
            if instance_config.get_docker_image():
                check_service_replication(
                    instance_config=instance_config,
                    all_pods=all_pods,
                    smartstack_replication_checker=
                    smartstack_replication_checker,
                )
            else:
                log.debug(
                    "%s is not deployed. Skipping replication monitoring." %
                    instance_config.job_id)
Exemplo n.º 8
0
def mesh_status(
    service: str,
    service_mesh: ServiceMesh,
    instance: str,
    job_config: LongRunningServiceConfig,
    service_namespace_config: ServiceNamespaceConfig,
    pods: Sequence[V1Pod],
    settings: Any,
    should_return_individual_backends: bool = False,
) -> Mapping[str, Any]:

    registration = job_config.get_registrations()[0]
    instance_pool = job_config.get_pool()

    replication_checker = KubeSmartstackEnvoyReplicationChecker(
        nodes=kubernetes_tools.get_all_nodes(settings.kubernetes_client),
        system_paasta_config=settings.system_paasta_config,
    )
    node_hostname_by_location = replication_checker.get_allowed_locations_and_hosts(
        job_config)

    expected_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(
        service=service,
        namespace=job_config.get_nerve_namespace(),
        cluster=settings.cluster,
        instance_type_class=KubernetesDeploymentConfig,
    )
    expected_count_per_location = int(expected_smartstack_count /
                                      len(node_hostname_by_location))
    mesh_status: MutableMapping[str, Any] = {
        "registration": registration,
        "expected_backends_per_location": expected_count_per_location,
        "locations": [],
    }

    for location, hosts in node_hostname_by_location.items():
        host = replication_checker.get_first_host_in_pool(hosts, instance_pool)
        if service_mesh == ServiceMesh.SMARTSTACK:
            mesh_status["locations"].append(
                _build_smartstack_location_dict(
                    synapse_host=host,
                    synapse_port=settings.system_paasta_config.
                    get_synapse_port(),
                    synapse_haproxy_url_format=settings.system_paasta_config.
                    get_synapse_haproxy_url_format(),
                    registration=registration,
                    pods=pods,
                    location=location,
                    should_return_individual_backends=
                    should_return_individual_backends,
                ))
        elif service_mesh == ServiceMesh.ENVOY:
            mesh_status["locations"].append(
                _build_envoy_location_dict(
                    envoy_host=host,
                    envoy_admin_port=settings.system_paasta_config.
                    get_envoy_admin_port(),
                    envoy_admin_endpoint_format=settings.system_paasta_config.
                    get_envoy_admin_endpoint_format(),
                    registration=registration,
                    pods=pods,
                    location=location,
                    should_return_individual_backends=
                    should_return_individual_backends,
                ))
    return mesh_status