def format_custom_resource( instance_config: Mapping[str, Any], service: str, instance: str, cluster: str, kind: str, version: str, group: str, namespace: str, ) -> Mapping[str, Any]: sanitised_service = sanitise_kubernetes_name(service) sanitised_instance = sanitise_kubernetes_name(instance) resource: Mapping[str, Any] = { "apiVersion": f"{group}/{version}", "kind": kind, "metadata": { "name": f"{sanitised_service}-{sanitised_instance}", "namespace": namespace, "labels": { "yelp.com/paasta_service": service, "yelp.com/paasta_instance": instance, "yelp.com/paasta_cluster": cluster, }, "annotations": { "yelp.com/desired_state": "running" }, }, "spec": instance_config, } config_hash = get_config_hash(instance_config) resource["metadata"]["labels"]["yelp.com/paasta_config_sha"] = config_hash return resource
def get_secret_env(self) -> Mapping[str, dict]: base_env = self.config_dict.get("env", {}) secret_env = {} for k, v in base_env.items(): if is_secret_ref(v): secret = get_secret_name_from_ref(v) sanitised_secret = sanitise_kubernetes_name(secret) service = ( self.service if not is_shared_secret(v) else SHARED_SECRET_SERVICE ) sanitised_service = sanitise_kubernetes_name(service) secret_env[k] = { "secret_name": f"tron-secret-{sanitised_service}-{sanitised_secret}", "key": secret, } return secret_env
def format_custom_resource( instance_config: Mapping[str, Any], service: str, instance: str, cluster: str, kind: str, version: str, group: str, namespace: str, git_sha: str, ) -> Mapping[str, Any]: sanitised_service = sanitise_kubernetes_name(service) sanitised_instance = sanitise_kubernetes_name(instance) resource: Mapping[str, Any] = { "apiVersion": f"{group}/{version}", "kind": kind, "metadata": { "name": f"{sanitised_service}-{sanitised_instance}", "namespace": namespace, "labels": { "yelp.com/paasta_service": service, "yelp.com/paasta_instance": instance, "yelp.com/paasta_cluster": cluster, paasta_prefixed("service"): service, paasta_prefixed("instance"): instance, paasta_prefixed("cluster"): cluster, }, "annotations": {}, }, "spec": instance_config, } url = get_dashboard_url(kind, service, instance, cluster) if url: resource["metadata"]["annotations"]["yelp.com/dashboard_url"] = url resource["metadata"]["annotations"][paasta_prefixed( "dashboard_url")] = url config_hash = get_config_hash(resource) resource["metadata"]["annotations"]["yelp.com/desired_state"] = "running" resource["metadata"]["annotations"][paasta_prefixed( "desired_state")] = "running" resource["metadata"]["labels"]["yelp.com/paasta_config_sha"] = config_hash resource["metadata"]["labels"][paasta_prefixed("config_sha")] = config_hash resource["metadata"]["labels"][paasta_prefixed("git_sha")] = git_sha return resource
def get_container_type(container_name: str, instance_name: str) -> str: """ To differentiate between main service containers and sidecars """ if instance_name and container_name == kubernetes_tools.sanitise_kubernetes_name( instance_name): return MAIN_CONTAINER_TYPE else: return container_name
def validate_service_name(service): if len(sanitise_kubernetes_name(service)) > 63: paasta_print( failure( f"Length of service name {service} should be no more than 63." + " Note _ is replaced with - due to Kubernetes restriction", "http://paasta.readthedocs.io/en/latest/yelpsoa_configs.html", )) return False return True
def validate_instance_names(config_file_object, file_path): errors = [] for instance_name in config_file_object: if (not instance_name.startswith("_") and len(sanitise_kubernetes_name(instance_name)) > 63): errors.append(instance_name) if errors: error_string = "\n".join(errors) paasta_print( failure( f"Length of instance name \n{error_string}\n should be no more than 63." + " Note _ is replaced with -- due to Kubernetes restriction", "http://paasta.readthedocs.io/en/latest/yelpsoa_configs.html", )) return len(errors) == 0
def get_app_name(service: str, instance: str): return sanitise_kubernetes_name(f"{service}-{instance}")
def create_instance_cpu_scaling_rule( service: str, instance: str, autoscaling_config: AutoscalingParamsDict, paasta_cluster: str, ) -> PrometheusAdapterRule: """ Creates a Prometheus adapter rule config for a given service instance. """ deployment_name = get_kubernetes_app_name(service=service, instance=instance) sanitized_instance_name = sanitise_kubernetes_name(instance) metric_name = f"{deployment_name}-cpu-prom" moving_average_window = autoscaling_config.get( "moving_average_window_seconds", DEFAULT_CPU_AUTOSCALING_MOVING_AVERAGE_WINDOW) # this series query is a bit of a hack: we don't use the Prometheus adapter as expected (i.e., very generic rules) # but we still need to give it a query that returns something even though we're not going to use the series/label # templates that are auto-extracted for us. That said: we still need this query to return labels that can be tied # back to k8s objects WITHOUT using label_replace series_query = f""" kube_deployment_labels{{ deployment='{deployment_name}', paasta_cluster='{paasta_cluster}', namespace='paasta' }} """ cpu_usage = f""" avg( irate( container_cpu_usage_seconds_total{{ namespace='paasta', container='{sanitized_instance_name}', paasta_cluster='{paasta_cluster}' }}[1m] ) ) by (pod, container) """ cpus_available = f""" sum( container_spec_cpu_quota{{ namespace='paasta', container='{sanitized_instance_name}', paasta_cluster='{paasta_cluster}' }} / container_spec_cpu_period{{ namespace='paasta', paasta_cluster='{paasta_cluster}' }} ) by (pod, container) """ # NOTE: we only have Pod names in our container_cpu* metrics, but we can't get a # Deployment from those consistenly due to k8s limitations on certain field lengths # - thus we need to extract this information from the ReplicaSet name (which is made # possible by the fact that our ReplicaSets are named # {{deployment}}-{{10 character hex string}}) so that our query only considers the # service that we want to autoscale - without this we're only filtering by instance # name and these are very much not unique # k8s:pod:info is an internal recording rule that joins kube_pod_info with # kube_pod_status_phase pod_info_join = f""" on (pod) group_left(kube_deployment) label_replace( k8s:pod:info{{ created_by_name=~'{deployment_name}.*', created_by_kind='ReplicaSet', namespace='paasta', paasta_cluster='{paasta_cluster}', phase='Running' }}, 'kube_deployment', '$1', 'created_by_name', '(.+)-[a-f0-9]{{10}}' ) """ # get the total usage of all of our Pods divided by the number of CPUs available to # those Pods (i.e., the k8s CPU limit) in order to get the % of CPU used and then add # some labels to this vector load = f""" sum( (({cpu_usage}) / ({cpus_available})) * {pod_info_join} ) by (kube_deployment) """ current_replicas = f""" ( scalar( kube_deployment_spec_replicas{{paasta_cluster='{paasta_cluster}',deployment='{deployment_name}'}} >= 0 or max_over_time( kube_deployment_spec_replicas{{paasta_cluster='{paasta_cluster}',deployment='{deployment_name}'}}[{DEFAULT_EXTRAPOLATION_TIME}s] ) ) ) """ # we want to calculate: # * the desired replicas based on instantaneous load, # * smooth that over time, # * and then divide by the non-smoothed current number of replicas. # otherwise, if we do the naive thing and take the average of the load inside avg_over_time, # then we'll see the oscillations that we fixed in PR #2862 moving_average_load = f""" avg_over_time(({load})[{moving_average_window}s:]) / {current_replicas} """ # for some reason, during bounces we lose the labels from the previous timeseries (and thus end up with two # timeseries), so we avg these to merge them together # NOTE: we multiply by 100 to return a number between [0, 100] to the HPA moving_average_load_percent = f"avg({moving_average_load}) * 100" # we need to do some somwhat hacky label_replaces to inject labels that will then be used for association # without these, the adapter doesn't know what deployment to associate the query result with # NOTE: these labels MUST match the equivalent ones in the seriesQuery metrics_query = f""" label_replace( label_replace( {moving_average_load_percent}, 'deployment', '{deployment_name}', '', '' ), 'namespace', 'paasta', '', '' ) """ return { "name": { "as": metric_name }, "seriesQuery": _minify_promql(series_query), "metricsQuery": _minify_promql(metrics_query), "resources": { "overrides": { "namespace": { "resource": "namespace" }, "deployment": { "group": "apps", "resource": "deployments" }, }, }, }
def get_sanitised_instance_name(self) -> str: return sanitise_kubernetes_name(self.get_instance())
def get_sanitised_service_name(self) -> str: return sanitise_kubernetes_name(self.get_service())
def reconcile_kubernetes_resource( kube_client: KubeClient, service: str, instance_configs: Mapping[str, Any], custom_resources: Sequence[KubeCustomResource], kind: KubeKind, version: str, group: str, cluster: str, instance: str = None, ) -> bool: results = [] for inst, config in instance_configs.items(): if instance is not None and instance != inst: continue formatted_resource = format_custom_resource( instance_config=config, service=service, instance=inst, cluster=cluster, kind=kind.singular, version=version, group=group, namespace=f"paasta-{kind.plural}", ) desired_resource = KubeCustomResource( service=service, instance=inst, config_sha=formatted_resource["metadata"]["labels"] ["paasta.yelp.com/config_sha"], kind=kind.singular, name=formatted_resource["metadata"]["name"], namespace=f"paasta-{kind.plural}", ) try: if not (service, inst, kind.singular) in [ (c.service, c.instance, c.kind) for c in custom_resources ]: log.info(f"{desired_resource} does not exist so creating") create_custom_resource( kube_client=kube_client, version=version, kind=kind, formatted_resource=formatted_resource, group=group, ) elif desired_resource not in custom_resources: sanitised_service = sanitise_kubernetes_name(service) sanitised_instance = sanitise_kubernetes_name(inst) log.info( f"{desired_resource} exists but config_sha doesn't match") update_custom_resource( kube_client=kube_client, name=f"{sanitised_service}-{sanitised_instance}", version=version, kind=kind, formatted_resource=formatted_resource, group=group, ) else: log.info(f"{desired_resource} is up to date, no action taken") except Exception as e: log.error(str(e)) results.append(False) results.append(True) return all(results) if results else True
def sanitised_name(service: str, instance: str) -> str: sanitised_service = sanitise_kubernetes_name(service) sanitised_instance = sanitise_kubernetes_name(instance) return f"{sanitised_service}-{sanitised_instance}"
def sanitise_kubernetes_service_name(name: str) -> str: return limit_size_with_hash( sanitise_kubernetes_name(name).replace(".", "---"))
def reconcile_kubernetes_resource( kube_client: KubeClient, service: str, instance_configs: Mapping[str, Any], custom_resources: Sequence[KubeCustomResource], kind: KubeKind, version: str, group: str, crd: CustomResourceDefinition, cluster: str, instance: str = None, ) -> bool: succeeded = True config_handler = LONG_RUNNING_INSTANCE_TYPE_HANDLERS[crd.file_prefix] for inst, config in instance_configs.items(): if instance is not None and instance != inst: continue try: soa_config = config_handler.loader( service=service, instance=inst, cluster=cluster, load_deployments=True, soa_dir=DEFAULT_SOA_DIR, ) git_sha = get_git_sha_from_dockerurl(soa_config.get_docker_url(), long=True) formatted_resource = format_custom_resource( instance_config=config, service=service, instance=inst, cluster=cluster, kind=kind.singular, version=version, group=group, namespace=f"paasta-{kind.plural}", git_sha=git_sha, ) desired_resource = KubeCustomResource( service=service, instance=inst, config_sha=formatted_resource["metadata"]["labels"][ paasta_prefixed("config_sha")], git_sha=formatted_resource["metadata"]["labels"].get( paasta_prefixed("git_sha")), kind=kind.singular, name=formatted_resource["metadata"]["name"], namespace=f"paasta-{kind.plural}", ) if not (service, inst, kind.singular) in [ (c.service, c.instance, c.kind) for c in custom_resources ]: log.info(f"{desired_resource} does not exist so creating") create_custom_resource( kube_client=kube_client, version=version, kind=kind, formatted_resource=formatted_resource, group=group, ) elif desired_resource not in custom_resources: sanitised_service = sanitise_kubernetes_name(service) sanitised_instance = sanitise_kubernetes_name(inst) log.info( f"{desired_resource} exists but config_sha doesn't match") update_custom_resource( kube_client=kube_client, name=f"{sanitised_service}-{sanitised_instance}", version=version, kind=kind, formatted_resource=formatted_resource, group=group, ) else: log.info(f"{desired_resource} is up to date, no action taken") except Exception as e: log.error(str(e)) succeeded = False return succeeded