def get_deployment_names_from_list(service_instance_list):
    app_names = []
    for service_instance in service_instance_list:
        try:
            service, instance, _, __ = decompose_job_id(service_instance)
            app_name = get_kubernetes_app_name(service, instance)
            app_names.append(app_name)
        except InvalidJobNameError:
            log.error(
                f"Invalid service instance specified. Format is service{SPACER}instance."
            )
            sys.exit(1)
    return app_names
예제 #2
0
def main():
    args = parse_args()
    soa_dir = args.soa_dir
    cluster = args.cluster
    instances = get_services_for_cluster(cluster=cluster,
                                         instance_type="kubernetes",
                                         soa_dir=soa_dir)
    service_instances = []
    for name, instance in instances:
        if args.sanitise:
            app_name = kubernetes_tools.get_kubernetes_app_name(name, instance)
        else:
            app_name = compose_job_id(name, instance)
        service_instances.append(app_name)
    print("\n".join(service_instances))
    sys.exit(0)
def create_instance_cpu_scaling_rule(
    service: str,
    instance: str,
    autoscaling_config: AutoscalingParamsDict,
    paasta_cluster: str,
) -> PrometheusAdapterRule:
    """
    Creates a Prometheus adapter rule config for a given service instance.
    """
    deployment_name = get_kubernetes_app_name(service=service,
                                              instance=instance)
    sanitized_instance_name = sanitise_kubernetes_name(instance)
    metric_name = f"{deployment_name}-cpu-prom"
    moving_average_window = autoscaling_config.get(
        "moving_average_window_seconds",
        DEFAULT_CPU_AUTOSCALING_MOVING_AVERAGE_WINDOW)

    # this series query is a bit of a hack: we don't use the Prometheus adapter as expected (i.e., very generic rules)
    # but we still need to give it a query that returns something even though we're not going to use the series/label
    # templates that are auto-extracted for us. That said: we still need this query to return labels that can be tied
    # back to k8s objects WITHOUT using label_replace
    series_query = f"""
        kube_deployment_labels{{
            deployment='{deployment_name}',
            paasta_cluster='{paasta_cluster}',
            namespace='paasta'
        }}
    """

    cpu_usage = f"""
        avg(
            irate(
                container_cpu_usage_seconds_total{{
                    namespace='paasta',
                    container='{sanitized_instance_name}',
                    paasta_cluster='{paasta_cluster}'
                }}[1m]
            )
        ) by (pod, container)
    """

    cpus_available = f"""
        sum(
            container_spec_cpu_quota{{
                namespace='paasta',
                container='{sanitized_instance_name}',
                paasta_cluster='{paasta_cluster}'
            }}
            / container_spec_cpu_period{{
                namespace='paasta',
                paasta_cluster='{paasta_cluster}'
            }}
        ) by (pod, container)
    """

    # NOTE: we only have Pod names in our container_cpu* metrics, but we can't get a
    # Deployment from those consistenly due to k8s limitations on certain field lengths
    # - thus we need to extract this information from the ReplicaSet name (which is made
    # possible by the fact that our ReplicaSets are named
    # {{deployment}}-{{10 character hex string}}) so that our query only considers the
    # service that we want to autoscale - without this we're only filtering by instance
    # name and these are very much not unique
    # k8s:pod:info is an internal recording rule that joins kube_pod_info with
    # kube_pod_status_phase
    pod_info_join = f"""
        on (pod) group_left(kube_deployment) label_replace(
            k8s:pod:info{{
                created_by_name=~'{deployment_name}.*',
                created_by_kind='ReplicaSet',
                namespace='paasta',
                paasta_cluster='{paasta_cluster}',
                phase='Running'
            }},
            'kube_deployment',
            '$1',
            'created_by_name',
            '(.+)-[a-f0-9]{{10}}'
        )
    """

    # get the total usage of all of our Pods divided by the number of CPUs available to
    # those Pods (i.e., the k8s CPU limit) in order to get the % of CPU used and then add
    # some labels to this vector
    load = f"""
        sum(
            (({cpu_usage}) / ({cpus_available})) * {pod_info_join}
        ) by (kube_deployment)
    """

    current_replicas = f"""
        (
            scalar(
                kube_deployment_spec_replicas{{paasta_cluster='{paasta_cluster}',deployment='{deployment_name}'}} >= 0
                or
                max_over_time(
                    kube_deployment_spec_replicas{{paasta_cluster='{paasta_cluster}',deployment='{deployment_name}'}}[{DEFAULT_EXTRAPOLATION_TIME}s]
                )
            )
        )
    """

    # we want to calculate:
    # * the desired replicas based on instantaneous load,
    # * smooth that over time,
    # * and then divide by the non-smoothed current number of replicas.
    # otherwise, if we do the naive thing and take the average of the load inside avg_over_time,
    # then we'll see the oscillations that we fixed in PR #2862
    moving_average_load = f"""
        avg_over_time(({load})[{moving_average_window}s:]) / {current_replicas}
    """

    # for some reason, during bounces we lose the labels from the previous timeseries (and thus end up with two
    # timeseries), so we avg these to merge them together
    # NOTE: we multiply by 100 to return a number between [0, 100] to the HPA
    moving_average_load_percent = f"avg({moving_average_load}) * 100"

    # we need to do some somwhat hacky label_replaces to inject labels that will then be used for association
    # without these, the adapter doesn't know what deployment to associate the query result with
    # NOTE: these labels MUST match the equivalent ones in the seriesQuery
    metrics_query = f"""
        label_replace(
            label_replace(
                {moving_average_load_percent},
                'deployment',
                '{deployment_name}',
                '',
                ''
            ),
            'namespace',
            'paasta',
            '',
            ''
        )
    """

    return {
        "name": {
            "as": metric_name
        },
        "seriesQuery": _minify_promql(series_query),
        "metricsQuery": _minify_promql(metrics_query),
        "resources": {
            "overrides": {
                "namespace": {
                    "resource": "namespace"
                },
                "deployment": {
                    "group": "apps",
                    "resource": "deployments"
                },
            },
        },
    }
def create_instance_uwsgi_scaling_rule(
    service: str,
    instance: str,
    autoscaling_config: AutoscalingParamsDict,
    paasta_cluster: str,
) -> PrometheusAdapterRule:
    """
    Creates a Prometheus adapter rule config for a given service instance.
    """
    setpoint = autoscaling_config["setpoint"]
    moving_average_window = autoscaling_config.get(
        "moving_average_window_seconds",
        DEFAULT_UWSGI_AUTOSCALING_MOVING_AVERAGE_WINDOW)
    # this should always be set, but we default to 0 for safety as the worst thing that would happen
    # is that we take a couple more iterations than required to hit the desired setpoint
    offset = autoscaling_config.get("offset", 0)
    deployment_name = get_kubernetes_app_name(service=service,
                                              instance=instance)
    worker_filter_terms = f"paasta_cluster='{paasta_cluster}',paasta_service='{service}',paasta_instance='{instance}'"
    replica_filter_terms = (
        f"paasta_cluster='{paasta_cluster}',deployment='{deployment_name}'")

    current_replicas = f"""
        sum(
            label_join(
                (
                    kube_deployment_spec_replicas{{{replica_filter_terms}}} >= 0
                    or
                    max_over_time(
                        kube_deployment_spec_replicas{{{replica_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s]
                    )
                ),
                "kube_deployment", "", "deployment"
            )
        ) by (kube_deployment)
    """
    # k8s:deployment:pods_status_ready is a metric created by summing kube_pod_status_ready
    # over paasta service/instance/cluster. it counts the number of ready pods in a paasta
    # deployment.
    ready_pods = f"""
        (sum(
            k8s:deployment:pods_status_ready{{{worker_filter_terms}}} >= 0
            or
            max_over_time(
                k8s:deployment:pods_status_ready{{{worker_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s]
            )
        ) by (kube_deployment))
    """
    load_per_instance = f"""
        avg(
            uwsgi_worker_busy{{{worker_filter_terms}}}
        ) by (kube_pod, kube_deployment)
    """
    missing_instances = f"""
        clamp_min(
            {ready_pods} - count({load_per_instance}) by (kube_deployment),
            0
        )
    """
    total_load = f"""
    (
        sum(
            {load_per_instance}
        ) by (kube_deployment)
        +
        {missing_instances}
    )
    """
    desired_instances_at_each_point_in_time = f"""
        {total_load} / {setpoint - offset}
    """
    desired_instances = f"""
        avg_over_time(
            (
                {desired_instances_at_each_point_in_time}
            )[{moving_average_window}s:]
        )
    """
    metrics_query = f"""
        {desired_instances} / {current_replicas}
    """

    metric_name = f"{deployment_name}-uwsgi-prom"

    return {
        "name": {
            "as": metric_name
        },
        "seriesQuery": f"uwsgi_worker_busy{{{worker_filter_terms}}}",
        "resources": {
            "template": "kube_<<.Resource>>"
        },
        "metricsQuery": _minify_promql(metrics_query),
    }
def create_instance_arbitrary_promql_scaling_rule(
    service: str,
    instance: str,
    autoscaling_config: AutoscalingParamsDict,
    paasta_cluster: str,
) -> PrometheusAdapterRule:
    prometheus_adapter_config = autoscaling_config["prometheus_adapter_config"]
    deployment_name = get_kubernetes_app_name(service=service,
                                              instance=instance)

    if "seriesQuery" in prometheus_adapter_config:
        # If the user specifies seriesQuery, don't wrap their metricsQuery, under the assumption that they may not want
        # us to mess with their labels.
        series_query = prometheus_adapter_config["seriesQuery"]
        metrics_query = prometheus_adapter_config["metricsQuery"]
    else:
        # If the user doesn't specify seriesQuery, assume they want to just write some promql that returns a number.
        # Set up series_query to match the default `resources`
        series_query = f"""
            kube_deployment_labels{{
                deployment='{deployment_name}',
                paasta_cluster='{paasta_cluster}',
                namespace='paasta'
            }}
        """
        # Wrap their promql with label_replace() calls that add `deployment` / `namespace` labels which match the default `resources`.
        metrics_query = f"""
            label_replace(
                label_replace(
                    {prometheus_adapter_config["metricsQuery"]},
                    'deployment',
                    '{deployment_name}',
                    '',
                    ''
                ),
                'namespace',
                'paasta',
                '',
                ''
            )
        """

    return {
        "name": {
            "as": f"{deployment_name}-arbitrary-promql",
        },
        "seriesQuery":
        _minify_promql(series_query),
        "metricsQuery":
        _minify_promql(metrics_query),
        "resources":
        prometheus_adapter_config.get(
            "resources",
            {
                "overrides": {
                    "namespace": {
                        "resource": "namespace"
                    },
                    "deployment": {
                        "group": "apps",
                        "resource": "deployments"
                    },
                },
            },
        ),
    }
def create_instance_piscina_scaling_rule(
    service: str,
    instance: str,
    autoscaling_config: AutoscalingParamsDict,
    paasta_cluster: str,
) -> PrometheusAdapterRule:
    """
    Creates a Prometheus adapter rule config for a given service instance.
    """
    setpoint = autoscaling_config["setpoint"]
    moving_average_window = autoscaling_config.get(
        "moving_average_window_seconds",
        DEFAULT_PISCINA_AUTOSCALING_MOVING_AVERAGE_WINDOW,
    )
    deployment_name = get_kubernetes_app_name(service=service,
                                              instance=instance)
    worker_filter_terms = f"paasta_cluster='{paasta_cluster}',paasta_service='{service}',paasta_instance='{instance}'"
    replica_filter_terms = (
        f"paasta_cluster='{paasta_cluster}',deployment='{deployment_name}'")

    current_replicas = f"""
        sum(
            label_join(
                (
                    kube_deployment_spec_replicas{{{replica_filter_terms}}} >= 0
                    or
                    max_over_time(
                        kube_deployment_spec_replicas{{{replica_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s]
                    )
                ),
                "kube_deployment", "", "deployment"
            )
        ) by (kube_deployment)
    """
    # k8s:deployment:pods_status_ready is a metric created by summing kube_pod_status_ready
    # over paasta service/instance/cluster. it counts the number of ready pods in a paasta
    # deployment.
    ready_pods = f"""
        (sum(
            k8s:deployment:pods_status_ready{{{worker_filter_terms}}} >= 0
            or
            max_over_time(
                k8s:deployment:pods_status_ready{{{worker_filter_terms}}}[{DEFAULT_EXTRAPOLATION_TIME}s]
            )
        ) by (kube_deployment))
    """
    load_per_instance = f"""
        (piscina_pool_utilization{{{worker_filter_terms}}})
    """
    missing_instances = f"""
        clamp_min(
            {ready_pods} - count({load_per_instance}) by (kube_deployment),
            0
        )
    """
    total_load = f"""
    (
        sum(
            {load_per_instance}
        ) by (kube_deployment)
        +
        {missing_instances}
    )
    """
    desired_instances_at_each_point_in_time = f"""
        {total_load} / {setpoint}
    """
    desired_instances = f"""
        avg_over_time(
            (
                {desired_instances_at_each_point_in_time}
            )[{moving_average_window}s:]
        )
    """
    metrics_query = f"""
        {desired_instances} / {current_replicas}
    """

    return {
        "name": {
            "as": f"{deployment_name}-piscina-prom"
        },
        "seriesQuery": f"piscina_pool_utilization{{{worker_filter_terms}}}",
        "resources": {
            "template": "kube_<<.Resource>>"
        },
        "metricsQuery": _minify_promql(metrics_query),
    }
예제 #7
0
def sync_boto_secrets(
    kube_client: KubeClient,
    cluster: str,
    service: str,
    secret_provider_name: str,
    vault_cluster_config: Mapping[str, str],
    soa_dir: str,
    namespace: str,
) -> bool:
    # Update boto key secrets
    config_loader = PaastaServiceConfigLoader(service=service, soa_dir=soa_dir)
    for instance_config in config_loader.instance_configs(
            cluster=cluster, instance_type_class=KubernetesDeploymentConfig):
        instance = instance_config.instance
        boto_keys = instance_config.config_dict.get("boto_keys", [])
        if not boto_keys:
            continue
        boto_keys.sort()
        secret_data = {}
        for key in boto_keys:
            for filetype in ["sh", "yaml", "json", "cfg"]:
                this_key = key + "." + filetype
                sanitised_key = this_key.replace(".", "-").replace("_", "--")
                try:
                    with open(f"/etc/boto_cfg_private/{this_key}") as f:
                        secret_data[sanitised_key] = base64.b64encode(
                            f.read().encode("utf-8")).decode("utf-8")
                except IOError:
                    log.warning(
                        f"Boto key {this_key} required for {service} could not be found."
                    )
        if not secret_data:
            continue
        # In order to prevent slamming the k8s API, add some artificial delay here
        time.sleep(0.3)
        app_name = get_kubernetes_app_name(service, instance)
        secret = limit_size_with_hash(f"paasta-boto-key-{app_name}")
        hashable_data = "".join([secret_data[key] for key in secret_data])
        signature = hashlib.sha1(hashable_data.encode("utf-8")).hexdigest()
        kubernetes_signature = get_kubernetes_secret_signature(
            kube_client=kube_client,
            secret=secret,
            service=service,
            namespace=namespace,
        )
        if not kubernetes_signature:
            log.info(
                f"{secret} for {service} in {namespace} not found, creating")
            try:
                create_plaintext_dict_secret(
                    kube_client=kube_client,
                    secret_name=secret,
                    secret_data=secret_data,
                    service=service,
                    namespace=namespace,
                )
            except ApiException as e:
                if e.status == 409:
                    log.warning(
                        f"Secret {secret} for {service} already exists in {namespace} but no signature found. Updating secret and signature."
                    )
                    update_plaintext_dict_secret(
                        kube_client=kube_client,
                        secret_name=secret,
                        secret_data=secret_data,
                        service=service,
                        namespace=namespace,
                    )
                else:
                    raise
            create_kubernetes_secret_signature(
                kube_client=kube_client,
                secret=secret,
                service=service,
                secret_signature=signature,
                namespace=namespace,
            )
        elif signature != kubernetes_signature:
            log.info(
                f"{secret} for {service} in {namespace} needs updating as signature changed"
            )
            update_plaintext_dict_secret(
                kube_client=kube_client,
                secret_name=secret,
                secret_data=secret_data,
                service=service,
                namespace=namespace,
            )
            update_kubernetes_secret_signature(
                kube_client=kube_client,
                secret=secret,
                service=service,
                secret_signature=signature,
                namespace=namespace,
            )
        else:
            log.info(f"{secret} for {service} in {namespace} up to date")
    return True