Exemplo n.º 1
0
def check_service_replication(
    instance_config: MarathonServiceConfig,
    all_tasks_or_pods: Sequence[MarathonTask],
    replication_checker: MesosSmartstackEnvoyReplicationChecker,
) -> Optional[bool]:
    """Checks a service's replication levels based on how the service's replication
    should be monitored. (smartstack/envoy or mesos)

    :param instance_config: an instance of MarathonServiceConfig
    :param replication_checker: an instance of MesosSmartstackEnvoyReplicationChecker
    """
    expected_count = instance_config.get_instances()
    log.info("Expecting %d total tasks for %s" %
             (expected_count, instance_config.job_id))
    proxy_port = get_proxy_port_for_instance(instance_config)

    registrations = instance_config.get_registrations()
    # if the primary registration does not match the service_instance name then
    # the best we can do is check marathon for replication (for now).
    if proxy_port is not None and registrations[0] == instance_config.job_id:
        is_well_replicated = monitoring_tools.check_replication_for_instance(
            instance_config=instance_config,
            expected_count=expected_count,
            replication_checker=replication_checker,
        )
        return is_well_replicated
    else:
        check_healthy_marathon_tasks_for_service_instance(
            instance_config=instance_config,
            expected_count=expected_count,
            all_tasks=all_tasks_or_pods,
        )
        return None
Exemplo n.º 2
0
def status_desired_state(
    service: str,
    instance: str,
    client: marathon_tools.MarathonClient,
    job_config: marathon_tools.MarathonServiceConfig,
) -> str:
    status = get_bouncing_status(service, instance, client, job_config)
    desired_state = desired_state_human(job_config.get_desired_state(), job_config.get_instances())
    return f"Desired State:      {status} and {desired_state}"
Exemplo n.º 3
0
def autoscale_marathon_instance(
    marathon_service_config: MarathonServiceConfig,
    system_paasta_config: SystemPaastaConfig,
    marathon_tasks: Sequence[MarathonTask],
    mesos_tasks: Sequence[Task],
) -> None:
    try:
        with create_autoscaling_lock(marathon_service_config.service,
                                     marathon_service_config.instance):
            current_instances = marathon_service_config.get_instances()
            task_data_insufficient = is_task_data_insufficient(
                marathon_service_config=marathon_service_config,
                marathon_tasks=marathon_tasks,
                current_instances=current_instances,
            )
            autoscaling_params = marathon_service_config.get_autoscaling_params(
            )
            log_utilization_data: Mapping = {}
            utilization = get_utilization(
                marathon_service_config=marathon_service_config,
                system_paasta_config=system_paasta_config,
                autoscaling_params=autoscaling_params,
                log_utilization_data=log_utilization_data,
                marathon_tasks=marathon_tasks,
                mesos_tasks=mesos_tasks,
            )
            error = get_error_from_utilization(
                utilization=utilization,
                setpoint=autoscaling_params["setpoint"],
                current_instances=current_instances,
            )
            num_healthy_instances = len(marathon_tasks)
            new_instance_count = get_new_instance_count(
                utilization=utilization,
                error=error,
                autoscaling_params=autoscaling_params,
                current_instances=current_instances,
                marathon_service_config=marathon_service_config,
                num_healthy_instances=num_healthy_instances,
                persist_data=(not task_data_insufficient),
            )
            safe_downscaling_threshold = int(current_instances * 0.7)
            _record_autoscaling_decision(
                marathon_service_config=marathon_service_config,
                autoscaling_params=autoscaling_params,
                utilization=utilization,
                log_utilization_data=log_utilization_data,
                error=error,
                current_instances=current_instances,
                num_healthy_instances=num_healthy_instances,
                new_instance_count=new_instance_count,
                safe_downscaling_threshold=safe_downscaling_threshold,
                task_data_insufficient=task_data_insufficient,
            )
            if new_instance_count != current_instances:
                if new_instance_count < current_instances and task_data_insufficient:
                    write_to_log(
                        config=marathon_service_config,
                        line=
                        "Delaying scaling *down* as we found too few healthy tasks running in marathon. "
                        "This can happen because tasks are delayed/waiting/unhealthy or because we are "
                        "waiting for tasks to be killed. Will wait for sufficient healthy tasks before "
                        "we make a decision to scale down.",
                        level="debug",
                    )
                    return
                else:
                    set_instances_for_marathon_service(
                        service=marathon_service_config.service,
                        instance=marathon_service_config.instance,
                        instance_count=new_instance_count,
                    )
                    write_to_log(
                        config=marathon_service_config,
                        line="Scaling from %d to %d instances (%s)" % (
                            current_instances,
                            new_instance_count,
                            humanize_error(error),
                        ),
                        level="event",
                    )
            else:
                write_to_log(
                    config=marathon_service_config,
                    line="Staying at %d instances (%s)" %
                    (current_instances, humanize_error(error)),
                    level="debug",
                )
    except LockHeldException:
        log.warning(
            "Skipping autoscaling run for {service}.{instance} because the lock is held"
            .format(
                service=marathon_service_config.service,
                instance=marathon_service_config.instance,
            ))
Exemplo n.º 4
0
def perform_command(
    command: str,
    service: str,
    instance: str,
    cluster: str,
    verbose: int,
    soa_dir: str,
    clients: marathon_tools.MarathonClients,
    job_config: marathon_tools.MarathonServiceConfig,
    app_id: str = None,
) -> int:
    """Performs a start/stop/restart/status on an instance
    :param command: String of start, stop, restart, status
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: int verbosity level
    :param client: MarathonClient or CachingMarathonClient
    :returns: A unix-style return code
    """
    system_config = load_system_paasta_config()

    if not app_id:
        try:
            app_id = job_config.format_marathon_app_dict()["id"]
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            paasta_print(
                "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?"
                % job_id
            )
            return 1

    normal_instance_count = job_config.get_instances()

    current_client = clients.get_current_client_for_service(job_config)

    if command == "restart":
        restart_marathon_job(service, instance, app_id, current_client, cluster)
    elif command == "status":
        paasta_print(
            status_desired_state(service, instance, current_client, job_config)
        )
        dashboards = get_marathon_dashboard_links(clients, system_config)
        tasks, out = status_marathon_job(
            service=service,
            instance=instance,
            cluster=cluster,
            soa_dir=soa_dir,
            dashboards=dashboards,
            normal_instance_count=normal_instance_count,
            clients=clients,
            job_config=job_config,
            desired_app_id=app_id,
            verbose=verbose,
        )
        paasta_print(out)
        service_namespace_config = marathon_tools.load_service_namespace_config(
            service=service, namespace=job_config.get_nerve_namespace(), soa_dir=soa_dir
        )

        paasta_print(
            status_mesos_tasks(service, instance, normal_instance_count, verbose)
        )

        proxy_port = service_namespace_config.get("proxy_port")
        if proxy_port is not None:
            normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(
                service, instance, cluster
            )
            paasta_print(
                status_smartstack_backends(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    job_config=job_config,
                    service_namespace_config=service_namespace_config,
                    tasks=tasks,
                    expected_count=normal_smartstack_count,
                    soa_dir=soa_dir,
                    verbose=verbose > 0,
                    synapse_port=system_config.get_synapse_port(),
                    synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(),
                    system_deploy_blacklist=system_config.get_deploy_blacklist(),
                    system_deploy_whitelist=system_config.get_deploy_whitelist(),
                )
            )
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
Exemplo n.º 5
0
def autoscale_marathon_instance(
    marathon_service_config: MarathonServiceConfig,
    system_paasta_config: SystemPaastaConfig,
    marathon_tasks: Sequence[MarathonTask],
    mesos_tasks: Sequence[Task],
) -> None:
    try:
        with create_autoscaling_lock(marathon_service_config.service, marathon_service_config.instance):
            current_instances = marathon_service_config.get_instances()
            task_data_insufficient = is_task_data_insufficient(
                marathon_service_config=marathon_service_config,
                marathon_tasks=marathon_tasks,
                current_instances=current_instances,
            )
            autoscaling_params = marathon_service_config.get_autoscaling_params()
            log_utilization_data: Mapping = {}
            utilization = get_utilization(
                marathon_service_config=marathon_service_config,
                system_paasta_config=system_paasta_config,
                autoscaling_params=autoscaling_params,
                log_utilization_data=log_utilization_data,
                marathon_tasks=marathon_tasks,
                mesos_tasks=mesos_tasks,
            )
            error = get_error_from_utilization(
                utilization=utilization,
                setpoint=autoscaling_params['setpoint'],
                current_instances=current_instances,
            )
            new_instance_count = get_new_instance_count(
                utilization=utilization,
                error=error,
                autoscaling_params=autoscaling_params,
                current_instances=current_instances,
                marathon_service_config=marathon_service_config,
                num_healthy_instances=len(marathon_tasks),
            )

            safe_downscaling_threshold = int(current_instances * 0.7)
            if new_instance_count != current_instances:
                if new_instance_count < current_instances and task_data_insufficient:
                    write_to_log(
                        config=marathon_service_config,
                        line='Delaying scaling *down* as we found too few healthy tasks running in marathon. '
                             'This can happen because tasks are delayed/waiting/unhealthy or because we are '
                             'waiting for tasks to be killed. Will wait for sufficient healthy tasks before '
                             'we make a decision to scale down.',
                    )
                    return
                if new_instance_count == safe_downscaling_threshold:
                    write_to_log(
                        config=marathon_service_config,
                        line='Autoscaler clamped: %s' % str(log_utilization_data),
                        level='debug',
                    )

                write_to_log(
                    config=marathon_service_config,
                    line='Scaling from %d to %d instances (%s)' % (
                        current_instances, new_instance_count, humanize_error(error),
                    ),
                )
                set_instances_for_marathon_service(
                    service=marathon_service_config.service,
                    instance=marathon_service_config.instance,
                    instance_count=new_instance_count,
                )
            else:
                write_to_log(
                    config=marathon_service_config,
                    line='Staying at %d instances (%s)' % (current_instances, humanize_error(error)),
                    level='debug',
                )
            meteorite_dims = {
                'service_name': marathon_service_config.service,
                'decision_policy': autoscaling_params[DECISION_POLICY_KEY],  # type: ignore
                'paasta_cluster': marathon_service_config.cluster,
                'instance_name': marathon_service_config.instance,
            }
            if yelp_meteorite:
                gauge = yelp_meteorite.create_gauge('paasta.service.instances', meteorite_dims)
                gauge.set(new_instance_count)
                gauge = yelp_meteorite.create_gauge('paasta.service.max_instances', meteorite_dims)
                gauge.set(marathon_service_config.get_max_instances())
                gauge = yelp_meteorite.create_gauge('paasta.service.min_instances', meteorite_dims)
                gauge.set(marathon_service_config.get_min_instances())
    except LockHeldException:
        log.warning("Skipping autoscaling run for {service}.{instance} because the lock is held".format(
            service=marathon_service_config.service,
            instance=marathon_service_config.instance,
        ))
Exemplo n.º 6
0
def marathon_job_status(
    service: str,
    instance: str,
    job_config: marathon_tools.MarathonServiceConfig,
    marathon_apps_with_clients: List[Tuple[MarathonApp, MarathonClient]],
    verbose: int,
) -> MutableMapping[str, Any]:
    job_status_fields: MutableMapping[str, Any] = {
        "app_statuses": [],
        "app_count":
        len(marathon_apps_with_clients),
        "desired_state":
        job_config.get_desired_state(),
        "bounce_method":
        job_config.get_bounce_method(),
        "expected_instance_count":
        job_config.get_instances(),
        "active_shas":
        list(get_active_shas_for_marathon_apps(marathon_apps_with_clients)),
    }

    try:
        desired_app_id = job_config.format_marathon_app_dict()["id"]
    except NoDockerImageError:
        error_msg = "Docker image is not in deployments.json."
        job_status_fields["error_message"] = error_msg
        return job_status_fields

    job_status_fields["desired_app_id"] = desired_app_id

    deploy_status_for_desired_app = None
    dashboard_links = get_marathon_dashboard_links(
        settings.marathon_clients, settings.system_paasta_config)
    tasks_running = 0
    for app, marathon_client in marathon_apps_with_clients:
        deploy_status = marathon_tools.get_marathon_app_deploy_status(
            marathon_client, app)

        app_status = marathon_app_status(
            app,
            marathon_client,
            dashboard_links.get(marathon_client) if dashboard_links else None,
            deploy_status,
            list_tasks=verbose > 0,
        )
        job_status_fields["app_statuses"].append(app_status)

        if app.id.lstrip("/") == desired_app_id.lstrip("/"):
            deploy_status_for_desired_app = marathon_tools.MarathonDeployStatus.tostring(
                deploy_status)
        tasks_running += app.tasks_running

    job_status_fields["deploy_status"] = (deploy_status_for_desired_app
                                          or "Waiting for bounce")
    job_status_fields["running_instance_count"] = tasks_running

    if verbose > 0:
        autoscaling_info = get_autoscaling_info(marathon_apps_with_clients,
                                                job_config)
        if autoscaling_info is not None:
            autoscaling_info_dict = autoscaling_info._asdict()

            for field in ("current_utilization", "target_instances"):
                if autoscaling_info_dict[field] is None:
                    del autoscaling_info_dict[field]

            job_status_fields["autoscaling_info"] = autoscaling_info_dict

    return job_status_fields