示例#1
0
def test_old_and_new_ways_load_the_same_marathon_configs(
    mock_marathon_tools_read_extra_service_information,
    mock_read_extra_service_information,
    mock_marathon_tools_load_deployments_json,
    mock_load_deployments_json,
):
    mock_read_extra_service_information.return_value = marathon_cluster_config(
    )
    mock_marathon_tools_read_extra_service_information.return_value = (
        marathon_cluster_config())
    mock_load_deployments_json.return_value = deployment_json()
    mock_marathon_tools_load_deployments_json.return_value = deployment_json()
    s = create_test_service()
    expected = [
        load_marathon_service_config(
            service=TEST_SERVICE_NAME,
            instance="main",
            cluster=TEST_CLUSTER_NAME,
            load_deployments=True,
            soa_dir=TEST_SOA_DIR,
        ),
        load_marathon_service_config(
            service=TEST_SERVICE_NAME,
            instance="canary",
            cluster=TEST_CLUSTER_NAME,
            load_deployments=True,
            soa_dir=TEST_SOA_DIR,
        ),
    ]
    assert (list(s.instance_configs(TEST_CLUSTER_NAME,
                                    MarathonServiceConfig)) == expected)
示例#2
0
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None):
    """Performs a start/stop/restart/status on an instance
    :param command: String of start, stop, restart, status
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: int verbosity level
    :returns: A unix-style return code
    """
    system_config = load_system_paasta_config()

    marathon_config = marathon_tools.load_marathon_config()
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir)
    if not app_id:
        try:
            app_id = job_config.format_marathon_app_dict()['id']
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id
            return 1

    normal_instance_count = job_config.get_instances()
    normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(service, instance, cluster)
    proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, cluster, soa_dir=soa_dir)

    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())
    if command == 'restart':
        restart_marathon_job(service, instance, app_id, client, cluster)
    elif command == 'status':
        print status_desired_state(service, instance, client, job_config)
        print status_marathon_job(service, instance, app_id, normal_instance_count, client)
        tasks, out = status_marathon_job_verbose(service, instance, client)
        if verbose > 0:
            print out
        print status_mesos_tasks(service, instance, normal_instance_count)
        if verbose > 0:
            tail_lines = calculate_tail_lines(verbose_level=verbose)
            print status_mesos_tasks_verbose(
                job_id=app_id,
                get_short_task_id=get_short_task_id,
                tail_lines=tail_lines,
            )
        if proxy_port is not None:
            print status_smartstack_backends(
                service=service,
                instance=instance,
                cluster=cluster,
                job_config=job_config,
                tasks=tasks,
                expected_count=normal_smartstack_count,
                soa_dir=soa_dir,
                verbose=verbose > 0,
                synapse_port=system_config.get_synapse_port(),
                synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(),
            )
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
示例#3
0
def send_event(name, instance, soa_dir, status, output):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param name: The service name the event is about
    :param instance: The instance of the service the event is about
    :param soa_dir: The service directory to read monitoring information from
    :param status: The status to emit for this event
    :param output: The output to emit for this event
    """
    cluster = load_system_paasta_config().get_cluster()
    monitoring_overrides = marathon_tools.load_marathon_service_config(
        name,
        instance,
        cluster,
        soa_dir=soa_dir,
        load_deployments=False,
    ).get_monitoring()
    # In order to let sensu know how often to expect this check to fire,
    # we need to set the ``check_every`` to the frequency of our cron job, which
    # is 10s.
    monitoring_overrides['check_every'] = '10s'
    # Most setup_marathon_job failures are transient and represent issues
    # that will probably be fixed eventually, so we set an alert_after
    # to suppress extra noise
    monitoring_overrides['alert_after'] = '10m'
    check_name = 'setup_marathon_job.%s' % compose_job_id(name, instance)
    monitoring_tools.send_event(name, check_name, monitoring_overrides, status,
                                output, soa_dir)
def send_event(service, namespace, cluster, soa_dir, status, output):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param service: The service name the event is about
    :param namespace: The namespace of the service the event is about
    :param soa_dir: The service directory to read monitoring information from
    :param status: The status to emit for this event
    :param output: The output to emit for this event"""
    # This function assumes the input is a string like "mumble.main"
    monitoring_overrides = marathon_tools.load_marathon_service_config(service, namespace, cluster).get_monitoring()
    if "alert_after" not in monitoring_overrides:
        monitoring_overrides["alert_after"] = "2m"
    monitoring_overrides["check_every"] = "1m"
    monitoring_overrides["runbook"] = monitoring_tools.get_runbook(monitoring_overrides, service, soa_dir=soa_dir)

    check_name = "check_marathon_services_replication.%s" % compose_job_id(service, namespace)
    monitoring_tools.send_event(service, check_name, monitoring_overrides, status, output, soa_dir)
    _log(
        service=service,
        line="Replication: %s" % output,
        component="monitoring",
        level="debug",
        cluster=cluster,
        instance=namespace,
    )
示例#5
0
def send_sensu_bounce_keepalive(service, instance, cluster, soa_dir):
    """Send a Sensu event with a special ``ttl``, to let Sensu know that
    the everything is fine. This event is **not** fired when the bounce is in
    progress.

    If the bounce goes on for too long, this the ``ttl`` will expire and Sensu
    will emit a new event saying that this one didn't check in within the expected
    time-to-live."""
    ttl = '1h'
    monitoring_overrides = marathon_tools.load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=cluster,
        soa_dir=soa_dir,
        load_deployments=False,
    ).get_monitoring()
    # Sensu currently emits events for expired ttl checks every 30s
    monitoring_overrides['check_every'] = '30s'
    monitoring_overrides['alert_after'] = '2m'
    monitoring_overrides['runbook'] = 'http://y/paasta-troubleshooting'
    monitoring_overrides['tip'] = ("Check out `paasta logs`. If the bounce hasn't made progress, "
                                   "it may mean that the new version isn't healthy.")
    # Dogfooding this alert till I'm comfortable it doesn't spam people
    monitoring_overrides['team'] = 'noop'
    monitoring_overrides['notification_email'] = '*****@*****.**'

    monitoring_tools.send_event(
        service=service,
        check_name='paasta_bounce_progress.%s' % compose_job_id(service, instance),
        overrides=monitoring_overrides,
        status=pysensu_yelp.Status.OK,
        output="The bounce is in a steady state",
        soa_dir=soa_dir,
        ttl=ttl,
    )
def send_event(service, namespace, cluster, soa_dir, status, output):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param service: The service name the event is about
    :param namespace: The namespace of the service the event is about
    :param soa_dir: The service directory to read monitoring information from
    :param status: The status to emit for this event
    :param output: The output to emit for this event"""
    # This function assumes the input is a string like "mumble.main"
    monitoring_overrides = marathon_tools.load_marathon_service_config(
        service=service,
        instance=namespace,
        cluster=cluster,
        soa_dir=soa_dir,
        load_deployments=False,
    ).get_monitoring()
    if 'alert_after' not in monitoring_overrides:
        monitoring_overrides['alert_after'] = '2m'
    monitoring_overrides['check_every'] = '1m'
    monitoring_overrides['runbook'] = monitoring_tools.get_runbook(monitoring_overrides, service, soa_dir=soa_dir)

    check_name = 'check_marathon_services_replication.%s' % compose_job_id(service, namespace)
    monitoring_tools.send_event(service, check_name, monitoring_overrides, status, output, soa_dir)
    _log(
        service=service,
        line='Replication: %s' % output,
        component='monitoring',
        level='debug',
        cluster=cluster,
        instance=namespace,
    )
示例#7
0
def run_marathon_app(context, job_id, instances):
    (service, instance, _, __) = decompose_job_id(job_id)
    job_config = marathon_tools.load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=load_system_paasta_config().get_cluster(),
        soa_dir=context.soa_dir,
    )
    app_id = job_config.format_marathon_app_dict()['id']
    app_config = {
        'id': app_id,
        'cmd': '/bin/sleep 1m',
        'container': {
            'type': 'DOCKER',
            'docker': {
                'network': 'BRIDGE',
                'image': 'busybox',
            },
        },
        'instances': instances,
        'constraints': [["hostname", "UNIQUE"]],
    }
    paasta_tools.bounce_lib.create_marathon_app(
        app_id=app_id,
        config=app_config,
        client=context.marathon_clients.get_current_client_for_service(job_config),
    )
示例#8
0
def status_marathon_job(context, status, job_id):
    normal_instance_count = 1
    (service, instance, _, __) = decompose_job_id(job_id)
    job_config = marathon_tools.load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=load_system_paasta_config().get_cluster(),
        soa_dir=context.soa_dir,
    )
    app_id = job_config.format_marathon_app_dict()['id']

    with requests_cache.disabled():
        tasks, output = marathon_serviceinit.status_marathon_job(
            service=service,
            instance=instance,
            cluster=load_system_paasta_config().get_cluster(),
            soa_dir=context.soa_dir,
            dashboards=None,
            normal_instance_count=normal_instance_count,
            clients=context.marathon_clients,
            job_config=job_config,
            desired_app_id=app_id,
            verbose=0,
        )
    assert status in output, f"{status!r} not found in {output!r}"
示例#9
0
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None):
    """Performs a start/stop/restart/status on an instance
    :param command: String of start, stop, restart, status
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: int verbosity level
    :returns: A unix-style return code
    """
    system_config = load_system_paasta_config()

    marathon_config = marathon_tools.load_marathon_config()
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir)
    if not app_id:
        try:
            app_id = job_config.format_marathon_app_dict()['id']
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id
            return 1

    normal_instance_count = job_config.get_instances()
    normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(service, instance, cluster)
    proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, cluster, soa_dir=soa_dir)

    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())
    if command == 'restart':
        restart_marathon_job(service, instance, app_id, client, cluster)
    elif command == 'status':
        print status_desired_state(service, instance, client, job_config)
        print status_marathon_job(service, instance, app_id, normal_instance_count, client)
        tasks, out = status_marathon_job_verbose(service, instance, client)
        if verbose > 0:
            print out
        print status_mesos_tasks(service, instance, normal_instance_count)
        if verbose > 0:
            tail_lines = calculate_tail_lines(verbose_level=verbose)
            print status_mesos_tasks_verbose(
                job_id=app_id,
                get_short_task_id=get_short_task_id,
                tail_lines=tail_lines,
            )
        if proxy_port is not None:
            print status_smartstack_backends(
                service=service,
                instance=instance,
                cluster=cluster,
                job_config=job_config,
                tasks=tasks,
                expected_count=normal_smartstack_count,
                soa_dir=soa_dir,
                verbose=verbose > 0,
                synapse_port=system_config.get_synapse_port(),
                synapse_haproxy_url_format=system_config.get_synapse_haproxy_url_format(),
            )
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
示例#10
0
def get_configs_of_services_to_scale(cluster, soa_dir=DEFAULT_SOA_DIR):
    services = get_services_for_cluster(
        cluster=cluster,
        instance_type='marathon',
        soa_dir=soa_dir,
    )
    configs = []
    for service, instance in services:
        try:
            service_config = load_marathon_service_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
            )
        except NoDeploymentsAvailable:
            log.debug(
                "%s is not deployed yet, refusing to do autoscaling calculations for it"
                % compose_job_id(service, instance))
            continue

        if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \
                and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke':
            configs.append(service_config)

    return configs
示例#11
0
def get_instance_config_for_service(soa_dir, service):
    for cluster in list_clusters(
            service=service,
            soa_dir=soa_dir,
    ):
        for _, instance in get_service_instance_list(
                service=service,
                cluster=cluster,
                instance_type='marathon',
        ):
            yield load_marathon_service_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
            )
        for _, instance in get_service_instance_list(
                service=service,
                cluster=cluster,
                instance_type='chronos',
        ):
            yield load_chronos_job_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
            )
示例#12
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            services = get_services_for_cluster(
                cluster=cluster,
                instance_type='marathon',
                soa_dir=soa_dir,
            )
            configs = []
            for service, instance in services:
                service_config = load_marathon_service_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
                if service_config.get_max_instances(
                ) and service_config.get_desired_state() == 'start':
                    configs.append(service_config)

            if configs:
                marathon_config = load_marathon_config()
                all_marathon_tasks = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password(),
                ).list_tasks()
                all_mesos_tasks = get_running_tasks_from_active_frameworks(
                    '')  # empty string matches all app ids
                with ZookeeperPool():
                    for config in configs:
                        if config.get_autoscaling_params(
                        )['decision_policy'] != 'bespoke':
                            try:
                                job_id = format_job_id(config.service,
                                                       config.instance)
                                marathon_tasks = {
                                    task.id: task
                                    for task in all_marathon_tasks
                                    if job_id == get_short_job_id(task.id)
                                    and task.health_check_results
                                }
                                if not marathon_tasks:
                                    raise MetricsProviderNoDataError(
                                        "Couldn't find any healthy marathon tasks"
                                    )
                                mesos_tasks = [
                                    task for task in all_mesos_tasks
                                    if task['id'] in marathon_tasks
                                ]
                                autoscale_marathon_instance(
                                    config, list(marathon_tasks.values()),
                                    mesos_tasks)
                            except Exception as e:
                                raise e
                                write_to_log(config=config,
                                             line='Caught Exception %s' % e)
    except LockHeldException:
        pass
示例#13
0
def perform_command(command, service, instance, cluster, verbose, soa_dir, app_id=None, delta=None):
    """Performs a start/stop/restart/status/scale on an instance
    :param command: String of start, stop, restart, status or scale
    :param service: service name
    :param instance: instance name, like "main" or "canary"
    :param cluster: cluster name
    :param verbose: bool if the output should be verbose or not
    :returns: A unix-style return code
    """
    marathon_config = marathon_tools.load_marathon_config()
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster, soa_dir=soa_dir)
    if not app_id:
        try:
            app_id = marathon_tools.create_complete_config(service, instance, marathon_config, soa_dir=soa_dir)['id']
        except NoDockerImageError:
            job_id = compose_job_id(service, instance)
            print "Docker image for %s not in deployments.json. Exiting. Has Jenkins deployed it?" % job_id
            return 1

    normal_instance_count = job_config.get_instances()
    normal_smartstack_count = marathon_tools.get_expected_instance_count_for_namespace(service, instance)
    proxy_port = marathon_tools.get_proxy_port_for_instance(service, instance, soa_dir=soa_dir)

    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())
    if command == 'start':
        start_marathon_job(service, instance, app_id, normal_instance_count, client, cluster)
    elif command == 'stop':
        stop_marathon_job(service, instance, app_id, client, cluster)
    elif command == 'restart':
        restart_marathon_job(service, instance, app_id, normal_instance_count, client, cluster)
    elif command == 'status':
        # Setting up transparent cache for http API calls
        requests_cache.install_cache('paasta_serviceinit', backend='memory')

        print status_desired_state(service, instance, client, job_config)
        print status_marathon_job(service, instance, app_id, normal_instance_count, client)
        tasks, out = status_marathon_job_verbose(service, instance, client)
        if verbose:
            print out
        print status_mesos_tasks(service, instance, normal_instance_count)
        if verbose:
            print status_mesos_tasks_verbose(app_id, get_short_task_id)
        if proxy_port is not None:
            print status_smartstack_backends(
                service=service,
                instance=instance,
                cluster=cluster,
                job_config=job_config,
                tasks=tasks,
                expected_count=normal_smartstack_count,
                soa_dir=soa_dir,
                verbose=verbose,
            )
    elif command == 'scale':
        scale_marathon_job(service, instance, app_id, delta, client, cluster)
    else:
        # The command parser shouldn't have let us get this far...
        raise NotImplementedError("Command %s is not implemented!" % command)
    return 0
示例#14
0
def send_event_if_under_replication(
    service,
    instance,
    cluster,
    expected_count,
    num_available,
    soa_dir,
):
    full_name = compose_job_id(service, instance)
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    output = ('Service %s has %d out of %d expected instances available!\n' +
              '(threshold: %d%%)') % (full_name, num_available, expected_count, crit_threshold)
    under_replicated, _ = is_under_replicated(num_available, expected_count, crit_threshold)
    if under_replicated:
        log.error(output)
        status = pysensu_yelp.Status.CRITICAL
    else:
        log.info(output)
        status = pysensu_yelp.Status.OK
    send_event(
        service=service,
        namespace=instance,
        cluster=cluster,
        soa_dir=soa_dir,
        status=status,
        output=output)
示例#15
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            services = get_services_for_cluster(
                cluster=cluster,
                instance_type='marathon',
                soa_dir=soa_dir,
            )
            configs = []
            for service, instance in services:
                service_config = load_marathon_service_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
                if service_config.get_max_instances() and service_config.get_desired_state() == 'start':
                    configs.append(service_config)

            if configs:
                marathon_config = load_marathon_config()
                marathon_tasks = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password(),
                ).list_tasks()
                mesos_tasks = get_running_tasks_from_active_frameworks('')
                for config in configs:
                    try:
                        autoscale_marathon_instance(config, marathon_tasks, mesos_tasks)
                    except Exception as e:
                        write_to_log(config=config, line='Caught Exception %s' % e, level='event')
    except LockHeldException:
        pass
示例#16
0
def get_desired_marathon_configs(soa_dir):
    cluster = load_system_paasta_config().get_cluster()
    instances = get_services_for_cluster(instance_type="marathon",
                                         cluster=cluster,
                                         soa_dir=soa_dir)

    job_configs = dict()
    formatted_marathon_configs = dict()

    for service, instance in instances:
        try:
            job_config = load_marathon_service_config(service=service,
                                                      instance=instance,
                                                      cluster=cluster,
                                                      soa_dir=soa_dir)

            formatted_config = job_config.format_marathon_app_dict()
            formatted_marathon_configs[formatted_config["id"].lstrip(
                "/")] = formatted_config
            job_configs[formatted_config["id"].lstrip("/")] = job_config
        # Not ideal but we rely on a lot of user input to create the app dict
        # and we really can't afford to bail if just one app definition is malformed
        except Exception as errormsg:
            _log(
                service=service,
                line=str(errormsg),
                component="deploy",
                level="debug",
                cluster=cluster,
                instance=instance,
            )
    return formatted_marathon_configs, job_configs
def get_instance_config_for_service(soa_dir, service):
    for cluster in list_clusters(
        service=service,
        soa_dir=soa_dir,
    ):
        for _, instance in get_service_instance_list(
            service=service,
            cluster=cluster,
            instance_type='marathon',
        ):
            yield load_marathon_service_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
            )
        for _, instance in get_service_instance_list(
            service=service,
            cluster=cluster,
            instance_type='chronos',
        ):
            yield load_chronos_job_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
            )
示例#18
0
def send_event(service, namespace, cluster, soa_dir, status, output):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param service: The service name the event is about
    :param namespace: The namespace of the service the event is about
    :param soa_dir: The service directory to read monitoring information from
    :param status: The status to emit for this event
    :param output: The output to emit for this event"""
    # This function assumes the input is a string like "mumble.main"
    monitoring_overrides = marathon_tools.load_marathon_service_config(
        service, namespace, cluster).get_monitoring()
    if 'alert_after' not in monitoring_overrides:
        monitoring_overrides['alert_after'] = '2m'
    monitoring_overrides['check_every'] = '1m'
    monitoring_overrides['runbook'] = monitoring_tools.get_runbook(monitoring_overrides, service, soa_dir=soa_dir)

    check_name = 'check_marathon_services_replication.%s' % compose_job_id(service, namespace)
    monitoring_tools.send_event(service, check_name, monitoring_overrides, status, output, soa_dir)
    _log(
        service=service,
        line='Replication: %s' % output,
        component='monitoring',
        level='debug',
        cluster=cluster,
        instance=namespace,
    )
示例#19
0
def send_event(name, instance, soa_dir, status, output):
    """Send an event to sensu via pysensu_yelp with the given information.

    :param name: The service name the event is about
    :param instance: The instance of the service the event is about
    :param soa_dir: The service directory to read monitoring information from
    :param status: The status to emit for this event
    :param output: The output to emit for this event
    """
    cluster = load_system_paasta_config().get_cluster()
    monitoring_overrides = marathon_tools.load_marathon_service_config(
        name,
        instance,
        cluster,
        soa_dir=soa_dir,
        load_deployments=False,
    ).get_monitoring()
    # In order to let sensu know how often to expect this check to fire,
    # we need to set the ``check_every`` to the frequency of our cron job, which
    # is 10s.
    monitoring_overrides['check_every'] = '10s'
    # Most setup_marathon_job failures are transient and represent issues
    # that will probably be fixed eventually, so we set an alert_after
    # to suppress extra noise
    monitoring_overrides['alert_after'] = '10m'
    check_name = 'setup_marathon_job.%s' % compose_job_id(name, instance)
    monitoring_tools.send_event(name, check_name, monitoring_overrides, status, output, soa_dir)
示例#20
0
def get_desired_marathon_configs(soa_dir):
    cluster = load_system_paasta_config().get_cluster()
    instances = get_services_for_cluster(
        instance_type='marathon',
        cluster=cluster,
        soa_dir=soa_dir,
    )
    marathon_configs = dict()

    for service, instance in instances:
        try:
            marathon_config = load_marathon_service_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
            ).format_marathon_app_dict()
            marathon_configs[marathon_config['id'].lstrip(
                '/')] = marathon_config
        except NoSlavesAvailableError as errormsg:
            _log(
                service=service,
                line=errormsg,
                component='deploy',
                level='event',
                cluster=cluster,
                instance=instance,
            )
        except (NoDeploymentsAvailable, NoDockerImageError):
            pass
    return marathon_configs
示例#21
0
def deploy_marathon_service(service, instance, client, soa_dir,
                            marathon_config):
    try:
        service_instance_config = marathon_tools.load_marathon_service_config(
            service,
            instance,
            load_system_paasta_config().get_cluster(),
            soa_dir=soa_dir,
        )
    except NoDeploymentsAvailable:
        log.debug(
            "No deployments found for %s.%s in cluster %s. Skipping." %
            (service, instance, load_system_paasta_config().get_cluster()))
        return 0
    except NoConfigurationForServiceError:
        error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \
                    (service, instance, load_system_paasta_config().get_cluster())
        log.error(error_msg)
        return 1

    try:
        status, output = setup_service(service, instance, client,
                                       service_instance_config, soa_dir)
        sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
        send_event(service, instance, soa_dir, sensu_status, output)
        return 0
    except (KeyError, TypeError, AttributeError, InvalidInstanceConfig):
        error_str = traceback.format_exc()
        log.error(error_str)
        send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL,
                   error_str)
        return 1
示例#22
0
def send_sensu_bounce_keepalive(service, instance, cluster, soa_dir):
    """Send a Sensu event with a special ``ttl``, to let Sensu know that
    the everything is fine. This event is **not** fired when the bounce is in
    progress.

    If the bounce goes on for too long, this the ``ttl`` will expire and Sensu
    will emit a new event saying that this one didn't check in within the expected
    time-to-live."""
    ttl = '1h'
    monitoring_overrides = marathon_tools.load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=cluster,
        load_deployments=False,
    ).get_monitoring()
    # Sensu currently emits events for expired ttl checks every 30s
    monitoring_overrides['check_every'] = '30s'
    monitoring_overrides['alert_after'] = '2m'
    monitoring_overrides['runbook'] = 'http://y/paasta-troubleshooting'
    monitoring_overrides['tip'] = ("Check out `paasta logs`. If the bounce hasn't made progress, "
                                   "it may mean that the new version isn't healthy.")
    # Dogfooding this alert till I'm comfortable it doesn't spam people
    monitoring_overrides['team'] = 'noop'
    monitoring_overrides['notification_email'] = '*****@*****.**'

    monitoring_tools.send_event(
        service=service,
        check_name='paasta_bounce_progress.%s' % compose_job_id(service, instance),
        overrides=monitoring_overrides,
        status=pysensu_yelp.Status.OK,
        output="The bounce is in a steady state",
        soa_dir=soa_dir,
        ttl=ttl,
    )
示例#23
0
def get_configs_of_services_to_scale(cluster, soa_dir=DEFAULT_SOA_DIR):
    services = get_services_for_cluster(
        cluster=cluster,
        instance_type='marathon',
        soa_dir=soa_dir,
    )
    configs = []
    for service, instance in services:
        try:
            service_config = load_marathon_service_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
            )
        except NoDeploymentsAvailable:
            log.debug("%s is not deployed yet, refusing to do autoscaling calculations for it" %
                      compose_job_id(service, instance))
            continue

        if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \
                and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke':
            configs.append(service_config)

    return configs
示例#24
0
def check_sha_changed(context, service_instance):
    service, instance, _, _ = decompose_job_id(service_instance)
    service_configuration_lib._yaml_cache = {}
    context.marathon_config = load_marathon_service_config(
        service, instance, context.cluster)
    assert context.app_id != context.marathon_config.format_marathon_app_dict(
    )['id']
示例#25
0
def run_marathon_app(context, job_id, instances):
    (service, instance, _, __) = decompose_job_id(job_id)
    job_config = marathon_tools.load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=load_system_paasta_config().get_cluster(),
        soa_dir=context.soa_dir,
    )
    app_id = job_config.format_marathon_app_dict()["id"]
    app_config = {
        "id": app_id,
        "cmd": "/bin/sleep 1m",
        "container": {
            "type": "DOCKER",
            "docker": {
                "network": "BRIDGE",
                "image": "busybox"
            },
        },
        "instances": instances,
        "constraints": [["hostname", "UNIQUE"]],
    }
    paasta_tools.bounce_lib.create_marathon_app(
        app_id=app_id,
        config=app_config,
        client=context.marathon_clients.get_current_client_for_service(
            job_config),
    )
示例#26
0
def deploy_marathon_service(service, instance, client, soa_dir, marathon_config):
    try:
        service_instance_config = marathon_tools.load_marathon_service_config(
            service,
            instance,
            load_system_paasta_config().get_cluster(),
            soa_dir=soa_dir,
        )
    except NoDeploymentsAvailable:
        log.debug("No deployments found for %s.%s in cluster %s. Skipping." %
                  (service, instance, load_system_paasta_config().get_cluster()))
        return 0
    except NoConfigurationForServiceError:
        error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \
                    (service, instance, load_system_paasta_config().get_cluster())
        log.error(error_msg)
        return 1

    try:
        status, output = setup_service(service, instance, client, marathon_config,
                                       service_instance_config, soa_dir)
        sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
        send_event(service, instance, soa_dir, sensu_status, output)
        return 0
    except (KeyError, TypeError, AttributeError, InvalidInstanceConfig):
        error_str = traceback.format_exc()
        log.error(error_str)
        send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str)
        return 1
def kill_marathon_app(full_appid, cluster, client, soa_dir):
    service, instance, _, __ = (s.replace("--", "_")
                                for s in decompose_job_id(full_appid))
    service_instance_config = marathon_tools.load_marathon_service_config(
        service=service, instance=instance, cluster=cluster, soa_dir=soa_dir)
    complete_config = service_instance_config.format_marathon_app_dict()
    registrations = service_instance_config.get_registrations()
    service_namespace_config = marathon_tools.load_service_namespace_config(
        service=service, namespace=registrations[0])
    drain_method = drain_lib.get_drain_method(
        service_instance_config.get_drain_method(service_namespace_config),
        service=service,
        instance=instance,
        registrations=registrations,
        drain_method_params=service_instance_config.get_drain_method_params(
            service_namespace_config),
    )

    bounce_func = bounce_lib.get_bounce_method_func("down")

    while marathon_tools.is_app_id_running(app_id=full_appid, client=client):
        app_to_kill = client.get_app(full_appid)
        (
            old_app_live_happy_tasks,
            old_app_live_unhappy_tasks,
            old_app_draining_tasks,
            old_app_at_risk_tasks,
        ) = get_tasks_by_state(
            other_apps=[app_to_kill],
            drain_method=drain_method,
            service=service,
            nerve_ns=registrations[0],
            bounce_health_params=service_instance_config.
            get_bounce_health_params(service_namespace_config),
        )
        do_bounce(
            bounce_func=bounce_func,
            drain_method=drain_method,
            config=complete_config,
            new_app_running="",
            happy_new_tasks=[],
            old_app_live_happy_tasks=old_app_live_happy_tasks,
            old_app_live_unhappy_tasks=old_app_live_unhappy_tasks,
            old_app_draining_tasks=old_app_draining_tasks,
            old_app_at_risk_tasks=old_app_at_risk_tasks,
            serviceinstance=f"{service}.{instance}",
            bounce_method="down",
            service=service,
            cluster=cluster,
            instance=instance,
            marathon_jobid=full_appid,
            client=client,
            soa_dir=soa_dir,
        )

        paasta_print("Sleeping for 10 seconds to give the tasks time to drain")
        time.sleep(10)

    paasta_print(f"Successfully killed {full_appid}")
示例#28
0
def main():
    """Attempt to set up the marathon service instance given.
    Exits 1 if the deployment failed.
    This is done in the following order:

    - Load the marathon configuration
    - Connect to marathon
    - Load the service instance's configuration
    - Create the complete marathon job configuration
    - Deploy/bounce the service
    - Emit an event about the deployment to sensu"""
    args = parse_args()
    soa_dir = args.soa_dir
    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    try:
        service, instance, _, __ = decompose_job_id(args.service_instance)
    except InvalidJobNameError:
        log.error("Invalid service instance specified. Format is service%sinstance." % SPACER)
        sys.exit(1)

    marathon_config = get_main_marathon_config()
    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())

    try:
        service_instance_config = marathon_tools.load_marathon_service_config(
            service,
            instance,
            load_system_paasta_config().get_cluster(),
            soa_dir=soa_dir,
        )
    except NoDeploymentsAvailable:
        log.debug("No deployments found for %s in cluster %s. Skipping." % (args.service_instance,
                                                                            load_system_paasta_config().get_cluster()))
        sys.exit(0)
    except NoConfigurationForServiceError:
        error_msg = "Could not read marathon configuration file for %s in cluster %s" % \
            (args.service_instance, load_system_paasta_config().get_cluster())
        log.error(error_msg)
        sys.exit(1)

    try:
        status, output = setup_service(service, instance, client, marathon_config,
                                       service_instance_config, soa_dir)
        sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
        send_event(service, instance, soa_dir, sensu_status, output)
        # We exit 0 because the script finished ok and the event was sent to the right team.
        sys.exit(0)
    except (KeyError, TypeError, AttributeError, InvalidInstanceConfig):
        import traceback
        error_str = traceback.format_exc()
        log.error(error_str)
        send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str)
        # We exit 0 because the script finished ok and the event was sent to the right team.
        sys.exit(0)
示例#29
0
def check_smartstack_replication_for_instance(
    service,
    instance,
    cluster,
    soa_dir,
    expected_count,
):
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param service: A string like example_service
    :param namespace: A nerve namespace, like "main"
    :param cluster: name of the cluster
    :param soa_dir: The SOA configuration directory to read from
    """
    namespace = marathon_tools.read_namespace_for_service_instance(service, instance, soa_dir=soa_dir)
    if namespace != instance:
        log.debug("Instance %s is announced under namespace: %s. "
                  "Not checking replication for it" % (instance, namespace))
        return
    full_name = compose_job_id(service, instance)
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    monitoring_blacklist = job_config.get_monitoring_blacklist()
    log.info('Checking instance %s in smartstack', full_name)
    smartstack_replication_info = load_smartstack_info_for_service(
        service=service, namespace=namespace, soa_dir=soa_dir, blacklist=monitoring_blacklist)
    log.debug('Got smartstack replication info for %s: %s' % (full_name, smartstack_replication_info))

    if len(smartstack_replication_info) == 0:
        status = pysensu_yelp.Status.CRITICAL
        output = ('Service %s has no Smartstack replication info. Make sure the discover key in your smartstack.yaml '
                  'is valid!\n') % full_name
        log.error(output)
    else:
        expected_count_per_location = int(expected_count / len(smartstack_replication_info))
        output = ''
        under_replication_per_location = []

        for location, available_backends in sorted(smartstack_replication_info.iteritems()):
            num_available_in_location = available_backends.get(full_name, 0)
            under_replicated, ratio = is_under_replicated(
                num_available_in_location, expected_count_per_location, crit_threshold)
            if under_replicated:
                output += '- Service %s has %d out of %d expected instances in %s (CRITICAL: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            else:
                output += '- Service %s has %d out of %d expected instances in %s (OK: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            under_replication_per_location.append(under_replicated)

        if any(under_replication_per_location):
            status = pysensu_yelp.Status.CRITICAL
            log.error(output)
        else:
            status = pysensu_yelp.Status.OK
            log.info(output)
    send_event(service=service, namespace=instance, cluster=cluster, soa_dir=soa_dir, status=status, output=output)
示例#30
0
def kill_marathon_app(full_appid, cluster, client, soa_dir):
    service, instance, _, __ = (s.replace('--', '_') for s in decompose_job_id(full_appid))
    service_instance_config = marathon_tools.load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=cluster,
        soa_dir=soa_dir,
    )
    complete_config = service_instance_config.format_marathon_app_dict()
    nerve_ns = service_instance_config.get_nerve_namespace()
    service_namespace_config = marathon_tools.load_service_namespace_config(service=service, namespace=nerve_ns)
    drain_method = drain_lib.get_drain_method(
        service_instance_config.get_drain_method(service_namespace_config),
        service=service,
        instance=instance,
        nerve_ns=nerve_ns,
        drain_method_params=service_instance_config.get_drain_method_params(service_namespace_config),
    )

    bounce_func = bounce_lib.get_bounce_method_func('down')

    while marathon_tools.is_app_id_running(app_id=full_appid, client=client):
        app_to_kill = client.get_app(full_appid)
        (
            old_app_live_happy_tasks,
            old_app_live_unhappy_tasks,
            old_app_draining_tasks,
            old_app_at_risk_tasks,
        ) = get_tasks_by_state(
            other_apps=[app_to_kill],
            drain_method=drain_method,
            service=service,
            nerve_ns=nerve_ns,
            bounce_health_params=service_instance_config.get_bounce_health_params(service_namespace_config),
        )
        do_bounce(
            bounce_func=bounce_func,
            drain_method=drain_method,
            config=complete_config,
            new_app_running='',
            happy_new_tasks=[],
            old_app_live_happy_tasks=old_app_live_happy_tasks,
            old_app_live_unhappy_tasks=old_app_live_unhappy_tasks,
            old_app_draining_tasks=old_app_draining_tasks,
            old_app_at_risk_tasks=old_app_at_risk_tasks,
            serviceinstance="{}.{}".format(service, instance),
            bounce_method='down',
            service=service,
            cluster=cluster,
            instance=instance,
            marathon_jobid=full_appid,
            client=client,
            soa_dir=soa_dir,
        )

        paasta_print("Sleeping for 10 seconds to give the tasks time to drain")
        time.sleep(10)

    paasta_print("Sucessfully killed {}".format(full_appid))
示例#31
0
def main():
    """Attempt to set up the marathon service instance given.
    Exits 1 if the deployment failed.
    This is done in the following order:

    - Load the marathon configuration
    - Connect to marathon
    - Load the service instance's configuration
    - Create the complete marathon job configuration
    - Deploy/bounce the service
    - Emit an event about the deployment to sensu"""
    args = parse_args()
    soa_dir = args.soa_dir
    if args.verbose:
        log.setLevel(logging.DEBUG)
    else:
        log.setLevel(logging.WARNING)
    try:
        service, instance, _, __ = decompose_job_id(args.service_instance)
    except InvalidJobNameError:
        log.error("Invalid service instance specified. Format is service%sinstance." % SPACER)
        sys.exit(1)

    marathon_config = get_main_marathon_config()
    client = marathon_tools.get_marathon_client(marathon_config.get_url(), marathon_config.get_username(),
                                                marathon_config.get_password())

    try:
        service_instance_config = marathon_tools.load_marathon_service_config(
            service,
            instance,
            load_system_paasta_config().get_cluster(),
            soa_dir=soa_dir,
        )
    except NoDeploymentsAvailable:
        log.debug("No deployments found for %s in cluster %s. Skipping." % (args.service_instance,
                                                                            load_system_paasta_config().get_cluster()))
        sys.exit(0)
    except NoConfigurationForServiceError:
        error_msg = "Could not read marathon configuration file for %s in cluster %s" % \
            (args.service_instance, load_system_paasta_config().get_cluster())
        log.error(error_msg)
        sys.exit(1)

    try:
        status, output = setup_service(service, instance, client, marathon_config,
                                       service_instance_config, soa_dir)
        sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
        send_event(service, instance, soa_dir, sensu_status, output)
        # We exit 0 because the script finished ok and the event was sent to the right team.
        sys.exit(0)
    except (KeyError, TypeError, AttributeError, InvalidInstanceConfig):
        import traceback
        error_str = traceback.format_exc()
        log.error(error_str)
        send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str)
        # We exit 0 because the script finished ok and the event was sent to the right team.
        sys.exit(0)
示例#32
0
def marathon_instance_status(instance_status, service, instance, verbose):
    apps = marathon_tools.get_matching_appids(service, instance, settings.marathon_client)
    job_config = marathon_tools.load_marathon_service_config(
        service, instance, settings.cluster, soa_dir=settings.soa_dir)

    # bouncing status can be inferred from app_count, ref get_bouncing_status
    instance_status['app_count'] = len(apps)
    instance_status['bounce_method'] = job_config.get_bounce_method()
    instance_status['desired_state'] = job_config.get_desired_state()

    instance_status['marathon'] = marathon_job_status(settings.marathon_client, job_config)
def send_event_if_under_replication(
    service,
    instance,
    cluster,
    expected_count,
    num_available,
    soa_dir,
):
    full_name = compose_job_id(service, instance)
    job_config = marathon_tools.load_marathon_service_config(
        service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    output = ('Service %s has %d out of %d expected instances available!\n' +
              '(threshold: %d%%)') % (full_name, num_available, expected_count,
                                      crit_threshold)
    under_replicated, _ = is_under_replicated(num_available, expected_count,
                                              crit_threshold)
    if under_replicated:
        output += (
            "\n\n"
            "What this alert means:\n"
            "\n"
            "  This replication alert means that the service PaaSTA can't keep the\n"
            "  requested number of copies up and healthy in the cluster.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply unhealthy. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * Increase the instance count\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
        ) % {
            'service': service,
            'instance': instance,
            'cluster': cluster,
        }
        log.error(output)
        status = pysensu_yelp.Status.CRITICAL
    else:
        log.info(output)
        status = pysensu_yelp.Status.OK
    send_event(
        service=service,
        namespace=instance,
        cluster=cluster,
        soa_dir=soa_dir,
        status=status,
        output=output,
    )
示例#34
0
def wait_launch_tasks(context, job_id, task_count):
    (service, instance, _, __) = decompose_job_id(job_id)
    job_config = marathon_tools.load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=load_system_paasta_config().get_cluster(),
        soa_dir=context.soa_dir,
    )
    app_id = job_config.format_marathon_app_dict()['id']
    client = context.marathon_clients.get_current_client_for_service(job_config)
    itest_utils.wait_for_app_to_launch_tasks(client, app_id, task_count, exact_matches_only=True)
示例#35
0
文件: sysdig.py 项目: white105/paasta
def paasta_sysdig(args):
    system_paasta_config = load_system_paasta_config()

    if not args.local:
        mesos_master = get_any_mesos_master(
            cluster=args.cluster, system_paasta_config=system_paasta_config)
        ssh_cmd = ('ssh -At -o StrictHostKeyChecking=no -o LogLevel=QUIET {0} '
                   '"sudo paasta {1} --local"').format(mesos_master,
                                                       ' '.join(sys.argv[1:]))
        return_code, output = _run(ssh_cmd)
        if return_code != 0:
            paasta_print(output)
            sys.exit(return_code)
        slave, command = output.split(':', 1)
        subprocess.call(
            shlex.split("ssh -tA {} '{}'".format(slave, command.strip())))
        return
    status = get_status_for_instance(
        cluster=args.cluster,
        service=args.service,
        instance=args.instance,
    )
    slave = pick_slave_from_status(
        status=status,
        host=args.host,
    )

    job_config = load_marathon_service_config(
        service=args.service,
        instance=args.instance,
        cluster=args.cluster,
    )

    marathon_servers = get_marathon_servers(system_paasta_config)
    marathon_clients = get_marathon_clients(marathon_servers)

    # Unfortunately, sysdig seems to only be able to take one marathon URL, so hopefully the service in question is not
    # currently moving between shards.
    client = marathon_clients.get_current_client_for_service(
        job_config=job_config, )
    marathon_url = client.servers[0]
    marathon_user, marathon_pass = client.auth

    mesos_url = get_mesos_master().host
    marathon_parsed_url = urlparse(marathon_url)
    marathon_creds_url = marathon_parsed_url._replace(netloc="{}:{}@{}".format(
        marathon_user,
        marathon_pass,
        marathon_parsed_url.netloc,
    ))
    paasta_print(
        format_mesos_command(slave, status.marathon.app_id, mesos_url,
                             marathon_creds_url.geturl()))
示例#36
0
 def get_priority(service, instance, cluster):
     try:
         config = load_marathon_service_config(
             service=service,
             instance=instance,
             cluster=cluster,
             soa_dir=DEFAULT_SOA_DIR,
         )
     except (NoDockerImageError, InvalidJobNameError,
             NoDeploymentsAvailable) as e:
         return 0
     return config.get_bounce_priority()
示例#37
0
def marathon_instance_status(instance_status, service, instance, verbose):
    mstatus = {}
    apps = marathon_tools.get_matching_appids(service, instance, settings.marathon_client)
    job_config = marathon_tools.load_marathon_service_config(
        service, instance, settings.cluster, soa_dir=settings.soa_dir)

    # bouncing status can be inferred from app_count, ref get_bouncing_status
    mstatus['app_count'] = len(apps)
    mstatus['desired_state'] = job_config.get_desired_state()
    mstatus['bounce_method'] = job_config.get_bounce_method()
    marathon_job_status(mstatus, settings.marathon_client, job_config)
    return mstatus
示例#38
0
def get_autoscaling_info(marathon_client, service, instance, cluster, soa_dir):
    service_config = load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=cluster,
        soa_dir=soa_dir,
    )
    if service_config.get_max_instances() and service_config.get_desired_state(
    ) == 'start':
        all_marathon_tasks, all_mesos_tasks = get_all_marathon_mesos_tasks(
            marathon_client)
        autoscaling_params = service_config.get_autoscaling_params()
        autoscaling_params.update({'noop': True})
        try:
            marathon_tasks, mesos_tasks = filter_autoscaling_tasks(
                marathon_client,
                all_marathon_tasks,
                all_mesos_tasks,
                service_config,
            )
            utilization = get_utilization(
                marathon_service_config=service_config,
                autoscaling_params=autoscaling_params,
                log_utilization_data={},
                marathon_tasks=list(marathon_tasks.values()),
                mesos_tasks=mesos_tasks,
            )
            error = get_error_from_utilization(
                utilization=utilization,
                setpoint=autoscaling_params['setpoint'],
                current_instances=service_config.get_instances(),
            )
            new_instance_count = get_new_instance_count(
                utilization=utilization,
                error=error,
                autoscaling_params=autoscaling_params,
                current_instances=service_config.get_instances(),
                marathon_service_config=service_config,
                num_healthy_instances=len(marathon_tasks),
            )
            current_utilization = "{:.1f}%".format(utilization * 100)
        except MetricsProviderNoDataError:
            current_utilization = "Exception"
            new_instance_count = "Exception"
        return ServiceAutoscalingInfo(
            current_instances=str(service_config.get_instances()),
            max_instances=str(service_config.get_max_instances()),
            min_instances=str(service_config.get_min_instances()),
            current_utilization=current_utilization,
            target_instances=str(new_instance_count),
        )
    return None
def marathon_instance_status(
    instance_status: Mapping[str, Any],
    service: str,
    instance: str,
    verbose: int,
    include_smartstack: bool,
    include_mesos: bool,
) -> Mapping[str, Any]:
    mstatus: Dict[str, Any] = {}

    job_config = marathon_tools.load_marathon_service_config(
        service, instance, settings.cluster, soa_dir=settings.soa_dir
    )
    marathon_apps_with_clients = marathon_tools.get_marathon_apps_with_clients(
        clients=settings.marathon_clients.get_all_clients_for_service(job_config),
        embed_tasks=True,
        service_name=service,
    )
    matching_apps_with_clients = marathon_tools.get_matching_apps_with_clients(
        service, instance, marathon_apps_with_clients
    )

    mstatus.update(
        marathon_job_status(
            service, instance, job_config, matching_apps_with_clients, verbose
        )
    )

    if include_smartstack:
        service_namespace_config = marathon_tools.load_service_namespace_config(
            service=service,
            namespace=job_config.get_nerve_namespace(),
            soa_dir=settings.soa_dir,
        )
        if "proxy_port" in service_namespace_config:
            tasks = [
                task for app, _ in matching_apps_with_clients for task in app.tasks
            ]

            mstatus["smartstack"] = marathon_smartstack_status(
                service,
                instance,
                job_config,
                service_namespace_config,
                tasks,
                should_return_individual_backends=verbose > 0,
            )

    if include_mesos:
        mstatus["mesos"] = marathon_mesos_status(service, instance, verbose)

    return mstatus
示例#40
0
def deploy_marathon_service(service, instance, client, soa_dir, marathon_config, marathon_apps):
    """deploy the service instance given and proccess return code
    if there was an error we send a sensu alert.

    :param service: The service name to setup
    :param instance: The instance of the service to setup
    :param client: A MarathonClient object
    :param soa_dir: Path to yelpsoa configs
    :param marathon_config: The service instance's configuration dict
    :param marathon_apps: A list of all marathon app objects
    :returns: A tuple of (status, bounce_in_seconds) to be used by paasta-deployd
        bounce_in_seconds instructs how long until the deployd should try another bounce
        None means that it is in a steady state and doesn't need to bounce again
    """
    short_id = marathon_tools.format_job_id(service, instance)
    try:
        with bounce_lib.bounce_lock_zookeeper(short_id):
            try:
                service_instance_config = marathon_tools.load_marathon_service_config(
                    service,
                    instance,
                    load_system_paasta_config().get_cluster(),
                    soa_dir=soa_dir,
                )
            except NoDeploymentsAvailable:
                log.debug("No deployments found for %s.%s in cluster %s. Skipping." %
                          (service, instance, load_system_paasta_config().get_cluster()))
                return 0, None
            except NoConfigurationForServiceError:
                error_msg = "Could not read marathon configuration file for %s.%s in cluster %s" % \
                            (service, instance, load_system_paasta_config().get_cluster())
                log.error(error_msg)
                return 1, None

            try:
                status, output, bounce_again_in_seconds = setup_service(service,
                                                                        instance,
                                                                        client,
                                                                        service_instance_config,
                                                                        marathon_apps,
                                                                        soa_dir)
                sensu_status = pysensu_yelp.Status.CRITICAL if status else pysensu_yelp.Status.OK
                send_event(service, instance, soa_dir, sensu_status, output)
                return 0, bounce_again_in_seconds
            except (KeyError, TypeError, AttributeError, InvalidInstanceConfig, NoSlavesAvailableError):
                error_str = traceback.format_exc()
                log.error(error_str)
                send_event(service, instance, soa_dir, pysensu_yelp.Status.CRITICAL, error_str)
                return 1, None
    except bounce_lib.LockHeldException:
        log.error("Instance %s already being bounced. Exiting", short_id)
        return 0, None
示例#41
0
def create_marathon_dashboard(
    cluster: str,
    soa_dir: str = DEFAULT_SOA_DIR,
    marathon_clients: MarathonClients = None,
    system_paasta_config: SystemPaastaConfig = None,
) -> Marathon_Dashboard:
    try:
        instances: List = get_services_for_cluster(
            cluster=cluster,
            instance_type='marathon',
            soa_dir=soa_dir,
        )
    except FileNotFoundError:
        instances = []
    dashboard: Marathon_Dashboard = {cluster: []}
    if system_paasta_config is None:
        system_paasta_config = load_system_paasta_config()
    marathon_servers = get_marathon_servers(
        system_paasta_config=system_paasta_config)
    if marathon_clients is None:
        marathon_clients = get_marathon_clients(
            marathon_servers=marathon_servers, cached=False)
    for service_instance in instances:
        service: str = service_instance[0]
        instance: str = service_instance[1]
        service_config: MarathonServiceConfig = load_marathon_service_config(
            service=service,
            instance=instance,
            cluster=cluster,
            load_deployments=False,
            soa_dir=soa_dir,
        )
        client: MarathonClient = marathon_clients.get_current_client_for_service(
            job_config=service_config)
        dashboard_links: Dict = system_paasta_config.get_dashboard_links()
        shard_url: str = client.servers[0]
        if 'Marathon RO' in dashboard_links[cluster]:
            marathon_links = dashboard_links[cluster]['Marathon RO']
            if isinstance(marathon_links, list):
                for shard_number, shard in enumerate(marathon_servers.current):
                    if shard.url[0] == shard_url:
                        shard_url = marathon_links[shard_number]
            elif isinstance(marathon_links, str):
                shard_url = marathon_links.split(' ')[0]
        service_info: Marathon_Dashboard_Item = {
            'service': service,
            'instance': instance,
            'shard_url': shard_url,
        }
        dashboard[cluster].append(service_info)
    return dashboard
示例#42
0
def get_autoscaler_count(request):
    service = request.swagger_data.get('service')
    instance = request.swagger_data.get('instance')
    cluster = settings.cluster
    soa_dir = settings.soa_dir
    service_config = load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=cluster,
        soa_dir=soa_dir,
        load_deployments=False,
    )
    response_body = {'desired_instances': service_config.get_instances()}
    return Response(json_body=response_body, status_code=200)
def send_event_if_under_replication(
    service,
    instance,
    cluster,
    expected_count,
    num_available,
    soa_dir,
):
    full_name = compose_job_id(service, instance)
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    output = ('Service %s has %d out of %d expected instances available!\n' +
              '(threshold: %d%%)') % (full_name, num_available, expected_count, crit_threshold)
    under_replicated, _ = is_under_replicated(num_available, expected_count, crit_threshold)
    if under_replicated:
        output += (
            "\n\n"
            "What this alert means:\n"
            "\n"
            "  This replication alert means that the service PaaSTA can't keep the\n"
            "  requested number of copies up and healthy in the cluster.\n"
            "\n"
            "Reasons this might be happening:\n"
            "\n"
            "  The service may simply unhealthy. There also may not be enough resources\n"
            "  in the cluster to support the requested instance count.\n"
            "\n"
            "Things you can do:\n"
            "\n"
            "  * Increase the instance count\n"
            "  * Fix the cause of the unhealthy service. Try running:\n"
            "\n"
            "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
        ) % {
            'service': service,
            'instance': instance,
            'cluster': cluster,
        }
        log.error(output)
        status = pysensu_yelp.Status.CRITICAL
    else:
        log.info(output)
        status = pysensu_yelp.Status.OK
    send_event(
        service=service,
        namespace=instance,
        cluster=cluster,
        soa_dir=soa_dir,
        status=status,
        output=output)
示例#44
0
文件: check.py 项目: somic/paasta
def get_marathon_steps(service, soa_dir):
    """This is a kind of funny function that gets all the marathon instances
    for a service and massages it into a form that matches up with what
    deploy.yaml's steps look like. This is only so we can compare it 1-1
    with what deploy.yaml has for linting."""
    steps = []
    for cluster in list_clusters(service, soa_dir):
        for _, instance in get_service_instance_list(
            service=service, cluster=cluster, instance_type="marathon", soa_dir=soa_dir
        ):
            config = load_marathon_service_config(
                service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False
            )
            steps.append(config.get_deploy_group())
    return steps
示例#45
0
def get_autoscaler_count(request):
    service = request.swagger_data.get("service")
    instance = request.swagger_data.get("instance")
    cluster = settings.cluster
    soa_dir = settings.soa_dir
    try:
        service_config = load_marathon_service_config(
            service=service, instance=instance, cluster=cluster, soa_dir=soa_dir, load_deployments=False
        )
    except Exception:
        error_message = "Unable to load service config for %s.%s" % (service, instance)
        raise ApiFailure(error_message, 404)

    response_body = {"desired_instances": service_config.get_instances()}
    return Response(json_body=response_body, status_code=200)
示例#46
0
def autoscale_services(soa_dir=DEFAULT_SOA_DIR):
    try:
        with create_autoscaling_lock():
            cluster = load_system_paasta_config().get_cluster()
            services = get_services_for_cluster(
                cluster=cluster,
                instance_type='marathon',
                soa_dir=soa_dir,
            )
            configs = []
            for service, instance in services:
                service_config = load_marathon_service_config(
                    service=service,
                    instance=instance,
                    cluster=cluster,
                    soa_dir=soa_dir,
                )
                if service_config.get_max_instances() and service_config.get_desired_state() == 'start' \
                        and service_config.get_autoscaling_params()['decision_policy'] != 'bespoke':
                    configs.append(service_config)

            if configs:
                marathon_config = load_marathon_config()
                marathon_client = get_marathon_client(
                    url=marathon_config.get_url(),
                    user=marathon_config.get_username(),
                    passwd=marathon_config.get_password())
                all_marathon_tasks = marathon_client.list_tasks()
                all_mesos_tasks = get_running_tasks_from_active_frameworks('')  # empty string matches all app ids
                with ZookeeperPool():
                    for config in configs:
                        try:
                            job_id = format_job_id(config.service, config.instance)
                            # Get a dict of healthy tasks, we assume tasks with no healthcheck defined
                            # are healthy. We assume tasks with no healthcheck results but a defined
                            # healthcheck to be unhealthy.
                            marathon_tasks = {task.id: task for task in all_marathon_tasks
                                              if job_id == get_short_job_id(task.id) and
                                              (is_task_healthy(task) or not
                                               marathon_client.get_app(task.app_id).health_checks)}
                            if not marathon_tasks:
                                raise MetricsProviderNoDataError("Couldn't find any healthy marathon tasks")
                            mesos_tasks = [task for task in all_mesos_tasks if task['id'] in marathon_tasks]
                            autoscale_marathon_instance(config, list(marathon_tasks.values()), mesos_tasks)
                        except Exception as e:
                            write_to_log(config=config, line='Caught Exception %s' % e)
    except LockHeldException:
        pass
示例#47
0
文件: utils.py 项目: oktopuz/paasta
def get_instance_configs_for_service(service, soa_dir):
    for cluster in list_clusters(
        service=service,
        soa_dir=soa_dir,
    ):
        for _, instance in get_service_instance_list(
            service=service,
            cluster=cluster,
            instance_type='marathon',
            soa_dir=soa_dir,
        ):
            yield load_marathon_service_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
                load_deployments=False,
            )
        for _, instance in get_service_instance_list(
            service=service,
            cluster=cluster,
            instance_type='chronos',
            soa_dir=soa_dir,
        ):
            yield load_chronos_job_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
                load_deployments=False,
            )
        for _, instance in get_service_instance_list(
            service=service,
            cluster=cluster,
            instance_type='adhoc',
            soa_dir=soa_dir,
        ):
            yield load_adhoc_job_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
                load_deployments=False,
            )
def get_desired_marathon_configs(soa_dir):
    cluster = load_system_paasta_config().get_cluster()
    instances = get_services_for_cluster(
        instance_type='marathon',
        cluster=cluster,
        soa_dir=soa_dir,
    )
    marathon_configs = dict()

    for service, instance in instances:
        try:
            marathon_config = load_marathon_service_config(
                service=service,
                instance=instance,
                cluster=cluster,
                soa_dir=soa_dir,
            ).format_marathon_app_dict()
            marathon_configs[marathon_config['id'].lstrip('/')] = marathon_config
        except NoDockerImageError:
            # This service hasn't been deployed yet
            pass
    return marathon_configs
示例#49
0
文件: autoscaler.py 项目: Yelp/paasta
def update_autoscaler_count(request):
    service = request.swagger_data.get('service')
    instance = request.swagger_data.get('instance')
    desired_instances = request.swagger_data.get('json_body')['desired_instances']

    try:
        service_config = load_marathon_service_config(
            service=service,
            instance=instance,
            cluster=settings.cluster,
            soa_dir=settings.soa_dir,
            load_deployments=False,
        )
    except Exception:
        error_message = 'Unable to load service config for %s.%s' % (service, instance)
        raise ApiFailure(error_message, 404)

    max_instances = service_config.get_max_instances()
    if max_instances is None:
        error_message = 'Autoscaling is not enabled for %s.%s' % (service, instance)
        raise ApiFailure(error_message, 404)

    min_instances = service_config.get_min_instances()

    # Dump whatever number from the client to zk. get_instances() will limit
    # readings from zk to [min_instances, max_instances].
    set_instances_for_marathon_service(service=service, instance=instance, instance_count=desired_instances)
    status = 'SUCCESS'
    if desired_instances > max_instances:
        desired_instances = max_instances
        status = 'WARNING desired_instances is greater than max_instances %d' % max_instances
    elif desired_instances < min_instances:
        desired_instances = min_instances
        status = 'WARNING desired_instances is less than min_instances %d' % min_instances

    response_body = {'desired_instances': desired_instances, 'status': status}
    return Response(json_body=response_body, status_code=202)
示例#50
0
def main():
    args = parse_args()
    full_appid = args.appname.lstrip('/')
    soa_dir = args.soa_dir
    marathon_config = marathon_tools.load_marathon_config()
    client = marathon_tools.get_marathon_client(
        url=marathon_config.get_url(),
        user=marathon_config.get_username(),
        passwd=marathon_config.get_password(),
    )

    if not marathon_tools.is_app_id_running(app_id=full_appid, client=client):
        print("Couldn't find an app named {0}".format(full_appid))
        sys.exit(1)

    service, instance, _, __ = (s.replace('--', '_') for s in decompose_job_id(full_appid))
    complete_config = marathon_tools.create_complete_config(service, instance, marathon_config)
    cluster = load_system_paasta_config().get_cluster()
    service_instance_config = marathon_tools.load_marathon_service_config(
        service=service,
        instance=instance,
        cluster=cluster,
        soa_dir=soa_dir,
    )
    nerve_ns = service_instance_config.get_nerve_namespace()
    service_namespace_config = marathon_tools.load_service_namespace_config(service=service, namespace=nerve_ns)
    drain_method = drain_lib.get_drain_method(
        service_instance_config.get_drain_method(service_namespace_config),
        service=service,
        instance=instance,
        nerve_ns=nerve_ns,
        drain_method_params=service_instance_config.get_drain_method_params(service_namespace_config),
    )

    bounce_func = bounce_lib.get_bounce_method_func('down')

    while marathon_tools.is_app_id_running(app_id=full_appid, client=client):
        app_to_kill = client.get_app(full_appid)
        old_app_live_tasks, old_app_draining_tasks = get_old_live_draining_tasks([app_to_kill], drain_method)
        do_bounce(
            bounce_func=bounce_func,
            drain_method=drain_method,
            config=complete_config,
            new_app_running='',
            happy_new_tasks=[],
            old_app_live_tasks=old_app_live_tasks,
            old_app_draining_tasks=old_app_draining_tasks,
            serviceinstance="{0}.{1}".format(service, instance),
            bounce_method='down',
            service=service,
            cluster=cluster,
            instance=instance,
            marathon_jobid=full_appid,
            client=client,
            soa_dir=soa_dir,
        )

        print "Sleeping for 10 seconds to give the tasks time to drain"
        time.sleep(10)

    print("Sucessfully killed {0}".format(full_appid))
def check_smartstack_replication_for_instance(
    service,
    instance,
    cluster,
    soa_dir,
    expected_count,
    system_paasta_config,
):
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param service: A string like example_service
    :param namespace: A nerve namespace, like "main"
    :param cluster: name of the cluster
    :param soa_dir: The SOA configuration directory to read from
    :param system_paasta_config: A SystemPaastaConfig object representing the system configuration.
    """
    namespace = marathon_tools.read_namespace_for_service_instance(service, instance, soa_dir=soa_dir)
    if namespace != instance:
        log.debug("Instance %s is announced under namespace: %s. "
                  "Not checking replication for it" % (instance, namespace))
        return
    full_name = compose_job_id(service, instance)
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    monitoring_blacklist = job_config.get_monitoring_blacklist()
    log.info('Checking instance %s in smartstack', full_name)
    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=namespace,
        soa_dir=soa_dir,
        blacklist=monitoring_blacklist,
        system_paasta_config=system_paasta_config,
    )
    log.debug('Got smartstack replication info for %s: %s' % (full_name, smartstack_replication_info))

    if len(smartstack_replication_info) == 0:
        status = pysensu_yelp.Status.CRITICAL
        output = ('Service %s has no Smartstack replication info. Make sure the discover key in your smartstack.yaml '
                  'is valid!\n') % full_name
        log.error(output)
    else:
        expected_count_per_location = int(expected_count / len(smartstack_replication_info))
        output = ''
        under_replication_per_location = []

        for location, available_backends in sorted(smartstack_replication_info.iteritems()):
            num_available_in_location = available_backends.get(full_name, 0)
            under_replicated, ratio = is_under_replicated(
                num_available_in_location, expected_count_per_location, crit_threshold)
            if under_replicated:
                output += '- Service %s has %d out of %d expected instances in %s (CRITICAL: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            else:
                output += '- Service %s has %d out of %d expected instances in %s (OK: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            under_replication_per_location.append(under_replicated)

        if any(under_replication_per_location):
            status = pysensu_yelp.Status.CRITICAL
            output += (
                "\n\n"
                "What this alert means:\n"
                "\n"
                "  This replication alert means that a SmartStack powered loadbalancer (haproxy)\n"
                "  doesn't have enough healthy backends. Not having enough healthy backends\n"
                "  means that clients of that service will get 503s (http) or connection refused\n"
                "  (tcp) when trying to connect to it.\n"
                "\n"
                "Reasons this might be happening:\n"
                "\n"
                "  The service may simply not have enough copies or it could simply be\n"
                "  unhealthy in that location. There also may not be enough resources\n"
                "  in the cluster to support the requested instance count.\n"
                "\n"
                "Things you can do:\n"
                "\n"
                "  * Fix the cause of the unhealthy service. Try running:\n"
                "\n"
                "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
                "\n"
                "  * Widen SmartStack discovery settings\n"
                "  * Increase the instance count\n"
                "\n"
            ) % {
                'service': service,
                'instance': instance,
                'cluster': cluster,
            }
            log.error(output)
        else:
            status = pysensu_yelp.Status.OK
            log.info(output)
    send_event(service=service, namespace=instance, cluster=cluster, soa_dir=soa_dir, status=status, output=output)