Exemplo n.º 1
0
def synapse_replication_is_low(service, instance, system_paasta_config,
                               local_backends):
    crit_threshold = 80
    namespace = read_namespace_for_service_instance(service=service,
                                                    instance=instance)
    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=namespace,
        blacklist=[],
        system_paasta_config=system_paasta_config,
    )
    expected_count = get_expected_instance_count_for_namespace(
        service=service, namespace=namespace)
    expected_count_per_location = int(expected_count /
                                      len(smartstack_replication_info))
    synapse_name = "%s.%s" % (service, namespace)
    local_replication = get_replication_for_services(
        synapse_host=system_paasta_config.get_default_synapse_host(),
        synapse_port=system_paasta_config.get_synapse_port(),
        synapse_haproxy_url_format=system_paasta_config.
        get_synapse_haproxy_url_format(),
        services=[synapse_name],
    )
    num_available = local_replication.get(synapse_name, 0)
    under_replicated, ratio = utils.is_under_replicated(
        num_available, expected_count_per_location, crit_threshold)
    log.info('Service %s.%s has %d out of %d expected instances' %
             (service, instance, num_available, expected_count_per_location))
    return under_replicated
Exemplo n.º 2
0
def check_smartstack_replication_for_instance(
    service,
    instance,
    cluster,
    soa_dir,
    expected_count,
):
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param service: A string like example_service
    :param namespace: A nerve namespace, like "main"
    :param cluster: name of the cluster
    :param soa_dir: The SOA configuration directory to read from
    """
    namespace = marathon_tools.read_namespace_for_service_instance(service, instance, soa_dir=soa_dir)
    if namespace != instance:
        log.debug("Instance %s is announced under namespace: %s. "
                  "Not checking replication for it" % (instance, namespace))
        return
    full_name = compose_job_id(service, instance)
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    monitoring_blacklist = job_config.get_monitoring_blacklist()
    log.info('Checking instance %s in smartstack', full_name)
    smartstack_replication_info = load_smartstack_info_for_service(
        service=service, namespace=namespace, soa_dir=soa_dir, blacklist=monitoring_blacklist)
    log.debug('Got smartstack replication info for %s: %s' % (full_name, smartstack_replication_info))

    if len(smartstack_replication_info) == 0:
        status = pysensu_yelp.Status.CRITICAL
        output = ('Service %s has no Smartstack replication info. Make sure the discover key in your smartstack.yaml '
                  'is valid!\n') % full_name
        log.error(output)
    else:
        expected_count_per_location = int(expected_count / len(smartstack_replication_info))
        output = ''
        under_replication_per_location = []

        for location, available_backends in sorted(smartstack_replication_info.iteritems()):
            num_available_in_location = available_backends.get(full_name, 0)
            under_replicated, ratio = is_under_replicated(
                num_available_in_location, expected_count_per_location, crit_threshold)
            if under_replicated:
                output += '- Service %s has %d out of %d expected instances in %s (CRITICAL: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            else:
                output += '- Service %s has %d out of %d expected instances in %s (OK: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            under_replication_per_location.append(under_replicated)

        if any(under_replication_per_location):
            status = pysensu_yelp.Status.CRITICAL
            log.error(output)
        else:
            status = pysensu_yelp.Status.OK
            log.info(output)
    send_event(service=service, namespace=instance, cluster=cluster, soa_dir=soa_dir, status=status, output=output)
Exemplo n.º 3
0
def status_smartstack_backends(service, instance, job_config, cluster, tasks,
                               expected_count, soa_dir, verbose, synapse_port,
                               synapse_haproxy_url_format):
    """Returns detailed information about smartstack backends for a service
    and instance.
    return: A newline separated string of the smarststack backend status
    """
    output = []
    nerve_ns = marathon_tools.read_namespace_for_service_instance(
        service, instance, cluster)
    service_instance = compose_job_id(service, nerve_ns)

    service_namespace_config = marathon_tools.load_service_namespace_config(
        service, instance, soa_dir=soa_dir)
    discover_location_type = service_namespace_config.get_discover()
    monitoring_blacklist = job_config.get_monitoring_blacklist()

    filtered_slaves = get_all_slaves_for_blacklist_whitelist(
        blacklist=monitoring_blacklist, whitelist=[])

    grouped_slaves = get_mesos_slaves_grouped_by_attribute(
        slaves=filtered_slaves,
        attribute=discover_location_type,
    )

    # rebuild the dict, replacing the slave object
    # with just their hostname
    grouped_slave_hostname = {
        attribute_value: [slave['hostname'] for slave in slaves]
        for attribute_value, slaves in grouped_slaves.items()
    }

    if len(grouped_slave_hostname) == 0:
        output.append("Smartstack: ERROR - %s is NOT in smartstack at all!" %
                      service_instance)
    else:
        output.append("Smartstack:")
        if verbose:
            output.append("  Haproxy Service Name: %s" % service_instance)
            output.append("  Backends:")

        output.extend(
            pretty_print_smartstack_backends_for_locations(
                service_instance=service_instance,
                tasks=tasks,
                locations=grouped_slave_hostname,
                expected_count=expected_count,
                verbose=verbose,
                synapse_port=synapse_port,
                synapse_haproxy_url_format=synapse_haproxy_url_format,
            ))
    return "\n".join(output)
Exemplo n.º 4
0
def status_smartstack_backends(service, instance, job_config, cluster, tasks,
                               expected_count, soa_dir, verbose, synapse_port,
                               synapse_haproxy_url_format):
    """Returns detailed information about smartstack backends for a service
    and instance.
    return: A newline separated string of the smarststack backend status
    """
    output = []
    nerve_ns = marathon_tools.read_namespace_for_service_instance(
        service, instance, cluster)
    service_instance = compose_job_id(service, nerve_ns)

    if instance != nerve_ns:
        ns_string = PaastaColors.bold(nerve_ns)
        output.append(
            "Smartstack: N/A - %s is announced in the %s namespace." %
            (instance, ns_string))
        # If verbose mode is specified, then continue to show backends anyway, otherwise stop early
        if not verbose:
            return "\n".join(output)

    service_namespace_config = marathon_tools.load_service_namespace_config(
        service, instance, soa_dir=soa_dir)
    discover_location_type = service_namespace_config.get_discover()
    monitoring_blacklist = job_config.get_monitoring_blacklist()
    unique_attributes = get_mesos_slaves_grouped_by_attribute(
        attribute=discover_location_type, blacklist=monitoring_blacklist)
    if len(unique_attributes) == 0:
        output.append("Smartstack: ERROR - %s is NOT in smartstack at all!" %
                      service_instance)
    else:
        output.append("Smartstack:")
        if verbose:
            output.append("  Haproxy Service Name: %s" % service_instance)
            output.append("  Backends:")

        output.extend(
            pretty_print_smartstack_backends_for_locations(
                service_instance,
                tasks,
                unique_attributes,
                expected_count,
                verbose,
                synapse_port,
                synapse_haproxy_url_format,
            ))
    return "\n".join(output)
Exemplo n.º 5
0
def status_smartstack_backends(service, instance, job_config, cluster, tasks, expected_count, soa_dir, verbose,
                               synapse_port, synapse_haproxy_url_format):
    """Returns detailed information about smartstack backends for a service
    and instance.
    return: A newline separated string of the smarststack backend status
    """
    output = []
    nerve_ns = marathon_tools.read_namespace_for_service_instance(service, instance, cluster)
    service_instance = compose_job_id(service, nerve_ns)

    if instance != nerve_ns:
        ns_string = PaastaColors.bold(nerve_ns)
        output.append("Smartstack: N/A - %s is announced in the %s namespace." % (instance, ns_string))
        # If verbose mode is specified, then continue to show backends anyway, otherwise stop early
        if not verbose:
            return "\n".join(output)

    service_namespace_config = marathon_tools.load_service_namespace_config(service, instance, soa_dir=soa_dir)
    discover_location_type = service_namespace_config.get_discover()
    monitoring_blacklist = job_config.get_monitoring_blacklist()
    unique_attributes = get_mesos_slaves_grouped_by_attribute(
        attribute=discover_location_type, blacklist=monitoring_blacklist)
    if len(unique_attributes) == 0:
        output.append("Smartstack: ERROR - %s is NOT in smartstack at all!" % service_instance)
    else:
        output.append("Smartstack:")
        if verbose:
            output.append("  Haproxy Service Name: %s" % service_instance)
            output.append("  Backends:")

        output.extend(pretty_print_smartstack_backends_for_locations(
            service_instance,
            tasks,
            unique_attributes,
            expected_count,
            verbose,
            synapse_port,
            synapse_haproxy_url_format,
        ))
    return "\n".join(output)
def check_smartstack_replication_for_instance(
    service,
    instance,
    cluster,
    soa_dir,
    expected_count,
    system_paasta_config,
):
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param service: A string like example_service
    :param namespace: A nerve namespace, like "main"
    :param cluster: name of the cluster
    :param soa_dir: The SOA configuration directory to read from
    :param system_paasta_config: A SystemPaastaConfig object representing the system configuration.
    """
    namespace = marathon_tools.read_namespace_for_service_instance(service, instance, soa_dir=soa_dir)
    if namespace != instance:
        log.debug("Instance %s is announced under namespace: %s. "
                  "Not checking replication for it" % (instance, namespace))
        return
    full_name = compose_job_id(service, instance)
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    monitoring_blacklist = job_config.get_monitoring_blacklist()
    log.info('Checking instance %s in smartstack', full_name)
    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=namespace,
        soa_dir=soa_dir,
        blacklist=monitoring_blacklist,
        system_paasta_config=system_paasta_config,
    )
    log.debug('Got smartstack replication info for %s: %s' % (full_name, smartstack_replication_info))

    if len(smartstack_replication_info) == 0:
        status = pysensu_yelp.Status.CRITICAL
        output = ('Service %s has no Smartstack replication info. Make sure the discover key in your smartstack.yaml '
                  'is valid!\n') % full_name
        log.error(output)
    else:
        expected_count_per_location = int(expected_count / len(smartstack_replication_info))
        output = ''
        under_replication_per_location = []

        for location, available_backends in sorted(smartstack_replication_info.iteritems()):
            num_available_in_location = available_backends.get(full_name, 0)
            under_replicated, ratio = is_under_replicated(
                num_available_in_location, expected_count_per_location, crit_threshold)
            if under_replicated:
                output += '- Service %s has %d out of %d expected instances in %s (CRITICAL: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            else:
                output += '- Service %s has %d out of %d expected instances in %s (OK: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            under_replication_per_location.append(under_replicated)

        if any(under_replication_per_location):
            status = pysensu_yelp.Status.CRITICAL
            output += (
                "\n\n"
                "What this alert means:\n"
                "\n"
                "  This replication alert means that a SmartStack powered loadbalancer (haproxy)\n"
                "  doesn't have enough healthy backends. Not having enough healthy backends\n"
                "  means that clients of that service will get 503s (http) or connection refused\n"
                "  (tcp) when trying to connect to it.\n"
                "\n"
                "Reasons this might be happening:\n"
                "\n"
                "  The service may simply not have enough copies or it could simply be\n"
                "  unhealthy in that location. There also may not be enough resources\n"
                "  in the cluster to support the requested instance count.\n"
                "\n"
                "Things you can do:\n"
                "\n"
                "  * Fix the cause of the unhealthy service. Try running:\n"
                "\n"
                "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
                "\n"
                "  * Widen SmartStack discovery settings\n"
                "  * Increase the instance count\n"
                "\n"
            ) % {
                'service': service,
                'instance': instance,
                'cluster': cluster,
            }
            log.error(output)
        else:
            status = pysensu_yelp.Status.OK
            log.info(output)
    send_event(service=service, namespace=instance, cluster=cluster, soa_dir=soa_dir, status=status, output=output)
def check_smartstack_replication_for_instance(
    service,
    instance,
    cluster,
    soa_dir,
    expected_count,
    system_paasta_config,
):
    """Check a set of namespaces to see if their number of available backends is too low,
    emitting events to Sensu based on the fraction available and the thresholds defined in
    the corresponding yelpsoa config.

    :param service: A string like example_service
    :param namespace: A nerve namespace, like "main"
    :param cluster: name of the cluster
    :param soa_dir: The SOA configuration directory to read from
    :param system_paasta_config: A SystemPaastaConfig object representing the system configuration.
    """
    namespace = marathon_tools.read_namespace_for_service_instance(service, instance, soa_dir=soa_dir)
    if namespace != instance:
        log.debug("Instance %s is announced under namespace: %s. "
                  "Not checking replication for it" % (instance, namespace))
        return
    full_name = compose_job_id(service, instance)
    job_config = marathon_tools.load_marathon_service_config(service, instance, cluster)
    crit_threshold = job_config.get_replication_crit_percentage()
    monitoring_blacklist = job_config.get_monitoring_blacklist()
    log.info('Checking instance %s in smartstack', full_name)
    smartstack_replication_info = load_smartstack_info_for_service(
        service=service,
        namespace=namespace,
        soa_dir=soa_dir,
        blacklist=monitoring_blacklist,
        system_paasta_config=system_paasta_config,
    )
    log.debug('Got smartstack replication info for %s: %s' % (full_name, smartstack_replication_info))

    if len(smartstack_replication_info) == 0:
        status = pysensu_yelp.Status.CRITICAL
        output = ('Service %s has no Smartstack replication info. Make sure the discover key in your smartstack.yaml '
                  'is valid!\n') % full_name
        log.error(output)
    else:
        expected_count_per_location = int(expected_count / len(smartstack_replication_info))
        output = ''
        under_replication_per_location = []

        for location, available_backends in sorted(smartstack_replication_info.iteritems()):
            num_available_in_location = available_backends.get(full_name, 0)
            under_replicated, ratio = is_under_replicated(
                num_available_in_location, expected_count_per_location, crit_threshold)
            if under_replicated:
                output += '- Service %s has %d out of %d expected instances in %s (CRITICAL: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            else:
                output += '- Service %s has %d out of %d expected instances in %s (OK: %d%%)\n' % (
                    full_name, num_available_in_location, expected_count_per_location, location, ratio)
            under_replication_per_location.append(under_replicated)

        if any(under_replication_per_location):
            status = pysensu_yelp.Status.CRITICAL
            output += (
                "\n\n"
                "What this alert means:\n"
                "\n"
                "  This replication alert means that a SmartStack powered loadbalancer (haproxy)\n"
                "  doesn't have enough healthy backends. Not having enough healthy backends\n"
                "  means that clients of that service will get 503s (http) or connection refused\n"
                "  (tcp) when trying to connect to it.\n"
                "\n"
                "Reasons this might be happening:\n"
                "\n"
                "  The service may simply not have enough copies or it could simply be\n"
                "  unhealthy in that location. There also may not be enough resources\n"
                "  in the cluster to support the requested instance count.\n"
                "\n"
                "Things you can do:\n"
                "\n"
                "  * Fix the cause of the unhealthy service. Try running:\n"
                "\n"
                "      paasta status -s %(service)s -i %(instance)s -c %(cluster)s -vv\n"
                "\n"
                "  * Widen SmartStack discovery settings\n"
                "  * Increase the instance count\n"
                "\n"
            ) % {
                'service': service,
                'instance': instance,
                'cluster': cluster,
            }
            log.error(output)
        else:
            status = pysensu_yelp.Status.OK
            log.info(output)
    send_event(service=service, namespace=instance, cluster=cluster, soa_dir=soa_dir, status=status, output=output)