예제 #1
0
 def test_get_human_time(self):
     timestamp = 1565092435
     human_time = "2019-08-06T13:53:55.000000Z"
     timestamp_2 = 1565095633.9568892
     human_time_2 = "2019-08-06T14:47:13.956889Z"
     self.assertEqual(get_human_time(timestamp), human_time)
     self.assertEqual(get_human_time(timestamp_2), human_time_2)
예제 #2
0
    def __init__(self, config, step, timestamp, client=None, delete=False):

        # Init dataclass fields from SLO config and Error Budget Policy
        self.__set_fields(**config,
                          **step,
                          lambdas={
                              'slo_target': float,
                              'alerting_burn_rate_threshold': float
                          })
        # Set other fields
        self.window = int(step['measurement_window_seconds'])
        self.timestamp = int(timestamp)
        self.timestamp_human = utils.get_human_time(timestamp)
        self.valid = True
        self.metadata = config.get('metadata', {})

        # Get backend results
        data = self.run_backend(config, client=client, delete=delete)
        if not self._validate(data):
            self.valid = False
            return

        # Build SLO report
        self.build(step, data)

        # Post validation
        if not self._post_validate():
            self.valid = False
예제 #3
0
    def test_get_human_time(self):
        # Timezones
        tz_1 = 'Europe/Paris'
        tz_2 = 'America/Chicago'

        # Timestamp 1
        timestamp = 1565092435
        utc_time = "2019-08-06T11:53:55.000000"
        human_paris_1 = get_human_time(timestamp, timezone=tz_1)
        human_chicago_1 = get_human_time(timestamp, timezone=tz_2)

        # Timestamp 2
        timestamp_2 = 1565095633.9568892
        utc_time_2 = "2019-08-06T12:47:13.956889"
        human_paris_2 = get_human_time(timestamp_2, timezone=tz_1)
        human_chicago_2 = get_human_time(timestamp_2, timezone=tz_2)

        self.assertEqual(human_paris_1, utc_time + "+02:00")
        self.assertEqual(human_chicago_1, utc_time + "-05:00")
        self.assertEqual(human_paris_2, utc_time_2 + "+02:00")
        self.assertEqual(human_chicago_2, utc_time_2 + "-05:00")
예제 #4
0
    def __init__(self, config, step, timestamp, client=None, delete=False):

        # Init dataclass fields from SLO config and Error Budget Policy
        self.__set_fields(**config,
                          **step,
                          lambdas={
                              'slo_target': float,
                              'alerting_burn_rate_threshold': int
                          })

        # Set other fields
        self.window = int(step['measurement_window_seconds'])
        self.timestamp = int(timestamp)
        self.timestamp_human = utils.get_human_time(timestamp)

        # Get backend results
        result = self.run_backend(config, client=client, delete=delete)
        if result:
            self.build(step, result)
def make_measurement(slo_config, step, backend_result, timestamp):
    """Measure following metrics: SLI, SLO, Error Budget, Burn Rate.

    Args:
        slo_config (dict): SLO configuration.
        step (dict): Step config.
        backend_result (tuple or int): A tuple (good_event_count,
            bad_event_count) or the SLI value as a float.
        timestamp (int): UNIX timestamp.

    Returns:
        dict: Report dictionary.
    """
    slo_full_name = get_full_slo_name(slo_config)
    step_name = step['error_budget_policy_step_name']
    info = f"{slo_full_name :<25} | {step_name :<8}"

    LOGGER.debug(f"{info} | SLO report starting ...")

    # For some backends we are sending the SLI value directly, for others we're
    # sending a tuple (good_event_count, bad_event_count) and we'll compute the
    # SLI from there
    if not isinstance(backend_result, tuple):
        if backend_result == 0:
            LOGGER.error(f"{info} | Null SLI value.")
            return None
        good_event_count, bad_event_count = None, None
        sli = round(backend_result, 6)
    else:
        good_event_count, bad_event_count = backend_result
        if (good_event_count + bad_event_count) == 0:
            LOGGER.error(f"{info} | {step_name} | No events found.")
            return None
        LOGGER.debug(f"{info} Good event count: {good_event_count}")
        LOGGER.debug(f"{info} Bad event count: {bad_event_count}")
        sli = round(good_event_count / (good_event_count + bad_event_count), 6)

    slo_target = float(slo_config['slo_target'])
    window = int(step['measurement_window_seconds'])
    alerting_burn_rate_threshold = int(step['alerting_burn_rate_threshold'])
    overburned_consequence_message = step['overburned_consequence_message']
    achieved_consequence_message = step['achieved_consequence_message']
    step_name = step['error_budget_policy_step_name']
    timestamp_human = utils.get_human_time(timestamp)

    # Compute SLI and gap between SLI / SLO target.
    gap = sli - slo_target

    # Compute Error Budget (target, current value, remaining minutes, available
    # minutes).
    error_budget_target = 1 - slo_target
    error_budget_target = 1 - slo_target
    error_budget_value = 1 - sli
    error_budget_remaining_minutes = window * gap / 60
    error_minutes = window * error_budget_value / 60
    error_budget_minutes = window * error_budget_target / 60

    # Compute Error Budget Burn rate: the % of consumed error budget.
    if error_budget_target == 0:
        error_budget_burn_rate = 0
    else:
        error_budget_burn_rate = round(
            error_budget_value / error_budget_target, 1)

    # Alert boolean on burn rate excessive speed.
    alert = error_budget_burn_rate > alerting_burn_rate_threshold

    # Set consequence message as derived from the Error Budget Policy file.
    if alert:
        consequence_message = overburned_consequence_message
    elif error_budget_burn_rate <= 1:
        consequence_message = achieved_consequence_message
    else:
        consequence_message = (
            'Missed for this measurement window, but not enough to alert')

    # Build out result
    result = {
        'service_name': slo_config['service_name'],
        'feature_name': slo_config['feature_name'],
        'slo_name': slo_config['slo_name'],
        'slo_target': slo_config['slo_target'],
        'slo_description': slo_config['slo_description'],
        'error_budget_policy_step_name': step_name,
        'error_budget_remaining_minutes': error_budget_remaining_minutes,
        'error_budget_minutes': error_budget_minutes,
        'error_minutes': error_minutes,
        'error_budget_target': error_budget_target,
        'timestamp_human': timestamp_human,
        'timestamp': timestamp,
        'consequence_message': consequence_message,
        'window': window,
        'bad_events_count': bad_event_count,
        'good_events_count': good_event_count,
        'sli_measurement': sli,
        'gap': gap,
        'error_budget_measurement': error_budget_value,
        'error_budget_burn_rate': error_budget_burn_rate,
        'alerting_burn_rate_threshold': alerting_burn_rate_threshold,
        'alert': alert
    }
    LOGGER.debug(pprint.pformat(result))
    sli_percent = round(sli * 100, 6)
    LOGGER.info(f"{info} | "
                f"SLI: {sli_percent} % | "
                f"Target: {slo_target * 100} % | "
                f"Burnrate: {error_budget_burn_rate :<2} | "
                f"Target burnrate: {alerting_burn_rate_threshold} | "
                f"Alert: {alert}")
    return result
예제 #6
0
def make_measurement(slo_config, step, good_event_count, bad_event_count,
                     timestamp):
    """Measure following metrics: SLI, SLO, Error Budget, Burn Rate.

    Args:
        slo_config (dict): SLO configuration.
        step (dict): Step config.
        good_event_count (int): Good events count.
        bad_event_count (int): Bad events count.
        timestamp (int): UNIX timestamp.

    Returns:
        dict: Report dictionary.
    """
    LOGGER.info("Making SLO measurements for step '%s'",
                step['error_budget_policy_step_name'])
    if (good_event_count + bad_event_count) == 0:
        error = "No valid events for {}/{}/{}/{}".format(
            slo_config['service_name'], slo_config['feature_name'],
            slo_config['slo_name'], step['error_budget_policy_step_name'])
        LOGGER.error(error)
        return

    LOGGER.debug("Good event count: %s" % good_event_count)
    LOGGER.debug("Bad event count: %s" % bad_event_count)

    slo_target = float(slo_config['slo_target'])
    window = int(step['measurement_window_seconds'])
    alerting_burn_rate_threshold = int(step['alerting_burn_rate_threshold'])
    overburned_consequence_message = step['overburned_consequence_message']
    achieved_consequence_message = step['achieved_consequence_message']
    step_name = step['error_budget_policy_step_name']
    timestamp_human = utils.get_human_time(timestamp)

    # Compute SLI and gap between SLI / SLO target.
    sli = good_event_count / (good_event_count + bad_event_count)
    gap = sli - slo_target

    # Compute Error Budget (target, current value, remaining minutes, available
    # minutes).
    error_budget_target = 1 - slo_target
    error_budget_target = 1 - slo_target
    error_budget_measurement = 1 - sli
    error_budget_remaining_minutes = window * gap / 60
    error_minutes = window * error_budget_measurement / 60
    error_budget_minutes = window * error_budget_target / 60

    # Compute Error Budget Burn rate: the % of consumed error budget.
    error_budget_burn_rate = error_budget_measurement / error_budget_target

    # Alert boolean on burn rate excessive speed.
    alert = error_budget_burn_rate > alerting_burn_rate_threshold

    # Set consequence message as derived from the Error Budget Policy file.
    if alert:
        consequence_message = overburned_consequence_message
    elif error_budget_burn_rate <= 1:
        consequence_message = achieved_consequence_message
    else:
        consequence_message = (
            'Missed for this measurement window, but not enough to alert')

    # Build out result
    result = OrderedDict({
        'service_name': slo_config['service_name'],
        'feature_name': slo_config['feature_name'],
        'slo_name': slo_config['slo_name'],
        'slo_target': slo_config['slo_target'],
        'slo_description': slo_config['slo_description'],
        'error_budget_policy_step_name': step_name,
        'error_budget_remaining_minutes': error_budget_remaining_minutes,
        'error_budget_minutes': error_budget_minutes,
        'error_minutes': error_minutes,
        'error_budget_target': error_budget_target,
        'timestamp_human': timestamp_human,
        'timestamp': timestamp,
        'consequence_message': consequence_message,
        'window': window,
        'bad_events_count': bad_event_count,
        'good_events_count': good_event_count,
        'sli_measurement': sli,
        'gap': gap,
        'error_budget_measurement': error_budget_measurement,
        'error_budget_burn_rate': error_budget_burn_rate,
        'alerting_burn_rate_threshold': alerting_burn_rate_threshold,
        'alert': alert
    })
    LOGGER.debug(pprint.pformat(result))
    return result