def test_get_human_time(self): timestamp = 1565092435 human_time = "2019-08-06T13:53:55.000000Z" timestamp_2 = 1565095633.9568892 human_time_2 = "2019-08-06T14:47:13.956889Z" self.assertEqual(get_human_time(timestamp), human_time) self.assertEqual(get_human_time(timestamp_2), human_time_2)
def __init__(self, config, step, timestamp, client=None, delete=False): # Init dataclass fields from SLO config and Error Budget Policy self.__set_fields(**config, **step, lambdas={ 'slo_target': float, 'alerting_burn_rate_threshold': float }) # Set other fields self.window = int(step['measurement_window_seconds']) self.timestamp = int(timestamp) self.timestamp_human = utils.get_human_time(timestamp) self.valid = True self.metadata = config.get('metadata', {}) # Get backend results data = self.run_backend(config, client=client, delete=delete) if not self._validate(data): self.valid = False return # Build SLO report self.build(step, data) # Post validation if not self._post_validate(): self.valid = False
def test_get_human_time(self): # Timezones tz_1 = 'Europe/Paris' tz_2 = 'America/Chicago' # Timestamp 1 timestamp = 1565092435 utc_time = "2019-08-06T11:53:55.000000" human_paris_1 = get_human_time(timestamp, timezone=tz_1) human_chicago_1 = get_human_time(timestamp, timezone=tz_2) # Timestamp 2 timestamp_2 = 1565095633.9568892 utc_time_2 = "2019-08-06T12:47:13.956889" human_paris_2 = get_human_time(timestamp_2, timezone=tz_1) human_chicago_2 = get_human_time(timestamp_2, timezone=tz_2) self.assertEqual(human_paris_1, utc_time + "+02:00") self.assertEqual(human_chicago_1, utc_time + "-05:00") self.assertEqual(human_paris_2, utc_time_2 + "+02:00") self.assertEqual(human_chicago_2, utc_time_2 + "-05:00")
def __init__(self, config, step, timestamp, client=None, delete=False): # Init dataclass fields from SLO config and Error Budget Policy self.__set_fields(**config, **step, lambdas={ 'slo_target': float, 'alerting_burn_rate_threshold': int }) # Set other fields self.window = int(step['measurement_window_seconds']) self.timestamp = int(timestamp) self.timestamp_human = utils.get_human_time(timestamp) # Get backend results result = self.run_backend(config, client=client, delete=delete) if result: self.build(step, result)
def make_measurement(slo_config, step, backend_result, timestamp): """Measure following metrics: SLI, SLO, Error Budget, Burn Rate. Args: slo_config (dict): SLO configuration. step (dict): Step config. backend_result (tuple or int): A tuple (good_event_count, bad_event_count) or the SLI value as a float. timestamp (int): UNIX timestamp. Returns: dict: Report dictionary. """ slo_full_name = get_full_slo_name(slo_config) step_name = step['error_budget_policy_step_name'] info = f"{slo_full_name :<25} | {step_name :<8}" LOGGER.debug(f"{info} | SLO report starting ...") # For some backends we are sending the SLI value directly, for others we're # sending a tuple (good_event_count, bad_event_count) and we'll compute the # SLI from there if not isinstance(backend_result, tuple): if backend_result == 0: LOGGER.error(f"{info} | Null SLI value.") return None good_event_count, bad_event_count = None, None sli = round(backend_result, 6) else: good_event_count, bad_event_count = backend_result if (good_event_count + bad_event_count) == 0: LOGGER.error(f"{info} | {step_name} | No events found.") return None LOGGER.debug(f"{info} Good event count: {good_event_count}") LOGGER.debug(f"{info} Bad event count: {bad_event_count}") sli = round(good_event_count / (good_event_count + bad_event_count), 6) slo_target = float(slo_config['slo_target']) window = int(step['measurement_window_seconds']) alerting_burn_rate_threshold = int(step['alerting_burn_rate_threshold']) overburned_consequence_message = step['overburned_consequence_message'] achieved_consequence_message = step['achieved_consequence_message'] step_name = step['error_budget_policy_step_name'] timestamp_human = utils.get_human_time(timestamp) # Compute SLI and gap between SLI / SLO target. gap = sli - slo_target # Compute Error Budget (target, current value, remaining minutes, available # minutes). error_budget_target = 1 - slo_target error_budget_target = 1 - slo_target error_budget_value = 1 - sli error_budget_remaining_minutes = window * gap / 60 error_minutes = window * error_budget_value / 60 error_budget_minutes = window * error_budget_target / 60 # Compute Error Budget Burn rate: the % of consumed error budget. if error_budget_target == 0: error_budget_burn_rate = 0 else: error_budget_burn_rate = round( error_budget_value / error_budget_target, 1) # Alert boolean on burn rate excessive speed. alert = error_budget_burn_rate > alerting_burn_rate_threshold # Set consequence message as derived from the Error Budget Policy file. if alert: consequence_message = overburned_consequence_message elif error_budget_burn_rate <= 1: consequence_message = achieved_consequence_message else: consequence_message = ( 'Missed for this measurement window, but not enough to alert') # Build out result result = { 'service_name': slo_config['service_name'], 'feature_name': slo_config['feature_name'], 'slo_name': slo_config['slo_name'], 'slo_target': slo_config['slo_target'], 'slo_description': slo_config['slo_description'], 'error_budget_policy_step_name': step_name, 'error_budget_remaining_minutes': error_budget_remaining_minutes, 'error_budget_minutes': error_budget_minutes, 'error_minutes': error_minutes, 'error_budget_target': error_budget_target, 'timestamp_human': timestamp_human, 'timestamp': timestamp, 'consequence_message': consequence_message, 'window': window, 'bad_events_count': bad_event_count, 'good_events_count': good_event_count, 'sli_measurement': sli, 'gap': gap, 'error_budget_measurement': error_budget_value, 'error_budget_burn_rate': error_budget_burn_rate, 'alerting_burn_rate_threshold': alerting_burn_rate_threshold, 'alert': alert } LOGGER.debug(pprint.pformat(result)) sli_percent = round(sli * 100, 6) LOGGER.info(f"{info} | " f"SLI: {sli_percent} % | " f"Target: {slo_target * 100} % | " f"Burnrate: {error_budget_burn_rate :<2} | " f"Target burnrate: {alerting_burn_rate_threshold} | " f"Alert: {alert}") return result
def make_measurement(slo_config, step, good_event_count, bad_event_count, timestamp): """Measure following metrics: SLI, SLO, Error Budget, Burn Rate. Args: slo_config (dict): SLO configuration. step (dict): Step config. good_event_count (int): Good events count. bad_event_count (int): Bad events count. timestamp (int): UNIX timestamp. Returns: dict: Report dictionary. """ LOGGER.info("Making SLO measurements for step '%s'", step['error_budget_policy_step_name']) if (good_event_count + bad_event_count) == 0: error = "No valid events for {}/{}/{}/{}".format( slo_config['service_name'], slo_config['feature_name'], slo_config['slo_name'], step['error_budget_policy_step_name']) LOGGER.error(error) return LOGGER.debug("Good event count: %s" % good_event_count) LOGGER.debug("Bad event count: %s" % bad_event_count) slo_target = float(slo_config['slo_target']) window = int(step['measurement_window_seconds']) alerting_burn_rate_threshold = int(step['alerting_burn_rate_threshold']) overburned_consequence_message = step['overburned_consequence_message'] achieved_consequence_message = step['achieved_consequence_message'] step_name = step['error_budget_policy_step_name'] timestamp_human = utils.get_human_time(timestamp) # Compute SLI and gap between SLI / SLO target. sli = good_event_count / (good_event_count + bad_event_count) gap = sli - slo_target # Compute Error Budget (target, current value, remaining minutes, available # minutes). error_budget_target = 1 - slo_target error_budget_target = 1 - slo_target error_budget_measurement = 1 - sli error_budget_remaining_minutes = window * gap / 60 error_minutes = window * error_budget_measurement / 60 error_budget_minutes = window * error_budget_target / 60 # Compute Error Budget Burn rate: the % of consumed error budget. error_budget_burn_rate = error_budget_measurement / error_budget_target # Alert boolean on burn rate excessive speed. alert = error_budget_burn_rate > alerting_burn_rate_threshold # Set consequence message as derived from the Error Budget Policy file. if alert: consequence_message = overburned_consequence_message elif error_budget_burn_rate <= 1: consequence_message = achieved_consequence_message else: consequence_message = ( 'Missed for this measurement window, but not enough to alert') # Build out result result = OrderedDict({ 'service_name': slo_config['service_name'], 'feature_name': slo_config['feature_name'], 'slo_name': slo_config['slo_name'], 'slo_target': slo_config['slo_target'], 'slo_description': slo_config['slo_description'], 'error_budget_policy_step_name': step_name, 'error_budget_remaining_minutes': error_budget_remaining_minutes, 'error_budget_minutes': error_budget_minutes, 'error_minutes': error_minutes, 'error_budget_target': error_budget_target, 'timestamp_human': timestamp_human, 'timestamp': timestamp, 'consequence_message': consequence_message, 'window': window, 'bad_events_count': bad_event_count, 'good_events_count': good_event_count, 'sli_measurement': sli, 'gap': gap, 'error_budget_measurement': error_budget_measurement, 'error_budget_burn_rate': error_budget_burn_rate, 'alerting_burn_rate_threshold': alerting_burn_rate_threshold, 'alert': alert }) LOGGER.debug(pprint.pformat(result)) return result