def main(): try: exercise_file = open(util.relative_path("exercise_reports"), 'r+') ex_reports = json.loads(exercise_file.read()) except IOError: exercise_file = open(util.relative_path("exercise_reports"), 'w') ex_reports = {"elapsed_time": 1, # Filler value "max_id": -1, "last_time": 0} new_reports = get_errors(copy.deepcopy(ex_reports)) period_len = new_reports["time_this_period"] for ex in new_reports: if ex in SPECIAL_VALUES: continue if ex in ex_reports and ex_reports[ex]["num_errors"] > 0: errors_this_period = new_reports[ex]["this_period"] mean, probability = util.probability(ex_reports[ex]["num_errors"], ex_reports["elapsed_time"], errors_this_period, period_len) print ("%s] TOTAL %s/%ss; %s-: %s/%ss; m=%.3f p=%.3f" % (time.strftime("%Y-%m-%d %H:%M:%S %Z"), ex_reports[ex]["num_errors"], ex_reports["elapsed_time"], ex_reports["last_time"], errors_this_period, period_len, mean, probability)) if (probability > 0.997 and errors_this_period > 1): util.send_to_slack( "*Elevated exercise bug report rate in exercise `%s`\n" "Reports: %s. We saw %s in the last %s minutes," " while the mean indicates we should see around %s." " *Probability that this is abnormally elevated: %.4f.*" % (ex, generate_slack_links(new_reports[ex]["href"]), util.thousand_commas(errors_this_period), util.thousand_commas(int(period_len / 60)), util.thousand_commas(round(mean, 2)), probability), channel="#support") if "href" in new_reports[ex].keys(): del new_reports[ex]["href"] # don't need to keep the links around del new_reports["time_this_period"] # Overwrite with new contents exercise_file.seek(0) exercise_file.truncate() exercise_file.write(json.dumps(new_reports)) exercise_file.close()
def main(): try: exercise_file = open(util.relative_path("exercise_reports"), 'r+') ex_reports = json.loads(exercise_file.read()) except IOError: exercise_file = open(util.relative_path("exercise_reports"), 'w') ex_reports = { "elapsed_time": 1, # Filler value "max_id": -1, "last_time": 0 } new_reports = get_errors(copy.deepcopy(ex_reports)) period_len = new_reports["time_this_period"] for ex in new_reports: if ex in SPECIAL_VALUES: continue if ex in ex_reports and ex_reports[ex]["num_errors"] > 0: errors_this_period = new_reports[ex]["this_period"] mean, probability = util.probability(ex_reports[ex]["num_errors"], ex_reports["elapsed_time"], errors_this_period, period_len) print("%s] TOTAL %s/%ss; %s-: %s/%ss; m=%.3f p=%.3f" % (time.strftime("%Y-%m-%d %H:%M:%S %Z"), ex_reports[ex]["num_errors"], ex_reports["elapsed_time"], ex_reports["last_time"], errors_this_period, period_len, mean, probability)) if (probability > 0.997 and errors_this_period > 1): util.send_to_slack( "*Elevated exercise bug report rate in exercise `%s`\n" "Reports: %s. We saw %s in the last %s minutes," " while the mean indicates we should see around %s." " *Probability that this is abnormally elevated: %.4f.*" % (ex, generate_slack_links(new_reports[ex]["href"]), util.thousand_commas(errors_this_period), util.thousand_commas(int(period_len / 60)), util.thousand_commas(round(mean, 2)), probability), channel="#support") if "href" in new_reports[ex].keys(): del new_reports[ex]["href"] # don't need to keep the links around del new_reports["time_this_period"] # Overwrite with new contents exercise_file.seek(0) exercise_file.truncate() exercise_file.write(json.dumps(new_reports)) exercise_file.close()
def main(): try: exercise_file = open(util.relative_path("exercise_reports"), 'r+') ex_reports = json.loads(exercise_file.read()) except IOError: exercise_file = open(util.relative_path("exercise_reports"), 'w') ex_reports = {"elapsed_time": 1, # Filler value "max_id": -1, "last_time": 0} new_reports = get_errors(copy.deepcopy(ex_reports)) period_len = new_reports["time_this_period"] for ex in new_reports: if ex in SPECIAL_VALUES: continue if ex in ex_reports and ex_reports[ex]["num_errors"] > 0: errors_this_period = new_reports[ex]["this_period"] mean, probability = util.probability(ex_reports[ex]["num_errors"], ex_reports["elapsed_time"], errors_this_period, period_len) if (probability > 0.997 and errors_this_period > 1): # Too many errors! hipchat_message.send_message( "Elevated exercise bug report rate in exercise %s!" " Reports: %s. We saw %s in the last %s minutes," " while the mean indicates we should see around %s." " Probability that this is abnormally elevated: %.4f." % (ex, generate_links(new_reports[ex]["href"]), util.thousand_commas(errors_this_period), util.thousand_commas(int(period_len / 60)), util.thousand_commas(round(mean, 2)), probability), room_id="Exercises") if "href" in new_reports[ex].keys(): del new_reports[ex]["href"] # don't need to keep the links around del new_reports["time_this_period"] # Overwrite with new contents exercise_file.seek(0) exercise_file.truncate() exercise_file.write(json.dumps(new_reports)) exercise_file.close()
def handle_alerts(num_new_tickets, time_this_period, mean, probability, start_time, end_time): """Determine which alerts to send at various thresholds. If probability of elevated ticket count is high, a notification is sent to Slack and Alerta. A Pagerduty alert is only sent out if a significantly elevated rate is detected. """ # TODO(jacqueline): Including SIGNIFICANT_TICKET_COUNT hard # threshold here so as to catch false positives, especially during # transition. Maybe consider removing this once change in mean # starts flattening out; August 2017? url = 'https://khanacademy.zendesk.com/agent/filters/37051364' message = ( "We saw %s in the last %s minutes," " while the mean indicates we should see around %s." " *Probability that this is abnormally elevated: %.4f.*" % (util.thousand_commas(num_new_tickets), util.thousand_commas(int(time_this_period / 60)), util.thousand_commas(round(mean, 2)), probability)) if (mean != 0 and probability > 0.999 and num_new_tickets >= SIGNIFICANT_TICKET_COUNT): # Too many errors! Point people to the 'all tickets' filter. message = ("*Elevated bug report rate on <%s|Zendesk>*\n" % url + message) util.send_to_slack(message, channel='#1s-and-0s') util.send_to_slack(message, channel='#user-issues') util.send_to_alerta(message, severity=logging.ERROR) # Before we start texting people, make sure we've hit higher threshold. # TODO(benkraft/jacqueline): Potentially could base this off more # historical data from analogous dow/time datapoints, but doesn't look # like Zendesk API has a good way of doing this, running into request # quota issues. Readdress this option if threshold is too noisy. if probability > 0.9995: util.send_to_pagerduty(message, service='beep-boop') else: # If ticket rate is normal, still send alert to alerta to resolve any # prior existing alerts. message = ("Normal bug report rate on <%s|Zendesk>\n" % url + message) util.send_to_alerta(message, severity=logging.INFO)
def main(): try: google_code_file = open(util.relative_path("google_code"), 'r+') old_reports = json.loads(google_code_file.read()) except IOError: google_code_file = open(util.relative_path("google_code"), 'w') # elapsed_time is filler value: doesn't matter what it is # since issue_count is 0. old_reports = {"elapsed_time": 1, "last_id": -1, "issue_count": 0, "last_time": 0} new_reports = get_errors(copy.deepcopy(old_reports)) time_this_period = new_reports["time_this_period"] mean, probability = util.probability(old_reports["issue_count"], old_reports["elapsed_time"], new_reports["issues_this_period"], time_this_period) if (mean != 0 and probability > 0.99): # Too many errors! hipchat_message.send_message( "Elevated bug report rate on" " <a href='http://khanacademy.org/r/bugs'>Google" " code!</a>" " We saw %s in the last %s minutes," " while the mean indicates we should see around %s." " Probability that this is abnormally elevated: %.4f." % (util.thousand_commas(new_reports["issues_this_period"]), util.thousand_commas(int(time_this_period / 60)), util.thousand_commas(round(mean, 2)), probability)) # Delete fields we don't need anymore del(new_reports["issues_this_period"]) del(new_reports["time_this_period"]) google_code_file.seek(0) google_code_file.truncate() google_code_file.write(json.dumps(new_reports)) google_code_file.close()
def handle_alerts(new_tickets, time_this_period, mean, probability, start_time, end_time): """Determine which alerts to send at various thresholds. If probability of elevated ticket count is high, a notification is sent to Slack and Alerta. A Pagerduty alert is only sent out if a significantly elevated rate is detected. """ # TODO(jacqueline): Including SIGNIFICANT_TICKET_COUNT hard # threshold here so as to catch false positives, especially during # transition. Maybe consider removing this once change in mean # starts flattening out; August 2017? num_new_tickets = len(new_tickets) message = ( "We saw %s in the last %s minutes," " while the mean indicates we should see around %s." " *Probability that this is abnormally elevated: %.4f.*" % (util.thousand_commas(num_new_tickets), util.thousand_commas(int(time_this_period / 60)), util.thousand_commas(round(mean, 2)), probability)) if (mean != 0 and probability > 0.999 and num_new_tickets >= SIGNIFICANT_TICKET_COUNT): # Too many errors! Point people to the slack channel. message = ("Elevated Zendesk report rate (#zendesk-technical)\n" + message) # Generated a list of tickets that we will send to Slack along with the # original message ticket_list = '' for ticket in new_tickets: created_at = _parse_time(ticket['created_at']) created_at = datetime.datetime.fromtimestamp(created_at) ticket_list += "\n*[%s][Ticket #%d]:* %s" % ( created_at.strftime("%I:%M %p"), ticket['id'], # Strip any non-safe characters from the subject line re.sub(r"[^\w\-\.'%&:,\[\]/\\\(\)\" ]", '', ticket['subject'])) util.send_to_slack(message + ticket_list, channel='#1s-and-0s') util.send_to_slack(message + ticket_list, channel='#user-issues') util.send_to_alerta(message, severity=logging.ERROR) # Before we start texting people, make sure we've hit higher threshold. # TODO(benkraft/jacqueline): Potentially could base this off more # historical data from analogous dow/time datapoints, but doesn't look # like Zendesk API has a good way of doing this, running into request # quota issues. Readdress this option if threshold is too noisy. if (probability > 0.9995 and num_new_tickets >= MIN_TICKET_COUNT_TO_PAGE_SOMEONE): util.send_to_pagerduty(message, service='beep-boop') else: # If ticket rate is normal, still send alert to alerta to resolve any # prior existing alerts. message = "Normal Zendesk report rate (#zendesk-technical)\n" + message util.send_to_alerta(message, severity=logging.INFO, mark_resolved=True)
def main(): try: jira_status_file = util.relative_path('jira') with open(jira_status_file) as f: old_data = cPickle.load(f) except IOError: old_data = {'elapsed_times': {}, 'ticket_counts': collections.defaultdict(int), 'last_time_t': None, } # We compare the number of tickets in the last few minutes against # the historical average for all time. But we don't start "all # time" at AD 1, we start it 100 days ago. # Note: this is a way wider window than we use for Zendesk, but we're # making exercise-specific recommendations, so we need more data. now = int(time.time()) num_days_in_past = 100 (num_new_tickets, oldest_ticket_time_t) = num_tickets_between( old_data['last_time_t'] or (now - 86400 * num_days_in_past), now) # Elapsed time is computed per-exercise, so store values as we go. # We use a copy so that exercises that don't appear as new tickets still # have their old elapsed times preserved. elapsed_times = copy.copy(old_data['elapsed_times']) for exercise in num_new_tickets: # If this is the first time we're running, we don't have a last_time_t, # so we take the oldest ticket for each exercise as its last_time_t last_time_t = old_data['last_time_t'] or oldest_ticket_time_t[exercise] time_this_period = now - last_time_t # Avoid divide-by-0 if this is the first time we've seen an exercise time_last_period = old_data['elapsed_times'].get(exercise, 0.0001) num_old_tickets_for_exercise = old_data['ticket_counts'][exercise] num_new_tickets_for_exercise = num_new_tickets[exercise] (mean, probability) = util.probability(num_old_tickets_for_exercise, time_last_period, num_new_tickets_for_exercise, time_this_period) print('%s] %s TOTAL %s/%ss; %s-: %s/%ss; m=%.3f p=%.3f' % (time.strftime('%Y-%m-%d %H:%M:%S %Z'), exercise, num_old_tickets_for_exercise, int(time_last_period), last_time_t, num_new_tickets_for_exercise, time_this_period, mean, probability)) if (mean != 0 and probability > 0.9995 and num_new_tickets_for_exercise > THRESHOLD): quoted = urllib.quote(exercise.encode("utf-8")) ka_url = "https://khanacademy.org/e/%s" % quoted jira_url = "https://khanacademy.atlassian.net/browse/AI-941528?jql=Exercise%%20%%3D%%20%s" % quoted util.send_to_slack( "*Elevated bug report rate on exercise `%s`*\n" "We saw %s in the last %s minutes," " while the mean indicates we should see around %s." " *Probability that this is abnormally elevated: %.4f.*\n" " Links: <%s|exercise on Khan Academy>, <%s|JIRA tickets>." % (exercise, util.thousand_commas(num_new_tickets_for_exercise), util.thousand_commas(int(time_this_period / 60)), util.thousand_commas(round(mean, 2)), probability, ka_url, jira_url), channel='#content-beep-boop') elapsed_times[exercise] = time_last_period + time_this_period new_ticket_counts = util.merge_int_dicts(old_data['ticket_counts'], num_new_tickets) new_data = {'elapsed_times': elapsed_times, 'ticket_counts': new_ticket_counts, 'last_time_t': now, } with open(jira_status_file, 'w') as f: cPickle.dump(new_data, f)