def check_for_alert(cls, resource, *args, **kwargs): """ Check for open uptime alerts. Create new one if nothing is found and send alerts """ db_session = get_db_session(kwargs.get("db_session")) request = get_current_request() event_type = "uptime_alert" metric = kwargs["metric"] event = EventService.for_resource( [resource.resource_id], event_type=Event.types[event_type], status=Event.statuses["active"], ) if event.first(): log.info("ALERT: PROGRESS: %s %s" % (event_type, resource)) else: log.warning("ALERT: OPEN: %s %s" % (event_type, resource)) event_values = { "status_code": metric["status_code"], "tries": metric["tries"], "response_time": metric["response_time"], } new_event = Event( resource_id=resource.resource_id, event_type=Event.types[event_type], status=Event.statuses["active"], values=event_values, ) db_session.add(new_event) new_event.send_alerts(request=request, resource=resource)
def check_for_groups_alert(cls, resource, event_type, *args, **kwargs): """ Check for open alerts depending on group type. Create new one if nothing is found and send alerts """ db_session = get_db_session(kwargs.get("db_session")) request = get_current_request() report_groups = kwargs["report_groups"] occurence_dict = kwargs["occurence_dict"] error_reports = 0 slow_reports = 0 for group in report_groups: occurences = occurence_dict.get(group.id, 1) if group.get_report().report_type == ReportType.error: error_reports += occurences elif group.get_report().report_type == ReportType.slow: slow_reports += occurences log_msg = "LIMIT INFO: %s : %s error reports. %s slow_reports" % ( resource, error_reports, slow_reports, ) logging.warning(log_msg) threshold = 10 for event_type in ["error_report_alert", "slow_report_alert"]: if ( error_reports < resource.error_report_threshold and event_type == "error_report_alert" ): continue elif ( slow_reports <= resource.slow_report_threshold and event_type == "slow_report_alert" ): continue if event_type == "error_report_alert": amount = error_reports threshold = resource.error_report_threshold elif event_type == "slow_report_alert": amount = slow_reports threshold = resource.slow_report_threshold event = EventService.for_resource( [resource.resource_id], event_type=Event.types[event_type], status=Event.statuses["active"], ) if event.first(): log.info("ALERT: PROGRESS: %s %s" % (event_type, resource)) else: log.warning("ALERT: OPEN: %s %s" % (event_type, resource)) new_event = Event( resource_id=resource.resource_id, event_type=Event.types[event_type], status=Event.statuses["active"], values={"reports": amount, "threshold": threshold}, ) db_session.add(new_event) new_event.send_alerts(request=request, resource=resource)
def fetch_events(request): """ Returns list of log entries from Elasticsearch """ event_paginator = EventService.get_paginator(user=request.user, page=1, items_per_page=100) headers = gen_pagination_headers(request, event_paginator) request.response.headers.update(headers) return [ev.get_dict() for ev in event_paginator.items]
def add_uptime_stats(params, metric): proto_version = parse_proto(params.get("protocol_version")) try: application = ApplicationService.by_id_cached()(metric["resource_id"]) application = DBSession.merge(application, load=False) if not application: return start_interval = convert_date(metric["timestamp"]) start_interval = start_interval.replace(second=0, microsecond=0) new_metric = UptimeMetric( start_interval=start_interval, response_time=metric["response_time"], status_code=metric["status_code"], is_ok=metric["is_ok"], location=metric.get("location", 1), tries=metric["tries"], resource_id=application.resource_id, owner_user_id=application.owner_user_id, ) DBSession.add(new_metric) DBSession.flush() add_metrics_uptime([new_metric.es_doc()]) if metric["is_ok"]: event_types = [Event.types["uptime_alert"]] statuses = [Event.statuses["active"]] # get events older than 5 min events = EventService.by_type_and_status( event_types, statuses, older_than=(datetime.utcnow() - timedelta(minutes=6)), app_ids=[application.resource_id], ) for event in events: event.close() else: UptimeMetricService.check_for_alert(application, metric=metric) action = "METRICS UPTIME" metrics_msg = "%s: %s, proto:%s" % (action, str(application), proto_version) log.info(metrics_msg) session = DBSession() mark_changed(session) return True except Exception as exc: print_traceback(log) add_uptime_stats.retry(exc=exc)
def event_PATCH(request): resources = request.user.resources_with_perms( ['view'], resource_types=request.registry.resource_types) event = EventService.for_resource( [r.resource_id for r in resources], event_id=request.matchdict['event_id']).first() if not event: return HTTPNotFound() allowed_keys = ['status'] for k, v in request.unsafe_json_body.items(): if k in allowed_keys: if k == 'status': event.close() else: setattr(event, k, v) else: return HTTPBadRequest() return event.get_dict()
def close_alerts(): log.warning("Checking alerts") since_when = datetime.utcnow() try: event_types = [ Event.types["error_report_alert"], Event.types["slow_report_alert"], ] statuses = [Event.statuses["active"]] # get events older than 5 min events = EventService.by_type_and_status( event_types, statuses, older_than=(since_when - timedelta(minutes=5))) for event in events: # see if we can close them event.validate_or_close(since_when=(since_when - timedelta(minutes=1))) except Exception as exc: print_traceback(log) raise
def latest_events(self): return EventService.latest_for_user(self)
def charts_data(request): """ Handles charting from UI generated charts """ # path for user testing out the chart ids_to_override = None chart = request.context.chart chart.migrate_json_config() req_type = request.matchdict["key"] if (request.method == "POST" and not request.context.used_uuid and req_type == "data_test_config"): chart_config = copy.deepcopy(request.unsafe_json_body) # for now just throw error in case something weird is found applications = UserService.resources_with_perms( request.user, ["view"], resource_types=["application"]) # CRITICAL - this ensures our resultset is limited to only the ones # user has view permissions all_possible_app_ids = set([app.resource_id for app in applications]) schema = ChartConfigSchema().bind(resources=all_possible_app_ids) schema.deserialize(chart_config) filter_settings = build_filter_settings_from_chart_config( request, chart_config) else: # path for everyone else viewing the chart using UUID/public or not # ids_to_override will only work here because # initially it was validated # in dashboard_chart_save() request - so at this point its considered # valid chart_config = chart.config if not chart_config: return {} ids_to_override = [chart_config["resource"]] filter_settings = build_filter_settings_from_chart_config( request, chart_config, override_app_ids=ids_to_override) if not chart_config: return HTTPNotFound() # send chartype so client knows how to render the result chart_type = chart_config.get("chartType") # we always want to use the POST version of chart type for preview purposes # as priority if not chart_type: chart_type = chart.config.get("chartType") es_config = transform_json_to_es_config(request, chart_config, filter_settings, ids_to_override=ids_to_override) query = es_config["query"] if not es_config["index_names"]: return { "name": "", "chart_type": chart_type, "parent_agg": es_config["parent_agg"], "series": [], "system_labels": {}, "groups": [], "rect_regions": [], "categories": [], } result = Datastores.es.search(body=query, index=es_config["index_names"], doc_type="log", size=0) series, info_dict = parse_es_result(result, es_config, json_config=chart_config) regions = [] if req_type == "data_rule_config": json_body = copy.deepcopy(request.unsafe_json_body) rule_config = json_body.get("rule") field_mappings = json_body.get("mappings") rule_obj = RuleService.rule_from_config(rule_config, field_mappings, info_dict["system_labels"]) parent_agg = chart_config.get("parentAgg") if parent_agg and parent_agg["type"] == "time_histogram": for step in series: if rule_obj.match(step): iv = time_deltas[parent_agg["config"]["interval"]] step_start = step["key"].replace(second=0, microsecond=0) regions.append({ "start": step_start, "end": step_start + iv["delta"], "class": "rule1", }) else: events = EventService.for_resource([chart.resource_id], target_uuid=chart.uuid) for event in events: if event.end_date or event.values.get("end_interval"): end_date = event.end_date.replace(second=0, microsecond=0) step_end = event.values.get("end_interval") or end_date else: step_end = datetime.utcnow().replace(second=0, microsecond=0) start_date = event.values["start_interval"] regions.append({ "start": start_date, "end": step_end, "class": "rule1" }) return { "name": chart.name, "chart_type": chart_type, "parent_agg": es_config["parent_agg"], "series": series, "system_labels": info_dict["system_labels"], "rect_regions": regions, "groups": [list(v) for v in info_dict["groups"].values()], "categories": info_dict["categories"], }
def alerting_test(request): """ Allows to test send data on various registered alerting channels """ applications = UserService.resources_with_perms( request.user, ["view"], resource_types=["application"]) # what we can select in total all_possible_app_ids = [app.resource_id for app in applications] resource = applications[0] alert_channels = [] for channel in request.user.alert_channels: alert_channels.append(channel.get_dict()) cname = request.params.get("channel_name") cvalue = request.params.get("channel_value") event_name = request.params.get("event_name") if cname and cvalue: for channel in request.user.alert_channels: if channel.channel_value == cvalue and channel.channel_name == cname: break if event_name in ["error_report_alert", "slow_report_alert"]: # opened new_event = Event( resource_id=resource.resource_id, event_type=Event.types[event_name], start_date=datetime.datetime.utcnow(), status=Event.statuses["active"], values={ "reports": 5, "threshold": 10 }, ) channel.notify_alert(resource=resource, event=new_event, user=request.user, request=request) # closed ev_type = Event.types[event_name.replace("open", "close")] new_event = Event( resource_id=resource.resource_id, event_type=ev_type, start_date=datetime.datetime.utcnow(), status=Event.statuses["closed"], values={ "reports": 5, "threshold": 10 }, ) channel.notify_alert(resource=resource, event=new_event, user=request.user, request=request) elif event_name == "notify_reports": report = ( ReportGroupService.by_app_ids(all_possible_app_ids).filter( ReportGroup.report_type == ReportType.error).first()) confirmed_reports = [(5, report), (1, report)] channel.notify_reports( resource=resource, user=request.user, request=request, since_when=datetime.datetime.utcnow(), reports=confirmed_reports, ) confirmed_reports = [(5, report)] channel.notify_reports( resource=resource, user=request.user, request=request, since_when=datetime.datetime.utcnow(), reports=confirmed_reports, ) elif event_name == "notify_uptime": new_event = Event( resource_id=resource.resource_id, event_type=Event.types["uptime_alert"], start_date=datetime.datetime.utcnow(), status=Event.statuses["active"], values={ "status_code": 500, "tries": 2, "response_time": 0 }, ) channel.notify_uptime_alert(resource=resource, event=new_event, user=request.user, request=request) elif event_name == "chart_alert": event = EventService.by_type_and_status( event_types=(Event.types["chart_alert"], ), status_types=(Event.statuses["active"], ), ).first() channel.notify_chart_alert(resource=event.resource, event=event, user=request.user, request=request) elif event_name == "daily_digest": since_when = datetime.datetime.utcnow() - datetime.timedelta( hours=8) filter_settings = { "resource": [resource.resource_id], "tags": [{ "name": "type", "value": ["error"], "op": None }], "type": "error", "start_date": since_when, } reports = ReportGroupService.get_trending( request, filter_settings=filter_settings, limit=50) channel.send_digest( resource=resource, user=request.user, request=request, since_when=datetime.datetime.utcnow(), reports=reports, ) return { "alert_channels": alert_channels, "applications": dict([(app.resource_id, app.resource_name) for app in applications.all()]), }
def alerting_test(request): """ Allows to test send data on various registered alerting channels """ applications = request.user.resources_with_perms( ['view'], resource_types=['application']) # what we can select in total all_possible_app_ids = [app.resource_id for app in applications] resource = applications[0] alert_channels = [] for channel in request.user.alert_channels: alert_channels.append(channel.get_dict()) cname = request.params.get('channel_name') cvalue = request.params.get('channel_value') event_name = request.params.get('event_name') if cname and cvalue: for channel in request.user.alert_channels: if (channel.channel_value == cvalue and channel.channel_name == cname): break if event_name in ['error_report_alert', 'slow_report_alert']: # opened new_event = Event(resource_id=resource.resource_id, event_type=Event.types[event_name], start_date=datetime.datetime.utcnow(), status=Event.statuses['active'], values={ 'reports': 5, 'threshold': 10 }) channel.notify_alert(resource=resource, event=new_event, user=request.user, request=request) # closed ev_type = Event.types[event_name.replace('open', 'close')] new_event = Event(resource_id=resource.resource_id, event_type=ev_type, start_date=datetime.datetime.utcnow(), status=Event.statuses['closed'], values={ 'reports': 5, 'threshold': 10 }) channel.notify_alert(resource=resource, event=new_event, user=request.user, request=request) elif event_name == 'notify_reports': report = ReportGroupService.by_app_ids(all_possible_app_ids) \ .filter(ReportGroup.report_type == ReportType.error).first() confirmed_reports = [(5, report), (1, report)] channel.notify_reports(resource=resource, user=request.user, request=request, since_when=datetime.datetime.utcnow(), reports=confirmed_reports) confirmed_reports = [(5, report)] channel.notify_reports(resource=resource, user=request.user, request=request, since_when=datetime.datetime.utcnow(), reports=confirmed_reports) elif event_name == 'notify_uptime': new_event = Event(resource_id=resource.resource_id, event_type=Event.types['uptime_alert'], start_date=datetime.datetime.utcnow(), status=Event.statuses['active'], values={ "status_code": 500, "tries": 2, "response_time": 0 }) channel.notify_uptime_alert(resource=resource, event=new_event, user=request.user, request=request) elif event_name == 'chart_alert': event = EventService.by_type_and_status( event_types=(Event.types['chart_alert'], ), status_types=(Event.statuses['active'], )).first() channel.notify_chart_alert(resource=event.resource, event=event, user=request.user, request=request) elif event_name == 'daily_digest': since_when = datetime.datetime.utcnow() - datetime.timedelta( hours=8) filter_settings = { 'resource': [resource.resource_id], 'tags': [{ 'name': 'type', 'value': ['error'], 'op': None }], 'type': 'error', 'start_date': since_when } reports = ReportGroupService.get_trending( request, filter_settings=filter_settings, limit=50) channel.send_digest(resource=resource, user=request.user, request=request, since_when=datetime.datetime.utcnow(), reports=reports) return { 'alert_channels': alert_channels, 'applications': dict([(app.resource_id, app.resource_name) for app in applications.all()]) }
def alert_chart(pkey, chart_uuid): start = datetime.utcnow() request = get_current_request() alert_action = AlertChannelActionService.by_pkey(pkey) chart = DashboardChartService.by_uuid(chart_uuid) chart.migrate_json_config() resource = chart.dashboard json_body = chart.config ids_to_override = [json_body["resource"]] filter_settings = build_filter_settings_from_chart_config( request, json_body, override_app_ids=ids_to_override ) log.warning("alert_chart, resource:{}, chart:{}".format(resource, chart_uuid)) # determine start and end date for dataset start_date, end_date = determine_date_boundries_json(json_body) if not filter_settings["start_date"]: filter_settings["start_date"] = start_date.replace( hour=0, minute=0, second=0, microsecond=0 ) if not filter_settings["end_date"]: filter_settings["end_date"] = end_date event_type = Event.types["chart_alert"] open_event = None latest_closed_event = None events_query = EventService.for_resource( [resource.resource_id], event_type=event_type, target_uuid=chart_uuid, limit=20 ) for event in events_query: if event.status == Event.statuses["active"] and not open_event: open_event = event if event.status == Event.statuses["closed"] and not latest_closed_event: latest_closed_event = event if latest_closed_event: filter_settings["start_date"] = latest_closed_event.end_date es_config = transform_json_to_es_config( request, json_body, filter_settings, ids_to_override=ids_to_override ) if not es_config["index_names"]: return result = Datastores.es.search( body=es_config["query"], index=es_config["index_names"], doc_type="log", size=0 ) series, info_dict = parse_es_result(result, es_config, json_config=json_body) # we need to make a deepcopy since we will mutate it rule_config = copy.deepcopy(alert_action.rule) field_mappings = alert_action.config rule_obj = RuleService.rule_from_config( rule_config, field_mappings, info_dict["system_labels"] ) matched_interval = None finished_interval = None for step in reversed(series): if rule_obj.match(step): log.info("matched start") if not matched_interval: matched_interval = step break else: finished_interval = step if matched_interval: if open_event: log.info("ALERT: PROGRESS: %s %s" % (event_type, resource)) if finished_interval: open_event.values = copy.deepcopy(open_event.values) end_interval = finished_interval["key"].strftime(DATE_FORMAT) open_event.values["end_interval"] = end_interval open_event.close() else: log.warning("ALERT: OPEN: %s %s" % (event_type, resource)) step_size = None parent_agg = json_body.get("parentAgg") if parent_agg and parent_agg["type"] == "time_histogram": step_size = time_deltas[parent_agg["config"]["interval"]][ "delta" ].total_seconds() matched_step_values = { "values": matched_interval, "labels": info_dict["system_labels"], } values_dict = { "matched_rule": alert_action.get_dict(), "matched_step_values": matched_step_values, "start_interval": step["key"], "end_interval": None, "resource": chart.config.get("resource"), "chart_name": chart.name, "chart_uuid": chart_uuid, "step_size": step_size, "action_name": alert_action.name, } new_event = Event( resource_id=resource.resource_id, event_type=event_type, status=Event.statuses["active"], values=values_dict, target_uuid=chart_uuid, ) DBSession.add(new_event) DBSession.flush() new_event.send_alerts(request=request, resource=resource) elif open_event: if finished_interval: open_event.values = copy.deepcopy(open_event.values) end_interval = finished_interval["key"].strftime(DATE_FORMAT) open_event.values["end_interval"] = end_interval open_event.close() took = datetime.utcnow() - start log.warning("chart alert rule check took: {}".format(took))