示例#1
0
 def check_for_alert(cls, resource, *args, **kwargs):
     """ Check for open uptime alerts. Create new one if nothing is found
     and send alerts """
     db_session = get_db_session(kwargs.get("db_session"))
     request = get_current_request()
     event_type = "uptime_alert"
     metric = kwargs["metric"]
     event = EventService.for_resource(
         [resource.resource_id],
         event_type=Event.types[event_type],
         status=Event.statuses["active"],
     )
     if event.first():
         log.info("ALERT: PROGRESS: %s %s" % (event_type, resource))
     else:
         log.warning("ALERT: OPEN: %s %s" % (event_type, resource))
         event_values = {
             "status_code": metric["status_code"],
             "tries": metric["tries"],
             "response_time": metric["response_time"],
         }
         new_event = Event(
             resource_id=resource.resource_id,
             event_type=Event.types[event_type],
             status=Event.statuses["active"],
             values=event_values,
         )
         db_session.add(new_event)
         new_event.send_alerts(request=request, resource=resource)
示例#2
0
    def check_for_groups_alert(cls, resource, event_type, *args, **kwargs):
        """ Check for open alerts depending on group type.
        Create new one if nothing is found and send alerts """
        db_session = get_db_session(kwargs.get("db_session"))
        request = get_current_request()
        report_groups = kwargs["report_groups"]
        occurence_dict = kwargs["occurence_dict"]

        error_reports = 0
        slow_reports = 0
        for group in report_groups:
            occurences = occurence_dict.get(group.id, 1)
            if group.get_report().report_type == ReportType.error:
                error_reports += occurences
            elif group.get_report().report_type == ReportType.slow:
                slow_reports += occurences

        log_msg = "LIMIT INFO: %s : %s error reports. %s slow_reports" % (
            resource,
            error_reports,
            slow_reports,
        )
        logging.warning(log_msg)
        threshold = 10
        for event_type in ["error_report_alert", "slow_report_alert"]:
            if (
                error_reports < resource.error_report_threshold
                and event_type == "error_report_alert"
            ):
                continue
            elif (
                slow_reports <= resource.slow_report_threshold
                and event_type == "slow_report_alert"
            ):
                continue
            if event_type == "error_report_alert":
                amount = error_reports
                threshold = resource.error_report_threshold
            elif event_type == "slow_report_alert":
                amount = slow_reports
                threshold = resource.slow_report_threshold

            event = EventService.for_resource(
                [resource.resource_id],
                event_type=Event.types[event_type],
                status=Event.statuses["active"],
            )
            if event.first():
                log.info("ALERT: PROGRESS: %s %s" % (event_type, resource))
            else:
                log.warning("ALERT: OPEN: %s %s" % (event_type, resource))
                new_event = Event(
                    resource_id=resource.resource_id,
                    event_type=Event.types[event_type],
                    status=Event.statuses["active"],
                    values={"reports": amount, "threshold": threshold},
                )
                db_session.add(new_event)
                new_event.send_alerts(request=request, resource=resource)
示例#3
0
def fetch_events(request):
    """
    Returns list of log entries from Elasticsearch
    """
    event_paginator = EventService.get_paginator(user=request.user,
                                                 page=1,
                                                 items_per_page=100)
    headers = gen_pagination_headers(request, event_paginator)
    request.response.headers.update(headers)

    return [ev.get_dict() for ev in event_paginator.items]
示例#4
0
def add_uptime_stats(params, metric):
    proto_version = parse_proto(params.get("protocol_version"))
    try:
        application = ApplicationService.by_id_cached()(metric["resource_id"])
        application = DBSession.merge(application, load=False)
        if not application:
            return
        start_interval = convert_date(metric["timestamp"])
        start_interval = start_interval.replace(second=0, microsecond=0)
        new_metric = UptimeMetric(
            start_interval=start_interval,
            response_time=metric["response_time"],
            status_code=metric["status_code"],
            is_ok=metric["is_ok"],
            location=metric.get("location", 1),
            tries=metric["tries"],
            resource_id=application.resource_id,
            owner_user_id=application.owner_user_id,
        )
        DBSession.add(new_metric)
        DBSession.flush()
        add_metrics_uptime([new_metric.es_doc()])
        if metric["is_ok"]:
            event_types = [Event.types["uptime_alert"]]
            statuses = [Event.statuses["active"]]
            # get events older than 5 min
            events = EventService.by_type_and_status(
                event_types,
                statuses,
                older_than=(datetime.utcnow() - timedelta(minutes=6)),
                app_ids=[application.resource_id],
            )
            for event in events:
                event.close()
        else:
            UptimeMetricService.check_for_alert(application, metric=metric)
        action = "METRICS UPTIME"
        metrics_msg = "%s: %s, proto:%s" % (action, str(application),
                                            proto_version)
        log.info(metrics_msg)
        session = DBSession()
        mark_changed(session)
        return True
    except Exception as exc:
        print_traceback(log)
        add_uptime_stats.retry(exc=exc)
示例#5
0
def event_PATCH(request):
    resources = request.user.resources_with_perms(
        ['view'], resource_types=request.registry.resource_types)
    event = EventService.for_resource(
        [r.resource_id for r in resources],
        event_id=request.matchdict['event_id']).first()
    if not event:
        return HTTPNotFound()
    allowed_keys = ['status']
    for k, v in request.unsafe_json_body.items():
        if k in allowed_keys:
            if k == 'status':
                event.close()
            else:
                setattr(event, k, v)
        else:
            return HTTPBadRequest()
    return event.get_dict()
示例#6
0
def close_alerts():
    log.warning("Checking alerts")
    since_when = datetime.utcnow()
    try:
        event_types = [
            Event.types["error_report_alert"],
            Event.types["slow_report_alert"],
        ]
        statuses = [Event.statuses["active"]]
        # get events older than 5 min
        events = EventService.by_type_and_status(
            event_types,
            statuses,
            older_than=(since_when - timedelta(minutes=5)))
        for event in events:
            # see if we can close them
            event.validate_or_close(since_when=(since_when -
                                                timedelta(minutes=1)))
    except Exception as exc:
        print_traceback(log)
        raise
示例#7
0
 def latest_events(self):
     return EventService.latest_for_user(self)
示例#8
0
def charts_data(request):
    """
    Handles charting from UI generated charts
    """
    # path for user testing out the chart
    ids_to_override = None
    chart = request.context.chart
    chart.migrate_json_config()
    req_type = request.matchdict["key"]
    if (request.method == "POST" and not request.context.used_uuid
            and req_type == "data_test_config"):
        chart_config = copy.deepcopy(request.unsafe_json_body)
        # for now just throw error in case something weird is found
        applications = UserService.resources_with_perms(
            request.user, ["view"], resource_types=["application"])

        # CRITICAL - this ensures our resultset is limited to only the ones
        # user has view permissions
        all_possible_app_ids = set([app.resource_id for app in applications])

        schema = ChartConfigSchema().bind(resources=all_possible_app_ids)
        schema.deserialize(chart_config)
        filter_settings = build_filter_settings_from_chart_config(
            request, chart_config)
    else:
        # path for everyone else viewing the chart using UUID/public or not
        # ids_to_override will only work here because
        # initially it was validated
        # in dashboard_chart_save() request - so at this point its considered
        # valid
        chart_config = chart.config
        if not chart_config:
            return {}
        ids_to_override = [chart_config["resource"]]
        filter_settings = build_filter_settings_from_chart_config(
            request, chart_config, override_app_ids=ids_to_override)

    if not chart_config:
        return HTTPNotFound()

    # send chartype so client knows how to render the result
    chart_type = chart_config.get("chartType")
    # we always want to use the POST version of chart type for preview purposes
    # as priority
    if not chart_type:
        chart_type = chart.config.get("chartType")

    es_config = transform_json_to_es_config(request,
                                            chart_config,
                                            filter_settings,
                                            ids_to_override=ids_to_override)

    query = es_config["query"]

    if not es_config["index_names"]:
        return {
            "name": "",
            "chart_type": chart_type,
            "parent_agg": es_config["parent_agg"],
            "series": [],
            "system_labels": {},
            "groups": [],
            "rect_regions": [],
            "categories": [],
        }
    result = Datastores.es.search(body=query,
                                  index=es_config["index_names"],
                                  doc_type="log",
                                  size=0)
    series, info_dict = parse_es_result(result,
                                        es_config,
                                        json_config=chart_config)

    regions = []

    if req_type == "data_rule_config":
        json_body = copy.deepcopy(request.unsafe_json_body)
        rule_config = json_body.get("rule")
        field_mappings = json_body.get("mappings")
        rule_obj = RuleService.rule_from_config(rule_config, field_mappings,
                                                info_dict["system_labels"])

        parent_agg = chart_config.get("parentAgg")
        if parent_agg and parent_agg["type"] == "time_histogram":

            for step in series:
                if rule_obj.match(step):
                    iv = time_deltas[parent_agg["config"]["interval"]]
                    step_start = step["key"].replace(second=0, microsecond=0)
                    regions.append({
                        "start": step_start,
                        "end": step_start + iv["delta"],
                        "class": "rule1",
                    })
    else:
        events = EventService.for_resource([chart.resource_id],
                                           target_uuid=chart.uuid)
        for event in events:
            if event.end_date or event.values.get("end_interval"):
                end_date = event.end_date.replace(second=0, microsecond=0)
                step_end = event.values.get("end_interval") or end_date
            else:
                step_end = datetime.utcnow().replace(second=0, microsecond=0)
            start_date = event.values["start_interval"]
            regions.append({
                "start": start_date,
                "end": step_end,
                "class": "rule1"
            })

    return {
        "name": chart.name,
        "chart_type": chart_type,
        "parent_agg": es_config["parent_agg"],
        "series": series,
        "system_labels": info_dict["system_labels"],
        "rect_regions": regions,
        "groups": [list(v) for v in info_dict["groups"].values()],
        "categories": info_dict["categories"],
    }
示例#9
0
def alerting_test(request):
    """
    Allows to test send data on various registered alerting channels
    """
    applications = UserService.resources_with_perms(
        request.user, ["view"], resource_types=["application"])
    # what we can select in total
    all_possible_app_ids = [app.resource_id for app in applications]
    resource = applications[0]

    alert_channels = []
    for channel in request.user.alert_channels:
        alert_channels.append(channel.get_dict())

    cname = request.params.get("channel_name")
    cvalue = request.params.get("channel_value")
    event_name = request.params.get("event_name")
    if cname and cvalue:
        for channel in request.user.alert_channels:
            if channel.channel_value == cvalue and channel.channel_name == cname:
                break
        if event_name in ["error_report_alert", "slow_report_alert"]:
            # opened
            new_event = Event(
                resource_id=resource.resource_id,
                event_type=Event.types[event_name],
                start_date=datetime.datetime.utcnow(),
                status=Event.statuses["active"],
                values={
                    "reports": 5,
                    "threshold": 10
                },
            )
            channel.notify_alert(resource=resource,
                                 event=new_event,
                                 user=request.user,
                                 request=request)

            # closed
            ev_type = Event.types[event_name.replace("open", "close")]
            new_event = Event(
                resource_id=resource.resource_id,
                event_type=ev_type,
                start_date=datetime.datetime.utcnow(),
                status=Event.statuses["closed"],
                values={
                    "reports": 5,
                    "threshold": 10
                },
            )
            channel.notify_alert(resource=resource,
                                 event=new_event,
                                 user=request.user,
                                 request=request)
        elif event_name == "notify_reports":
            report = (
                ReportGroupService.by_app_ids(all_possible_app_ids).filter(
                    ReportGroup.report_type == ReportType.error).first())
            confirmed_reports = [(5, report), (1, report)]
            channel.notify_reports(
                resource=resource,
                user=request.user,
                request=request,
                since_when=datetime.datetime.utcnow(),
                reports=confirmed_reports,
            )
            confirmed_reports = [(5, report)]
            channel.notify_reports(
                resource=resource,
                user=request.user,
                request=request,
                since_when=datetime.datetime.utcnow(),
                reports=confirmed_reports,
            )
        elif event_name == "notify_uptime":
            new_event = Event(
                resource_id=resource.resource_id,
                event_type=Event.types["uptime_alert"],
                start_date=datetime.datetime.utcnow(),
                status=Event.statuses["active"],
                values={
                    "status_code": 500,
                    "tries": 2,
                    "response_time": 0
                },
            )
            channel.notify_uptime_alert(resource=resource,
                                        event=new_event,
                                        user=request.user,
                                        request=request)
        elif event_name == "chart_alert":
            event = EventService.by_type_and_status(
                event_types=(Event.types["chart_alert"], ),
                status_types=(Event.statuses["active"], ),
            ).first()
            channel.notify_chart_alert(resource=event.resource,
                                       event=event,
                                       user=request.user,
                                       request=request)
        elif event_name == "daily_digest":
            since_when = datetime.datetime.utcnow() - datetime.timedelta(
                hours=8)
            filter_settings = {
                "resource": [resource.resource_id],
                "tags": [{
                    "name": "type",
                    "value": ["error"],
                    "op": None
                }],
                "type": "error",
                "start_date": since_when,
            }

            reports = ReportGroupService.get_trending(
                request, filter_settings=filter_settings, limit=50)
            channel.send_digest(
                resource=resource,
                user=request.user,
                request=request,
                since_when=datetime.datetime.utcnow(),
                reports=reports,
            )

    return {
        "alert_channels":
        alert_channels,
        "applications":
        dict([(app.resource_id, app.resource_name)
              for app in applications.all()]),
    }
示例#10
0
def alerting_test(request):
    """
    Allows to test send data on various registered alerting channels
    """
    applications = request.user.resources_with_perms(
        ['view'], resource_types=['application'])
    # what we can select in total
    all_possible_app_ids = [app.resource_id for app in applications]
    resource = applications[0]

    alert_channels = []
    for channel in request.user.alert_channels:
        alert_channels.append(channel.get_dict())

    cname = request.params.get('channel_name')
    cvalue = request.params.get('channel_value')
    event_name = request.params.get('event_name')
    if cname and cvalue:
        for channel in request.user.alert_channels:
            if (channel.channel_value == cvalue
                    and channel.channel_name == cname):
                break
        if event_name in ['error_report_alert', 'slow_report_alert']:
            # opened
            new_event = Event(resource_id=resource.resource_id,
                              event_type=Event.types[event_name],
                              start_date=datetime.datetime.utcnow(),
                              status=Event.statuses['active'],
                              values={
                                  'reports': 5,
                                  'threshold': 10
                              })
            channel.notify_alert(resource=resource,
                                 event=new_event,
                                 user=request.user,
                                 request=request)

            # closed
            ev_type = Event.types[event_name.replace('open', 'close')]
            new_event = Event(resource_id=resource.resource_id,
                              event_type=ev_type,
                              start_date=datetime.datetime.utcnow(),
                              status=Event.statuses['closed'],
                              values={
                                  'reports': 5,
                                  'threshold': 10
                              })
            channel.notify_alert(resource=resource,
                                 event=new_event,
                                 user=request.user,
                                 request=request)
        elif event_name == 'notify_reports':
            report = ReportGroupService.by_app_ids(all_possible_app_ids) \
                .filter(ReportGroup.report_type == ReportType.error).first()
            confirmed_reports = [(5, report), (1, report)]
            channel.notify_reports(resource=resource,
                                   user=request.user,
                                   request=request,
                                   since_when=datetime.datetime.utcnow(),
                                   reports=confirmed_reports)
            confirmed_reports = [(5, report)]
            channel.notify_reports(resource=resource,
                                   user=request.user,
                                   request=request,
                                   since_when=datetime.datetime.utcnow(),
                                   reports=confirmed_reports)
        elif event_name == 'notify_uptime':
            new_event = Event(resource_id=resource.resource_id,
                              event_type=Event.types['uptime_alert'],
                              start_date=datetime.datetime.utcnow(),
                              status=Event.statuses['active'],
                              values={
                                  "status_code": 500,
                                  "tries": 2,
                                  "response_time": 0
                              })
            channel.notify_uptime_alert(resource=resource,
                                        event=new_event,
                                        user=request.user,
                                        request=request)
        elif event_name == 'chart_alert':
            event = EventService.by_type_and_status(
                event_types=(Event.types['chart_alert'], ),
                status_types=(Event.statuses['active'], )).first()
            channel.notify_chart_alert(resource=event.resource,
                                       event=event,
                                       user=request.user,
                                       request=request)
        elif event_name == 'daily_digest':
            since_when = datetime.datetime.utcnow() - datetime.timedelta(
                hours=8)
            filter_settings = {
                'resource': [resource.resource_id],
                'tags': [{
                    'name': 'type',
                    'value': ['error'],
                    'op': None
                }],
                'type': 'error',
                'start_date': since_when
            }

            reports = ReportGroupService.get_trending(
                request, filter_settings=filter_settings, limit=50)
            channel.send_digest(resource=resource,
                                user=request.user,
                                request=request,
                                since_when=datetime.datetime.utcnow(),
                                reports=reports)

    return {
        'alert_channels':
        alert_channels,
        'applications':
        dict([(app.resource_id, app.resource_name)
              for app in applications.all()])
    }
示例#11
0
def alert_chart(pkey, chart_uuid):
    start = datetime.utcnow()
    request = get_current_request()
    alert_action = AlertChannelActionService.by_pkey(pkey)
    chart = DashboardChartService.by_uuid(chart_uuid)
    chart.migrate_json_config()
    resource = chart.dashboard
    json_body = chart.config
    ids_to_override = [json_body["resource"]]
    filter_settings = build_filter_settings_from_chart_config(
        request, json_body, override_app_ids=ids_to_override
    )

    log.warning("alert_chart, resource:{}, chart:{}".format(resource, chart_uuid))

    # determine start and end date for dataset
    start_date, end_date = determine_date_boundries_json(json_body)
    if not filter_settings["start_date"]:
        filter_settings["start_date"] = start_date.replace(
            hour=0, minute=0, second=0, microsecond=0
        )

    if not filter_settings["end_date"]:
        filter_settings["end_date"] = end_date

    event_type = Event.types["chart_alert"]
    open_event = None
    latest_closed_event = None
    events_query = EventService.for_resource(
        [resource.resource_id], event_type=event_type, target_uuid=chart_uuid, limit=20
    )

    for event in events_query:
        if event.status == Event.statuses["active"] and not open_event:
            open_event = event
        if event.status == Event.statuses["closed"] and not latest_closed_event:
            latest_closed_event = event

    if latest_closed_event:
        filter_settings["start_date"] = latest_closed_event.end_date

    es_config = transform_json_to_es_config(
        request, json_body, filter_settings, ids_to_override=ids_to_override
    )

    if not es_config["index_names"]:
        return
    result = Datastores.es.search(
        body=es_config["query"], index=es_config["index_names"], doc_type="log", size=0
    )
    series, info_dict = parse_es_result(result, es_config, json_config=json_body)

    # we need to make a deepcopy since we will mutate it
    rule_config = copy.deepcopy(alert_action.rule)
    field_mappings = alert_action.config

    rule_obj = RuleService.rule_from_config(
        rule_config, field_mappings, info_dict["system_labels"]
    )
    matched_interval = None
    finished_interval = None
    for step in reversed(series):
        if rule_obj.match(step):
            log.info("matched start")
            if not matched_interval:
                matched_interval = step
                break
        else:
            finished_interval = step

    if matched_interval:
        if open_event:
            log.info("ALERT: PROGRESS: %s %s" % (event_type, resource))
            if finished_interval:
                open_event.values = copy.deepcopy(open_event.values)
                end_interval = finished_interval["key"].strftime(DATE_FORMAT)
                open_event.values["end_interval"] = end_interval
                open_event.close()
        else:
            log.warning("ALERT: OPEN: %s %s" % (event_type, resource))
            step_size = None
            parent_agg = json_body.get("parentAgg")
            if parent_agg and parent_agg["type"] == "time_histogram":
                step_size = time_deltas[parent_agg["config"]["interval"]][
                    "delta"
                ].total_seconds()
            matched_step_values = {
                "values": matched_interval,
                "labels": info_dict["system_labels"],
            }
            values_dict = {
                "matched_rule": alert_action.get_dict(),
                "matched_step_values": matched_step_values,
                "start_interval": step["key"],
                "end_interval": None,
                "resource": chart.config.get("resource"),
                "chart_name": chart.name,
                "chart_uuid": chart_uuid,
                "step_size": step_size,
                "action_name": alert_action.name,
            }
            new_event = Event(
                resource_id=resource.resource_id,
                event_type=event_type,
                status=Event.statuses["active"],
                values=values_dict,
                target_uuid=chart_uuid,
            )
            DBSession.add(new_event)
            DBSession.flush()
            new_event.send_alerts(request=request, resource=resource)
    elif open_event:
        if finished_interval:
            open_event.values = copy.deepcopy(open_event.values)
            end_interval = finished_interval["key"].strftime(DATE_FORMAT)
            open_event.values["end_interval"] = end_interval
        open_event.close()
    took = datetime.utcnow() - start
    log.warning("chart alert rule check took: {}".format(took))