def test(self): closed_incident = create_incident( self.organization, IncidentType.CREATED, "Closed", "", groups=[self.group], date_started=timezone.now() - timedelta(days=30), ) update_incident_status(closed_incident, IncidentStatus.CLOSED) open_incident = create_incident( self.organization, IncidentType.CREATED, "Open", "", groups=[self.group], date_started=timezone.now() - timedelta(days=30), ) incidents = [closed_incident, open_incident] for incident, incident_stats in zip(incidents, bulk_get_incident_stats(incidents)): event_stats = get_incident_event_stats(incident) assert incident_stats["event_stats"].data["data"] == event_stats.data["data"] assert incident_stats["event_stats"].start == event_stats.start assert incident_stats["event_stats"].end == event_stats.end assert incident_stats["event_stats"].rollup == event_stats.rollup aggregates = get_incident_aggregates(incident) assert incident_stats["total_events"] == aggregates["count"] assert incident_stats["unique_users"] == aggregates["unique_users"]
def trigger_alert_threshold(self): """ Called when a subscription update exceeds the value defined in the `alert_rule.alert_threshold`, and there is not already an active incident. Increments the count of how many times we've consecutively exceeded the threshold, and if above the `threshold_period` defined in the alert rule then create an incident. :return: """ self.alert_triggers += 1 if self.alert_triggers >= self.alert_rule.threshold_period: detected_at = to_datetime(self.last_update) self.active_incident = create_incident( self.alert_rule.organization, IncidentType.ALERT_TRIGGERED, # TODO: Include more info in name? self.alert_rule.name, alert_rule=self.alert_rule, # TODO: Incidents need to keep track of which metric to display query=self.subscription.query, date_started=detected_at, date_detected=detected_at, projects=[self.subscription.project], ) # TODO: We should create an audit log, and maybe something that keeps # all of the details available for showing on the incident. Might be a json # blob or w/e? Or might be able to use the audit log. # We now set this threshold to 0. We don't need to count it anymore # once we've triggered an incident. self.alert_triggers = 0
def test_simple(self): status = IncidentStatus.CREATED title = 'hello' query = 'goodbye' date_started = timezone.now() other_project = self.create_project() other_group = self.create_group(project=other_project) incident = create_incident( self.organization, status=status, title=title, query=query, date_started=date_started, projects=[self.project], groups=[self.group, other_group], ) assert incident.identifier == 1 assert incident.status == status.value assert incident.title == title assert incident.query == query assert incident.date_started == date_started assert incident.date_detected == date_started assert IncidentGroup.objects.filter( incident=incident, group__in=[self.group, other_group] ).count() == 2 assert IncidentProject.objects.filter( incident=incident, project__in=[self.project, other_project], ).count() == 2
def test_incidents_list(self): alert_rule = create_alert_rule(self.organization, [self.project], "hello", "level:error", "count()", 10, 1) incident = create_incident( self.organization, type_=IncidentType.DETECTED, title="Incident #1", query="hello", aggregation=QueryAggregations.TOTAL, date_started=timezone.now(), date_detected=timezone.now(), projects=[self.project], groups=[self.group], alert_rule=alert_rule, ) with self.feature(FEATURE_NAME): self.browser.get(self.path) self.browser.wait_until_not(".loading-indicator") self.browser.wait_until_test_id("incident-sparkline") self.browser.snapshot("incidents - list") details_url = u'[href="/organizations/{}/alerts/{}/'.format( self.organization.slug, incident.identifier) self.browser.wait_until(details_url) self.browser.click(details_url) self.browser.wait_until_not(".loading-indicator") self.browser.wait_until_test_id("incident-title") self.browser.wait_until_not('[data-test-id="loading-placeholder"]') self.browser.snapshot("incidents - details")
def test_simple(self): incident_type = IncidentType.CREATED title = 'hello' query = 'goodbye' date_started = timezone.now() other_project = self.create_project() other_group = self.create_group(project=other_project) incident = create_incident( self.organization, type=incident_type, title=title, query=query, date_started=date_started, projects=[self.project], groups=[self.group, other_group], ) assert incident.identifier == 1 assert incident.status == incident_type.value assert incident.title == title assert incident.query == query assert incident.date_started == date_started assert incident.date_detected == date_started assert IncidentGroup.objects.filter( incident=incident, group__in=[self.group, other_group]).count() == 2 assert IncidentProject.objects.filter( incident=incident, project__in=[self.project, other_project], ).count() == 2 assert IncidentActivity.objects.filter( incident=incident, type=IncidentActivityType.CREATED.value, event_stats_snapshot__isnull=False, ).count() == 1
def post(self, request, organization): if not features.has( "organizations:incidents", organization, actor=request.user): return self.respond(status=404) serializer = IncidentSerializer(data=request.data, context={"organization": organization}) if serializer.is_valid(): result = serializer.validated_data groups = result["groups"] all_projects = set(result["projects"]) | set( g.project for g in result["groups"]) if any(p for p in all_projects if not request.access.has_project_access(p)): raise PermissionDenied incident = create_incident( organization=organization, type=IncidentType.CREATED, title=result["title"], query=result.get("query", ""), aggregation=result["aggregation"], date_started=result.get("dateStarted"), date_detected=result.get("dateDetected"), projects=result["projects"], groups=groups, user=request.user, ) return Response(serialize(incident, request.user), status=201) return Response(serializer.errors, status=400)
def test_simple(self): status = IncidentStatus.CREATED title = 'hello' query = 'goodbye' date_started = timezone.now() other_project = self.create_project() other_group = self.create_group(project=other_project) incident = create_incident( self.organization, status=status, title=title, query=query, date_started=date_started, projects=[self.project], groups=[self.group, other_group], ) assert incident.identifier == 1 assert incident.status == status.value assert incident.title == title assert incident.query == query assert incident.date_started == date_started assert incident.date_detected == date_started assert IncidentGroup.objects.filter( incident=incident, group__in=[self.group, other_group]).count() == 2 assert IncidentProject.objects.filter( incident=incident, project__in=[self.project, other_project], ).count() == 2
def post(self, request, organization): if not features.has( 'organizations:incidents', organization, actor=request.user): return self.respond(status=404) serializer = IncidentSerializer( data=request.data, context={'organization': organization}, ) if serializer.is_valid(): result = serializer.validated_data groups = result['groups'] all_projects = set(result['projects']) | set( g.project for g in result['groups']) if any(p for p in all_projects if not request.access.has_project_access(p)): raise PermissionDenied incident = create_incident( organization=organization, type=IncidentType.CREATED, title=result['title'], query=result.get('query', ''), date_started=result['dateStarted'], date_detected=result.get('dateDetected', result['dateStarted']), projects=result['projects'], groups=groups, user=request.user, ) return Response(serialize(incident, request.user), status=201) return Response(serializer.errors, status=400)
def post(self, request, organization): if not features.has('organizations:incidents', organization, actor=request.user): return self.respond(status=404) serializer = IncidentSerializer( data=request.DATA, context={'organization': organization}, ) if serializer.is_valid(): result = serializer.object groups = result['groups'] all_projects = set(result['projects']) | set(g.project for g in result['groups']) if any(p for p in all_projects if not request.access.has_project_access(p)): raise PermissionDenied incident = create_incident( organization=organization, status=IncidentStatus.CREATED, title=result['title'], query=result.get('query', ''), date_started=result['dateStarted'], date_detected=result.get('dateDetected', result['dateStarted']), projects=result['projects'], groups=groups, ) return Response(serialize(incident, request.user), status=201) return Response(serializer.errors, status=400)
def test_simple(self): incident_type = IncidentType.CREATED title = "hello" query = "goodbye" date_started = timezone.now() other_project = self.create_project() other_group = self.create_group(project=other_project) self.record_event.reset_mock() alert_rule = create_alert_rule( self.organization, [self.project], "hello", AlertRuleThresholdType.ABOVE, "level:error", QueryAggregations.TOTAL, 10, 1000, 400, 1, ) incident = create_incident( self.organization, type=incident_type, title=title, query=query, date_started=date_started, projects=[self.project], groups=[self.group, other_group], alert_rule=alert_rule, ) assert incident.identifier == 1 assert incident.status == incident_type.value assert incident.title == title assert incident.query == query assert incident.date_started == date_started assert incident.date_detected == date_started assert incident.alert_rule == alert_rule assert (IncidentGroup.objects.filter( incident=incident, group__in=[self.group, other_group]).count() == 2) assert (IncidentProject.objects.filter( incident=incident, project__in=[self.project, other_project]).count() == 2) assert (IncidentActivity.objects.filter( incident=incident, type=IncidentActivityType.CREATED.value, event_stats_snapshot__isnull=False, ).count() == 1) assert len(self.record_event.call_args_list) == 1 event = self.record_event.call_args[0][0] assert isinstance(event, IncidentCreatedEvent) assert event.data == { "organization_id": six.text_type(self.organization.id), "incident_id": six.text_type(incident.id), "incident_type": six.text_type(IncidentType.CREATED.value), } self.calculate_incident_suspects.apply_async.assert_called_once_with( kwargs={"incident_id": incident.id})
def trigger_alert_threshold(self, trigger): """ Called when a subscription update exceeds the value defined in the `trigger.alert_threshold`, and the trigger hasn't already been activated. Increments the count of how many times we've consecutively exceeded the threshold, and if above the `threshold_period` defined in the alert rule then mark the trigger as activated, and create an incident if there isn't already one. :return: """ self.trigger_alert_counts[trigger.id] += 1 if self.trigger_alert_counts[ trigger.id] >= self.alert_rule.threshold_period: metrics.incr("incidents.alert_rules.trigger", tags={"type": "fire"}) # Only create a new incident if we don't already have an active one if not self.active_incident: detected_at = self.last_update # Subscriptions label buckets by the end of the bucket, whereas discover # labels them by the front. This causes us an off-by-one error with # alert start dates, so to prevent this we subtract a bucket off of the # start date. # We also multiply by threshold_period so that we can show when the # alert actually started happening, rather than when we detected it. detected_at -= timedelta( seconds=self.alert_rule.snuba_query.time_window * self.alert_rule.threshold_period) self.active_incident = create_incident( self.alert_rule.organization, IncidentType.ALERT_TRIGGERED, # TODO: Include more info in name? self.alert_rule.name, alert_rule=self.alert_rule, date_started=detected_at, date_detected=detected_at, projects=[self.subscription.project], ) # Now create (or update if it already exists) the incident trigger so that # we have a record of this trigger firing for this incident incident_trigger = self.incident_triggers.get(trigger.id) if incident_trigger: incident_trigger.status = TriggerStatus.ACTIVE.value incident_trigger.save() else: incident_trigger = IncidentTrigger.objects.create( incident=self.active_incident, alert_rule_trigger=trigger, status=TriggerStatus.ACTIVE.value, ) self.handle_incident_severity_update() self.handle_trigger_actions(incident_trigger) self.incident_triggers[trigger.id] = incident_trigger # TODO: We should create an audit log, and maybe something that keeps # all of the details available for showing on the incident. Might be a json # blob or w/e? Or might be able to use the audit log # We now set this threshold to 0. We don't need to count it anymore # once we've triggered an incident. self.trigger_alert_counts[trigger.id] = 0
def test_simple(self): incident_type = IncidentType.ALERT_TRIGGERED title = "hello" query = "goodbye" aggregation = QueryAggregations.UNIQUE_USERS date_started = timezone.now() other_project = self.create_project(fire_project_created=True) other_group = self.create_group(project=other_project) alert_rule = create_alert_rule( self.organization, [self.project], "hello", "level:error", QueryAggregations.TOTAL, 10, 1, ) self.record_event.reset_mock() incident = create_incident( self.organization, type=incident_type, title=title, query=query, aggregation=aggregation, date_started=date_started, projects=[self.project], groups=[self.group, other_group], alert_rule=alert_rule, ) assert incident.identifier == 1 assert incident.status == IncidentStatus.OPEN.value assert incident.type == incident_type.value assert incident.title == title assert incident.query == query assert incident.aggregation == aggregation.value assert incident.date_started == date_started assert incident.date_detected == date_started assert incident.alert_rule == alert_rule assert (IncidentGroup.objects.filter( incident=incident, group__in=[self.group, other_group]).count() == 2) assert (IncidentProject.objects.filter( incident=incident, project__in=[self.project, other_project]).count() == 2) assert (IncidentActivity.objects.filter( incident=incident, type=IncidentActivityType.DETECTED.value).count() == 1) assert len(self.record_event.call_args_list) == 1 event = self.record_event.call_args[0][0] assert isinstance(event, IncidentCreatedEvent) assert event.data == { "organization_id": six.text_type(self.organization.id), "incident_id": six.text_type(incident.id), "incident_type": six.text_type(IncidentType.ALERT_TRIGGERED.value), }
def trigger_alert_threshold(self, trigger, metric_value): """ Called when a subscription update exceeds the value defined in the `trigger.alert_threshold`, and the trigger hasn't already been activated. Increments the count of how many times we've consecutively exceeded the threshold, and if above the `threshold_period` defined in the alert rule then mark the trigger as activated, and create an incident if there isn't already one. :return: """ self.trigger_alert_counts[trigger.id] += 1 if self.trigger_alert_counts[ trigger.id] >= self.alert_rule.threshold_period: metrics.incr("incidents.alert_rules.trigger", tags={"type": "fire"}) # Only create a new incident if we don't already have an active one if not self.active_incident: detected_at = self.calculate_event_date_from_update_date( self.last_update) self.active_incident = create_incident( self.alert_rule.organization, IncidentType.ALERT_TRIGGERED, # TODO: Include more info in name? self.alert_rule.name, alert_rule=self.alert_rule, date_started=detected_at, # TODO: This should probably be either the current time or the # message time. Current time likely makes most sense, since this is # when we actually noticed the problem. date_detected=detected_at, projects=[self.subscription.project], ) # Now create (or update if it already exists) the incident trigger so that # we have a record of this trigger firing for this incident incident_trigger = self.incident_triggers.get(trigger.id) if incident_trigger: incident_trigger.status = TriggerStatus.ACTIVE.value incident_trigger.save() else: incident_trigger = IncidentTrigger.objects.create( incident=self.active_incident, alert_rule_trigger=trigger, status=TriggerStatus.ACTIVE.value, ) self.handle_incident_severity_update() self.handle_trigger_actions(incident_trigger, metric_value) self.incident_triggers[trigger.id] = incident_trigger # TODO: We should create an audit log, and maybe something that keeps # all of the details available for showing on the incident. Might be a json # blob or w/e? Or might be able to use the audit log # We now set this threshold to 0. We don't need to count it anymore # once we've triggered an incident. self.trigger_alert_counts[trigger.id] = 0
def test(self): closed_incident = create_incident( self.organization, IncidentType.ALERT_TRIGGERED, "Closed", "", QueryAggregations.TOTAL, groups=[self.group], date_started=timezone.now() - timedelta(days=30), ) update_incident_status(closed_incident, IncidentStatus.CLOSED) open_incident = create_incident( self.organization, IncidentType.ALERT_TRIGGERED, "Open", "", QueryAggregations.TOTAL, groups=[self.group], date_started=timezone.now() - timedelta(days=30), ) incidents = [closed_incident, open_incident] changed = False for incident, incident_stats in zip( incidents, bulk_get_incident_stats(incidents)): event_stats = get_incident_event_stats(incident) assert incident_stats["event_stats"].data[ "data"] == event_stats.data["data"] expected_start = incident_stats["event_stats"].start expected_end = incident_stats["event_stats"].end if not changed: expected_start = expected_start - calculate_incident_prewindow( expected_start, expected_end, incident) changed = True assert event_stats.start == expected_start assert event_stats.end == expected_end assert incident_stats["event_stats"].rollup == event_stats.rollup aggregates = get_incident_aggregates(incident) assert incident_stats["total_events"] == aggregates["count"] assert incident_stats["unique_users"] == aggregates["unique_users"]
def trigger_alert_threshold(self, trigger): """ Called when a subscription update exceeds the value defined in the `trigger.alert_threshold`, and the trigger hasn't already been activated. Increments the count of how many times we've consecutively exceeded the threshold, and if above the `threshold_period` defined in the alert rule then mark the trigger as activated, and create an incident if there isn't already one. :return: """ self.trigger_alert_counts[trigger.id] += 1 if self.trigger_alert_counts[ trigger.id] >= self.alert_rule.threshold_period: # Only create a new incident if we don't already have an active one if not self.active_incident: detected_at = self.last_update self.active_incident = create_incident( self.alert_rule.organization, IncidentType.ALERT_TRIGGERED, # TODO: Include more info in name? self.alert_rule.name, alert_rule=self.alert_rule, query=self.subscription.query, aggregation=QueryAggregations(self.alert_rule.aggregation), date_started=detected_at, date_detected=detected_at, projects=[self.subscription.project], ) # Now create (or update if it already exists) the incident trigger so that # we have a record of this trigger firing for this incident incident_trigger = self.incident_triggers.get(trigger.id) if incident_trigger: incident_trigger.status = TriggerStatus.ACTIVE.value incident_trigger.save() else: incident_trigger = IncidentTrigger.objects.create( incident=self.active_incident, alert_rule_trigger=trigger, status=TriggerStatus.ACTIVE.value, ) self.handle_incident_severity_update() self.handle_trigger_actions(incident_trigger) self.incident_triggers[trigger.id] = incident_trigger # TODO: We should create an audit log, and maybe something that keeps # all of the details available for showing on the incident. Might be a json # blob or w/e? Or might be able to use the audit log # We now set this threshold to 0. We don't need to count it anymore # once we've triggered an incident. self.trigger_alert_counts[trigger.id] = 0
def test_closed(self): incident = create_incident( self.organization, IncidentType.CREATED, "Test", "", timezone.now(), projects=[self.project], ) with self.assertChanges( lambda: IncidentSnapshot.objects.filter(incident=incident).exists(), before=False, after=True, ): self.run_test(incident, IncidentStatus.CLOSED, timezone.now())
def test_reopened(self): incident = create_incident( self.organization, IncidentType.CREATED, 'Test', '', timezone.now(), projects=[self.project], ) update_incident_status(incident, IncidentStatus.CLOSED) with self.assertChanges( lambda: IncidentSnapshot.objects.filter(incident=incident). exists(), before=True, after=False, ): self.run_test(incident, IncidentStatus.OPEN, None)
def test_simple(self): incident_type = IncidentType.CREATED title = 'hello' query = 'goodbye' date_started = timezone.now() other_project = self.create_project() other_group = self.create_group(project=other_project) self.record_event.reset_mock() incident = create_incident( self.organization, type=incident_type, title=title, query=query, date_started=date_started, projects=[self.project], groups=[self.group, other_group], ) assert incident.identifier == 1 assert incident.status == incident_type.value assert incident.title == title assert incident.query == query assert incident.date_started == date_started assert incident.date_detected == date_started assert IncidentGroup.objects.filter( incident=incident, group__in=[self.group, other_group]).count() == 2 assert IncidentProject.objects.filter( incident=incident, project__in=[self.project, other_project], ).count() == 2 assert IncidentActivity.objects.filter( incident=incident, type=IncidentActivityType.CREATED.value, event_stats_snapshot__isnull=False, ).count() == 1 assert len(self.record_event.call_args_list) == 1 event = self.record_event.call_args[0][0] assert isinstance(event, IncidentCreatedEvent) assert event.data == { 'organization_id': six.text_type(self.organization.id), 'incident_id': six.text_type(incident.id), 'incident_type': six.text_type(IncidentType.CREATED.value), } self.calculate_incident_suspects.apply_async.assert_called_once_with( kwargs={'incident_id': incident.id}, )
def test_incidents_list(self): incident = create_incident( self.organization, type=IncidentType.CREATED, title="Incident #1", query="", date_started=timezone.now(), projects=[self.project], groups=[self.group], ) with self.feature(FEATURE_NAME): self.browser.get(self.path) self.browser.wait_until_not(".loading-indicator") self.browser.snapshot("incidents - list") details_url = u'[href="/organizations/{}/incidents/{}/'.format( self.organization.slug, incident.identifier) self.browser.wait_until(details_url) self.browser.click(details_url) self.browser.wait_until_not(".loading-indicator") self.browser.wait_until_test_id("incident-title") self.browser.wait_until_not('[data-test-id="loading-placeholder"]') self.browser.snapshot("incidents - details")