def test_alert_receive_now(self): """ Ensure receive time is stamped. """ alert = Alert(self.RESOURCE, self.EVENT, severity=self.SEVERITY, environment=self.ENVIRONMENT) alert.receive_now() self.assertIsInstance(alert.receive_time, datetime.datetime)
def test_alert_receive_now(self): """ Ensure receive time is stamped. """ alert = Alert( self.RESOURCE, self.EVENT, environment=self.ENVIRONMENT, severity=self.SEVERITY ) alert.receive_now() self.assertIsInstance(alert.receive_time, datetime.datetime)
def parse_notification(notification): notification = json.loads(notification) if notification['Type'] == 'SubscriptionConfirmation': return Alert( resource=notification['TopicArn'], event=notification['Type'], environment='Production', severity='informational', service=['Unknown'], group='AWS/CloudWatch', text='%s <a href="%s" target="_blank">SubscribeURL</a>' % (notification['Message'], notification['SubscribeURL']), origin=notification['TopicArn'], event_type='cloudwatchAlarm', create_time=datetime.datetime.strptime(notification['Timestamp'], '%Y-%m-%dT%H:%M:%S.%fZ'), raw_data=notification, ) elif notification['Type'] == 'Notification': alarm = json.loads(notification['Message']) if 'Trigger' not in alarm: raise ValueError("SNS message is not a Cloudwatch notification") return Alert( resource='%s:%s' % (alarm['Trigger']['Dimensions'][0]['name'], alarm['Trigger']['Dimensions'][0]['value']), event=alarm['AlarmName'], environment='Production', severity=cw_state_to_severity(alarm['NewStateValue']), service=[alarm['AWSAccountId']], group=alarm['Trigger']['Namespace'], value=alarm['NewStateValue'], text=alarm['AlarmDescription'], tags=[alarm['Region']], attributes={ 'incidentKey': alarm['AlarmName'], 'thresholdInfo': alarm['Trigger'] }, origin=notification['TopicArn'], event_type='cloudwatchAlarm', create_time=datetime.datetime.strptime(notification['Timestamp'], '%Y-%m-%dT%H:%M:%S.%fZ'), raw_data=alarm)
def parse_pingdom(check): check = json.loads(check) if check['action'] == 'assign': return Alert( resource=check['host'], event=check['description'], correlate=['up', 'down'], environment='Production', severity='critical', service=[check['checkname']], group='Network', text='%s is %s.' % (check['checkname'], check['description']), attributes={'incidentKey': check['incidentid']}, origin='Pingdom', event_type='availabilityAlert', raw_data=check, ) elif check['action'] == 'notify_of_close': return Alert( resource=check['host'], event=check['description'], correlate=['up', 'down'], environment='Production', severity='normal', service=[check['checkname']], group='Network', text='%s is %s.' % (check['checkname'], check['description']), attributes={'incidentKey': check['incidentid']}, origin='Pingdom', event_type='availabilityAlert', raw_data=check, ) else: return Alert( resource=check['host'], event=check['description'], correlate=['up', 'down', check['description']], environment='Production', severity='indeterminate', service=[check['checkname']], group='Network', text='%s is %s.' % (check['checkname'], check['description']), attributes={'incidentKey': check['incidentid']}, origin='Pingdom', event_type='availabilityAlert', raw_data=check, )
def parse_prometheus(notification): labels = notification['labels'] annotations = notification['annotations'] text = annotations.get('description', None) or annotations.get('summary', None) or \ '%s: %s on %s' % (labels['job'], labels['alertname'], labels['instance']) if 'description' in annotations: del annotations['description'] if 'summary' in annotations: del annotations['summary'] if 'generatorURL' in notification: annotations['generatorUrl'] = notification['generatorURL'] return Alert( resource=labels.get('exported_instance', None) or labels['instance'], event=labels['alertname'], environment=labels.get('environment', 'Production'), severity=labels.get('severity', 'warning'), correlate=labels['correlate'].split(',') if 'correlate' in labels else None, service=labels.get('service', '').split(','), group=labels.get('job', None), value=labels.get('value', None), text=text, tags=labels.get('tags', '').split(','), attributes=annotations, origin='Prometheus', event_type='performanceAlert', raw_data=notification )
def parse_serverdensity(payload): alert = json.loads(payload) if alert['fixed']: severity = 'ok' else: severity = 'critical' return Alert(resource=alert['item_name'], event=alert['alert_type'], environment='Production', severity=severity, service=[alert['item_type']], group=alert['alert_section'], value=alert['configured_trigger_value'], text='Alert created for %s:%s' % (alert['item_type'], alert['item_name']), tags=['cloud'] if alert['item_cloud'] else [], attributes={ 'alertId': alert['alert_id'], 'itemId': alert['item_id'] }, origin='ServerDensity', event_type='serverDensityAlert', raw_data=alert)
def receive_alert(): recv_started = receive_timer.start_timer() try: incomingAlert = Alert.parse_alert(request.data) except ValueError as e: receive_timer.stop_timer(recv_started) return jsonify(status="error", message=str(e)), 400 if request.headers.getlist("X-Forwarded-For"): incomingAlert.attributes.update(ip=request.headers.getlist("X-Forwarded-For")[0]) else: incomingAlert.attributes.update(ip=request.remote_addr) try: alert = process_alert(incomingAlert) except RejectException as e: receive_timer.stop_timer(recv_started) return jsonify(status="error", message=str(e)), 403 except RuntimeWarning as e: receive_timer.stop_timer(recv_started) return jsonify(status="ok", id=incomingAlert.id, message=str(e)), 202 except Exception as e: receive_timer.stop_timer(recv_started) return jsonify(status="error", message=str(e)), 500 receive_timer.stop_timer(recv_started) if alert: body = alert.get_body() body['href'] = "%s/%s" % (request.base_url, alert.id) return jsonify(status="ok", id=alert.id, alert=body), 201, {'Location': '%s/%s' % (request.base_url, alert.id)} else: return jsonify(status="error", message="insert or update of received alert failed"), 500
def test_alert_with_some_values(self): """ Ensure a valid alert is created with some assigned values """ alert = Alert(self.RESOURCE, self.EVENT, environment=self.ENVIRONMENT, severity=self.SEVERITY, correlate=self.CORRELATE, status=self.STATUS, service=self.SERVICE, group=self.GROUP, value=self.VALUE, text=self.TEXT, tags=self.TAGS, attributes=self.ATTRIBUTES, origin=self.ORIGIN, event_type=self.EVENT_TYPE, create_time=self.CREATE_TIME, timeout=self.TIMEOUT, raw_data=self.RAW_DATA) self.assertEquals(alert.resource, self.RESOURCE) self.assertEquals(alert.event, self.EVENT) self.assertEquals(alert.environment, self.ENVIRONMENT) self.assertEquals(alert.severity, self.SEVERITY) self.assertEquals(alert.correlate, self.CORRELATE) self.assertEquals(alert.service, self.SERVICE) self.assertEquals(alert.tags, self.TAGS) self.assertEquals(alert.attributes, self.ATTRIBUTES)
def parse_prometheus(alert): status = alert.get('status', 'firing') labels = copy(alert['labels']) annotations = copy(alert['annotations']) starts_at = parse_date(alert['startsAt']) if alert['endsAt'] == '0001-01-01T00:00:00Z': ends_at = None else: ends_at = parse_date(alert['endsAt']) if status == 'firing': severity = labels.pop('severity', 'warning') create_time = starts_at elif status == 'resolved': severity = 'normal' create_time = ends_at else: severity = 'unknown' create_time = ends_at or starts_at summary = annotations.pop('summary', None) description = annotations.pop('description', None) text = description or summary or '%s: %s on %s' % ( labels['job'], labels['alertname'], labels['instance']) try: timeout = int(labels.pop('timeout', 0)) or None except ValueError: timeout = None if 'generatorURL' in alert: annotations[ 'moreInfo'] = '<a href="%s" target="_blank">Prometheus Graph</a>' % alert[ 'generatorURL'] return Alert( resource=labels.pop('exported_instance', None) or labels.pop('instance'), event=labels.pop('alertname'), environment=labels.pop('environment', 'Production'), severity=severity, correlate=labels.pop('correlate').split(',') if 'correlate' in labels else None, service=labels.pop('service', '').split(','), group=labels.pop('group', None), value=labels.pop('value', None), text=text, attributes=annotations, origin='prometheus/' + labels.get('job', '-'), event_type='prometheusAlert', create_time=create_time, timeout=timeout, raw_data=alert, customer=labels.pop('customer', None), tags=["%s=%s" % t for t in labels.items()] # any labels left are used for tags )
def receive_alert(): recv_started = receive_timer.start_timer() try: incomingAlert = Alert.parse_alert(request.data) except ValueError, e: receive_timer.stop_timer(recv_started) return jsonify(status="error", message=str(e)), 400
def test_zero_timeout(self): alert = Alert( self.RESOURCE, self.EVENT, timeout=0 ) self.assertEquals(alert.timeout, 0)
def test_alert_with_some_values(self): """ Ensure a valid alert is created with some assigned values """ alert = Alert(self.RESOURCE, self.EVENT, severity=self.SEVERITY, environment=self.ENVIRONMENT) self.assertEquals(alert.resource, self.RESOURCE) self.assertEquals(alert.event, self.EVENT) self.assertEquals(alert.severity, self.SEVERITY) self.assertEquals(alert.environment, self.ENVIRONMENT)
def test_alert_defaults(self): """ Ensures a valid alert is created with default values """ alert = Alert(self.RESOURCE, self.EVENT) self.assertEquals(alert.resource, self.RESOURCE) self.assertEquals(alert.event, self.EVENT) self.assertEquals(alert.group, 'Misc') self.assertEquals(alert.timeout, self.TIMEOUT)
def alert_kickstarter_days(url, browser=None): """ Alert if a kickstarter campaign will end, soon. We need Selenium because the number we want is updated dynamically on the HTML page. And we need to scrape because there is no public API to get it, despite the following which gives some data, but not the number of days left: https://www.kickstarter.com/projects/214379695/micropython-on-the-esp8266-beautifully-easy-iot/ """ browser = browser or utils.webdriver.Firefox() num_days_left = utils.get_kickstarter_days_left(url, browser) campaign_title = basename(url) if basename(url) else dirname(url) event = 'CampaignEndingSoon' resource = campaign_title environment = 'Production' severity = 'normal' text = 'More than a month left.' if num_days_left < 7: severity = 'critical' text = 'Less than a week left.' elif 7 <= num_days_left < 14: severity = 'major' text = 'Less than two weeks left.' elif 14 <= num_days_left < 28: severity = 'minor' text = 'Less than a month left.' service = ['Kickstarter'] value = str(num_days_left) alert_desc = dict(resource=resource, event=event, environment=environment, service=service, severity=severity, text=text, value=str(num_days_left)) if num_days_left < 28: # send new alert alert = Alert(**alert_desc) result = api.send(alert) if result['status'] == 'error': print result # raise IndexError ## TODO: make more meaningful exception else: # delete existing alert del alert_desc['value'] del alert_desc['severity'] del alert_desc['text'] query = dict(**alert_desc) for alert in api.get_alerts(query=query)['alerts']: api.delete_alert(alert['id'])
def alert_conda_outdated(path): """ Alert for an outdated conda package, delete previous alert if up to date. """ for (n, installed_version, latest_version) in utils.get_conda_updates(path): event = 'UpdateAvailable' resource = n environment = 'Production' severity = 'minor' text = 'Installed: %s' % installed_version service = ['Conda'] value = latest_version alert_desc = dict(resource=resource, event=event, environment=environment, service=service, severity=severity, text=text, value=value) if Version(installed_version) < Version(latest_version): if not DRY_RUN: # send new alert alert = Alert(**alert_desc) result = api.send(alert) else: print alert_desc # delete previous alerts for (n, installed_version) in utils.get_conda_list(path): event = 'UpdateAvailable' resource = n environment = 'Production' severity = 'minor' # text = 'Installed: %s' % installed_version service = ['Conda'] value = installed_version alert_desc = dict(resource=resource, event=event, environment=environment, service=service, severity=severity, value=value) if not DRY_RUN: query = dict(**alert_desc) for alert in api.get_alerts(query=query)['alerts']: api.delete_alert(alert['id']) else: print alert_desc
def parse_notification(self, notification): notification = json.loads(notification.get_body()) alarm = json.loads(notification['Message']) if 'Trigger' not in alarm: return # Defaults resource = '%s:%s' % (alarm['Trigger']['Dimensions'][0]['name'], alarm['Trigger']['Dimensions'][0]['value']) event = alarm['AlarmName'] severity = self.cw_state_to_severity(alarm['NewStateValue']) group = 'CloudWatch' value = alarm['Trigger']['MetricName'] text = alarm['AlarmDescription'] service = [ AWS_ACCOUNT_ID.get(alarm['AWSAccountId'], 'AWSAccountId:' + alarm['AWSAccountId']) ] tags = [alarm['Trigger']['Namespace']] correlate = list() origin = notification['TopicArn'] timeout = None create_time = datetime.datetime.strptime(notification['Timestamp'], '%Y-%m-%dT%H:%M:%S.%fZ') raw_data = notification['Message'] cloudwatchAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment='Production', service=service, text=text, event_type='cloudwatchAlarm', tags=tags, attributes={ 'awsMessageId': notification['MessageId'], 'awsRegion': alarm['Region'], 'thresholdInfo': alarm['NewStateReason'] }, origin=origin, timeout=timeout, create_time=create_time, raw_data=raw_data, ) return cloudwatchAlert
def on_message(self, headers, body): LOG.info("Received %s %s", headers['type'], headers['correlation-id']) LOG.debug("Received body : %s", body) if headers['type'] == 'Heartbeat': # TODO(nsatterl): Heartbeat.parse_heartbeat(body) etc. pass elif headers['type'].endswith('Alert'): alert = Alert.parse_alert(body) if alert: alert.receive_now() LOG.debug('Queueing alert %s', alert.get_body()) self.queue.put(alert)
def main(): listener = Listener() while True: listener.send_cmd('READY\n') data = sys.stdin.readline() headers = dict([x.split(':') for x in data.split()]) data = sys.stdin.read(int(headers['len'])) body = dict([x.split(':') for x in data.split()]) event = headers['eventname'] if event.startswith('TICK'): supervisorAlert = Heartbeat(origin='supervisord', tags=[headers['ver'], event]) else: if event.endswith('FATAL'): severity = 'critical' elif event.endswith('BACKOFF'): severity = 'warning' elif event.endswith('EXITED'): severity = 'minor' else: severity = 'normal' supervisorAlert = Alert( resource=body['processname'], environment='Production', service=['supervisord'], event=event, correlate=[ 'PROCESS_STATE_STARTING', 'PROCESS_STATE_RUNNING', 'PROCESS_STATE_BACKOFF', 'PROCESS_STATE_STOPPING', 'PROCESS_STATE_EXITED', 'PROCESS_STATE_STOPPED', 'PROCESS_STATE_FATAL', 'PROCESS_STATE_UNKNOWN' ], value='serial=%s' % headers['serial'], severity=severity, origin=headers['server'], text='State changed from %s to %s.' % (body['from_state'], event), raw_data='%s\n\n%s' % (json.dumps(headers), json.dumps(body))) try: listener.api.send(supervisorAlert) except Exception as e: listener.log_stderr(e) listener.send_cmd('RESULT 4\nFAIL') else: listener.send_cmd('RESULT 2\nOK')
def parse_stackdriver(notification): notification = json.loads(notification) incident = notification['incident'] state = incident['state'] if state == 'acknowledged': try: alert = db.get_alerts( query={'attributes.incidentId': incident['incident_id']}, limit=1)[0] except IndexError: raise ValueError('unknown Stackdriver Incident ID: %s' % incident['incident_id']) return state, alert else: if state == 'open': severity = 'critical' create_time = datetime.datetime.fromtimestamp( incident['started_at']) elif state == 'closed': severity = 'ok' create_time = datetime.datetime.fromtimestamp(incident['ended_at']) else: severity = 'indeterminate' create_time = None return state, Alert( resource=incident['resource_name'], event=incident['condition_name'], environment='Production', severity=severity, service=[incident['policy_name']], group='Cloud', text=incident['summary'], attributes={ 'incidentId': incident['incident_id'], 'resourceId': incident['resource_id'], 'moreInfo': '<a href="%s" target="_blank">Stackdriver Console</a>' % incident['url'] }, origin='Stackdriver', event_type='stackdriverAlert', create_time=create_time, raw_data=notification)
def receive_alert(): if not Switch.get('sender-api-allow').is_on(): return jsonify( status="error", message="API not accepting alerts. Try again later."), 503 recv_started = receive_timer.start_timer() try: incomingAlert = Alert.parse_alert(request.data) except ValueError as e: receive_timer.stop_timer(recv_started) return jsonify(status="error", message=str(e)), 400 if g.get('customer', None): incomingAlert.customer = g.get('customer') if request.headers.getlist("X-Forwarded-For"): incomingAlert.attributes.update( ip=request.headers.getlist("X-Forwarded-For")[0]) else: incomingAlert.attributes.update(ip=request.remote_addr) try: alert = process_alert(incomingAlert) except RejectException as e: receive_timer.stop_timer(recv_started) return jsonify(status="error", message=str(e)), 403 except RuntimeWarning as e: receive_timer.stop_timer(recv_started) return jsonify(status="ok", id=incomingAlert.id, message=str(e)), 202 except Exception as e: receive_timer.stop_timer(recv_started) return jsonify(status="error", message=str(e)), 500 receive_timer.stop_timer(recv_started) if alert: body = alert.get_body() body['href'] = absolute_url('/alert/' + alert.id) return jsonify(status="ok", id=alert.id, alert=body), 201, { 'Location': body['href'] } else: return jsonify( status="error", message="insert or update of received alert failed"), 500
def alert_webpage(url, *args, **kwargs): """ Alert if a webpage has unexpected content. """ info = utils.get_webpage_info(url, *args, **kwargs) event = 'CheckFailed' resource = url environment = 'Production' severity = 'normal' text = 'Nothing to report.' service = ['WWW'] value = 'ok' if info['status'] != 200: severity = 'major' text = 'Status is %d.' % info[status] value = info[status] if info['text_contains'] == False: severity = 'major' text = 'Expected text not found.' value = kwargs['text_contains'] alert_desc = dict(resource=resource, event=event, environment=environment, service=service, severity=severity, text=text, value=value) if severity == 'major': # send new alert alert = Alert(**alert_desc) result = api.send(alert) # if result['status'] == 'error': # print result # # raise IndexError ## TODO: make more meaningful exception else: # delete existing alert del alert_desc['value'] del alert_desc['severity'] del alert_desc['text'] query = dict(**alert_desc) for alert in api.get_alerts(query=query)['alerts']: api.delete_alert(alert['id'])
def parse_stackdriver(notification): incident = notification['incident'] state = incident['state'] if state == 'open': severity = 'critical' status = None create_time = datetime.datetime.fromtimestamp(incident['started_at']) elif state == 'acknowledged': severity = 'critical' status = 'ack' create_time = None elif state == 'closed': severity = 'ok' status = None create_time = datetime.datetime.fromtimestamp(incident['ended_at']) else: severity = 'indeterminate' status = None create_time = None return state, Alert( resource=incident['resource_name'], event=incident['condition_name'], environment='Production', severity=severity, status=status, service=[incident['policy_name']], group='Cloud', text=incident['summary'], attributes={ 'incidentId': incident['incident_id'], 'resourceId': incident['resource_id'], 'moreInfo': '<a href="%s" target="_blank">Stackdriver Console</a>' % incident['url'] }, origin='Stackdriver', event_type='stackdriverAlert', create_time=create_time, raw_data=notification)
def send(self, args): try: alert = Alert(resource=args.resource, event=args.event, environment=args.environment, severity=args.severity, correlate=args.correlate, status=args.status, service=args.service, group=args.group, value=args.value, text=args.text, tags=args.tags, attributes=dict([ attrib.split('=') for attrib in args.attributes ]), origin=args.origin, event_type=args.event_type, timeout=args.timeout, raw_data=args.raw_data) except Exception as e: LOG.error(e) sys.exit(1) try: response = self.api.send(alert) except Exception as e: LOG.error(e) sys.exit(1) if response['status'] == 'ok': if not 'alert' in response: info = response['message'] elif response['alert']['repeat']: info = "%s duplicates" % response['alert']['duplicateCount'] else: info = "%s -> %s" % (response['alert']['previousSeverity'], response['alert']['severity']) print("{} ({})".format(response['id'], info)) else: LOG.error(response['message']) sys.exit(1)
def alert_no_internet_access(url='http://www.google.com'): """ Alert if no internet access available, delete previous alert if it is. Command-line alternative (replace path argument):: alerta send -r network -e InternetUnavailable -E Network \ -S Network -s critical -t "Network not available." -v <url> """ event = 'InternetUnavailable' resource = 'network' environment = 'Production' severity = 'critical' service = ['Network'] text = 'Network not available.' value = url alert_desc = dict(resource=resource, event=event, environment=environment, service=service, severity=severity, text=text, value=value) if not utils.internet_available(url): if not DRY_RUN: # send new alert alert = Alert(**alert_desc) result = api.send(alert) if result['status'] == 'error': print result # raise IndexError ## TODO: make more meaningful exception else: print alert_desc else: if not DRY_RUN: # delete existing alert query = dict(**alert_desc) for alert in api.get_alerts(query=query)['alerts']: api.delete_alert(alert['id'])
def receive_alert(): recv_started = receive_timer.start_timer() try: incomingAlert = Alert.parse_alert(request.data) except ValueError as e: receive_timer.stop_timer(recv_started) return jsonify(status="error", message=str(e)), 400 if g.get('customer', None): incomingAlert.customer = g.get('customer') if request.headers.getlist("X-Forwarded-For"): incomingAlert.attributes.update( ip=request.headers.getlist("X-Forwarded-For")[0]) else: incomingAlert.attributes.update(ip=request.remote_addr) try: alert = process_alert(incomingAlert) except RejectException as e: receive_timer.stop_timer(recv_started) return jsonify(status="error", message=str(e)), 403 except RuntimeWarning as e: receive_timer.stop_timer(recv_started) return jsonify(status="ok", id=incomingAlert.id, message=str(e)), 202 except Exception as e: receive_timer.stop_timer(recv_started) return jsonify(status="error", message=str(e)), 500 receive_timer.stop_timer(recv_started) if alert: body = alert.get_body() body['href'] = "%s/%s" % (request.base_url, alert.id) return jsonify(status="ok", id=alert.id, alert=body), 201, { 'Location': '%s/%s' % (request.base_url, alert.id) } else: return jsonify( status="error", message="insert or update of received alert failed"), 500
def on_message(self, headers, body): LOG.debug("Received: %s", body) alert = Alert.parse_alert(body).get_body() if alert: LOG.info('%s : [%s] %s', alert['lastReceiveId'], alert['status'], alert['summary']) # TODO(nsatterl): is this still required? if 'tags' not in alert or not alert['tags']: # Kibana GUI borks if tags are null alert['tags'] = 'none' LOG.debug('alert last receivetime %s', alert['lastReceiveTime']) logstash = { '@message': alert['summary'], '@source': alert['resource'], '@source_host': 'not_used', '@source_path': alert['origin'], '@tags': alert['tags'], '@timestamp': json.dumps(alert['lastReceiveTime'], cls=DateEncoder), '@type': alert['type'], '@fields': str(alert) } LOG.debug('Index payload %s', logstash) try: index_url = "http://%s:%s/alerta/%s" % (CONF.es_host, CONF.es_port, 'alerta-' + datetime.datetime.utcnow().strftime('%Y.%M.%d')) LOG.debug('Index URL: %s', index_url) response = urllib2.urlopen(index_url, json.dumps(logstash)).read() except Exception, e: LOG.error('%s : Alert indexing to %s failed - %s', alert['lastReceiveId'], url, e) return try: es_id = json.loads(response)['_id'] LOG.info('%s : Alert indexed at %s/%s', alert['lastReceiveId'], index_url, es_id) except Exception: pass
def alert_python_sites_status(): """ Alert if any Python-related website infrastructure is not 'operational'. """ statuses = utils.get_python_sites_status() for domain in statuses: event = 'NonOperational' resource = domain environment = 'Production' severity = 'normal' text = 'Nothing to report.' if statuses[domain] != 'Operational': severity = 'major' text = 'Something is wrong.' service = ['PythonCommunity'] value = statuses[domain] alert_desc = dict(resource=resource, event=event, environment=environment, service=service, severity=severity, text=text, value=value) if severity != 'normal': # send new alert alert = Alert(**alert_desc) result = api.send(alert) if result['status'] == 'error': print result # raise IndexError ## TODO: make more meaningful exception else: # delete existing alert del alert_desc['value'] del alert_desc['severity'] del alert_desc['text'] query = dict(**alert_desc) for alert in api.get_alerts(query=query)['alerts']: api.delete_alert(alert['id'])
def receive_alert(): if not Switch.get('sender-api-allow').is_on(): return jsonify(status="error", message="API not accepting alerts. Try again later."), 503 recv_started = receive_timer.start_timer() try: incomingAlert = Alert.parse_alert(request.data) except ValueError as e: receive_timer.stop_timer(recv_started) return jsonify(status="error", message=str(e)), 400 if g.get('customer', None): incomingAlert.customer = g.get('customer') if request.headers.getlist("X-Forwarded-For"): incomingAlert.attributes.update(ip=request.headers.getlist("X-Forwarded-For")[0]) else: incomingAlert.attributes.update(ip=request.remote_addr) try: alert = process_alert(incomingAlert) except RejectException as e: receive_timer.stop_timer(recv_started) return jsonify(status="error", message=str(e)), 403 except RuntimeWarning as e: receive_timer.stop_timer(recv_started) return jsonify(status="ok", id=incomingAlert.id, message=str(e)), 202 except Exception as e: receive_timer.stop_timer(recv_started) return jsonify(status="error", message=str(e)), 500 receive_timer.stop_timer(recv_started) if alert: body = alert.get_body() body['href'] = absolute_url('/alert/' + alert.id) return jsonify(status="ok", id=alert.id, alert=body), 201, {'Location': body['href']} else: return jsonify(status="error", message="insert or update of received alert failed"), 500
def parse_newrelic(alert): if 'version' not in alert: raise ValueError("New Relic Legacy Alerting is not supported") status = alert['current_state'].lower() if status == 'open': severity = alert['severity'].lower() elif status == 'acknowledged': severity = alert['severity'].lower() status = 'ack' elif status == 'closed': severity = 'ok' else: severity = alert['severity'].lower() return Alert(resource=alert['targets'][0]['name'], event=alert['condition_name'], environment='Production', severity=severity, status=status, service=[alert['account_name']], group=alert['targets'][0]['type'], text=alert['details'], tags=[ '%s:%s' % (key, value) for (key, value) in alert['targets'][0]['labels'].items() ], attributes={ 'moreInfo': '<a href="%s" target="_blank">Incident URL</a>' % alert['incident_url'], 'runBook': '<a href="%s" target="_blank">Runbook URL</a>' % alert['runbook_url'] }, origin='New Relic/v%s' % alert['version'], event_type=alert['event_type'].lower(), raw_data=alert)
def alert_volume_not_existing(path): """ Alert if a volume does not exist, delete previous alert if it does. Command-line alternative (replace path argument): alerta send -r localhost -e VolumeUnavailable -E Localhost \ -S Filesystem -s minor -t "Volume not available." -v <path> """ event = 'VolumeUnavailable' resource = 'localhost' environment = 'Production' severity = 'minor' service = ['Filesystem'] text = 'Volume not available.' value = path alert_desc = dict(resource=resource, event=event, environment=environment, service=service, severity=severity, text=text, value=value) if not utils.volume_is_mounted(path): # send new alert alert = Alert(**alert_desc) result = api.send(alert) if result['status'] == 'error': print result # raise IndexError ## TODO: make more meaningful exception else: # delete existing alert query = dict(**alert_desc) for alert in api.get_alerts(query=query)['alerts']: api.delete_alert(alert['id'])
def on_message(self, headers, body): if tokens: _Lock.acquire() tokens -= 1 _Lock.release() LOG.debug('Taken a token, there are only %d left', tokens) else: LOG.warning('%s : No tokens left, rate limiting this alert', 'FIXME') #TODO(nsatterl): alert['lastReceiveId']) return LOG.debug("Received alert : %s", body) if headers['type'].endswith('Alert'): alert = Alert.parse_alert(body) if alert: LOG.info('%s : Send IRC message to %s', alert['lastReceiveId'], IRC_CHANNEL) shortid = alert['id'].split('-')[0] try: self.irc.send( 'PRIVMSG %s :%s [%s] %s\r\n' % (IRC_CHANNEL, shortid, alert['status'], alert['summary'])) except Exception, e: LOG.error('%s : IRC send failed - %s', alert['lastReceiveId'], e)
def alert_no_vpn(**kwdict): """ Alert if we don't use a VPN connection. """ event = 'VPNDisconnected' resource = 'network' environment = 'Production' severity = 'critical' service = ['Network'] text = 'VPN client is disconnected.' value = '?' alert_desc = dict(resource=resource, event=event, environment=environment, service=service, severity=severity, text=text, value=value) if not utils.using_vpn(**kwdict): if not DRY_RUN: # send new alert alert = Alert(**alert_desc) result = api.send(alert) if result['status'] == 'error': print result # raise IndexError ## TODO: make more meaningful exception else: print alert_desc else: if not DRY_RUN: # delete existing alert query = dict(**alert_desc) for alert in api.get_alerts(query=query)['alerts']: api.delete_alert(alert['id'])
resource = check['resource'] correlate = _HTTP_ALERTS group = 'Web' environment = check['environment'] service = check['service'] text = text tags = check.get('tags', list()) threshold_info = "%s : RT > %d RT > %d x %s" % ( check['url'], warn_thold, crit_thold, check.get('count', 1)) urlmonAlert = Alert(resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='serviceAlert', tags=tags, attributes={'thresholdInfo': threshold_info}) try: self.api.send(urlmonAlert) except Exception, e: LOG.warning('Failed to send alert: %s', e) self.queue.task_done() LOG.info('%s check complete.', self.getName()) self.queue.task_done()
get_ipython().system( u' cd $ALERTA_TEST_DIR && ./miniconda2/bin/alerta --endpoint-url "http://localhost:8090" delete' ) # ### Same Thing, Python style # In[ ]: from alerta.api import ApiClient from alerta.alert import Alert api = ApiClient(endpoint='http://localhost:8090') alert = Alert(resource='localhost', event='VolUnavailable', service=['Filesystem'], environment='Production', value='ERROR', severity='minor') res = api.send(alert) # ## Custom Alerts # ### Remember, you can do amazing stuff… # In[ ]: import utils utils.volume_is_mounted('/Volumes/Intenso64') # In[ ]:
def parse_snmptrap(data): pdu_data = data.splitlines() varbind_list = pdu_data[:] trapvars = dict() for line in pdu_data: if line.startswith('$'): special, value = line.split(None, 1) trapvars[special] = value varbind_list.pop(0) if '$s' in trapvars: if trapvars['$s'] == '0': version = 'SNMPv1' elif trapvars['$s'] == '1': version = 'SNMPv2c' elif trapvars['$s'] == '2': version = 'SNMPv2u' # not supported else: version = 'SNMPv3' trapvars['$s'] = version else: LOG.warning('Failed to parse unknown trap type.') return # Get varbinds varbinds = dict() idx = 0 for varbind in '\n'.join(varbind_list).split('~%~'): if varbind == '': break idx += 1 try: oid, value = varbind.split(None, 1) except ValueError: oid = varbind value = '' varbinds[oid] = value trapvars['$' + str(idx)] = value # $n LOG.debug('$%s %s', str(idx), value) trapvars['$q'] = trapvars['$q'].lstrip( '.') # if numeric, remove leading '.' trapvars['$#'] = str(idx) LOG.debug('varbinds = %s', varbinds) LOG.debug('version = %s', version) correlate = list() if version == 'SNMPv1': if trapvars['$w'] == '0': trapvars['$O'] = 'coldStart' correlate = ['coldStart', 'warmStart'] elif trapvars['$w'] == '1': trapvars['$O'] = 'warmStart' correlate = ['coldStart', 'warmStart'] elif trapvars['$w'] == '2': trapvars['$O'] = 'linkDown' correlate = ['linkUp', 'linkDown'] elif trapvars['$w'] == '3': trapvars['$O'] = 'linkUp' correlate = ['linkUp', 'linkDown'] elif trapvars['$w'] == '4': trapvars['$O'] = 'authenticationFailure' elif trapvars['$w'] == '5': trapvars['$O'] = 'egpNeighborLoss' elif trapvars['$w'] == '6': # enterpriseSpecific(6) if trapvars['$q'].isdigit( ): # XXX - specific trap number was not decoded trapvars['$O'] = '%s.0.%s' % (trapvars['$N'], trapvars['$q']) else: trapvars['$O'] = trapvars['$q'] elif version == 'SNMPv2c': if 'coldStart' in trapvars['$2']: trapvars['$w'] = '0' trapvars['$W'] = 'Cold Start' elif 'warmStart' in trapvars['$2']: trapvars['$w'] = '1' trapvars['$W'] = 'Warm Start' elif 'linkDown' in trapvars['$2']: trapvars['$w'] = '2' trapvars['$W'] = 'Link Down' elif 'linkUp' in trapvars['$2']: trapvars['$w'] = '3' trapvars['$W'] = 'Link Up' elif 'authenticationFailure' in trapvars['$2']: trapvars['$w'] = '4' trapvars['$W'] = 'Authentication Failure' elif 'egpNeighborLoss' in trapvars['$2']: trapvars['$w'] = '5' trapvars['$W'] = 'EGP Neighbor Loss' else: trapvars['$w'] = '6' trapvars['$W'] = 'Enterprise Specific' trapvars['$O'] = trapvars['$2'] # SNMPv2-MIB::snmpTrapOID.0 LOG.debug('trapvars = %s', trapvars) LOG.info('%s-Trap-PDU %s from %s at %s %s', version, trapvars['$O'], trapvars['$B'], trapvars['$x'], trapvars['$X']) if trapvars['$B'] != '<UNKNOWN>': resource = trapvars['$B'] elif trapvars['$A'] != '0.0.0.0': resource = trapvars['$A'] else: m = re.match(r'UDP: \[(\d+\.\d+\.\d+\.\d+)\]', trapvars['$b']) if m: resource = m.group(1) else: resource = '<NONE>' # Defaults event = trapvars['$O'] severity = 'normal' group = 'SNMP' value = trapvars['$w'] text = trapvars['$W'] environment = 'Production' service = ['Network'] attributes = {'source': trapvars['$B']} tags = [version] timeout = None create_time = datetime.datetime.strptime( '%sT%s.000Z' % (trapvars['$x'], trapvars['$X']), '%Y-%m-%dT%H:%M:%S.%fZ') snmptrapAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='snmptrapAlert', attributes=attributes, tags=tags, timeout=timeout, create_time=create_time, raw_data=data, ) SnmpTrapHandler.translate_alert(snmptrapAlert, trapvars) if snmptrapAlert.get_type() == 'Heartbeat': snmptrapAlert = Heartbeat(origin=snmptrapAlert.origin, tags=[__version__], timeout=snmptrapAlert.timeout) return snmptrapAlert
def parse_syslog(self, addr, data): LOG.debug('Parsing syslog message...') syslogAlerts = list() event = None resource = None for msg in data.split('\n'): # NOTE: if syslog msgs aren't being split on newlines and #012 appears instead then # try adding "$EscapeControlCharactersOnReceive off" to rsyslog.conf if not msg or 'last message repeated' in msg: continue if re.match('<\d+>1', msg): # Parse RFC 5424 compliant message m = re.match(r'<(\d+)>1 (\S+) (\S+) (\S+) (\S+) (\S+) (.*)', msg) if m: PRI = int(m.group(1)) ISOTIMESTAMP = m.group(2) HOSTNAME = m.group(3) APPNAME = m.group(4) PROCID = m.group(5) MSGID = m.group(6) TAG = '%s[%s] %s' % (APPNAME, PROCID, MSGID) MSG = m.group(7) LOG.info("Parsed RFC 5424 message OK") else: LOG.error("Could not parse RFC 5424 syslog message: %s", msg) continue elif re.match(r'<(\d{1,3})>\S{3}\s', msg): # Parse RFC 3164 compliant message m = re.match(r'<(\d{1,3})>\S{3}\s{1,2}\d?\d \d{2}:\d{2}:\d{2} (\S+)( (\S+):)? (.*)', msg) if m: PRI = int(m.group(1)) HOSTNAME = m.group(2) TAG = m.group(4) MSG = m.group(5) LOG.info("Parsed RFC 3164 message OK") else: LOG.error("Could not parse RFC 3164 syslog message: %s", msg) continue elif re.match('<\d+>.*%[A-Z0-9_-]+', msg): # Parse Cisco Syslog message m = re.match('<(\d+)>.*(%([A-Z0-9_-]+)):? (.*)', msg) if m: LOG.debug(m.groups()) PRI = int(m.group(1)) CISCO_SYSLOG = m.group(2) try: CISCO_FACILITY, CISCO_SEVERITY, CISCO_MNEMONIC = m.group(3).split('-') except ValueError, e: LOG.error('Could not parse Cisco syslog - %s: %s', e, m.group(3)) CISCO_FACILITY = CISCO_SEVERITY = CISCO_MNEMONIC = 'na' TAG = CISCO_MNEMONIC MSG = m.group(4) event = CISCO_SYSLOG # replace IP address with a hostname, if necessary try: socket.inet_aton(addr) (resource, _, _) = socket.gethostbyaddr(addr) except (socket.error, socket.herror): resource = addr resource = '%s:%s' % (resource, CISCO_FACILITY) else: LOG.error("Could not parse Cisco syslog message: %s", msg) continue facility, level = decode_priority(PRI) # Defaults event = event or '%s%s' % (facility.capitalize(), level.capitalize()) resource = resource or '%s%s' % (HOSTNAME, ':' + TAG if TAG else '') severity = priority_to_code(level) group = 'Syslog' value = level text = MSG environment = 'Production' service = ['Platform'] tags = ['%s.%s' % (facility, level)] correlate = list() timeout = None raw_data = msg syslogAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='syslogAlert', tags=tags, timeout=timeout, raw_data=raw_data, ) suppress = False try: suppress = Transformers.normalise_alert(syslogAlert, facility=facility, level=level) except RuntimeWarning: pass if suppress: LOG.info('Suppressing %s.%s alert', facility, level) LOG.debug('%s', syslogAlert) continue if syslogAlert.get_type() == 'Heartbeat': syslogAlert = Heartbeat(origin=syslogAlert.origin, timeout=syslogAlert.timeout) syslogAlerts.append(syslogAlert)
def parse_snmptrap(data): pdu_data = data.splitlines() varbind_list = pdu_data[:] trapvars = dict() for line in pdu_data: if line.startswith('$'): special, value = line.split(None, 1) trapvars[special] = value varbind_list.pop(0) if '$s' in trapvars: if trapvars['$s'] == '0': version = 'SNMPv1' elif trapvars['$s'] == '1': version = 'SNMPv2c' elif trapvars['$s'] == '2': version = 'SNMPv2u' # not supported else: version = 'SNMPv3' trapvars['$s'] = version else: LOG.warning('Failed to parse unknown trap type.') return # Get varbinds varbinds = dict() idx = 0 for varbind in '\n'.join(varbind_list).split('~%~'): if varbind == '': break idx += 1 try: oid, value = varbind.split(None, 1) except ValueError: oid = varbind value = '' varbinds[oid] = value trapvars['$' + str(idx)] = value # $n LOG.debug('$%s %s', str(idx), value) trapvars['$q'] = trapvars['$q'].lstrip('.') # if numeric, remove leading '.' trapvars['$#'] = str(idx) LOG.debug('varbinds = %s', varbinds) LOG.debug('version = %s', version) correlate = list() if version == 'SNMPv1': if trapvars['$w'] == '0': trapvars['$O'] = 'coldStart' correlate = ['coldStart', 'warmStart'] elif trapvars['$w'] == '1': trapvars['$O'] = 'warmStart' correlate = ['coldStart', 'warmStart'] elif trapvars['$w'] == '2': trapvars['$O'] = 'linkDown' correlate = ['linkUp', 'linkDown'] elif trapvars['$w'] == '3': trapvars['$O'] = 'linkUp' correlate = ['linkUp', 'linkDown'] elif trapvars['$w'] == '4': trapvars['$O'] = 'authenticationFailure' elif trapvars['$w'] == '5': trapvars['$O'] = 'egpNeighborLoss' elif trapvars['$w'] == '6': # enterpriseSpecific(6) if trapvars['$q'].isdigit(): # XXX - specific trap number was not decoded trapvars['$O'] = '%s.0.%s' % (trapvars['$N'], trapvars['$q']) else: trapvars['$O'] = trapvars['$q'] elif version == 'SNMPv2c': if 'coldStart' in trapvars['$2']: trapvars['$w'] = '0' trapvars['$W'] = 'Cold Start' elif 'warmStart' in trapvars['$2']: trapvars['$w'] = '1' trapvars['$W'] = 'Warm Start' elif 'linkDown' in trapvars['$2']: trapvars['$w'] = '2' trapvars['$W'] = 'Link Down' elif 'linkUp' in trapvars['$2']: trapvars['$w'] = '3' trapvars['$W'] = 'Link Up' elif 'authenticationFailure' in trapvars['$2']: trapvars['$w'] = '4' trapvars['$W'] = 'Authentication Failure' elif 'egpNeighborLoss' in trapvars['$2']: trapvars['$w'] = '5' trapvars['$W'] = 'EGP Neighbor Loss' else: trapvars['$w'] = '6' trapvars['$W'] = 'Enterprise Specific' trapvars['$O'] = trapvars['$2'] # SNMPv2-MIB::snmpTrapOID.0 LOG.debug('trapvars = %s', trapvars) LOG.info('%s-Trap-PDU %s from %s at %s %s', version, trapvars['$O'], trapvars['$B'], trapvars['$x'], trapvars['$X']) if trapvars['$B'] != '<UNKNOWN>': resource = trapvars['$B'] elif trapvars['$A'] != '0.0.0.0': resource = trapvars['$A'] else: m = re.match(r'UDP: \[(\d+\.\d+\.\d+\.\d+)\]', trapvars['$b']) if m: resource = m.group(1) else: resource = '<NONE>' # Defaults event = trapvars['$O'] severity = 'normal' group = 'SNMP' value = trapvars['$w'] text = trapvars['$W'] environment = 'Production' service = ['Network'] attributes = {'source': trapvars['$B']} tags = [version] timeout = None create_time = datetime.datetime.strptime('%sT%s.000Z' % (trapvars['$x'], trapvars['$X']), '%Y-%m-%dT%H:%M:%S.%fZ') snmptrapAlert = Alert( resource=resource, event=event, correlate=correlate, group=group, value=value, severity=severity, environment=environment, service=service, text=text, event_type='snmptrapAlert', attributes=attributes, tags=tags, timeout=timeout, create_time=create_time, raw_data=data, ) SnmpTrapHandler.translate_alert(snmptrapAlert, trapvars) if snmptrapAlert.get_type() == 'Heartbeat': snmptrapAlert = Heartbeat(origin=snmptrapAlert.origin, tags=[__version__], timeout=snmptrapAlert.timeout) return snmptrapAlert
def heartbeats(self, args): response = self._heartbeats() heartbeats = response['heartbeats'] print('{:<28} {:<26} {:<19} {:>8} {:7} {}'.format( 'ORIGIN', 'TAGS', 'CREATED', 'LATENCY', 'TIMEOUT', 'SINCE')) for heartbeat in heartbeats: hb = HeartbeatDocument.parse_heartbeat(heartbeat) latency = (hb.receive_time - hb.create_time).microseconds / 1000 since = datetime.utcnow() - hb.receive_time since = since - timedelta(microseconds=since.microseconds) latency_exceeded = latency > MAX_LATENCY timeout_exceeded = since.seconds > hb.timeout print('{:<28} {:<26} {} {}{:6}ms {:6}s {}{}'.format( hb.origin, ' '.join(hb.tags), hb.get_date('create_time', 'local', args.timezone), '*' if latency_exceeded else ' ', latency, hb.timeout, '*' if timeout_exceeded else ' ', since)) if args.alert: if timeout_exceeded: alert = Alert( resource=hb.origin, event='HeartbeatFail', correlate=[ 'HeartbeatFail', 'HeartbeatSlow', 'HeartbeatOK' ], group='System', environment='Production', service=['Alerta'], severity='major', value='{}'.format(since), text='Heartbeat not received in {} seconds'.format( hb.timeout), tags=hb.tags, type='heartbeatAlert') elif latency_exceeded: alert = Alert( resource=hb.origin, event='HeartbeatSlow', correlate=[ 'HeartbeatFail', 'HeartbeatSlow', 'HeartbeatOK' ], group='System', environment='Production', service=['Alerta'], severity='major', value='{}ms'.format(latency), text='Heartbeat took more than {}ms to be processed'. format(MAX_LATENCY), tags=hb.tags, type='heartbeatAlert') else: alert = Alert(resource=hb.origin, event='HeartbeatOK', correlate=[ 'HeartbeatFail', 'HeartbeatSlow', 'HeartbeatOK' ], group='System', environment='Production', service=['Alerta'], severity='normal', value='', text='Heartbeat OK', tags=hb.tags, type='heartbeatAlert') self.send(alert)
from alerta.alert import Alert, severity LOG = logging.getLogger('alerta') LOG = logging.getLogger(__name__) CONF = config.CONF print CONF config.parse_args(['--use-stderr', '--debug']) logging.setup('alerta') db = Mongo() #print db.save_alert({''}) alert3 = Alert('router55', 'Node_Down', severity=severity.INDETERMINATE, value='FAILED', timeout=600, environment=['PROD'], receive_time="2013-02-23T09:18:05.303Z", last_receive_time="2013-02-23T09:18:05.303Z", service=['Network', 'Common'], tags=['london', 'location:london', 'dc:location=london'], text="Node is not responding via ping.", origin="test3", correlate=['Node_Up', 'Node_Down'], event_type='myAlert', trend_indication='moreSevere') print alert3 print alert3.get_id() print alert3.get_header() print alert3.get_body() print 'Saving alert...' print db.save_alert(alert3) print 'Get alert...' print db.get_alert('PROD', 'router55', 'Node_Down', 'Indeterminate')