def delete_incident(incident_id, system_id, **kwargs): """ Deletes an incident for a selected system. @param incident_id: The id of the incident to be deleted @type incident_id: str @param system_id: System where the incident is defined in @type system_id: str @param kwargs: Extra key-value arguments to pass for incident deleting @return: Response code dictionary to indicate if the incident was deleted or not @rtype: dict """ try: system = SystemService().filter(pk=system_id, state__name='Active').first() if system is None: return {"code": "800.400.002"} incident = IncidentService().filter(pk=incident_id, system=system).first() if incident: if incident.delete(): return { 'code': '800.200.001', 'Message': 'Incident deleted successfully' } except Exception as ex: lgr.exception("Incident Delete exception %s" % ex) return {"code": "800.400.001"}
def get_events(system_id): """ Retrieves events logged for a certain system @param: system_id: Id of the system @type system_id: str @return: Response code indicating status and logged events """ try: system = SystemService().get(pk=system_id, state__name='Active') if not system: return {'code': '800.400.200'} events = list(EventService().filter( system=system, state__name='Active').values( 'id', 'date_created', 'interface', 'method', 'request', 'response', 'stack_trace', 'description', 'code', status=F('state__name'), system_name=F('system__name'), eventtype=F('event_type__name')).order_by('-date_created')) return {'code': '800.200.001', 'data': events} except Exception as ex: lgr.exception("Get events Exception %s" % ex) return {'code': '800.400.001'}
def test_get(self): """ Test System get service """ mixer.blend('core.System', name='Helaplan') system = SystemService().get(name='Helaplan') assert system is not None, 'Should have a System object'
def test_update(self): """ Test System update service """ system = mixer.blend('core.System') system = SystemService().update(system.id, name="Helaplan") assert system.name == "Helaplan", 'Should have the same name'
def test_filter(self): """ Test System filter service """ mixer.cycle(3).blend('core.System') systems = SystemService().filter() assert len(systems) == 3, 'Should have 3 System objects'
def get_endpoints(system): """ @param system: System where the endpoint is configured @type system:str @return: endpoints: dictionary containing a success code and a list of dictionary containing endpoints data @rtype: dict """ try: system = SystemService().filter(pk=system, state__name='Active') if not system: return {'code': '800.400.002', 'message': 'Invalid parameters'} endpoints = list(EndpointService().filter(system=system).values( 'id', 'name', 'description', 'url', 'optimal_response_time', 'date_created', 'date_modified', system_name=F('system__name'), type=F('endpoint_type__name'), state_name=F('state__name'))) return {'code': '800.200.001', 'data': endpoints} except Exception as ex: lgr.exception("Endpoint Administration exception: %s" % ex) return { 'code': '800.400.001', "message": "Error. Could not retrieve endpoints" }
def delete_rule(rule_id, system_id, **kwargs): """ Deletes an escalation rule for a selected system. @param rule_id: The id of the rule to be deleted @type rule_id: str @param system_id: System where the escalation rule is defined in @type system_id: str @param kwargs: Extra key-value arguments to pass for incident logging @return: Response code dictionary to indicate if the incident was created or not @rtype: dict """ try: system = SystemService().filter(pk=system_id, state__name='Active').first() escalation_rule = EscalationRuleService().filter( pk=rule_id, system=system).first() if system is None or escalation_rule is None: return {"code": "800.400.002"} if escalation_rule.delete(): return { 'code': '800.200.001', 'Message': 'Rule deleted successfully' } except Exception as ex: lgr.exception("Delete Escalation Rule exception %s" % ex) return {"code": "800.400.001"}
def get_rules(system_id, **kwargs): """ Retrieves all escalation rule for a selected system. @param system_id: System where the rule is defined @type system_id: str | None @param kwargs: Extra key-value arguments to pass for incident logging @return: Response code dictionary to indicate if the incident was created or not @rtype: dict """ try: system = SystemService().filter(pk=system_id, state__name='Active').first() if system is None: return {"code": "800.400.002"} escalation_rules = list( EscalationRuleService().filter(system=system).values( 'id', 'name', 'description', 'duration', 'date_created', 'date_modified', 'nth_event', system_id=F('system'), escalation_level_name=F('escalation_level__name'), state_name=F('state__name'), event_type_name=F('event_type__name')).order_by( '-date_created')) for rule in escalation_rules: rule.update( duration=timedelta.total_seconds(rule.get('duration'))) return {'code': '800.200.001', 'data': escalation_rules} except Exception as ex: lgr.exception("Get Escalation Rules exception %s" % ex) return {"code": "800.400.001"}
def create_rule(name, description, system, event_type, nth_event, escalation_level, duration, **kwargs): """ Creates an escalation rule for a selected system. @param name: Name of the escalation rule to be created @type name: str @param system: The system which the escalation rule will be applied in @type system: str @param description: Details on the Escalation Rule @type description: str @param event_type: Type of the event(s) to be affected by the rule @type event_type: str @param nth_event: Number of event of a certain type that need to be logged to raise an escalation @type nth_event: str @param duration: Time period within which certain events must occur to trigger an escalation. @type duration: int @param escalation_level: Level at which an escalation is configured with a set of recipients @type escalation_level: str @param kwargs: Extra key-value arguments to pass for incident logging @return: Response code dictionary to indicate if the incident was created or not @rtype: dict """ try: system = SystemService().get(pk=system, state__name="Active") escalation_level = EscalationLevelService().get( pk=escalation_level, state__name="Active") event_type = EventTypeService().get(pk=event_type, state__name='Active') if system is None or escalation_level is None or event_type is None: return {"code": "800.400.002"} escalation_rule = EscalationRuleService().create( name=name, description=description, system=system, nth_event=int(nth_event), duration=timedelta(seconds=duration), state=StateService().get(name='Active'), escalation_level=escalation_level, event_type=event_type) if escalation_rule is not None: rule = EscalationRuleService().filter( pk=escalation_rule.id, system=system).values( 'id', 'name', 'description', 'duration', 'date_created', 'date_modified', 'nth_event', system_id=F('system'), escalation_level_name=F('escalation_level__name'), state_name=F('state__name'), event_type_name=F('event_type__name')).first() rule.update( duration=timedelta.total_seconds(rule.get('duration'))) return {'code': '800.200.001', 'data': rule} except Exception as ex: lgr.exception("Escalation Rule Creation exception %s" % ex) return {"code": "800.400.001"}
def get_event(event_id, system_id): """ Retrieves an event logged for a certain system @param: event_id: Id of the event @type event_id: str @param: system_id: Id of the system @type system_id: str @return: Response code indicating status and logged event """ try: system = SystemService().get(pk=system_id, state__name='Active') event = EventService().filter( pk=event_id, system=system, state__name='Active').values( 'id', 'date_created', 'interface', 'method', 'request', 'response', 'stack_trace', 'description', 'code', status=F('state__name'), system_name=F('system__name'), eventtype=F('event_type__name')).first() if system is None or event is None: return {'code': '800.400.200', 'event': str(event_id)} return {'code': '800.200.001', 'data': event} except Exception as ex: lgr.exception("Get event Exception %s" % ex) return {'code': '800.400.001'}
def create_endpoint(name, description, url, system_id, color, response_time, endpoint_type_id, state_id): """ @param color: color the line graph will use when plotting @type color: str @param name: name of endpoint to be created @type name:str @param description: description of endpoint to be created @type description: str @param url: url of endpoint to be created @type: str @param system_id: id of system the endpoint will belong to @type : int @param response_time: average response time the endpoint should take @type: int @param endpoint_type_id: id of endpoint type the endpoint will belong to @type endpoint_type_id: int @param state_id: the id of initial state of the created endpoint will have @type state_id: int @return: Response code dictionary to indicate if the endpoint was created or not @rtype:dict """ try: system = SystemService().get(id=system_id, state__name="Active") endpoint_type = EndpointTypeService().get(id=endpoint_type_id, state__name="Active") state = StateService().get(id=state_id) if not (system and endpoint_type and state and name and description and response_time and url): return {"code": "800.400.002", "message": "Missing parameters"} exist = True if EndpointService().filter(system = system, url = url) \ else EndpointService().filter(system = system, name = name) if exist: return { "code": "800.400.001", "message": "An endpoint with this url or name exists" } endpoint = EndpointService().create( name=name, description=description, url=url, system=system, endpoint_type=endpoint_type, color=color, optimal_response_time=datetime.timedelta( seconds=int(response_time)), state=state) return { "code": "800.200.001", "message": "successfully created endpoint: %s" % endpoint.name } except Exception as ex: lgr.exception("Endpoint Administration exception: %s" % ex) return { "code": "800.400.001", "message": "Error when creating an endpoint" }
def send_notification(message, message_type, recipients, system_id): """ Create and sends a notification @param system_id: id of the system the notification is created from @type system_id:str @param message: a string of the content to be sent @type:str @param message_type: a string indicating the notification type @type: str @param recipients: a list containing either email or Phone_number dependent on message type @type:list @return: returns a dict of a code for success or failure @rtype: dict """ if not (recipients or message or message_type): return {"code": "800.400.002"} try: for recipient in recipients: data = NotificationService().create( message=message, notification_type=NotificationTypeService().get( name=message_type), recipient=recipient, system=SystemService().get(pk=system_id), state=StateService().get(name='Active')) if data is not None: message_data = { "destination": data.recipient, "message_type": data.notification_type.name, "lang": None, "corporate_id": None, "message_code": 'HPS0006', "replace_tags": { "code": None, 'corporate': None, 'date': datetime.now().strftime('%d/%m/%y'), 'time': datetime.now().time().strftime('%I:%M%p') } } # to do a call to notification API check if it returns a code for success if message_data: NotificationService().update( data.id, state=StateService().get(name='Sent')) else: data = NotificationService().update( data.id, state=StateService().get(name='Failed')) lgr.warn("Message sending failed: %s" % data) else: return {"code": "200.400.005"} return {"code": "800.200.001", "message": message} except Exception as e: lgr.exception("Notification logger exception %s" % e) return { "code": "800.400.001", "message": "error in sending notification interface" }
def get_incidents(system, incident_type=None, **kwargs): """ Retrieves incidents within the specified system @param system: System where the incident is created in @type system: str @param incident_type: Type of the incident @type incident_type: str @param kwargs: Extra key, value arguments to be passed @return: incidents | response code to indicate errors retrieving the incident @rtype: dict """ try: system = SystemService().get(pk=system, state__name='Active') incident_type = IncidentTypeService().get(name=incident_type, state__name='Active') if not system: return {'code': '800.400.002'} states = kwargs.get('states', None) incidents = IncidentService().filter(state__name__in = states) if states is not None else \ IncidentService().filter() incidents = incidents.filter( incident_type=incident_type) if incident_type else incidents incidents = list( incidents.filter(system=system).values( 'id', 'name', 'description', 'priority_level', 'date_created', 'date_modified', 'scheduled_for', 'scheduled_until', system_id=F('system__id'), incident_type_name=F('incident_type__name'), state_id=F('state__id'), state_name=F('state__name'), system_name=F('system__name'), event_type_id=F('event_type__id')).order_by( '-date_created')) for incident in incidents: incident_updates = list(IncidentLogService().filter( incident__id=incident.get('id')).values( 'id', 'description', 'priority_level', 'date_created', 'date_modified', user_id=F('user__id'), username=F('user__username'), escalation_level_id=F('escalation_level__id'), state_name=F('state__name'), state_id=F('state__id')).order_by('-date_created')) incident.update(incident_updates=incident_updates) return {'code': '800.200.001', 'data': incidents} except Exception as ex: lgr.exception("Get incidents exception %s" % ex) return {'code': '800.400.001'}
def get_system_recipient(user_id, system_id): """ @param system_id: the id of the system the recipient belongs to @type:str @param user_id: the id of the recipient @type: str @return:recipients:a dictionary containing a success code and a list of dictionaries containing system recipient data @rtype:dict """ try: escalations_levels = [] notification_types = [] state = [] system_recipient_id = [] system = SystemService().get(id=system_id) recipient = User.objects.get(id=user_id) if not (system and recipient): return {"code": "800.400.002", "message": "missing parameters"} system_recipient = SystemRecipientService().filter( system=system, recipient=recipient).values(userName=F('recipient__username'), recipientId=F('recipient'), systemRecipientId=F('id')).first() recipients = list(SystemRecipientService().filter( system=system, recipient=recipient).values( 'state', userName=F('recipient__username'), notificationType=F('notification_type'), systemRecipientId=F('id'), escalationLevel=F('escalation_level'))) if system_recipient: for recipient in recipients: escalations_levels.append(recipient.get('escalationLevel')) notification_types.append( recipient.get('notificationType')) state.append(recipient.get('state')) system_recipient_id.append( recipient.get('systemRecipientId')) data = [{ 'escalation_level_id': escalations_levels[i], 'notification_type_id': notification_types[i], 'state_id': state[i], 'system_recipient_id': system_recipient_id[i] } for i in range(len(escalations_levels))] system_recipient.update(escalationLevels=data) return {'code': '800.200.001', 'data': system_recipient} return { 'code': '800.200.001', 'data': 'There is no such system recipient' } except Exception as ex: lgr.exception("Recipient Administration Exception: %s" % ex) return { "code": "800.400.001", "message": "Error while fetching recipient" }
def dashboard_widgets_data(system, date_from=None, date_to=None): """ Retrieves historical data within a specified start and end date range within a system @param system: System where the incident is created in @type system: str @param date_from: Start date limit applied @type date_from: str | None @param date_to: End date limit to be applied @type date_to: str | None @return: incidents | response code to indicate errors retrieving the data @rtype: dict """ try: system = SystemService().get(pk=system, state__name='Active') if not system: return {'code': '800.400.002'} if date_from and date_to: date_from = dateutil.parser.parse(date_from) date_to = dateutil.parser.parse(date_to) else: date_from = datetime.combine( datetime.now(), datetime.min.time()) + timedelta(days=1) date_to = date_from - timedelta(days=1) reported_events = EventService().filter( system=system, date_created__lte=date_from, date_created__gte=date_to).count() open_incidents = IncidentService().filter( system=system, incident_type__name='Realtime', date_created__lte=date_from, date_created__gte=date_to).exclude( state__name='Resolved').count() closed_incidents = IncidentService().filter( system=system, incident_type__name='Realtime', state__name='Resolved', date_created__lte=date_from, date_created__gte=date_to).count() scheduled_incidents = IncidentService().filter( system=system, incident_type__name='Scheduled', date_created__lte=date_from, date_created__gte=date_to).exclude( state__name='Completed').count() data = { 'reported_events': reported_events, 'open_incidents': open_incidents, 'closed_incidents': closed_incidents, 'scheduled_incidents': scheduled_incidents } return {'code': '800.200.001', 'data': data} except Exception as ex: lgr.exception("Get incidents exception %s" % ex) return {'code': '800.400.001'}
def test_create(self): """ Test System create service """ state = mixer.blend('base.State') admin = mixer.blend(User) system = SystemService().create(name='Helaplan', state=state, admin=admin) assert system is not None, 'Should have a System object' assert system.name == 'Helaplan', 'Created System name is equals to Helaplan'
def create_system_recipient(system_id, user_id, escalations): """ @param system_id: The id of the system the recipient will belong to @type:str @param user_id: The id of the recipient @type:str @param escalations:A list of dictionaries containing notification type id and escalation level_id @type:list @return:a dictionary containing response code and data indicating a success or failure in creation @rtype: dict """ try: system = SystemService().get(id=system_id) recipient = User.objects.get(id=user_id) if not (system and recipient and escalations): return { "code": "800.400.002", "message": "Invalid parameters given" } for escalation in escalations: if SystemRecipientService().filter( system=system, recipient=recipient, escalation_level=EscalationLevelService().get( id=escalation.get('EscalationLevel')), ): return { "code": "800.400.001", "message": "system recipient already exist consider updating the recipient" } SystemRecipientService().create( system=system, recipient=recipient, escalation_level=EscalationLevelService().get( id=escalation.get('EscalationLevel')), notification_type=NotificationTypeService().get( id=escalation.get('NotificationType')), state=StateService().get(name='Active')) return { "code": "800.200.001", "message": " successfully created a system recipient" } except Exception as ex: lgr.exception("Recipient Administration Exception: %s" % ex) return { "code": "800.400.001", "message": "Error while creating a system recipient" }
def get_look_up_data(): """ @return: a dictionary containing a success code and a list of dictionaries containing system recipient data @rtype:dict """ try: state = list(StateService().filter().values('id', 'name')) notification_type = list(NotificationTypeService().filter().values( 'id', 'name')) escalation_level = list(EscalationLevelService().filter().values( 'id', 'name')) event_type = list(EventTypeService().filter().values('id', 'name')) endpoint_type = list(EndpointTypeService().filter().values( 'id', 'name')) incident_type = list(IncidentTypeService().filter().values( 'id', 'name')) user = list(User.objects.all().values('id', 'username')) system = list(SystemService().filter().values('id', 'name')) endpoint_states = list(StateService().filter( Q(name='Operational') | Q(name='Minor Outage') | Q(name='Major Outage') | Q(name='Under Maintenance') | Q(name='Degraded Performance')).values('id', 'name')) realtime_incident_states = list(StateService().filter( Q(name='Investigating') | Q(name='Identified') | Q(name='Monitoring') | Q(name='Resolved')).values( 'id', 'name')) scheduled_incident_states = list(StateService().filter( Q(name='Scheduled') | Q(name='InProgress') | Q(name='Completed')).values('id', 'name')) lookups = { 'states': state, 'incident_types': incident_type, 'escalation_levels': escalation_level, 'notification_types': notification_type, 'endpoint_types': endpoint_type, 'event_types': event_type, 'users': user, 'systems': system, 'realtime_incident_states': realtime_incident_states, 'endpoint_states': endpoint_states, 'scheduled_incident_states': scheduled_incident_states } return {"code": "800.200.001", "data": lookups} except Exception as ex: lgr.exception("Look up interface Exception: %s" % ex) return { "code": "800.400.001", "message": "Error while fetching data %s" % str(ex) }
def get_incident_events(incident_id, system_id, **kwargs): """ Retrieves the events that have caused the incident in a selected system. @param incident_id: The id of the incident @type incident_id: str @param system_id: System where the incident is created in @type system_id: str @param kwargs: Extra key-value arguments to pass for incident_event retrieval @return: Response code dictionary to indicate if the incident_events were retrieved or not @rtype: dict """ from api.backend.interfaces.event_administration import EventLog try: system = SystemService().filter(pk=system_id, state__name='Active').first() incident = IncidentService().filter(pk=incident_id, system=system).first() if system is None or incident is None: return {"code": "800.400.002"} incident_events = list(IncidentEventService().filter( incident=incident, state__name='Active').values( incident_id=F('incident'), status=F('state__name'), event_id=F('event')).order_by('-date_created')) events = [] for incident_event in incident_events: event = EventLog.get_event(incident_event.get('event_id'), system.id) if event.get('code') != '800.200.001': lgr.error('Event get Failed') else: incident_event.update(incident_event=event.get('data')) # incident_event.update(code = event.get('code')) events.append(incident_event) return {'code': '800.200.001', 'data': events} except Exception as ex: lgr.exception("Get Incident Event exception %s" % ex) return {"code": "800.400.001"}
def availability_trend(system, interval): """ Calculates the system availability percentage within a specified start and end date range within a system @param system: System whose availability percentage is to be computed @type system: str @param interval: Time interval to be applied in retrieving metric data points @return: Total system availability data points | response code to indicate errors while retrieving availability trend of the system as data points to be plotted in a graph @rtype: dict """ try: system = SystemService().get(pk=system, state__name='Active') if not system and interval: return { 'code': '800.400.002', 'message': 'Missing or invalid parameters' } today = datetime.now(timezone.utc) series = [] color = '#008000' name = 'Availability Percentage Trend' datasets = [] labels = [] if interval == 'day': time_intervals = 24 interval_length = 1 identifier = 'day' elif interval == 'week': time_intervals = 7 interval_length = 24 identifier = 'week' elif interval == 'month': time_intervals = 30 interval_length = 24 identifier = 'month' else: return { 'code': '800.400.002', 'message': 'Invalid time interval' } for i in range(1, time_intervals + 1): past_interval = today - timedelta(hours=i * interval_length) current_interval = past_interval + timedelta( hours=interval_length) availability_percentage_result = DashboardAdministration.calculate_system_availability( system=system.id, date_from=current_interval.isoformat(), date_to=past_interval.isoformat()) if availability_percentage_result.get('code') == '800.200.001': availability_percentage = availability_percentage_result.get( 'data').get('uptime_percentage') else: return { 'code': '800.400.001', 'message': availability_percentage_result } current_interval = (current_interval + timedelta(hours=1)).replace(minute=0) series.append( dict(value=availability_percentage, name=current_interval)) # labels.append(current_interval.strftime("%m/%d/%y %H:%M")) result = { "name": name, "color": color, "series": series, 'time_intervals': time_intervals, 'identifier': identifier, "yAxisValue": "Availability Trend in Percentage" } datasets.append(result) return {'code': '800.200.001', 'data': datasets} except Exception as ex: lgr.exception("Get uptime trend data exception %s" % ex) return { 'code': '800.400.001', 'msg': 'Error. Could not retrieve system up time trend data %s' % str(ex) }
def calculate_system_availability(system, interval=None, date_from=None, date_to=None): """ Calculates the system availability percentage within a specified start and end date range within a system @param system: System whose availability percentage is to be computed @type system: str @param interval: time interval to be applied @type interval: str | None @param date_from: Start date of the time period @type date_from: str @param date_to: End date of the time period @type date_to: str @return: system_availability_metric data | response code to indicate errors retrieving availability trend of the system @rtype: dict """ try: system = SystemService().get(pk=system, state__name='Active') if not system or not (interval or date_from and date_to): return {'code': '800.400.002', 'message': 'Invalid parameters'} if date_from and date_to: date_from = dateutil.parser.parse(date_from) date_to = dateutil.parser.parse(date_to) else: date_from = timezone.now() if interval == 'day': date_to = date_from - timedelta(hours=24) elif interval == 'week': date_to = date_from - timedelta(hours=24 * 7) elif interval == 'month': date_to = date_from - timedelta(hours=24 * 30) else: return { 'code': '800.400.002', 'message': 'Invalid time interval' } endpoints = EndpointService().filter(system=system) total_system_downtime = timedelta() latest_downtime = timezone.now() total_incidents = IncidentService().filter( system=system, date_created__gt=date_to, date_created__lt=date_from).count() for endpoint in endpoints: previous_monitor = {'state': None, 'date': None} saved_monitors = list(SystemMonitorService().filter( endpoint=endpoint, date_created__gt=date_to, date_created__lt=date_from).order_by( 'date_created').values('date_created', state_name=F('state__name'))) for monitor in saved_monitors: total_monitor_downtime = timedelta() if monitor.get('state_name') != 'Operational': previous_monitor = {'state': 'Down', 'date': monitor.get('date_created')} if \ previous_monitor.get('state') == 'Up' else previous_monitor if previous_monitor.get('state') == 'Up': previous_monitor.update( state='Down', date=monitor.get('date_created')) if not previous_monitor.get('date') and ( saved_monitors.index(monitor) == -1 or len(saved_monitors) == 1): total_monitor_downtime += date_from - monitor.get( 'date_created') else: if not previous_monitor.get('date'): previous_monitor.update( state='Down', date=monitor.get('date_created')) total_monitor_downtime += monitor.get( 'date_created') - previous_monitor.get('date') previous_monitor.update( state='Down', date=monitor.get('date_created')) else: if not previous_monitor.get('date'): previous_monitor.update( state='Up', date=monitor.get('date_created')) total_monitor_downtime += monitor.get( 'date_created') - previous_monitor.get('date') previous_monitor.update( state='Up', date=monitor.get('date_created')) latest_downtime = previous_monitor.get('date') total_system_downtime += total_monitor_downtime return { 'code': '800.200.001', 'data': { 'start_date': date_to.isoformat(), 'end_date': date_from.isoformat(), 'total_period': str(date_from - date_to), 'total_uptime': str((date_from - date_to) - total_system_downtime), 'total_downtime': total_system_downtime.total_seconds() if total_system_downtime.total_seconds() > 0 else 0, 'uptime_percentage': round(((date_from - date_to) - total_system_downtime).total_seconds() / (date_from - date_to).total_seconds() * 100, 2), 'downtime_percentage': round( total_system_downtime.total_seconds() / (date_from - date_to).total_seconds() * 100, 2), 'duration_since_downtime': (timezone.now() - latest_downtime).total_seconds(), 'incident_count': total_incidents } } except Exception as ex: lgr.exception("Calculate downtime percentage exception %s" % ex) return { 'code': '800.400.001', 'msg': 'Error. Could not calculate total system availability %s ' % str(ex) }
def get_current_status(system, **kwargs): """ Retrieves current system status and current incidents if any @param system: system whose status is being retrieved @type system: str @param kwargs: extra key-value args to be passed @return: a dictionary with any current incidents, statuses of registered endpoints and the response code """ try: system = SystemService().get(pk=system, state__name='Active') if system is None: return {'code': '800.400.002', 'message': 'Invalid parameters'} current_incidents = list( IncidentService().filter(system=system).exclude( Q(state__name='Resolved') | Q(state__name='Completed')).values( 'id', 'name', 'description', 'scheduled_for', 'scheduled_until', 'priority_level', 'event_type__name', 'system__name', 'state__name', 'date_created').order_by('-date_created')) for incident in current_incidents: incident_updates = list(IncidentLogService().filter( incident__id=incident.get('id')).values( 'description', 'priority_level', 'date_created', 'date_modified', user_name=F('user__username'), status=F('state__name')).order_by('-date_created')) incident.update(incident_updates=incident_updates) status_data = { 'system_id': system.id, 'name': system.name, 'description': system.description, 'incidents': current_incidents, 'current_state': {} } endpoints = [ str(endpoint) for endpoint in list(EndpointService().filter( system=system).values_list('state__name', flat=True)) ] status_data.update( current_state={ 'state': 'status-operational', 'description': 'All systems are operational' }) if endpoints is not None: for endpoint in endpoints: if 'Major Outage' in endpoints and all( status == 'Major Outage' for status in endpoints): status_data.update( current_state={ 'state': 'status-critical', 'description': 'There is a Major System Outage' }) break elif 'Major Outage' in endpoints: status_data.update( current_state={ 'state': 'status-major', 'description': 'There is a Partial System Outage' }) break elif 'Minor Outage' in endpoints: status_data.update( current_state={ 'state': 'status-minor', 'description': 'There is a Minor System Outage' }) break elif 'Degraded Performance' in endpoints: status_data.update( current_state={ 'state': 'status-minor', 'description': 'Partially Degraded Service' }) break elif 'Under Maintenance' in endpoints: status_data.update( current_state={ 'state': 'status-maintenance', 'description': 'A Service is undergoing maintenance' }) break else: status_data.update( current_state={ 'state': 'status-operational', 'description': 'All Systems Operational' }) break return {'code': '800.200.001', 'data': status_data} except Exception as ex: lgr.exception('Get current system status exception %s' % ex) return {'code': '800.400.001'}
def get_error_rate(system_id, start_date, end_date): """ Calculates and returns the error rate of a system based on logged events @param: system_id: Id of the system @type system_id: str @param start_date: Start point of the data to be presented @type: start_date: str @param: end_date: End date of the period for which the data is to be extracted @type end_date: str @return: Response code indicating status and error rate graph data """ try: system = SystemService().get(pk=system_id, state__name='Active') if not system: return {'code': '800.400.200'} now = timezone.now() start_date = dateutil.parser.parse(start_date) end_date = dateutil.parser.parse(end_date) series = [] color = "#E44D25" name = "Number of errors" period = start_date - end_date dataset = [] if period.days <= 1: for i in range(1, 25): past_hour = now - timedelta(hours=i, minutes=0) current_hour = past_hour + timedelta(hours=1) current_errors = EventService().filter( system=system, event_type__name='Error', date_created__lte=current_hour, date_created__gte=past_hour).count() past_hour = past_hour.replace(minute=0) series.append(dict(value=current_errors, name=past_hour)) result = { "name": name, "color": color, "series": series, "yAxisValue": "Number of Errors Occurred" } dataset.append(result) elif period.days <= 7: for i in range(0, 7): current_day = now - timedelta(days=i, hours=0, minutes=0) past_day = current_day + timedelta(days=1) current_errors = EventService().filter( system=system, event_type__name='Error', date_created__lte=past_day, date_created__gte=current_day).count() past_day = past_day.replace(hour=0, minute=0) series.append(dict(value=current_errors, name=past_day)) result = { "name": name, "color": color, "series": series, "yAxisValue": "Number of Errors Occurred" } dataset.append(result) elif period.days <= 31: for i in range(0, 31): current_day = now - timedelta(days=i, hours=0, minutes=0) past_day = current_day + timedelta(days=1) current_errors = EventService().filter( system=system, event_type__name='Error', date_created__lte=past_day, date_created__gte=current_day).count() past_day = past_day.replace(hour=0, minute=0) series.append(dict(value=current_errors, name=past_day)) result = { "name": name, "color": color, "series": series, "yAxisValue": "Number of Errors Occurred" } dataset.append(result) elif period.days <= 365: current_date = now.replace(day=1, hour=0, minute=0, second=0, microsecond=0) current_month = now.month current_date = current_date.replace( day=1, hour=0, minute=0, second=0, microsecond=0) + timedelta(days=calendar.monthrange( current_date.year, current_month)[1] - 1) for i in range(1, 13): if current_month > 1: end_date = current_date start_date = current_date - timedelta( days=calendar.monthrange(end_date.year, end_date.month)[1] - 1) current_date = current_date - timedelta( days=calendar.monthrange(current_date.year, current_month)[1]) current_month = current_month - 1 else: end_date = current_date start_date = current_date - timedelta( days=calendar.monthrange(end_date.year, end_date.month)[1] - 1) current_date = current_date - timedelta( days=calendar.monthrange(current_date.year, current_month)[1]) current_month = current_date.month current_errors = EventService().filter( system=system, event_type__name='Error', date_created__lte=end_date, date_created__gte=start_date).count() series.append(dict(value=current_errors, name=current_date)) result = { "name": name, "color": color, "series": series, "yAxisValue": "Number of Errors Occurred" } dataset.append(result) else: intervals = 24 for i in range(1, intervals + 1): past_hour = now - timedelta(hours=i, minutes=0) current_hour = past_hour + timedelta(hours=1) current_errors = EventService().filter( system=system, event_type__name='Error', date_created__lte=current_hour, date_created__gte=past_hour).count() past_hour = past_hour.replace(minute=0) series.append(dict(value=current_errors, name=past_hour)) result = { "name": name, "color": color, "series": series, "yAxisValue": "Number of Errors Occurred" } dataset.append(result) return {'code': '800.200.001', 'data': dataset} except Exception as ex: lgr.exception("Get Error rate Exception %s" % ex) return {'code': '800.400.001 %s' % str(ex)}
def log_event(event_type, system, interface=None, method=None, response=None, request=None, code=None, description=None, stack_trace=None, **kwargs): """ Logs an event that being reported from an external system or an health check @param event_type: Type of the event to be logged @type event_type: str @param system: The system where the event occurred @type system: str @param interface: Specific interface in a system where the event occurred @type interface: str | None @param method: Specific method within an interface where the event occurred @type method: str | None @param response: Response body, if any, of the reported event occurrence @type response: str | None @param request: Request body, if any, of the reported event occurrence @type request: str | None @param code: Response code of the event @type code: str | None @param description: Detailed information on the event occurrence @type description: str | None @param stack_trace: Stack trace from the on the event occurrence @type stack_trace: str | None @param kwargs: Extra key=>value arguments to be passed for the event logging @return: Response code in a dictionary indicating if the event is created successfully or not @rtype: dict """ try: system = SystemService().get(pk=system, state__name="Active") event_type = EventTypeService().get(name=event_type, state__name="Active") if system is None or event_type is None: return {"code": "800.400.002"} event = EventService().create( event_type=event_type, system=system, method=method, response=response, request=request, code=code, description=description, state=StateService().get(name="Active"), interface=InterfaceService().get(name=interface, state__name="Active", system=system), stack_trace=stack_trace) if event is not None: escalation = EventLog.escalate_event(event) if escalation.get('code') != '800.200.001': lgr.error('%s event escalation Failed' % event_type) created_event = EventService().filter(id=event.id).values( 'id', 'event_type', 'state__id', 'system__id', 'method', 'response', 'request', 'code', 'description', 'interface__id', 'stack_trace').first() return {'code': '800.200.001', 'data': created_event} except Exception as ex: lgr.exception('Event processor exception %s' % ex) return {'code': '800.400.001'}
def past_incidents(system, date_from=None, date_to=None): """ Retrieves historical incidents within a specified start and end date range within a system @param system: System where the incident is created in @type system: str @param date_from: Start date limit applied @type date_from: str | None @param date_to: End date limit to be applied @type date_to: str | None @return: incidents | response code to indicate errors retrieving the incident @rtype: dict """ try: system = SystemService().get(pk=system, state__name='Active') if not system: return {'code': '800.400.002'} if date_from and date_to: date_from = dateutil.parser.parse(date_from) date_to = dateutil.parser.parse(date_to) else: date_from = datetime.combine(datetime.now(), datetime.min.time()) date_to = date_from - timedelta(days=15) # return {'code': '800.200.001', 'data': date_to} data = [] for date in (date_from - timedelta(n) for n in range((date_from - date_to).days)): incidents = list(IncidentService().filter( system=system, date_created__gte=date, date_created__lt=date + timedelta(1)).exclude(~(Q(state__name='Resolved') | Q( state__name='Completed'))).values( 'id', 'name', 'description', 'system_id', 'priority_level', 'date_created', 'date_modified', 'scheduled_for', 'scheduled_until', type=F('incident_type__name'), eventtype=F('event_type__name'), incident_id=F('id'), status=F('state__name'), affected_system=F('system__name')).order_by( '-date_created')) for incident in incidents: incident_updates = list(IncidentLogService().filter( incident__id=incident.get('incident_id')).values( 'description', 'priority_level', 'date_created', 'date_modified', user_name=F('user__username'), status=F('state__name')).order_by('-date_created')) incident.update(incident_updates=incident_updates) data.append({'date': date, 'incidents': incidents}) return {'code': '800.200.001', 'data': data} except Exception as ex: lgr.exception("Get incidents exception %s" % ex) return {'code': '800.400.001'}
def perform_health_check(): """ This method formats system data and logs system status to system monitor model @return: Systems: a dictionary containing a success code and a list of dictionaries containing system status data @rtype:dict """ systems = [] try: for endpoint in EndpointService().filter( system__state__name="Active", endpoint_type__is_queried=True): try: health_state = requests.get(endpoint.url) monitor_data = { 'system': endpoint.system.name, 'endpoint': endpoint.name, 'response_body': health_state.content, 'response_code': health_state.status_code, 'state': StateService().get(name='Operational').name, } if health_state.status_code == 200: if health_state.elapsed > endpoint.optimal_response_time: monitor_data.update({ "response_time_speed": 'Slow', "event_type": EventTypeService().get(name='Warning'), "description": 'Response time is not within the expected time', "state": StateService().get( name='Degraded Performance'), "response_time": health_state.elapsed.total_seconds() }) else: monitor_data.update({ 'response_time_speed': 'Normal', "response_time": health_state.elapsed.total_seconds() }) else: monitor_data.update({ "response_time_speed": None, "event_type": EventTypeService().get(name='Critical'), "description": 'The system is not accessible', "state": StateService().get(name='Major Outage') }) system_status = SystemMonitorService().create( system=SystemService().get( name=monitor_data.get('system')), response_time=timedelta( seconds=int(monitor_data.get('response_time'))), response_time_speed=monitor_data.get( "response_time_speed"), state=StateService().get( name=monitor_data.get('state')), response_body=monitor_data.get("response_body"), endpoint=EndpointService().get( name=monitor_data.get("endpoint")), response_code=monitor_data.get("response_code")) if system_status is not None: systems.append({ "system": system_status.system.name, "status": system_status.state.name, "endpoint": endpoint.url }) else: systems.append({ "system": system_status.system, "status": "failed", "endpoint": endpoint }) if monitor_data.get("event_type") is not None: event = EventLog.log_event( event_type=monitor_data.get("event_type").name, system=monitor_data.get("system"), description=monitor_data.get("description"), response=monitor_data.get('response'), request=health_state.request) if event['code'] != "800.200.001": lgr.warning("Event creation failed %s" % event) except requests.ConnectionError as e: lgr.exception('Endpoint health check failed: %s' % e) return {"code": "800.200.001", "data": {"systems": systems}} except Exception as ex: lgr.exception("Health Status exception: %s" % ex) return { "code": "800.400.001", "message": "Error while performing health check" }
def get_system_endpoint_response_time(system_id, start_date, end_date): """ Returns the response time of every endpoint for a specific system @param end_date: End date of the period of which the data is to be extracted @type:str @param start_date: Start point of the data to be presented @type: str @param: system_id: Id of the system @type system_id: str @return: Response code indicating status and response time graph data """ try: system = SystemService().get(pk=system_id, state__name='Active') if not system: return {'code': '800.400.200'} now = timezone.now() start_date = dateutil.parser.parse(start_date) end_date = dateutil.parser.parse(end_date) period = start_date - end_date labels = [] label = [] dataset = [] if period.days <= 1: for i in range(1, 25): past_hour = now - timedelta(hours=i, minutes=0) current_hour = past_hour + timedelta(hours=1) response_times = list(SystemMonitorService().filter( system=system, date_created__lte=current_hour, date_created__gte=past_hour).values( name=F('endpoint__name'), responseTime=F('response_time'), dateCreated=F('date_created'))) past_hour = past_hour.replace(minute=0) label.append(past_hour.strftime("%m/%d/%y %H:%M")) result = {"Initial": {"data": [0]}} for response_time in response_times: response_time.update( responseTime=timedelta.total_seconds( response_time.get('responseTime')), dateCreated=response_time["dateCreated"].strftime( "%m/%d/%y %H:%M")) dataset.append(response_time) labels.append(response_time['dateCreated']) if dataset: label = [] [ label.append(item) for item in labels if item not in label ] result = {} for row in dataset: if row["name"] in result: result[row["name"]]["data"].append( row["responseTime"]) result[row["name"]]["dateCreated"].append( row["dateCreated"]) else: result[row["name"]] = { "label": row["name"], "data": [row["responseTime"]], "dateCreated": [row["dateCreated"]], } elif period.days <= 7: for i in range(0, 7): current_day = now - timedelta(days=i, hours=0, minutes=0) past_day = current_day + timedelta(days=1) response_times = list(SystemMonitorService().filter( system=system, date_created__lte=past_day, date_created__gte=current_day).values( name=F('endpoint__name'), responseTime=F('response_time'), dateCreated=F('date_created'))) past_day = past_day.replace(hour=0, minute=0) label.append(past_day.strftime("%m/%d/%y %H:%M")) result = {"Initial": {"data": [0]}} for response_time in response_times: response_time.update( responseTime=timedelta.total_seconds( response_time.get('responseTime')), dateCreated=response_time["dateCreated"].strftime( "%m/%d/%y %H:%M")) dataset.append(response_time) labels.append(response_time['dateCreated']) if dataset: label = [] [ label.append(item) for item in labels if item not in label ] result = {} for row in dataset: if row["name"] in result: result[row["name"]]["data"].append( row["responseTime"]) result[row["name"]]["dateCreated"].append( row["dateCreated"]) else: result[row["name"]] = { "label": row["name"], "data": [row["responseTime"]], "dateCreated": [row["dateCreated"]], } elif period.days <= 31: for i in range(0, 31): current_day = now - timedelta(days=i, hours=0, minutes=0) past_day = current_day + timedelta(days=1) response_times = list(SystemMonitorService().filter( system=system, date_created__lte=past_day, date_created__gte=current_day).values( name=F('endpoint__name'), responseTime=F('response_time'), dateCreated=F('date_created'))) # dates = [x.get('dateCreated') for x in response_times] # for d in (current_day - past_day for x in range(0, 30)): # if d not in dates: # response_times.append({'dateCreated': d, 'responseTime': 0}) past_day = past_day.replace(hour=0, minute=0) label.append(past_day.strftime("%m/%d/%y %H:%M")) result = {"Initial": {"data": [0]}} for response_time in response_times: response_time.update( responseTime=timedelta.total_seconds( response_time.get('responseTime')), dateCreated=response_time["dateCreated"].strftime( "%m/%d/%y %H:%M")) dataset.append(response_time) labels.append(response_time['dateCreated']) if dataset: label = [] [ label.append(item) for item in labels if item not in label ] result = {} for row in dataset: if row["name"] in result: result[row["name"]]["data"].append( row["responseTime"]) result[row["name"]]["dateCreated"].append( row["dateCreated"]) else: result[row["name"]] = { "label": row["name"], "data": [row["responseTime"]], "dateCreated": [row["dateCreated"]], } elif period.days <= 365: current_date = now.replace(day=1, hour=0, minute=0, second=0, microsecond=0) current_month = now.month current_date = current_date.replace( day=1, hour=0, minute=0, second=0, microsecond=0) + timedelta(days=calendar.monthrange( current_date.year, current_month)[1] - 1) for i in range(1, 13): if current_month > 1: month_name = calendar.month_name[current_month] end_date = current_date start_date = current_date - timedelta( days=calendar.monthrange(end_date.year, end_date.month)[1] - 1) current_date = current_date - timedelta( days=calendar.monthrange(current_date.year, current_month)[1]) current_month = current_month - 1 else: month_name = calendar.month_name[current_month] end_date = current_date start_date = current_date - timedelta( days=calendar.monthrange(end_date.year, end_date.month)[1] - 1) current_date = current_date - timedelta( days=calendar.monthrange(current_date.year, current_month)[1]) current_month = current_date.month response_times = list(SystemMonitorService().filter( system=system, date_created__lte=end_date, date_created__gte=start_date).values( name=F('endpoint__name'), responseTime=F('response_time'), dateCreated=F('date_created'))) label.append('%s, %s' % (month_name, current_date.year)) result = {"Initial": {"data": [0]}} for response_time in response_times: response_time.update( responseTime=timedelta.total_seconds( response_time.get('responseTime')), dateCreated=response_time["dateCreated"].strftime( "%m/%d/%y %H:%M")) dataset.append(response_time) labels.append(response_time['dateCreated']) if dataset: label = [] [ label.append(item) for item in labels if item not in label ] result = {} for row in dataset: if row["name"] in result: result[row["name"]]["data"].append( row["responseTime"]) result[row["name"]]["dateCreated"].append( row["dateCreated"]) else: result[row["name"]] = { "label": row["name"], "data": [row["responseTime"]], "dateCreated": [row["dateCreated"]], } return { 'code': '800.200.001', 'data': { 'labels': label, 'datasets': result } } except Exception as ex: lgr.exception("Get Error rate Exception %s" % ex) return {'code': '800.400.001'}
def log_incident(incident_type, system, escalation_level, name, description, priority_level, event_type=None, state="Investigating", escalated_events=None, scheduled_for=None, scheduled_until=None, **kwargs): """ Creates a realtime incident based on escalated events or scheduled incident based on user reports @param incident_type: Type of the incident to be created @type incident_type: str @param system: The system which the incident will be associated with @type system: str @param name: Title of the incident @type name: str @param description: Details on the incident @type description: str @param event_type: Type of the event(s) that triggered creation of the incident, if its event driven. @type event_type: str | None @param escalated_events: One or more events in the escalation if the incident is event driven. @type escalated_events: list | None @param state: Initial resolution state of the incident. Defaults to Investigating if left blank @type state: str @param priority_level: The level of importance to be assigned to the incident. @type priority_level: str @param escalation_level: Level at which an escalation is configured with a set of recipients @type escalation_level: str @param scheduled_for: Time the scheduled maintenance should begin if the incident is scheduled @type scheduled_for: str | None @param scheduled_until: Time the scheduled maintenance should end if the incident is scheduled @type scheduled_until: str | None @param kwargs: Extra key-value arguments to pass for incident logging @return: Response code dictionary to indicate if the incident was created or not @rtype: dict """ try: system = SystemService().get(pk=system, state__name="Active") incident_type = IncidentTypeService().get(name=incident_type, state__name="Active") try: state = StateService().get(pk=uuid.UUID(state)) except ValueError: state = StateService().get( name=state ) if incident_type.name == 'Realtime' else StateService().get( name='Scheduled') escalation_level = EscalationLevelService().get( pk=escalation_level, state__name="Active") if system is None or incident_type is None or escalation_level is None: return {"code": "800.400.002"} if incident_type.name == "Realtime" and event_type is not None: incident = IncidentService().filter( event_type__name=event_type, system=system).exclude( Q(state__name='Resolved'), Q(state__name='Completed')).order_by( '-date_created').first() if incident and int(priority_level) < 5: priority_level = incident.priority_level + 1 return IncidentAdministrator().update_incident( incident_id=incident.id, escalation_level=escalation_level.name, name=incident.name, state=incident.state.id, priority_level=str(priority_level), description= "Priority level of %s incident changed to %s" % (incident.name, priority_level)) if incident_type.name == 'Scheduled': scheduled_for = dateutil.parser.parse(scheduled_for) scheduled_until = dateutil.parser.parse(scheduled_until) incident = IncidentService().create( name=name, description=description, state=StateService().get(name=state), system=system, incident_type=incident_type, scheduled_for=scheduled_for, scheduled_until=scheduled_until, event_type=EventTypeService().get(name=event_type), priority_level=int(priority_level)) incident_log = IncidentLogService().create( description=description, incident=incident, priority_level=priority_level, state=StateService().get(name=state), escalation_level=escalation_level) if incident is not None and incident_log is not None: if escalated_events: for event in escalated_events: incident_event = IncidentEventService().create( event=event, incident=incident, state=StateService().get(name="Active")) if not incident_event: lgr.error("Error creating incident-events") email_system_recipients = SystemRecipientService().filter( escalation_level=escalation_level, system=incident.system, state__name='Active', notification_type__name='Email').values('recipient__id') sms_system_recipients = SystemRecipientService().filter( escalation_level=escalation_level, system=incident.system, state__name='Active', notification_type__name='Sms').values('recipient__id') sms_notification = NotificationLogger().send_notification( message=incident.description, message_type="Sms", system_id=incident.system.id, recipients=[ str(recipient["phone_number"]) for recipient in User.objects.filter( id__in=sms_system_recipients, is_active=True).values("phone_number") ]) email_notification = NotificationLogger().send_notification( message=incident.description, message_type="Email", system_id=incident.system.id, recipients=[ str(recipient['user__email']) for recipient in User.objects.filter(id__in=email_system_recipients, is_active=True).values('email') ]) if sms_notification.get( 'code') != '800.200.001' or email_notification.get( 'code') != '800.200.001': lgr.exception("Notification sending failed") return {'code': '800.200.001'} except Exception as ex: lgr.exception("Incident Logger exception %s" % ex) return {"code": "800.400.001"}