def process_set_alarm_state_msg(self, metric_key, message): project_id = message.get("project_id") alarm_name = message.get("alarm_name") state_reason_data = message.get("state_reason_data") if metric_key not in self.metrics: self.metrics[metric_key] = MetricMonitor(metric_key, self.cass) metric = self.metrics[metric_key] ret = self.cass.get_metric_alarm_key(project_id, alarm_name) if ret: alarm_key = ret try: metricalarm = metric.alarms[alarm_key] except KeyError: storm.log("alarm key [%s] is found, but alarm is not found." % alarm_key) return else: storm.log("alarm key [%s] is not found." % alarm_key) return metricalarm["state_reason"] = message.get("state_reason") metricalarm["state_value"] = message.get("state_value") metricalarm["state_reason_data"] = message.get("state_reason_data") # write into database alarm_columns = {"state_reason": message.get("state_reason"), "state_value": message.get("state_value")} if state_reason_data: alarm_columns["state_reason_data"] = state_reason_data self.cass.put_metric_alarm(alarm_key, alarm_columns)
def put_metric_data(self, metric_key, timestamp, value, unit=None): def get_stats(tmp_stat): try: ret = dict(zip(self.COLUMNS, map(lambda x: x.values()[0], tmp_stat))) for v in ret: if v == None: v = float("nan") except IndexError: storm.log("index %s is not in DB." % time_idx) ret = { "SampleCount": float("nan"), "Sum": float("nan"), "Average": float("nan"), "Minimum": float("nan"), "Maximum": float("nan"), } return ret time_idx = timestamp.replace(second=0, microsecond=0) time_diff = utils.utcnow() - time_idx if timedelta(seconds=self.STATISTICS_TTL) < time_diff: msg = "index %s is older than TTL. It doesn't need to insert DB" storm.log(msg % time_idx) return if time_idx not in self.df.index: self._reindex() value = utils.to_default_unit(value, unit) try: stat = self.df.ix[time_idx] for v in stat: if v == None: v = float("nan") except KeyError: stat = self.cass.get_metric_statistics_for_key(metric_key, time_idx) stat = get_stats(stat) stat["SampleCount"] = 1.0 if isnull(stat["SampleCount"]) else stat["SampleCount"] + 1.0 stat["Sum"] = value if isnull(stat["Sum"]) else stat["Sum"] + value stat["Average"] = stat["Sum"] / stat["SampleCount"] stat["Minimum"] = value if (isnull(stat["Minimum"]) or stat["Minimum"] > value) else stat["Minimum"] stat["Maximum"] = value if (isnull(stat["Maximum"]) or stat["Maximum"] < value) else stat["Maximum"] # insert into DB stat_dict = { "SampleCount": {time_idx: stat["SampleCount"]}, "Sum": {time_idx: stat["Sum"]}, "Average": {time_idx: stat["Average"]}, "Minimum": {time_idx: stat["Minimum"]}, "Maximum": {time_idx: stat["Maximum"]}, } ttl = self.STATISTICS_TTL - time_diff.total_seconds() self.cass.insert_stat(self.metric_key, stat_dict, ttl) storm.log("metric data inserted %s" % (self.metric_key))
def emit(self, record): try: msg = self.format(record) storm.log(msg) except (KeyboardInterrupt, SystemExit): raise except: self.handleError(record)
def get_stats(tmp_stat): try: ret = dict(zip(self.COLUMNS, map(lambda x: x.values()[0], tmp_stat))) for v in ret: if v == None: v = float("nan") except IndexError: storm.log("index %s is not in DB." % time_idx) ret = { "SampleCount": float("nan"), "Sum": float("nan"), "Average": float("nan"), "Minimum": float("nan"), "Maximum": float("nan"), } return ret
def do_alarm_action(self, alarmkey, alarm, new_state, old_state, query_date): """ parameter example: alarmkey: f459c0e0-f927-481f-9158-deb8abe102a2 alarm: OrderedDict([('actions_enabled', False), ('alarm_actions', u'[]'), ('alarm_arn', u'arn:spcs:synaps:IaaS:alarm:TEST_\uc54c\ub78c_02'), ('alarm_configuration_updated_timestamp', datetime.datetime(2012, 8, 25, 10, 51, 38, 469000)), ('alarm_description', u''), ('alarm_name', u'TEST_\uc54c\ub78c_02'), ('comparison_operator', u'LessThanThreshold'), ('dimensions', u'{"instance_name": "test instance"}'), ('evaluation_periods', 2), ('insufficient_data_actions', u'[]'), ('metric_key', UUID('96f19ec9-673b-4237-ae66-1bfde526595c')), ('metric_name', u'test_metric'), ('namespace', u'SPCS/SYNAPSTEST'), ('ok_actions', u'[]'), ('period', 300), ('project_id', u'IaaS'), ('state_reason', u'Threshold Crossed: 2 datapoints were not less than the threshold(2.000000). The most recent datapoints: [55.25, 55.25].'), ('state_reason_data', u'{"startDate": "2012-08-25T10:30:00.000000", "period": 300, "threshold": 2.0, "version": "1.0", "statistic": "Average", "recentDatapoints": [55.25, 55.25], "queryDate": "2012-08-25T10:32:24.671991"}'), ('state_updated_timestamp', datetime.datetime(2012, 8, 25, 11, 39, 49, 657449)), ('state_value', 'OK'), ('statistic', u'Average'), ('threshold', 2.0), ('unit', u'Percent'), ('reason', u'Threshold Crossed: 3 datapoints were not less than the threshold(2.000000). The most recent datapoints: [75.0, 80.0, 67.625].'), ('reason_data', '{"startDate": "2012-08-25T11:37:00.000000", "period": 300, "threshold": 2.0, "version": "1.0", "statistic": "Average", "recentDatapoints": [75.0, 80.0, 67.625], "queryDate": "2012-08-25T11:39:49.657449"}') ]) new_state: {'stateReason': u'Threshold Crossed: 3 datapoints were not less than the threshold(2.000000). The most recent datapoints: [75.0, 80.0, 67.625].', 'stateValue': 'OK', 'stateReasonData': {'startDate': '2012-08-25T11:37:00.000000', 'period': 300, 'threshold': 2.0, 'version': '1.0', 'statistic': u'Average', 'recentDatapoints': [75.0, 80.0, 67.625], 'queryDate': '2012-08-25T11:39:49.657449'}} old_state: {'stateReason': u'Insufficient Data: 1 datapoints were unknown.', 'stateReasonData': {u'startDate': u'2012-08-25T11:37:00.000000', u'period': 300, u'recentDatapoints': [55.25], u'version': u'1.0', u'statistic': u'Average', u'threshold': 2.0, u'queryDate': u'2012-08-25T11:39:26.261056'}, 'stateValue': 'INSUFFICIENT_DATA'} """ msg = { "state": new_state["stateValue"], "subject": "%s state has been changed from %s to %s at %s" % (alarm["alarm_name"], old_state["stateValue"], new_state["stateValue"], query_date), "body": "%s at %s" % (new_state["stateReason"], query_date), "query_date": query_date, } storm.log("emit to Alarm Action: %s %s" % (alarmkey, msg)) storm.emit([str(alarmkey), json.dumps(msg)])
def delete_metric_alarm(self, alarmkey): """ Delete alarms from memory and database alarmkey: alarmkey should be UUID """ try: alarm = self.alarms.pop(alarmkey) except KeyError: storm.log("alarmkey %s doesn't exist" % alarmkey) return self.cass.delete_metric_alarm(alarmkey) self.alarm_history_delete(alarmkey, alarm) storm.log("delete alarm %s for metric %s" % (str(alarmkey), self.metric_key)) self.update_left_offset(self.alarms)
def process(self, tup): message_buf = tup.values[0] message = json.loads(message_buf) message_id = message['message_id'] message_uuid = message['message_uuid'] self.log("start processing msg[%s:%s]" % (message_id, message_uuid)) if message_id == PUT_METRIC_DATA_MSG_ID: metric_key = str(self.get_metric_key(message)) storm.emit([metric_key, message_buf]) elif message_id == PUT_METRIC_ALARM_MSG_ID: metric_key = message.get('metric_key') storm.emit([metric_key, message_buf]) elif message_id == DELETE_ALARMS_MSG_ID: project_id = message['project_id'] alarmkeys = message['alarmkeys'] for alarmkey in alarmkeys: try: alarmkey_uuid = UUID(alarmkey) metric_key = self.get_alarm_metric_key(alarmkey_uuid) metric_key = str(metric_key) if metric_key: message['alarmkey'] = alarmkey storm.emit([metric_key, json.dumps(message)]) except Exception as e: storm.log("Alarm %s does not exists" % alarmkey) storm.log(traceback.format_exc(e)) elif message_id == SET_ALARM_STATE_MSG_ID: project_id = message['project_id'] alarm_name = message['alarm_name'] alarm_key = self.cass.get_metric_alarm_key(project_id, alarm_name) if alarm_key: alarm = self.cass.get_metric_alarm(alarm_key) metric_key = str(alarm['metric_key']) storm.emit([metric_key, json.dumps(message)])
def alarm_history_state_update(self, alarmkey, alarm, notification_message): # notification_message = { # 'method': "email", # 'receivers': email_receivers, # 'subject': message['subject'], # 'body': message['body'] # } item_type = 'Action' project_id = alarm['project_id'] history_summary = ("Message '%(subject)s' is sent via %(method)s" % notification_message) timestamp = utils.utcnow() history_key = uuid4() column = {'project_id':project_id, 'alarm_key':UUID(alarmkey), 'alarm_name':alarm['alarm_name'], 'history_data': json.dumps(notification_message), 'history_item_type':item_type, 'history_summary':history_summary, 'timestamp':timestamp} self.cass.insert_alarm_history(history_key, column) storm.log("alarm history \n %s" % history_summary)
def alarm_history_state_update(self, alarmkey, alarm, new_state, old_state): item_type = "StateUpdate" project_id = alarm["project_id"] summary_tpl = "Alarm updated from %s to %s" summary = summary_tpl % ( old_state.get("stateValue", "INSUFFICIENT_DATA"), new_state.get("stateValue", "INSUFFICIENT_DATA"), ) timestamp = utils.utcnow() data = {"newState": new_state, "oldState": old_state, "version": "1.0"} history_key = uuid.uuid4() column = { "project_id": project_id, "alarm_key": alarmkey, "alarm_name": alarm["alarm_name"], "history_data": json.dumps(data), "history_item_type": item_type, "history_summary": summary, "timestamp": timestamp, } self.cass.insert_alarm_history(history_key, column) storm.log("alarm history \n %s" % summary)
def put_alarm(self, project_id, metricalarm): alarm_name = metricalarm.get("alarm_name") alarm_key = self.cass.get_metric_alarm_key(project_id, alarm_name) if alarm_key: ret = self.cass.get_metric_alarm(alarm_key) if ret: self.alarms[alarm_key] = ret storm.log("alarm key is [%s]" % alarm_key) self.update_left_offset(self.alarms) else: storm.log("alarm key [%s] is found, but alarm is not found." % alarm_key) else: storm.log("no alarm key [%s]" % alarm_key)
def log(self, msg): storm.log("[%s:%d] %s" % (self.BOLT_NAME, self.pid, msg))
def _check_alarm(self, alarmkey, alarm, query_time=None): period = int(alarm["period"] / 60) evaluation_periods = alarm["evaluation_periods"] statistic = alarm["statistic"] threshold = alarm["threshold"] cmp_op = self.CMP_MAP[alarm["comparison_operator"]] unit = alarm["unit"] state_value = alarm["state_value"] time_difference_buffer_min = 3 query_time = query_time if query_time else utils.utcnow() for i in range(time_difference_buffer_min): end_idx = query_time.replace(second=0, microsecond=0) - (i + 1) * datetools.Minute() if not isnull(self.df[statistic].ix[end_idx]): break start_idx = end_idx - (period * evaluation_periods) * datetools.Minute() start_ana_idx = start_idx - datetools.Minute() * period func = self.ROLLING_FUNC_MAP[statistic] data = func(self.df[statistic].ix[start_ana_idx:end_idx], period, min_periods=0).ix[start_idx:end_idx:period][ 1: ] recent_datapoints = list(data) if unit and statistic is not "SampleCount": data = data / utils.UNIT_CONV_MAP[unit] threshold = threshold / utils.UNIT_CONV_MAP[unit] data = data.dropna() query_date = utils.strtime(query_time) reason_data = { "period": alarm["period"], "queryDate": query_date, "recentDatapoints": recent_datapoints, "startDate": utils.strtime(start_idx), "statistic": statistic, "threshold": threshold, "version": "1.0", } old_state = { "stateReason": alarm.get("reason", ""), "stateValue": alarm.get("state_value", "INSUFFICIENT_DATA"), "stateReasonData": json.loads(alarm.get("reason_data", "{}")), } json_reason_data = json.dumps(reason_data) if len(data) < evaluation_periods: if state_value != "INSUFFICIENT_DATA": template = _("Insufficient Data: %d datapoints were unknown.") reason = template % (evaluation_periods - len(data)) new_state = {"stateReason": reason, "stateReasonData": reason_data, "stateValue": "INSUFFICIENT_DATA"} self.update_alarm_state(alarmkey, "INSUFFICIENT_DATA", reason, json_reason_data, query_time) self.cass.update_alarm_state(alarmkey, "INSUFFICIENT_DATA", reason, json_reason_data, query_time) self.alarm_history_state_update(alarmkey, alarm, new_state, old_state) self.do_alarm_action(alarmkey, alarm, new_state, old_state, query_date) storm.log("INSUFFICIENT_DATA alarm") else: crossed = reduce(operator.and_, cmp_op(data, threshold)) com_op = alarm["comparison_operator"] if crossed: template = _( "Threshold Crossed: %d datapoints were %s " + "the threshold(%f). " + "The most recent datapoints: %s." ) reason = template % (len(data), self.CMP_STR_MAP[com_op], threshold, recent_datapoints) if state_value != "ALARM": new_state = {"stateReason": reason, "stateReasonData": reason_data, "stateValue": "ALARM"} self.update_alarm_state(alarmkey, "ALARM", reason, json_reason_data, query_time) self.cass.update_alarm_state(alarmkey, "ALARM", reason, json_reason_data, query_time) self.alarm_history_state_update(alarmkey, alarm, new_state, old_state) self.do_alarm_action(alarmkey, alarm, new_state, old_state, query_date) storm.log("ALARM alarm") else: template = _( "Threshold Crossed: %d datapoints were not %s " + "the threshold(%f). " + "The most recent datapoints: %s." ) reason = template % (len(data), self.CMP_STR_MAP[com_op], threshold, recent_datapoints) if state_value != "OK": new_state = {"stateReason": reason, "stateReasonData": reason_data, "stateValue": "OK"} self.update_alarm_state(alarmkey, "OK", reason, json_reason_data, query_time) self.cass.update_alarm_state(alarmkey, "OK", reason, json_reason_data, query_time) self.alarm_history_state_update(alarmkey, alarm, new_state, old_state) self.do_alarm_action(alarmkey, alarm, new_state, old_state, query_date) storm.log("OK alarm")
def load_alarms(self): alarms = dict(self.cass.load_alarms(self.metric_key)) storm.log("load_alarms %s for metric %s" % (str(alarms), self.metric_key)) return alarms