def __init__(self, **conf): # save yaml conf. self.conf = conf # redis handler self.ranger = MagicShow(**conf['database']) self.alertr = RedisClient(db='alert', **conf['database']) self.hostr = RedisClient(db='hosts', **conf['database']) # blacklist manager. self.bm = BlackManager(**conf['database']) self.alarm_info = dict() # alarm times self.alarm_times = dict() self.conf_info = dict() self.data_info = dict() self.time_stamp = dict() self.time_stamp['check_time'] = dict() self.time_stamp['alarm_time'] = dict() # which ones are being checked self.alarm_check = dict() # self.alive = True # self.check_enqueue = multiprocessing.Queue() self.check_dequeue = multiprocessing.Queue() self.alarm_enqueue = multiprocessing.Queue() self.alarm_dequeue = multiprocessing.Queue() # self.procs = dict() # create xmpp object self.xmpp = XMPPBOT(address=("125.39.223.167",5222), **conf['database']) self.xmpp.start() self.xmpp.enter_room()
class AlarmMain(object): def __init__(self, **conf): # save yaml conf. self.conf = conf # redis handler self.ranger = MagicShow(**conf['database']) self.alertr = RedisClient(db='alert', **conf['database']) self.hostr = RedisClient(db='hosts', **conf['database']) # blacklist manager. self.bm = BlackManager(**conf['database']) self.alarm_info = dict() # alarm times self.alarm_times = dict() self.conf_info = dict() self.data_info = dict() self.time_stamp = dict() self.time_stamp['check_time'] = dict() self.time_stamp['alarm_time'] = dict() # which ones are being checked self.alarm_check = dict() # self.alive = True # self.check_enqueue = multiprocessing.Queue() self.check_dequeue = multiprocessing.Queue() self.alarm_enqueue = multiprocessing.Queue() self.alarm_dequeue = multiprocessing.Queue() # self.procs = dict() # create xmpp object self.xmpp = XMPPBOT(address=("125.39.223.167",5222), **conf['database']) self.xmpp.start() self.xmpp.enter_room() def get_conf(self): conf_info = dict() for conf_type in ['client', 'tcp', 'http']: policy_configs = self.alertr.hgetall('total:%s:policy:configs' % conf_type) collector_configs = self.alertr.hgetall('total:%s:collector:configs' % conf_type) for item, policy_conf in policy_configs.items(): if 'collector' not in policy_conf:continue try: policy_conf = json.loads(policy_conf) except Exception as e: continue collector_name = policy_conf['collector'] target = policy_conf.pop('target') if collector_name in collector_configs: collector_conf = collector_configs[collector_name] else: continue try: conf = json.loads(collector_conf) except Exception as e: continue else: collector_interval = conf['interval'] policy_conf['interval'] = collector_interval targets = self.ranger.load(target).show() policy_conf['target'] = targets policy_conf['type'] = conf['type'] if conf_type == 'TCP' or conf_type == 'tcp': policy_conf['port'] = conf['port'] uniq_key = '%s|%s' % (item, conf_type) conf_info[uniq_key] = policy_conf if uniq_key not in self.alarm_info: self.alarm_info[uniq_key] = dict() if uniq_key not in self.alarm_times: self.alarm_times[uniq_key] = dict() return conf_info def init_process(self): cpu = multiprocessing.cpu_count() for i in xrange(cpu*2): proc = Process(target=check_data_proc, args=(self.check_enqueue, self.check_dequeue)) self.procs[proc] = 1 proc.start() for i in xrange(cpu*2): proc = Process(target=alarm_data_proc, args=(self.alarm_enqueue, self.alarm_dequeue)) self.procs[proc] = 1 proc.start() def get_data(self): data_info = dict() for item, conf in self.conf_info.items(): result_key = "result:%s" % conf['collector'] if not self.alertr.exists(result_key):continue result_data = self.alertr.hgetall(result_key) # too slow if determine, just leave to process. self.conf_info[item]['result'] = result_data def save_data(self, item, data): self.alarm_info[item] = data def send_alarm(self, item, data): email_error = dict() smess_error = dict() xmpp_error = dict() total_error = list() for k, v in data.items(): for info in v: template = info['template'] match = re.compile('\$(\w+)\$', re.I) while True: whether_match = match.search(template) if whether_match: key = whether_match.groups(0)[0] if key in info: template = template.replace('$'+key+'$', str(info[key]), 1) else: template = re.sub(',?$'+key+'$,?', '', template, 1) else:break template = re.sub("\n", "", template) real_key, host, name, groups = info['real_key'], info['host'], info['name'], info['group'] host_info, cur_time = self.hostr.hgetall(host), strftime('%H:%M:%S', localtime()) emessage = '%s|%s|%s|%s|%s'%(k, template, host_info['production'], host_info['idc'], \ host_info['service']) if host_info else '%s|%s|%s|%s|%s'%(k, template,'','','') xmpp_mess = '[%s]%s -- 机房:%s, 产品线:%s, 服务:%s 报警时间:%s'\ %(k, template, host_info['idc'], host_info['production'], host_info['service'], cur_time)\ if host_info else '[%s]%s -- 报警时间:%s' % (k, template, cur_time) message = '[%s]%s' % (k, template.replace(host, host.split('?')[0])) if 'message' in info and len(info['message']) >= 100: message = message.encode().replace(info['message'],'(output too long,read email)', 1) level = str(info['level']) readable_time = strftime('%Y-%m-%d %H:%M:%S') if self.bm.check(host,name): record = '%s|%s' % (emessage, 'is_black') total_error.append(record) continue if k == 'ALARM': record = '%s|%s' % (emessage, 'is_not_black') total_error.append(record) # didn't alarm but recovery if host not in self.alarm_times[item]: self.alarm_times[item][host] = dict() if real_key not in self.alarm_times[item][host]: self.alarm_times[item][host][real_key] = dict() if level not in self.alarm_times[item][host][real_key]: self.alarm_times[item][host][real_key][level] = 0 if self.alarm_times[item][host][real_key][level] >= info['limit'] and \ info['limit'] != 0: print "%s * message:%s, limitation:%s" % (readable_time, message, info['limit']) continue elif k == 'RECOVERY': #recovery but delete try: del self.alarm_info[item][host][real_key] except: pass try: if level in self.alarm_times[item][host][real_key]: del self.alarm_times[item][host][real_key] else: continue except: continue mobiles = list() emails = list() xmpps = list() for group in groups: group_info = self.alertr.get('group:'+group) if not group_info: break try: group_info = json.loads(group_info) except Exception as e: print "Error Exception:%s, Group:%s" % (e, group) continue mobile = group_info['mobile'] if 'mobile' in group_info else [] email = group_info['email'] if 'email' in group_info else [] xmpp = group_info['xmpp'] if 'xmpp' in group_info else [] for m in mobile: if m not in mobiles:mobiles.append(m) for e in email: if e not in emails:emails.append(e) for x in xmpp: if x not in xmpps:xmpps.append(x) print "%s * message:%s, method:%s" % (readable_time, xmpp_mess, info['method']) if self.check_alarm_time(info['period']): # if real_key in self.alarm_info means alarm again if k == "ALARM": self.alarm_times[item][host][real_key][level] += 1 method = info['method'] if 'email' in method: for e in emails: if e not in email_error: email_error[e] = dict() if k not in email_error[e]: email_error[e][k] = dict() email_error[e][k][emessage] = 1 if 'sms' in method: for m in mobiles: if m not in smess_error: smess_error[m] = dict() smess_error[m][message] = 1 if 'xmpp' in method: for x in xmpps: if x not in xmpp_error: xmpp_error[x] = dict() xmpp_error[x][xmpp_mess] = 1 #if policy deleted, delete record in total:alarm:errors if not total_error: self.alertr.hdel('total:alarm:errors', item) else: self.alertr.hset('total:alarm:errors', item, json.dumps(total_error)) if email_error: try:send_email(email_error) except Exception as e: print str(e) if smess_error: try:send_smess(smess_error) except Exception as e: print str(e) if xmpp_error: try:send_xmpp(self.xmpp, xmpp_error) except Exception as e: print str(e) def clean_memory(self): curr_items = self.alertr.hkeys('total:alarm:errors') for item in [x for x in curr_items if x not in self.alarm_info or not self.alarm_info[x]]: self.alertr.hdel('total:alarm:errors', item) # delete timstamps self.time_stamp['alarm_time'] = dict((k, v) for k, v in self.time_stamp['alarm_time'].items()\ if k in self.conf_info) self.time_stamp['check_time'] = dict((k, v) for k, v in self.time_stamp['check_time'].items()\ if k in self.conf_info) def check_alarm_time(self, period): for times in period: start,stop = times.split('-') if int(stop) == int(start): continue else: cur_hour = localtime()[3] if cur_hour <= int(stop) and int(start) <= cur_hour: return True return False def close_alarm(self, a, b): self.xmpp.disconnect() exit(1) def run(self): self.init_process() while self.alive: for sig in (signal.SIGTERM, signal.SIGINT, signal.SIGHUP): signal.signal(sig, self.close_alarm) # get all agent policy cur_time = int(time()) if 'conf_data' not in self.time_stamp or cur_time - self.time_stamp['conf_data'] >=15: self.conf_info = self.get_conf() self.get_data() self.time_stamp['conf_data'] = time() # put data into check queue. for item, conf in self.conf_info.items(): if item not in self.time_stamp['check_time'] or \ cur_time - self.time_stamp['check_time'][item] >= self.conf_info[item]['interval']: self.check_enqueue.put([item, conf, self.alarm_info[item]]) self.time_stamp['check_time'][item] = time() # read data from check queue. while True: try: item, data = self.check_dequeue.get(1, 0.1) except Queue.Empty: break else: self.save_data(item, data) # put data into alarm queue. for item in self.alarm_info.keys(): if item not in self.conf_info: del self.alarm_info[item] continue else: alarm_time = self.time_stamp['alarm_time'] check_time = self.time_stamp['check_time'][item] if not self.alarm_info[item]: if item in alarm_time:del alarm_time[item] continue if item not in alarm_time or cur_time - alarm_time[item] >= self.conf_info[item]['rate']: self.alarm_enqueue.put([item, self.conf_info[item], self.alarm_info[item]]) self.time_stamp['alarm_time'][item] = cur_time while True: try: item, data = self.alarm_dequeue.get(1, 0.1) except Queue.Empty: break else: # send alarm data self.send_alarm(item, data) # delete recovery errors. self.clean_memory()